396 files changed, 15379 insertions, 6965 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6894b085f0ee..620d93489539 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -335,7 +335,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        }
        init_rwsem(&v9ses->rename_sem);
-        rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY);
+        rc = bdi_setup_and_register(&v9ses->bdi, "9p");
        if (rc) {
                kfree(v9ses->aname);
                kfree(v9ses->uname);
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 5594505e6e73..b40133796b87 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -831,7 +831,6 @@ static const struct vm_operations_struct v9fs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = v9fs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
@@ -839,7 +838,6 @@ static const struct vm_operations_struct v9fs_mmap_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = v9fs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
diff --git a/fs/Kconfig b/fs/Kconfig
index 664991afe0c0..ec35851e5b71 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -13,13 +13,6 @@ if BLOCK
 source "fs/ext2/Kconfig"
 source "fs/ext3/Kconfig"
 source "fs/ext4/Kconfig"
-config FS_XIP
-# execute in place
-        bool
-        depends on EXT2_FS_XIP
-        default y
 source "fs/jbd/Kconfig"
 source "fs/jbd2/Kconfig"
@@ -40,6 +33,21 @@ source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
 source "fs/nilfs2/Kconfig"
+config FS_DAX
+        bool "Direct Access (DAX) support"
+        depends on MMU
+        depends on !(ARM || MIPS || SPARC)
+        help
+          Direct Access (DAX) can be used on memory-backed block devices.
+          If the block device supports DAX and the filesystem supports DAX,
+          then you can avoid using the pagecache to buffer I/Os.  Turning
+          on this option will compile in support for DAX; you will need to
+          mount the filesystem using the -o dax option.
+          If you do not have a block device that is capable of using this,
+          or if unsure, say N.  Saying Y will increase the size of the kernel
+          by about 5kB.
 endif # BLOCK
 # Posix ACL utility routines
@@ -165,6 +173,7 @@ config HUGETLB_PAGE
        def_bool HUGETLBFS
 source "fs/configfs/Kconfig"
+source "fs/efivarfs/Kconfig"
 endmenu
@@ -209,7 +218,6 @@ source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
 source "fs/f2fs/Kconfig"
-source "fs/efivarfs/Kconfig"
 endif # MISC_FILESYSTEMS
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c055d56ec63d..270c48148f79 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -149,13 +149,6 @@ config BINFMT_EM86
          later load the module when you want to use a Linux/Intel binary. The
          module will be called binfmt_em86. If unsure, say Y.
-config BINFMT_SOM
-        tristate "Kernel support for SOM binaries"
-        depends on PARISC && HPUX
-        help
-          SOM is a binary executable format inherited from HP/UX.  Say
-          Y here to be able to load and execute SOM binaries directly.
 config BINFMT_MISC
        tristate "Kernel support for MISC binaries"
        ---help---
diff --git a/fs/Makefile b/fs/Makefile
index bedff48e8fdc..a88ac4838c9e 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -28,6 +28,7 @@ obj-$(CONFIG_SIGNALFD)		+= signalfd.o
 obj-$(CONFIG_TIMERFD)           += timerfd.o
 obj-$(CONFIG_EVENTFD)           += eventfd.o
 obj-$(CONFIG_AIO)               += aio.o
+obj-$(CONFIG_FS_DAX)            += dax.o
 obj-$(CONFIG_FILE_LOCKING)      += locks.o
 obj-$(CONFIG_COMPAT)            += compat.o compat_ioctl.o
 obj-$(CONFIG_BINFMT_AOUT)       += binfmt_aout.o
@@ -37,7 +38,6 @@ obj-$(CONFIG_BINFMT_SCRIPT)	+= binfmt_script.o
 obj-$(CONFIG_BINFMT_ELF)        += binfmt_elf.o
 obj-$(CONFIG_COMPAT_BINFMT_ELF) += compat_binfmt_elf.o
 obj-$(CONFIG_BINFMT_ELF_FDPIC)  += binfmt_elf_fdpic.o
-obj-$(CONFIG_BINFMT_SOM)        += binfmt_som.o
 obj-$(CONFIG_BINFMT_FLAT)       += binfmt_flat.o
 obj-$(CONFIG_FS_MBCACHE)        += mbcache.o
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index ff44ff3ff015..c8764bd7497d 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -30,6 +30,8 @@
 #define AFFS_AC_SIZE            (AFFS_CACHE_SIZE/sizeof(struct affs_ext_key)/2)
 #define AFFS_AC_MASK            (AFFS_AC_SIZE-1)
+#define AFFSNAMEMAX 30U
 struct affs_ext_key {
        u32     ext;                            /* idx of the extended block */
        u32     key;                            /* block number */
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index c852f2fa1710..388da1ea815d 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -30,7 +30,7 @@ affs_insert_hash(struct inode *dir, struct buffer_head *bh)
        ino = bh->b_blocknr;
        offset = affs_hash_name(sb, AFFS_TAIL(sb, bh)->name + 1, AFFS_TAIL(sb, bh)->name[0]);
-        pr_debug("%s(dir=%u, ino=%d)\n", __func__, (u32)dir->i_ino, ino);
+        pr_debug("%s(dir=%lu, ino=%d)\n", __func__, dir->i_ino, ino);
        dir_bh = affs_bread(sb, dir->i_ino);
        if (!dir_bh)
@@ -80,8 +80,8 @@ affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh)
        sb = dir->i_sb;
        rem_ino = rem_bh->b_blocknr;
        offset = affs_hash_name(sb, AFFS_TAIL(sb, rem_bh)->name+1, AFFS_TAIL(sb, rem_bh)->name[0]);
-        pr_debug("%s(dir=%d, ino=%d, hashval=%d)\n",
+        pr_debug("%s(dir=%lu, ino=%d, hashval=%d)\n", __func__, dir->i_ino,
-                 __func__, (u32)dir->i_ino, rem_ino, offset);
+                 rem_ino, offset);
        bh = affs_bread(sb, dir->i_ino);
        if (!bh)
@@ -483,11 +483,10 @@ affs_check_name(const unsigned char *name, int len, bool notruncate)
 {
        int      i;
-        if (len > 30) {
+        if (len > AFFSNAMEMAX) {
                if (notruncate)
                        return -ENAMETOOLONG;
-                else
+                len = AFFSNAMEMAX;
-                        len = 30;
        }
        for (i = 0; i < len; i++) {
                if (name[i] < ' ' || name[i] == ':'
@@ -508,7 +507,7 @@ affs_check_name(const unsigned char *name, int len, bool notruncate)
 int
 affs_copy_name(unsigned char *bstr, struct dentry *dentry)
 {
-        int len = min(dentry->d_name.len, 30u);
+        u32 len = min(dentry->d_name.len, AFFSNAMEMAX);
        *bstr++ = len;
        memcpy(bstr, dentry->d_name.name, len);
diff --git a/fs/affs/bitmap.c b/fs/affs/bitmap.c
index c8de51185c23..675148950fed 100644
--- a/fs/affs/bitmap.c
+++ b/fs/affs/bitmap.c
@@ -99,7 +99,6 @@ err_bh_read:
 err_range:
        affs_error(sb, "affs_free_block","Block %u outside partition", block);
-        return;
 }
 /*
diff --git a/fs/affs/dir.c b/fs/affs/dir.c
index 59f07bec92a6..ac4f318aafba 100644
--- a/fs/affs/dir.c
+++ b/fs/affs/dir.c
@@ -54,8 +54,7 @@ affs_readdir(struct file *file, struct dir_context *ctx)
        u32                      ino;
        int                      error = 0;
-        pr_debug("%s(ino=%lu,f_pos=%lx)\n",
+        pr_debug("%s(ino=%lu,f_pos=%llx)\n", __func__, inode->i_ino, ctx->pos);
-                 __func__, inode->i_ino, (unsigned long)ctx->pos);
        if (ctx->pos < 2) {
                file->private_data = (void *)0;
@@ -115,11 +114,11 @@ inside:
                                break;
                        }
-                        namelen = min(AFFS_TAIL(sb, fh_bh)->name[0], (u8)30);
+                        namelen = min(AFFS_TAIL(sb, fh_bh)->name[0],
+                                      (u8)AFFSNAMEMAX);
                        name = AFFS_TAIL(sb, fh_bh)->name + 1;
-                        pr_debug("readdir(): dir_emit(\"%.*s\", "
+                        pr_debug("readdir(): dir_emit(\"%.*s\", ino=%u), hash=%d, f_pos=%llx\n",
-                                 "ino=%u), hash=%d, f_pos=%x\n",
+                                 namelen, name, ino, hash_pos, ctx->pos);
-                                 namelen, name, ino, hash_pos, (u32)ctx->pos);
                        if (!dir_emit(ctx, name, namelen, ino, DT_UNKNOWN))
                                goto done;
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 8faa6593ca6d..d2468bf95669 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -180,8 +180,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext)
                ext_key = be32_to_cpu(AFFS_TAIL(sb, bh)->extension);
                if (ext < AFFS_I(inode)->i_extcnt)
                        goto read_ext;
-                if (ext > AFFS_I(inode)->i_extcnt)
+                BUG_ON(ext > AFFS_I(inode)->i_extcnt);
-                        BUG();
                bh = affs_alloc_extblock(inode, bh, ext);
                if (IS_ERR(bh))
                        return bh;
@@ -198,8 +197,7 @@ affs_get_extblock_slow(struct inode *inode, u32 ext)
                struct buffer_head *prev_bh;
                /* allocate a new extended block */
-                if (ext > AFFS_I(inode)->i_extcnt)
+                BUG_ON(ext > AFFS_I(inode)->i_extcnt);
-                        BUG();
                /* get previous extended block */
                prev_bh = affs_get_extblock(inode, ext - 1);
@@ -299,8 +297,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
        struct buffer_head      *ext_bh;
        u32                      ext;
-        pr_debug("%s(%u, %lu)\n",
+        pr_debug("%s(%lu, %llu)\n", __func__, inode->i_ino,
-                 __func__, (u32)inode->i_ino, (unsigned long)block);
+                 (unsigned long long)block);
        BUG_ON(block > (sector_t)0x7fffffffUL);
@@ -330,8 +328,9 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
                /* store new block */
                if (bh_result->b_blocknr)
-                        affs_warning(sb, "get_block", "block already set (%lx)",
+                        affs_warning(sb, "get_block",
-                                     (unsigned long)bh_result->b_blocknr);
+                                     "block already set (%llx)",
+                                     (unsigned long long)bh_result->b_blocknr);
                AFFS_BLOCK(sb, ext_bh, block) = cpu_to_be32(blocknr);
                AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(block + 1);
                affs_adjust_checksum(ext_bh, blocknr - bh_result->b_blocknr + 1);
@@ -353,8 +352,8 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul
        return 0;
 err_big:
-        affs_error(inode->i_sb, "get_block", "strange block request %d",
+        affs_error(inode->i_sb, "get_block", "strange block request %llu",
-                   (int)block);
+                   (unsigned long long)block);
        return -EIO;
 err_ext:
        // unlock cache
@@ -399,6 +398,13 @@ affs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
        size_t count = iov_iter_count(iter);
        ssize_t ret;
+        if (rw == WRITE) {
+                loff_t size = offset + count;
+                if (AFFS_I(inode)->mmu_private < size)
+                        return 0;
+        }
        ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, affs_get_block);
        if (ret < 0 && (rw & WRITE))
                affs_write_failed(mapping, offset + count);
@@ -503,7 +509,7 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
        u32 bidx, boff, bsize;
        u32 tmp;
-        pr_debug("%s(%u, %ld, 0, %d)\n", __func__, (u32)inode->i_ino,
+        pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
                 page->index, to);
        BUG_ON(to > PAGE_CACHE_SIZE);
        kmap(page);
@@ -539,7 +545,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize)
        u32 size, bsize;
        u32 tmp;
-        pr_debug("%s(%u, %d)\n", __func__, (u32)inode->i_ino, newsize);
+        pr_debug("%s(%lu, %d)\n", __func__, inode->i_ino, newsize);
        bsize = AFFS_SB(sb)->s_data_blksize;
        bh = NULL;
        size = AFFS_I(inode)->mmu_private;
@@ -608,7 +614,7 @@ affs_readpage_ofs(struct file *file, struct page *page)
        u32 to;
        int err;
-        pr_debug("%s(%u, %ld)\n", __func__, (u32)inode->i_ino, page->index);
+        pr_debug("%s(%lu, %ld)\n", __func__, inode->i_ino, page->index);
        to = PAGE_CACHE_SIZE;
        if (((page->index + 1) << PAGE_CACHE_SHIFT) > inode->i_size) {
                to = inode->i_size & ~PAGE_CACHE_MASK;
@@ -631,8 +637,8 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping
        pgoff_t index;
        int err = 0;
-        pr_debug("%s(%u, %llu, %llu)\n", __func__, (u32)inode->i_ino,
+        pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
-                 (unsigned long long)pos, (unsigned long long)pos + len);
+                 pos + len);
        if (pos > AFFS_I(inode)->mmu_private) {
                /* XXX: this probably leaves a too-big i_size in case of
                 * failure. Should really be updating i_size at write_end time
@@ -681,9 +687,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping,
         * due to write_begin.
         */
-        pr_debug("%s(%u, %llu, %llu)\n",
+        pr_debug("%s(%lu, %llu, %llu)\n", __func__, inode->i_ino, pos,
-                 __func__, (u32)inode->i_ino, (unsigned long long)pos,
+                 pos + len);
-                (unsigned long long)pos + len);
        bsize = AFFS_SB(sb)->s_data_blksize;
        data = page_address(page);
@@ -831,8 +836,8 @@ affs_truncate(struct inode *inode)
        struct buffer_head *ext_bh;
        int i;
-        pr_debug("truncate(inode=%d, oldsize=%u, newsize=%u)\n",
+        pr_debug("truncate(inode=%lu, oldsize=%llu, newsize=%llu)\n",
-                 (u32)inode->i_ino, (u32)AFFS_I(inode)->mmu_private, (u32)inode->i_size);
+                 inode->i_ino, AFFS_I(inode)->mmu_private, inode->i_size);
        last_blk = 0;
        ext = 0;
@@ -863,7 +868,7 @@ affs_truncate(struct inode *inode)
        if (IS_ERR(ext_bh)) {
                affs_warning(sb, "truncate",
                             "unexpected read error for ext block %u (%ld)",
-                             (unsigned int)ext, PTR_ERR(ext_bh));
+                             ext, PTR_ERR(ext_bh));
                return;
        }
        if (AFFS_I(inode)->i_lc) {
@@ -911,7 +916,7 @@ affs_truncate(struct inode *inode)
                        if (IS_ERR(bh)) {
                                affs_warning(sb, "truncate",
                                             "unexpected read error for last block %u (%ld)",
-                                             (unsigned int)ext, PTR_ERR(bh));
+                                             ext, PTR_ERR(bh));
                                return;
                        }
                        tmp = be32_to_cpu(AFFS_DATA_HEAD(bh)->next);
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index d0609a282e1d..6f34510449e8 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -13,8 +13,6 @@
 #include <linux/gfp.h>
 #include "affs.h"
-extern const struct inode_operations affs_symlink_inode_operations;
 struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 {
        struct affs_sb_info     *sbi = AFFS_SB(sb);
@@ -348,9 +346,8 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3
        u32 block = 0;
        int retval;
-        pr_debug("%s(dir=%u, inode=%u, \"%pd\", type=%d)\n",
+        pr_debug("%s(dir=%lu, inode=%lu, \"%pd\", type=%d)\n", __func__,
-                 __func__, (u32)dir->i_ino,
+                 dir->i_ino, inode->i_ino, dentry, type);
-                 (u32)inode->i_ino, dentry, type);
        retval = -EIO;
        bh = affs_bread(sb, inode->i_ino);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index bbc38530e924..ffb7bd82c2a5 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -64,15 +64,16 @@ __affs_hash_dentry(struct qstr *qstr, toupper_t toupper, bool notruncate)
 {
        const u8 *name = qstr->name;
        unsigned long hash;
-        int i;
+        int retval;
+        u32 len;
-        i = affs_check_name(qstr->name, qstr->len, notruncate);
+        retval = affs_check_name(qstr->name, qstr->len, notruncate);
-        if (i)
+        if (retval)
-                return i;
+                return retval;
        hash = init_name_hash();
-        i = min(qstr->len, 30u);
+        len = min(qstr->len, AFFSNAMEMAX);
-        for (; i > 0; name++, i--)
+        for (; len > 0; name++, len--)
                hash = partial_name_hash(toupper(*name), hash);
        qstr->hash = end_name_hash(hash);
@@ -114,10 +115,10 @@ static inline int __affs_compare_dentry(unsigned int len,
         * If the names are longer than the allowed 30 chars,
         * the excess is ignored, so their length may differ.
         */
-        if (len >= 30) {
+        if (len >= AFFSNAMEMAX) {
-                if (name->len < 30)
+                if (name->len < AFFSNAMEMAX)
                        return 1;
-                len = 30;
+                len = AFFSNAMEMAX;
        } else if (len != name->len)
                return 1;
@@ -156,10 +157,10 @@ affs_match(struct dentry *dentry, const u8 *name2, toupper_t toupper)
        const u8 *name = dentry->d_name.name;
        int len = dentry->d_name.len;
-        if (len >= 30) {
+        if (len >= AFFSNAMEMAX) {
-                if (*name2 < 30)
+                if (*name2 < AFFSNAMEMAX)
                        return 0;
-                len = 30;
+                len = AFFSNAMEMAX;
        } else if (len != *name2)
                return 0;
@@ -173,9 +174,9 @@ int
 affs_hash_name(struct super_block *sb, const u8 *name, unsigned int len)
 {
        toupper_t toupper = affs_get_toupper(sb);
-        int hash;
+        u32 hash;
-        hash = len = min(len, 30u);
+        hash = len = min(len, AFFSNAMEMAX);
        for (; len > 0; len--)
                hash = (hash * 13 + toupper(*name++)) & 0x7ff;
@@ -248,9 +249,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
 int
 affs_unlink(struct inode *dir, struct dentry *dentry)
 {
-        pr_debug("%s(dir=%d, %lu \"%pd\")\n",
+        pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
-                 __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
+                 dentry->d_inode->i_ino, dentry);
-                dentry);
        return affs_remove_header(dentry);
 }
@@ -317,9 +317,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 int
 affs_rmdir(struct inode *dir, struct dentry *dentry)
 {
-        pr_debug("%s(dir=%u, %lu \"%pd\")\n",
+        pr_debug("%s(dir=%lu, %lu \"%pd\")\n", __func__, dir->i_ino,
-                __func__, (u32)dir->i_ino, dentry->d_inode->i_ino,
+                 dentry->d_inode->i_ino, dentry);
-                 dentry);
        return affs_remove_header(dentry);
 }
@@ -404,8 +403,7 @@ affs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = old_dentry->d_inode;
-        pr_debug("%s(%u, %u, \"%pd\")\n",
+        pr_debug("%s(%lu, %lu, \"%pd\")\n", __func__, inode->i_ino, dir->i_ino,
-                 __func__, (u32)inode->i_ino, (u32)dir->i_ino,
                 dentry);
        return affs_add_entry(dir, inode, dentry, ST_LINKFILE);
@@ -419,9 +417,8 @@ affs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct buffer_head *bh = NULL;
        int retval;
-        pr_debug("%s(old=%u,\"%pd\" to new=%u,\"%pd\")\n",
+        pr_debug("%s(old=%lu,\"%pd\" to new=%lu,\"%pd\")\n", __func__,
-                 __func__, (u32)old_dir->i_ino, old_dentry,
+                 old_dir->i_ino, old_dentry, new_dir->i_ino, new_dentry);
-                 (u32)new_dir->i_ino, new_dentry);
        retval = affs_check_name(new_dentry->d_name.name,
                                 new_dentry->d_name.len,
diff --git a/fs/affs/super.c b/fs/affs/super.c
index f754ab68a840..4cf0e9113fb6 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -432,39 +432,39 @@ got_root:
                sb->s_flags |= MS_RDONLY;
        }
        switch (chksum) {
-                case MUFS_FS:
+        case MUFS_FS:
-                case MUFS_INTLFFS:
+        case MUFS_INTLFFS:
-                case MUFS_DCFFS:
+        case MUFS_DCFFS:
-                        sbi->s_flags |= SF_MUFS;
+                sbi->s_flags |= SF_MUFS;
-                        /* fall thru */
+                /* fall thru */
-                case FS_INTLFFS:
+        case FS_INTLFFS:
-                case FS_DCFFS:
+        case FS_DCFFS:
-                        sbi->s_flags |= SF_INTL;
+                sbi->s_flags |= SF_INTL;
-                        break;
+                break;
-                case MUFS_FFS:
+        case MUFS_FFS:
-                        sbi->s_flags |= SF_MUFS;
+                sbi->s_flags |= SF_MUFS;
-                        break;
+                break;
-                case FS_FFS:
+        case FS_FFS:
-                        break;
+                break;
-                case MUFS_OFS:
+        case MUFS_OFS:
-                        sbi->s_flags |= SF_MUFS;
+                sbi->s_flags |= SF_MUFS;
-                        /* fall thru */
+                /* fall thru */
-                case FS_OFS:
+        case FS_OFS:
-                        sbi->s_flags |= SF_OFS;
+                sbi->s_flags |= SF_OFS;
-                        sb->s_flags |= MS_NOEXEC;
+                sb->s_flags |= MS_NOEXEC;
-                        break;
+                break;
-                case MUFS_DCOFS:
+        case MUFS_DCOFS:
-                case MUFS_INTLOFS:
+        case MUFS_INTLOFS:
-                        sbi->s_flags |= SF_MUFS;
+                sbi->s_flags |= SF_MUFS;
-                case FS_DCOFS:
+        case FS_DCOFS:
-                case FS_INTLOFS:
+        case FS_INTLOFS:
-                        sbi->s_flags |= SF_INTL | SF_OFS;
+                sbi->s_flags |= SF_INTL | SF_OFS;
-                        sb->s_flags |= MS_NOEXEC;
+                sb->s_flags |= MS_NOEXEC;
-                        break;
+                break;
-                default:
+        default:
-                        pr_err("Unknown filesystem on device %s: %08X\n",
+                pr_err("Unknown filesystem on device %s: %08X\n",
-                               sb->s_id, chksum);
+                       sb->s_id, chksum);
-                        return -EINVAL;
+                return -EINVAL;
        }
        if (mount_flags & SF_VERBOSE) {
@@ -584,7 +584,7 @@ affs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bavail  = free;
        buf->f_fsid.val[0] = (u32)id;
        buf->f_fsid.val[1] = (u32)(id >> 32);
-        buf->f_namelen = 30;
+        buf->f_namelen = AFFSNAMEMAX;
        return 0;
 }
@@ -602,6 +602,7 @@ static void affs_kill_sb(struct super_block *sb)
                affs_free_bitmap(sb);
                affs_brelse(sbi->s_root_bh);
                kfree(sbi->s_prefix);
+                mutex_destroy(&sbi->s_bmlock);
                kfree(sbi);
        }
 }
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 06e14bfb3496..dbc732e9a5c0 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -306,8 +306,8 @@ static int afs_send_pages(struct afs_call *call, struct msghdr *msg,
                        _debug("- range %u-%u%s",
                               offset, to, msg->msg_flags ? " [more]" : "");
-                        iov_iter_init(&msg->msg_iter, WRITE,
+                        iov_iter_kvec(&msg->msg_iter, WRITE | ITER_KVEC,
-                                      (struct iovec *) iov, 1, to - offset);
+                                      iov, 1, to - offset);
                        /* have to change the state *before* sending the last
                         * packet as RxRPC might give us the reply before it
@@ -384,7 +384,7 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
        msg.msg_name            = NULL;
        msg.msg_namelen         = 0;
-        iov_iter_init(&msg.msg_iter, WRITE, (struct iovec *)iov, 1,
+        iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1,
                      call->request_size);
        msg.msg_control         = NULL;
        msg.msg_controllen      = 0;
@@ -770,7 +770,7 @@ static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
 void afs_send_empty_reply(struct afs_call *call)
 {
        struct msghdr msg;
-        struct iovec iov[1];
+        struct kvec iov[1];
        _enter("");
@@ -778,7 +778,7 @@ void afs_send_empty_reply(struct afs_call *call)
        iov[0].iov_len          = 0;
        msg.msg_name            = NULL;
        msg.msg_namelen         = 0;
-        iov_iter_init(&msg.msg_iter, WRITE, iov, 0, 0); /* WTF? */
+        iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 0, 0);     /* WTF? */
        msg.msg_control         = NULL;
        msg.msg_controllen      = 0;
        msg.msg_flags           = 0;
@@ -805,7 +805,7 @@ void afs_send_empty_reply(struct afs_call *call)
 void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 {
        struct msghdr msg;
-        struct iovec iov[1];
+        struct kvec iov[1];
        int n;
        _enter("");
@@ -814,7 +814,7 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
        iov[0].iov_len          = len;
        msg.msg_name            = NULL;
        msg.msg_namelen         = 0;
-        iov_iter_init(&msg.msg_iter, WRITE, iov, 1, len);
+        iov_iter_kvec(&msg.msg_iter, WRITE | ITER_KVEC, iov, 1, len);
        msg.msg_control         = NULL;
        msg.msg_controllen      = 0;
        msg.msg_flags           = 0;
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 2b607257820c..d142a2449e65 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -106,7 +106,7 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
        volume->cell            = params->cell;
        volume->vid             = vlocation->vldb.vid[params->type];
-        ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY);
+        ret = bdi_setup_and_register(&volume->bdi, "afs");
        if (ret)
                goto error_bdi;
diff --git a/fs/aio.c b/fs/aio.c
index 1b7893ecc296..118a2e0088d8 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -165,15 +165,6 @@ static struct vfsmount *aio_mnt;
 static const struct file_operations aio_ring_fops;
 static const struct address_space_operations aio_ctx_aops;
-/* Backing dev info for aio fs.
- * -no dirty page accounting or writeback happens
- */
-static struct backing_dev_info aio_fs_backing_dev_info = {
-        .name           = "aiofs",
-        .state          = 0,
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_COPY,
-};
 static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
 {
        struct qstr this = QSTR_INIT("[aio]", 5);
@@ -185,7 +176,6 @@ static struct file *aio_private_file(struct kioctx *ctx, loff_t nr_pages)
        inode->i_mapping->a_ops = &aio_ctx_aops;
        inode->i_mapping->private_data = ctx;
-        inode->i_mapping->backing_dev_info = &aio_fs_backing_dev_info;
        inode->i_size = PAGE_SIZE * nr_pages;
        path.dentry = d_alloc_pseudo(aio_mnt->mnt_sb, &this);
@@ -230,9 +220,6 @@ static int __init aio_setup(void)
        if (IS_ERR(aio_mnt))
                panic("Failed to create aio fs mount.");
-        if (bdi_init(&aio_fs_backing_dev_info))
-                panic("Failed to init aio fs backing dev info.");
        kiocb_cachep = KMEM_CACHE(kiocb, SLAB_HWCACHE_ALIGN|SLAB_PANIC);
        kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
@@ -1140,6 +1127,13 @@ static long aio_read_events_ring(struct kioctx *ctx,
        long ret = 0;
        int copy_ret;
+        /*
+         * The mutex can block and wake us up and that will cause
+         * wait_event_interruptible_hrtimeout() to schedule without sleeping
+         * and repeat. This should be rare enough that it doesn't cause
+         * peformance issues. See the comment in read_events() for more detail.
+         */
+        sched_annotate_sleep();
        mutex_lock(&ctx->ring_lock);
        /* Access to ->ring_pages here is protected by ctx->ring_lock. */
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index edf47774b03d..e089f1985fca 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -274,9 +274,9 @@ more:
 static struct inode *
 befs_alloc_inode(struct super_block *sb)
 {
-        struct befs_inode_info *bi;
+        struct befs_inode_info *bi;
-        bi = (struct befs_inode_info *)kmem_cache_alloc(befs_inode_cachep,
-                                                        GFP_KERNEL);
+        bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
        if (!bi)
                return NULL;
        return &bi->vfs_inode;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 02b16910f4c9..995986b8e36b 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -645,11 +645,12 @@ out:
 static unsigned long randomize_stack_top(unsigned long stack_top)
 {
-        unsigned int random_variable = 0;
+        unsigned long random_variable = 0;
        if ((current->flags & PF_RANDOMIZE) &&
                !(current->personality & ADDR_NO_RANDOMIZE)) {
-                random_variable = get_random_int() & STACK_RND_MASK;
+                random_variable = (unsigned long) get_random_int();
+                random_variable &= STACK_RND_MASK;
                random_variable <<= PAGE_SHIFT;
        }
 #ifdef CONFIG_STACK_GROWSUP
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
deleted file mode 100644
index 4e00ed68d4a6..000000000000
--- a/fs/binfmt_som.c
+++ /dev/null
@@ -1,299 +0,0 @@
-/*
- * linux/fs/binfmt_som.c
- *
- * These are the functions used to load SOM format executables as used
- * by HP-UX.  
- *
- * Copyright 1999 Matthew Wilcox <willy@bofh.ai>
- * based on binfmt_elf which is
- * Copyright 1993, 1994: Eric Youngdale (ericy@cais.com).
- */
-#include <linux/module.h>
-#include <linux/fs.h>
-#include <linux/stat.h>
-#include <linux/sched.h>
-#include <linux/mm.h>
-#include <linux/mman.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/binfmts.h>
-#include <linux/som.h>
-#include <linux/string.h>
-#include <linux/file.h>
-#include <linux/fcntl.h>
-#include <linux/ptrace.h>
-#include <linux/slab.h>
-#include <linux/shm.h>
-#include <linux/personality.h>
-#include <linux/init.h>
-#include <asm/uaccess.h>
-#include <asm/pgtable.h>
-#include <linux/elf.h>
-static int load_som_binary(struct linux_binprm * bprm);
-static int load_som_library(struct file *);
-/*
- * If we don't support core dumping, then supply a NULL so we
- * don't even try.
- */
-#if 0
-static int som_core_dump(struct coredump_params *cprm);
-#else
-#define som_core_dump   NULL
-#endif
-#define SOM_PAGESTART(_v) ((_v) & ~(unsigned long)(SOM_PAGESIZE-1))
-#define SOM_PAGEOFFSET(_v) ((_v) & (SOM_PAGESIZE-1))
-#define SOM_PAGEALIGN(_v) (((_v) + SOM_PAGESIZE - 1) & ~(SOM_PAGESIZE - 1))
-static struct linux_binfmt som_format = {
-        .module         = THIS_MODULE,
-        .load_binary    = load_som_binary,
-        .load_shlib     = load_som_library,
-        .core_dump      = som_core_dump,
-        .min_coredump   = SOM_PAGESIZE
-};
-/*
- * create_som_tables() parses the env- and arg-strings in new user
- * memory and creates the pointer tables from them, and puts their
- * addresses on the "stack", returning the new stack pointer value.
- */
-static void create_som_tables(struct linux_binprm *bprm)
-{
-        char **argv, **envp;
-        int argc = bprm->argc;
-        int envc = bprm->envc;
-        unsigned long p;
-        unsigned long *sp;
-        /* Word-align the stack pointer */
-        sp = (unsigned long *)((bprm->p + 3) & ~3);
-        envp = (char **) sp;
-        sp += envc + 1;
-        argv = (char **) sp;
-        sp += argc + 1;
-        __put_user((unsigned long) envp,++sp);
-        __put_user((unsigned long) argv,++sp);
-        __put_user(argc, ++sp);
-        bprm->p = (unsigned long) sp;
-        p = current->mm->arg_start;
-        while (argc-- > 0) {
-                __put_user((char *)p,argv++);
-                p += strlen_user((char *)p);
-        }
-        __put_user(NULL, argv);
-        current->mm->arg_end = current->mm->env_start = p;
-        while (envc-- > 0) {
-                __put_user((char *)p,envp++);
-                p += strlen_user((char *)p);
-        }
-        __put_user(NULL, envp);
-        current->mm->env_end = p;
-}
-static int check_som_header(struct som_hdr *som_ex)
-{
-        int *buf = (int *)som_ex;
-        int i, ck;
-        if (som_ex->system_id != SOM_SID_PARISC_1_0 &&
-            som_ex->system_id != SOM_SID_PARISC_1_1 &&
-            som_ex->system_id != SOM_SID_PARISC_2_0)
-                return -ENOEXEC;
-        if (som_ex->a_magic != SOM_EXEC_NONSHARE &&
-            som_ex->a_magic != SOM_EXEC_SHARE &&
-            som_ex->a_magic != SOM_EXEC_DEMAND)
-                return -ENOEXEC;
-        if (som_ex->version_id != SOM_ID_OLD &&
-            som_ex->version_id != SOM_ID_NEW)
-                return -ENOEXEC;
-        ck = 0;
-        for (i=0; i<32; i++)
-                ck ^= buf[i];
-        if (ck != 0)
-                return -ENOEXEC;
-        return 0;
-}
-static int map_som_binary(struct file *file,
-                const struct som_exec_auxhdr *hpuxhdr)
-{
-        unsigned long code_start, code_size, data_start, data_size;
-        unsigned long bss_start, som_brk;
-        int retval;
-        int prot = PROT_READ | PROT_EXEC;
-        int flags = MAP_FIXED|MAP_PRIVATE|MAP_DENYWRITE|MAP_EXECUTABLE;
-        mm_segment_t old_fs = get_fs();
-        set_fs(get_ds());
-        code_start = SOM_PAGESTART(hpuxhdr->exec_tmem);
-        code_size = SOM_PAGEALIGN(hpuxhdr->exec_tsize);
-        current->mm->start_code = code_start;
-        current->mm->end_code = code_start + code_size;
-        retval = vm_mmap(file, code_start, code_size, prot,
-                        flags, SOM_PAGESTART(hpuxhdr->exec_tfile));
-        if (retval < 0 && retval > -1024)
-                goto out;
-        data_start = SOM_PAGESTART(hpuxhdr->exec_dmem);
-        data_size = SOM_PAGEALIGN(hpuxhdr->exec_dsize);
-        current->mm->start_data = data_start;
-        current->mm->end_data = bss_start = data_start + data_size;
-        retval = vm_mmap(file, data_start, data_size,
-                        prot | PROT_WRITE, flags,
-                        SOM_PAGESTART(hpuxhdr->exec_dfile));
-        if (retval < 0 && retval > -1024)
-                goto out;
-        som_brk = bss_start + SOM_PAGEALIGN(hpuxhdr->exec_bsize);
-        current->mm->start_brk = current->mm->brk = som_brk;
-        retval = vm_mmap(NULL, bss_start, som_brk - bss_start,
-                        prot | PROT_WRITE, MAP_FIXED | MAP_PRIVATE, 0);
-        if (retval > 0 || retval < -1024)
-                retval = 0;
-out:
-        set_fs(old_fs);
-        return retval;
-}
-/*
- * These are the functions used to load SOM executables and shared
- * libraries.  There is no binary dependent code anywhere else.
- */
-static int
-load_som_binary(struct linux_binprm * bprm)
-{
-        int retval;
-        unsigned int size;
-        unsigned long som_entry;
-        struct som_hdr *som_ex;
-        struct som_exec_auxhdr *hpuxhdr;
-        struct pt_regs *regs = current_pt_regs();
-        /* Get the exec-header */
-        som_ex = (struct som_hdr *) bprm->buf;
-        retval = check_som_header(som_ex);
-        if (retval != 0)
-                goto out;
-        /* Now read in the auxiliary header information */
-        retval = -ENOMEM;
-        size = som_ex->aux_header_size;
-        if (size > SOM_PAGESIZE)
-                goto out;
-        hpuxhdr = kmalloc(size, GFP_KERNEL);
-        if (!hpuxhdr)
-                goto out;
-        retval = kernel_read(bprm->file, som_ex->aux_header_location,
-                        (char *) hpuxhdr, size);
-        if (retval != size) {
-                if (retval >= 0)
-                        retval = -EIO;
-                goto out_free;
-        }
-        /* Flush all traces of the currently running executable */
-        retval = flush_old_exec(bprm);
-        if (retval)
-                goto out_free;
-        /* OK, This is the point of no return */
-        current->personality = PER_HPUX;
-        setup_new_exec(bprm);
-        /* Set the task size for HP-UX processes such that
-         * the gateway page is outside the address space.
-         * This can be fixed later, but for now, this is much
-         * easier.
-         */
-        current->thread.task_size = 0xc0000000;
-        /* Set map base to allow enough room for hp-ux heap growth */
-        current->thread.map_base = 0x80000000;
-        retval = map_som_binary(bprm->file, hpuxhdr);
-        if (retval < 0)
-                goto out_free;
-        som_entry = hpuxhdr->exec_entry;
-        kfree(hpuxhdr);
-        set_binfmt(&som_format);
-        install_exec_creds(bprm);
-        setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT);
-        create_som_tables(bprm);
-        current->mm->start_stack = bprm->p;
-#if 0
-        printk("(start_brk) %08lx\n" , (unsigned long) current->mm->start_brk);
-        printk("(end_code) %08lx\n" , (unsigned long) current->mm->end_code);
-        printk("(start_code) %08lx\n" , (unsigned long) current->mm->start_code);
-        printk("(end_data) %08lx\n" , (unsigned long) current->mm->end_data);
-        printk("(start_stack) %08lx\n" , (unsigned long) current->mm->start_stack);
-        printk("(brk) %08lx\n" , (unsigned long) current->mm->brk);
-#endif
-        map_hpux_gateway_page(current,current->mm);
-        start_thread_som(regs, som_entry, bprm->p);
-        return 0;
-        /* error cleanup */
-out_free:
-        kfree(hpuxhdr);
-out:
-        return retval;
-}
-static int load_som_library(struct file *f)
-{
-/* No lib support in SOM yet.  gizza chance.. */
-        return -ENOEXEC;
-}
-        /* Install the SOM loader.
-         * N.B. We *rely* on the table being the right size with the
-         * right number of free slots...
-         */
-static int __init init_som_binfmt(void)
-{
-        register_binfmt(&som_format);
-        return 0;
-}
-static void __exit exit_som_binfmt(void)
-{
-        /* Remove the SOM loader. */
-        unregister_binfmt(&som_format);
-}
-core_initcall(init_som_binfmt);
-module_exit(exit_som_binfmt);
-MODULE_LICENSE("GPL");
diff --git a/fs/block_dev.c b/fs/block_dev.c
index b48c41bf0f86..975266be67d3 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -49,23 +49,15 @@ inline struct block_device *I_BDEV(struct inode *inode)
 }
 EXPORT_SYMBOL(I_BDEV);
-/*
+static void bdev_write_inode(struct inode *inode)
- * Move the inode from its current bdi to a new bdi.  Make sure the inode
- * is clean before moving so that it doesn't linger on the old bdi.
- */
-static void bdev_inode_switch_bdi(struct inode *inode,
-                        struct backing_dev_info *dst)
 {
-        while (true) {
+        spin_lock(&inode->i_lock);
-                spin_lock(&inode->i_lock);
+        while (inode->i_state & I_DIRTY) {
-                if (!(inode->i_state & I_DIRTY)) {
-                        inode->i_data.backing_dev_info = dst;
-                        spin_unlock(&inode->i_lock);
-                        return;
-                }
                spin_unlock(&inode->i_lock);
                WARN_ON_ONCE(write_inode_now(inode, true));
+                spin_lock(&inode->i_lock);
        }
+        spin_unlock(&inode->i_lock);
 }
 /* Kill _all_ buffers and pagecache , dirty or not.. */
@@ -429,6 +421,46 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(bdev_write_page);
+/**
+ * bdev_direct_access() - Get the address for directly-accessibly memory
+ * @bdev: The device containing the memory
+ * @sector: The offset within the device
+ * @addr: Where to put the address of the memory
+ * @pfn: The Page Frame Number for the memory
+ * @size: The number of bytes requested
+ *
+ * If a block device is made up of directly addressable memory, this function
+ * will tell the caller the PFN and the address of the memory.  The address
+ * may be directly dereferenced within the kernel without the need to call
+ * ioremap(), kmap() or similar.  The PFN is suitable for inserting into
+ * page tables.
+ *
+ * Return: negative errno if an error occurs, otherwise the number of bytes
+ * accessible at this address.
+ */
+long bdev_direct_access(struct block_device *bdev, sector_t sector,
+                        void **addr, unsigned long *pfn, long size)
+{
+        long avail;
+        const struct block_device_operations *ops = bdev->bd_disk->fops;
+        if (size < 0)
+                return size;
+        if (!ops->direct_access)
+                return -EOPNOTSUPP;
+        if ((sector + DIV_ROUND_UP(size, 512)) >
+                                        part_nr_sects_read(bdev->bd_part))
+                return -ERANGE;
+        sector += get_start_sect(bdev);
+        if (sector % (PAGE_SIZE / 512))
+                return -EINVAL;
+        avail = ops->direct_access(bdev, sector, addr, pfn, size);
+        if (!avail)
+                return -ERANGE;
+        return min(avail, size);
+}
+EXPORT_SYMBOL_GPL(bdev_direct_access);
 /*
 * pseudo-fs
 */
@@ -584,7 +616,6 @@ struct block_device *bdget(dev_t dev)
                inode->i_bdev = bdev;
                inode->i_data.a_ops = &def_blk_aops;
                mapping_set_gfp_mask(&inode->i_data, GFP_USER);
-                inode->i_data.backing_dev_info = &default_backing_dev_info;
                spin_lock(&bdev_lock);
                list_add(&bdev->bd_list, &all_bdevs);
                spin_unlock(&bdev_lock);
@@ -1145,8 +1176,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                bdev->bd_queue = disk->queue;
                bdev->bd_contains = bdev;
                if (!partno) {
-                        struct backing_dev_info *bdi;
                        ret = -ENXIO;
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!bdev->bd_part)
@@ -1172,11 +1201,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                                }
                        }
-                        if (!ret) {
+                        if (!ret)
                                bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
-                                bdi = blk_get_backing_dev_info(bdev);
-                                bdev_inode_switch_bdi(bdev->bd_inode, bdi);
-                        }
                        /*
                         * If the device is invalidated, rescan partition
@@ -1203,8 +1229,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
                        if (ret)
                                goto out_clear;
                        bdev->bd_contains = whole;
-                        bdev_inode_switch_bdi(bdev->bd_inode,
-                                whole->bd_inode->i_data.backing_dev_info);
                        bdev->bd_part = disk_get_part(disk, partno);
                        if (!(disk->flags & GENHD_FL_UP) ||
                            !bdev->bd_part || !bdev->bd_part->nr_sects) {
@@ -1244,7 +1268,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
        bdev->bd_disk = NULL;
        bdev->bd_part = NULL;
        bdev->bd_queue = NULL;
-        bdev_inode_switch_bdi(bdev->bd_inode, &default_backing_dev_info);
        if (bdev != bdev->bd_contains)
                __blkdev_put(bdev->bd_contains, mode, 1);
        bdev->bd_contains = NULL;
@@ -1464,11 +1487,11 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part)
                WARN_ON_ONCE(bdev->bd_holders);
                sync_blockdev(bdev);
                kill_bdev(bdev);
-                /* ->release can cause the old bdi to disappear,
+                /*
-                 * so must switch it out first
+                 * ->release can cause the queue to disappear, so flush all
+                 * dirty data before.
                 */
-                bdev_inode_switch_bdi(bdev->bd_inode,
+                bdev_write_inode(bdev->bd_inode);
-                                        &default_backing_dev_info);
        }
        if (bdev->bd_contains == bdev) {
                if (disk->fops->release)
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index a66768ebc8d1..80e9c18ea64f 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -8,6 +8,7 @@ config BTRFS_FS
        select LZO_DECOMPRESS
        select RAID6_PQ
        select XOR_BLOCKS
+        select SRCU
        help
          Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2d3e32ebfd15..f55721ff9385 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1246,25 +1246,6 @@ int btrfs_check_shared(struct btrfs_trans_handle *trans,
        return ret;
 }
-/*
- * this makes the path point to (inum INODE_ITEM ioff)
- */
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
-                        struct btrfs_path *path)
-{
-        struct btrfs_key key;
-        return btrfs_find_item(fs_root, path, inum, ioff,
-                        BTRFS_INODE_ITEM_KEY, &key);
-}
-static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
-                                struct btrfs_path *path,
-                                struct btrfs_key *found_key)
-{
-        return btrfs_find_item(fs_root, path, inum, ioff,
-                        BTRFS_INODE_REF_KEY, found_key);
-}
 int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
                          u64 start_off, struct btrfs_path *path,
                          struct btrfs_inode_extref **ret_extref,
@@ -1374,7 +1355,8 @@ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
                        btrfs_tree_read_unlock_blocking(eb);
                        free_extent_buffer(eb);
                }
-                ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+                ret = btrfs_find_item(fs_root, path, parent, 0,
+                                BTRFS_INODE_REF_KEY, &found_key);
                if (ret > 0)
                        ret = -ENOENT;
                if (ret)
@@ -1552,7 +1534,6 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
 {
        int ret;
        int type;
-        struct btrfs_tree_block_info *info;
        struct btrfs_extent_inline_ref *eiref;
        if (*ptr == (unsigned long)-1)
@@ -1573,9 +1554,17 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
        }
        /* we can treat both ref types equally here */
-        info = (struct btrfs_tree_block_info *)(ei + 1);
        *out_root = btrfs_extent_inline_ref_offset(eb, eiref);
-        *out_level = btrfs_tree_block_level(eb, info);
+        if (key->type == BTRFS_EXTENT_ITEM_KEY) {
+                struct btrfs_tree_block_info *info;
+                info = (struct btrfs_tree_block_info *)(ei + 1);
+                *out_level = btrfs_tree_block_level(eb, info);
+        } else {
+                ASSERT(key->type == BTRFS_METADATA_ITEM_KEY);
+                *out_level = (u8)key->offset;
+        }
        if (ret == 1)
                *ptr = (unsigned long)-1;
@@ -1720,8 +1709,10 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
        struct btrfs_key found_key;
        while (!ret) {
-                ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
+                ret = btrfs_find_item(fs_root, path, inum,
-                                     &found_key);
+                                parent ? parent + 1 : 0, BTRFS_INODE_REF_KEY,
+                                &found_key);
                if (ret < 0)
                        break;
                if (ret) {
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 2a1ac6bfc724..9c41fbac3009 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -32,9 +32,6 @@ struct inode_fs_paths {
 typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
                void *ctx);
-int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
-                        struct btrfs_path *path);
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
                        struct btrfs_path *path, struct btrfs_key *found_key,
                        u64 *flags);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 4aadadcfab20..de5e4f2adfea 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -185,6 +185,9 @@ struct btrfs_inode {
        struct btrfs_delayed_node *delayed_node;
+        /* File creation time. */
+        struct timespec i_otime;
        struct inode vfs_inode;
 };
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 14a72ed14ef7..993642199326 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -213,11 +213,19 @@ static struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
 */
 static void add_root_to_dirty_list(struct btrfs_root *root)
 {
+        if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
+            !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
+                return;
        spin_lock(&root->fs_info->trans_lock);
-        if (test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state) &&
+        if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
-            list_empty(&root->dirty_list)) {
+                /* Want the extent tree to be the last on the list */
-                list_add(&root->dirty_list,
+                if (root->objectid == BTRFS_EXTENT_TREE_OBJECTID)
-                         &root->fs_info->dirty_cowonly_roots);
+                        list_move_tail(&root->dirty_list,
+                                       &root->fs_info->dirty_cowonly_roots);
+                else
+                        list_move(&root->dirty_list,
+                                  &root->fs_info->dirty_cowonly_roots);
        }
        spin_unlock(&root->fs_info->trans_lock);
 }
@@ -1363,8 +1371,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
        if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
                BUG_ON(tm->slot != 0);
-                eb_rewin = alloc_dummy_extent_buffer(eb->start,
+                eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
-                                                fs_info->tree_root->nodesize);
                if (!eb_rewin) {
                        btrfs_tree_read_unlock_blocking(eb);
                        free_extent_buffer(eb);
@@ -1444,7 +1451,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
        } else if (old_root) {
                btrfs_tree_read_unlock(eb_root);
                free_extent_buffer(eb_root);
-                eb = alloc_dummy_extent_buffer(logical, root->nodesize);
+                eb = alloc_dummy_extent_buffer(root->fs_info, logical);
        } else {
                btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
                eb = btrfs_clone_extent_buffer(eb_root);
@@ -2282,7 +2289,7 @@ static void reada_for_search(struct btrfs_root *root,
                if ((search <= target && target - search <= 65536) ||
                    (search > target && search - target <= 65536)) {
                        gen = btrfs_node_ptr_generation(node, nr);
-                        readahead_tree_block(root, search, blocksize);
+                        readahead_tree_block(root, search);
                        nread += blocksize;
                }
                nscan++;
@@ -2301,7 +2308,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
        u64 gen;
        u64 block1 = 0;
        u64 block2 = 0;
-        int blocksize;
        parent = path->nodes[level + 1];
        if (!parent)
@@ -2309,7 +2315,6 @@ static noinline void reada_for_balance(struct btrfs_root *root,
        nritems = btrfs_header_nritems(parent);
        slot = path->slots[level + 1];
-        blocksize = root->nodesize;
        if (slot > 0) {
                block1 = btrfs_node_blockptr(parent, slot - 1);
@@ -2334,9 +2339,9 @@ static noinline void reada_for_balance(struct btrfs_root *root,
        }
        if (block1)
-                readahead_tree_block(root, block1, blocksize);
+                readahead_tree_block(root, block1);
        if (block2)
-                readahead_tree_block(root, block2, blocksize);
+                readahead_tree_block(root, block2);
 }
@@ -2609,32 +2614,24 @@ static int key_search(struct extent_buffer *b, struct btrfs_key *key,
        return 0;
 }
-int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *found_path,
+int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
                u64 iobjectid, u64 ioff, u8 key_type,
                struct btrfs_key *found_key)
 {
        int ret;
        struct btrfs_key key;
        struct extent_buffer *eb;
-        struct btrfs_path *path;
+        ASSERT(path);
+        ASSERT(found_key);
        key.type = key_type;
        key.objectid = iobjectid;
        key.offset = ioff;
-        if (found_path == NULL) {
-                path = btrfs_alloc_path();
-                if (!path)
-                        return -ENOMEM;
-        } else
-                path = found_path;
        ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
-        if ((ret < 0) || (found_key == NULL)) {
+        if (ret < 0)
-                if (path != found_path)
-                        btrfs_free_path(path);
                return ret;
-        }
        eb = path->nodes[0];
        if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
@@ -3383,7 +3380,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        add_root_to_dirty_list(root);
        extent_buffer_get(c);
        path->nodes[level] = c;
-        path->locks[level] = BTRFS_WRITE_LOCK;
+        path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
        path->slots[level] = 0;
        return 0;
 }
@@ -4356,13 +4353,15 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
        path->search_for_split = 1;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        path->search_for_split = 0;
+        if (ret > 0)
+                ret = -EAGAIN;
        if (ret < 0)
                goto err;
        ret = -EAGAIN;
        leaf = path->nodes[0];
-        /* if our item isn't there or got smaller, return now */
+        /* if our item isn't there, return now */
-        if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0]))
+        if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
                goto err;
        /* the leaf has  changed, it now has room.  return now */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7e607416755a..84c3b00f3de8 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -198,6 +198,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 #define BTRFS_DIRTY_METADATA_THRESH     (32 * 1024 * 1024)
+#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
 /*
 * The key defines the order in the tree, and so it also defines (optimal)
 * block layout.
@@ -1020,6 +1022,9 @@ enum btrfs_raid_types {
                                         BTRFS_BLOCK_GROUP_RAID6 |   \
                                         BTRFS_BLOCK_GROUP_DUP |     \
                                         BTRFS_BLOCK_GROUP_RAID10)
+#define BTRFS_BLOCK_GROUP_RAID56_MASK   (BTRFS_BLOCK_GROUP_RAID5 |   \
+                                         BTRFS_BLOCK_GROUP_RAID6)
 /*
 * We need a bit for restriper to be able to tell when chunks of type
 * SINGLE are available.  This "extended" profile format is used in
@@ -1171,6 +1176,7 @@ struct btrfs_space_info {
        struct percpu_counter total_bytes_pinned;
        struct list_head list;
+        /* Protected by the spinlock 'lock'. */
        struct list_head ro_bgs;
        struct rw_semaphore groups_sem;
@@ -1238,7 +1244,6 @@ enum btrfs_disk_cache_state {
        BTRFS_DC_ERROR          = 1,
        BTRFS_DC_CLEAR          = 2,
        BTRFS_DC_SETUP          = 3,
-        BTRFS_DC_NEED_WRITE     = 4,
 };
 struct btrfs_caching_control {
@@ -1276,7 +1281,6 @@ struct btrfs_block_group_cache {
        unsigned long full_stripe_len;
        unsigned int ro:1;
-        unsigned int dirty:1;
        unsigned int iref:1;
        unsigned int has_caching_ctl:1;
        unsigned int removed:1;
@@ -1314,6 +1318,9 @@ struct btrfs_block_group_cache {
        struct list_head ro_list;
        atomic_t trimming;
+        /* For dirty block groups */
+        struct list_head dirty_list;
 };
 /* delayed seq elem */
@@ -1740,6 +1747,7 @@ struct btrfs_fs_info {
        spinlock_t unused_bgs_lock;
        struct list_head unused_bgs;
+        struct mutex unused_bg_unpin_mutex;
        /* For btrfs to record security options */
        struct security_mnt_opts security_opts;
@@ -1775,6 +1783,7 @@ struct btrfs_subvolume_writers {
 #define BTRFS_ROOT_DEFRAG_RUNNING       6
 #define BTRFS_ROOT_FORCE_COW            7
 #define BTRFS_ROOT_MULTI_LOG_TASKS      8
+#define BTRFS_ROOT_DIRTY                9
 /*
 * in ram representation of the tree.  extent_root is used for all allocations
@@ -1793,8 +1802,6 @@ struct btrfs_root {
        struct btrfs_fs_info *fs_info;
        struct extent_io_tree dirty_log_pages;
-        struct kobject root_kobj;
-        struct completion kobj_unregister;
        struct mutex objectid_mutex;
        spinlock_t accounting_lock;
@@ -2464,31 +2471,6 @@ BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
 BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-static inline struct btrfs_timespec *
-btrfs_inode_atime(struct btrfs_inode_item *inode_item)
-{
-        unsigned long ptr = (unsigned long)inode_item;
-        ptr += offsetof(struct btrfs_inode_item, atime);
-        return (struct btrfs_timespec *)ptr;
-}
-static inline struct btrfs_timespec *
-btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
-{
-        unsigned long ptr = (unsigned long)inode_item;
-        ptr += offsetof(struct btrfs_inode_item, mtime);
-        return (struct btrfs_timespec *)ptr;
-}
-static inline struct btrfs_timespec *
-btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
-{
-        unsigned long ptr = (unsigned long)inode_item;
-        ptr += offsetof(struct btrfs_inode_item, ctime);
-        return (struct btrfs_timespec *)ptr;
-}
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 054577bddaf2..82f0c7c95474 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1755,27 +1755,31 @@ static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
        btrfs_set_stack_inode_block_group(inode_item, 0);
-        btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+        btrfs_set_stack_timespec_sec(&inode_item->atime,
                                     inode->i_atime.tv_sec);
-        btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+        btrfs_set_stack_timespec_nsec(&inode_item->atime,
                                      inode->i_atime.tv_nsec);
-        btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+        btrfs_set_stack_timespec_sec(&inode_item->mtime,
                                     inode->i_mtime.tv_sec);
-        btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+        btrfs_set_stack_timespec_nsec(&inode_item->mtime,
                                      inode->i_mtime.tv_nsec);
-        btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+        btrfs_set_stack_timespec_sec(&inode_item->ctime,
                                     inode->i_ctime.tv_sec);
-        btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+        btrfs_set_stack_timespec_nsec(&inode_item->ctime,
                                      inode->i_ctime.tv_nsec);
+        btrfs_set_stack_timespec_sec(&inode_item->otime,
+                                     BTRFS_I(inode)->i_otime.tv_sec);
+        btrfs_set_stack_timespec_nsec(&inode_item->otime,
+                                     BTRFS_I(inode)->i_otime.tv_nsec);
 }
 int btrfs_fill_inode(struct inode *inode, u32 *rdev)
 {
        struct btrfs_delayed_node *delayed_node;
        struct btrfs_inode_item *inode_item;
-        struct btrfs_timespec *tspec;
        delayed_node = btrfs_get_delayed_node(inode);
        if (!delayed_node)
@@ -1802,17 +1806,19 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev)
        *rdev = btrfs_stack_inode_rdev(inode_item);
        BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
-        tspec = btrfs_inode_atime(inode_item);
+        inode->i_atime.tv_sec = btrfs_stack_timespec_sec(&inode_item->atime);
-        inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec);
+        inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->atime);
-        inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+        inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(&inode_item->mtime);
+        inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->mtime);
-        tspec = btrfs_inode_mtime(inode_item);
+        inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(&inode_item->ctime);
-        inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec);
+        inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(&inode_item->ctime);
-        inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
-        tspec = btrfs_inode_ctime(inode_item);
+        BTRFS_I(inode)->i_otime.tv_sec =
-        inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
+                btrfs_stack_timespec_sec(&inode_item->otime);
-        inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+        BTRFS_I(inode)->i_otime.tv_nsec =
+                btrfs_stack_timespec_nsec(&inode_item->otime);
        inode->i_generation = BTRFS_I(inode)->generation;
        BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1857,6 +1863,14 @@ int btrfs_delayed_delete_inode_ref(struct inode *inode)
 {
        struct btrfs_delayed_node *delayed_node;
+        /*
+         * we don't do delayed inode updates during log recovery because it
+         * leads to enospc problems.  This means we also can't do
+         * delayed inode refs
+         */
+        if (BTRFS_I(inode)->root->fs_info->log_root_recovering)
+                return -EAGAIN;
        delayed_node = btrfs_get_or_create_delayed_node(inode);
        if (IS_ERR(delayed_node))
                return PTR_ERR(delayed_node);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index ca6a3a3b6b6c..5ec03d999c37 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -440,18 +440,9 @@ leave:
 */
 static void btrfs_rm_dev_replace_blocked(struct btrfs_fs_info *fs_info)
 {
-        s64 writers;
-        DEFINE_WAIT(wait);
        set_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state);
-        do {
+        wait_event(fs_info->replace_wait, !percpu_counter_sum(
-                prepare_to_wait(&fs_info->replace_wait, &wait,
+                   &fs_info->bio_counter));
-                                TASK_UNINTERRUPTIBLE);
-                writers = percpu_counter_sum(&fs_info->bio_counter);
-                if (writers)
-                        schedule();
-                finish_wait(&fs_info->replace_wait, &wait);
-        } while (writers);
 }
 /*
@@ -932,15 +923,15 @@ void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount)
 void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info)
 {
-        DEFINE_WAIT(wait);
+        while (1) {
-again:
+                percpu_counter_inc(&fs_info->bio_counter);
-        percpu_counter_inc(&fs_info->bio_counter);
+                if (likely(!test_bit(BTRFS_FS_STATE_DEV_REPLACING,
-        if (test_bit(BTRFS_FS_STATE_DEV_REPLACING, &fs_info->fs_state)) {
+                                     &fs_info->fs_state)))
+                        break;
                btrfs_bio_counter_dec(fs_info);
                wait_event(fs_info->replace_wait,
                           !test_bit(BTRFS_FS_STATE_DEV_REPLACING,
                                     &fs_info->fs_state));
-                goto again;
        }
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 8c63419a7f70..f79f38542a73 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -318,7 +318,7 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                        memcpy(&found, result, csum_size);
                        read_extent_buffer(buf, &val, 0, csum_size);
-                        printk_ratelimited(KERN_INFO
+                        printk_ratelimited(KERN_WARNING
                                "BTRFS: %s checksum verify failed on %llu wanted %X found %X "
                                "level %d\n",
                                root->fs_info->sb->s_id, buf->start,
@@ -367,7 +367,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                ret = 0;
                goto out;
        }
-        printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
+        printk_ratelimited(KERN_ERR
+            "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n",
                        eb->fs_info->sb->s_id, eb->start,
                        parent_transid, btrfs_header_generation(eb));
        ret = 1;
@@ -633,21 +634,21 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
        found_start = btrfs_header_bytenr(eb);
        if (found_start != eb->start) {
-                printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start "
+                printk_ratelimited(KERN_ERR "BTRFS (device %s): bad tree block start "
                               "%llu %llu\n",
                               eb->fs_info->sb->s_id, found_start, eb->start);
                ret = -EIO;
                goto err;
        }
        if (check_tree_block_fsid(root, eb)) {
-                printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n",
+                printk_ratelimited(KERN_ERR "BTRFS (device %s): bad fsid on block %llu\n",
                               eb->fs_info->sb->s_id, eb->start);
                ret = -EIO;
                goto err;
        }
        found_level = btrfs_header_level(eb);
        if (found_level >= BTRFS_MAX_LEVEL) {
-                btrfs_info(root->fs_info, "bad tree block level %d",
+                btrfs_err(root->fs_info, "bad tree block level %d",
                           (int)btrfs_header_level(eb));
                ret = -EIO;
                goto err;
@@ -1073,12 +1074,12 @@ static const struct address_space_operations btree_aops = {
        .set_page_dirty = btree_set_page_dirty,
 };
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr)
 {
        struct extent_buffer *buf = NULL;
        struct inode *btree_inode = root->fs_info->btree_inode;
-        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        buf = btrfs_find_create_tree_block(root, bytenr);
        if (!buf)
                return;
        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
@@ -1086,7 +1087,7 @@ void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize)
        free_extent_buffer(buf);
 }
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
                         int mirror_num, struct extent_buffer **eb)
 {
        struct extent_buffer *buf = NULL;
@@ -1094,7 +1095,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
        struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
        int ret;
-        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        buf = btrfs_find_create_tree_block(root, bytenr);
        if (!buf)
                return 0;
@@ -1125,12 +1126,11 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 }
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
-                                                 u64 bytenr, u32 blocksize)
+                                                 u64 bytenr)
 {
        if (btrfs_test_is_dummy_root(root))
-                return alloc_test_extent_buffer(root->fs_info, bytenr,
+                return alloc_test_extent_buffer(root->fs_info, bytenr);
-                                                blocksize);
+        return alloc_extent_buffer(root->fs_info, bytenr);
-        return alloc_extent_buffer(root->fs_info, bytenr, blocksize);
 }
@@ -1152,7 +1152,7 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
        struct extent_buffer *buf = NULL;
        int ret;
-        buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize);
+        buf = btrfs_find_create_tree_block(root, bytenr);
        if (!buf)
                return NULL;
@@ -1275,12 +1275,10 @@ static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize,
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
        memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
-        memset(&root->root_kobj, 0, sizeof(root->root_kobj));
        if (fs_info)
                root->defrag_trans_start = fs_info->generation;
        else
                root->defrag_trans_start = 0;
-        init_completion(&root->kobj_unregister);
        root->root_key.objectid = objectid;
        root->anon_dev = 0;
@@ -1630,6 +1628,8 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
                                     bool check_ref)
 {
        struct btrfs_root *root;
+        struct btrfs_path *path;
+        struct btrfs_key key;
        int ret;
        if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
@@ -1669,8 +1669,17 @@ again:
        if (ret)
                goto fail;
-        ret = btrfs_find_item(fs_info->tree_root, NULL, BTRFS_ORPHAN_OBJECTID,
+        path = btrfs_alloc_path();
-                        location->objectid, BTRFS_ORPHAN_ITEM_KEY, NULL);
+        if (!path) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        key.objectid = BTRFS_ORPHAN_OBJECTID;
+        key.type = BTRFS_ORPHAN_ITEM_KEY;
+        key.offset = location->objectid;
+        ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
+        btrfs_free_path(path);
        if (ret < 0)
                goto fail;
        if (ret == 0)
@@ -1715,12 +1724,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
        int err;
-        bdi->capabilities = BDI_CAP_MAP_COPY;
+        err = bdi_setup_and_register(bdi, "btrfs");
-        err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY);
        if (err)
                return err;
-        bdi->ra_pages   = default_backing_dev_info.ra_pages;
+        bdi->ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
        bdi->congested_fn       = btrfs_congested_fn;
        bdi->congested_data     = info;
        return 0;
@@ -2233,6 +2241,7 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->qgroup_op_lock);
        spin_lock_init(&fs_info->buffer_lock);
        spin_lock_init(&fs_info->unused_bgs_lock);
+        mutex_init(&fs_info->unused_bg_unpin_mutex);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
        mutex_init(&fs_info->delalloc_root_mutex);
@@ -2319,7 +2328,6 @@ int open_ctree(struct super_block *sb,
         */
        fs_info->btree_inode->i_size = OFFSET_MAX;
        fs_info->btree_inode->i_mapping->a_ops = &btree_aops;
-        fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi;
        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
@@ -2498,7 +2506,7 @@ int open_ctree(struct super_block *sb,
                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
        if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA)
-                printk(KERN_ERR "BTRFS: has skinny extents\n");
+                printk(KERN_INFO "BTRFS: has skinny extents\n");
        /*
         * flag our filesystem as having big metadata blocks if
@@ -2522,7 +2530,7 @@ int open_ctree(struct super_block *sb,
         */
        if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) &&
            (sectorsize != nodesize)) {
-                printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes "
+                printk(KERN_ERR "BTRFS: unequal leaf/node/sector sizes "
                                "are not allowed for mixed block groups on %s\n",
                                sb->s_id);
                goto fail_alloc;
@@ -2630,12 +2638,12 @@ int open_ctree(struct super_block *sb,
        sb->s_blocksize_bits = blksize_bits(sectorsize);
        if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
-                printk(KERN_INFO "BTRFS: valid FS not found on %s\n", sb->s_id);
+                printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
                goto fail_sb_buffer;
        }
        if (sectorsize != PAGE_SIZE) {
-                printk(KERN_WARNING "BTRFS: Incompatible sector size(%lu) "
+                printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
                       "found on %s\n", (unsigned long)sectorsize, sb->s_id);
                goto fail_sb_buffer;
        }
@@ -2644,7 +2652,7 @@ int open_ctree(struct super_block *sb,
        ret = btrfs_read_sys_array(tree_root);
        mutex_unlock(&fs_info->chunk_mutex);
        if (ret) {
-                printk(KERN_WARNING "BTRFS: failed to read the system "
+                printk(KERN_ERR "BTRFS: failed to read the system "
                       "array on %s\n", sb->s_id);
                goto fail_sb_buffer;
        }
@@ -2659,7 +2667,7 @@ int open_ctree(struct super_block *sb,
                                           generation);
        if (!chunk_root->node ||
            !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
-                printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n",
+                printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n",
                       sb->s_id);
                goto fail_tree_roots;
        }
@@ -2671,7 +2679,7 @@ int open_ctree(struct super_block *sb,
        ret = btrfs_read_chunk_tree(chunk_root);
        if (ret) {
-                printk(KERN_WARNING "BTRFS: failed to read chunk tree on %s\n",
+                printk(KERN_ERR "BTRFS: failed to read chunk tree on %s\n",
                       sb->s_id);
                goto fail_tree_roots;
        }
@@ -2683,7 +2691,7 @@ int open_ctree(struct super_block *sb,
        btrfs_close_extra_devices(fs_info, fs_devices, 0);
        if (!fs_devices->latest_bdev) {
-                printk(KERN_CRIT "BTRFS: failed to read devices on %s\n",
+                printk(KERN_ERR "BTRFS: failed to read devices on %s\n",
                       sb->s_id);
                goto fail_tree_roots;
        }
@@ -2767,7 +2775,7 @@ retry_root_backup:
        ret = btrfs_recover_balance(fs_info);
        if (ret) {
-                printk(KERN_WARNING "BTRFS: failed to recover balance\n");
+                printk(KERN_ERR "BTRFS: failed to recover balance\n");
                goto fail_block_groups;
        }
@@ -3862,6 +3870,21 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
                                btrfs_super_log_root(sb));
+        /*
+         * Check the lower bound, the alignment and other constraints are
+         * checked later.
+         */
+        if (btrfs_super_nodesize(sb) < 4096) {
+                printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
+                                btrfs_super_nodesize(sb));
+                ret = -EINVAL;
+        }
+        if (btrfs_super_sectorsize(sb) < 4096) {
+                printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
+                                btrfs_super_sectorsize(sb));
+                ret = -EINVAL;
+        }
        if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) {
                printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n",
                                fs_info->fsid, sb->dev_item.fsid);
@@ -3875,6 +3898,10 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
        if (btrfs_super_num_devices(sb) > (1UL << 31))
                printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n",
                                btrfs_super_num_devices(sb));
+        if (btrfs_super_num_devices(sb) == 0) {
+                printk(KERN_ERR "BTRFS: number of devices is 0\n");
+                ret = -EINVAL;
+        }
        if (btrfs_super_bytenr(sb) != BTRFS_SUPER_INFO_OFFSET) {
                printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n",
@@ -3883,6 +3910,25 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
        }
        /*
+         * Obvious sys_chunk_array corruptions, it must hold at least one key
+         * and one chunk
+         */
+        if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
+                printk(KERN_ERR "BTRFS: system chunk array too big %u > %u\n",
+                                btrfs_super_sys_array_size(sb),
+                                BTRFS_SYSTEM_CHUNK_ARRAY_SIZE);
+                ret = -EINVAL;
+        }
+        if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key)
+                        + sizeof(struct btrfs_chunk)) {
+                printk(KERN_ERR "BTRFS: system chunk array too small %u < %lu\n",
+                                btrfs_super_sys_array_size(sb),
+                                sizeof(struct btrfs_disk_key)
+                                + sizeof(struct btrfs_chunk));
+                ret = -EINVAL;
+        }
+        /*
         * The generation is a global counter, we'll trust it more than the others
         * but it's still possible that it's the one that's wrong.
         */
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 414651821fb3..27d44c0fd236 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -46,11 +46,11 @@ struct btrfs_fs_devices;
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
                                      u64 parent_transid);
-void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize);
+void readahead_tree_block(struct btrfs_root *root, u64 bytenr);
-int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr,
                         int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
-                                                   u64 bytenr, u32 blocksize);
+                                                   u64 bytenr);
 void clean_tree_block(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, struct extent_buffer *buf);
 int open_ctree(struct super_block *sb,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a80b97100d90..571f402d3fc4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -74,8 +74,9 @@ enum {
        RESERVE_ALLOC_NO_ACCOUNT = 2,
 };
-static int update_block_group(struct btrfs_root *root,
+static int update_block_group(struct btrfs_trans_handle *trans,
-                              u64 bytenr, u64 num_bytes, int alloc);
+                              struct btrfs_root *root, u64 bytenr,
+                              u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -1925,7 +1926,7 @@ int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                         */
                        ret = 0;
                }
-                kfree(bbio);
+                btrfs_put_bbio(bbio);
        }
        if (actual_bytes)
@@ -2768,7 +2769,6 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_head *head;
        int ret;
        int run_all = count == (unsigned long)-1;
-        int run_most = 0;
        /* We'll clean this up in btrfs_cleanup_transaction */
        if (trans->aborted)
@@ -2778,10 +2778,8 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                root = root->fs_info->tree_root;
        delayed_refs = &trans->transaction->delayed_refs;
-        if (count == 0) {
+        if (count == 0)
                count = atomic_read(&delayed_refs->num_entries) * 2;
-                run_most = 1;
-        }
 again:
 #ifdef SCRAMBLE_DELAYED_REFS
@@ -3139,9 +3137,11 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
-        if (ret < 0)
+        if (ret) {
+                if (ret > 0)
+                        ret = -ENOENT;
                goto fail;
-        BUG_ON(ret); /* Corruption */
+        }
        leaf = path->nodes[0];
        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
@@ -3149,11 +3149,9 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(path);
 fail:
-        if (ret) {
+        if (ret)
                btrfs_abort_transaction(trans, root, ret);
-                return ret;
+        return ret;
-        }
-        return 0;
 }
@@ -3315,120 +3313,42 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
        struct btrfs_block_group_cache *cache;
-        int err = 0;
+        struct btrfs_transaction *cur_trans = trans->transaction;
+        int ret = 0;
        struct btrfs_path *path;
-        u64 last = 0;
+        if (list_empty(&cur_trans->dirty_bgs))
+                return 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-again:
+        /*
-        while (1) {
+         * We don't need the lock here since we are protected by the transaction
-                cache = btrfs_lookup_first_block_group(root->fs_info, last);
+         * commit.  We want to do the cache_save_setup first and then run the
-                while (cache) {
+         * delayed refs to make sure we have the best chance at doing this all
-                        if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+         * in one shot.
-                                break;
+         */
-                        cache = next_block_group(root, cache);
+        while (!list_empty(&cur_trans->dirty_bgs)) {
-                }
+                cache = list_first_entry(&cur_trans->dirty_bgs,
-                if (!cache) {
+                                         struct btrfs_block_group_cache,
-                        if (last == 0)
+                                         dirty_list);
-                                break;
+                list_del_init(&cache->dirty_list);
-                        last = 0;
+                if (cache->disk_cache_state == BTRFS_DC_CLEAR)
-                        continue;
+                        cache_save_setup(cache, trans, path);
-                }
+                if (!ret)
-                err = cache_save_setup(cache, trans, path);
+                        ret = btrfs_run_delayed_refs(trans, root,
-                last = cache->key.objectid + cache->key.offset;
+                                                     (unsigned long) -1);
-                btrfs_put_block_group(cache);
+                if (!ret && cache->disk_cache_state == BTRFS_DC_SETUP)
-        }
+                        btrfs_write_out_cache(root, trans, cache, path);
+                if (!ret)
-        while (1) {
+                        ret = write_one_cache_group(trans, root, path, cache);
-                if (last == 0) {
-                        err = btrfs_run_delayed_refs(trans, root,
-                                                     (unsigned long)-1);
-                        if (err) /* File system offline */
-                                goto out;
-                }
-                cache = btrfs_lookup_first_block_group(root->fs_info, last);
-                while (cache) {
-                        if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
-                                btrfs_put_block_group(cache);
-                                goto again;
-                        }
-                        if (cache->dirty)
-                                break;
-                        cache = next_block_group(root, cache);
-                }
-                if (!cache) {
-                        if (last == 0)
-                                break;
-                        last = 0;
-                        continue;
-                }
-                if (cache->disk_cache_state == BTRFS_DC_SETUP)
-                        cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
-                cache->dirty = 0;
-                last = cache->key.objectid + cache->key.offset;
-                err = write_one_cache_group(trans, root, path, cache);
-                btrfs_put_block_group(cache);
-                if (err) /* File system offline */
-                        goto out;
-        }
-        while (1) {
-                /*
-                 * I don't think this is needed since we're just marking our
-                 * preallocated extent as written, but just in case it can't
-                 * hurt.
-                 */
-                if (last == 0) {
-                        err = btrfs_run_delayed_refs(trans, root,
-                                                     (unsigned long)-1);
-                        if (err) /* File system offline */
-                                goto out;
-                }
-                cache = btrfs_lookup_first_block_group(root->fs_info, last);
-                while (cache) {
-                        /*
-                         * Really this shouldn't happen, but it could if we
-                         * couldn't write the entire preallocated extent and
-                         * splitting the extent resulted in a new block.
-                         */
-                        if (cache->dirty) {
-                                btrfs_put_block_group(cache);
-                                goto again;
-                        }
-                        if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
-                                break;
-                        cache = next_block_group(root, cache);
-                }
-                if (!cache) {
-                        if (last == 0)
-                                break;
-                        last = 0;
-                        continue;
-                }
-                err = btrfs_write_out_cache(root, trans, cache, path);
-                /*
-                 * If we didn't have an error then the cache state is still
-                 * NEED_WRITE, so we can set it to WRITTEN.
-                 */
-                if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
-                        cache->disk_cache_state = BTRFS_DC_WRITTEN;
-                last = cache->key.objectid + cache->key.offset;
                btrfs_put_block_group(cache);
        }
-out:
        btrfs_free_path(path);
-        return err;
+        return ret;
 }
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -5043,19 +4963,25 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 /**
 * drop_outstanding_extent - drop an outstanding extent
 * @inode: the inode we're dropping the extent for
+ * @num_bytes: the number of bytes we're relaseing.
 *
 * This is called when we are freeing up an outstanding extent, either called
 * after an error or after an extent is written.  This will return the number of
 * reserved extents that need to be freed.  This must be called with
 * BTRFS_I(inode)->lock held.
 */
-static unsigned drop_outstanding_extent(struct inode *inode)
+static unsigned drop_outstanding_extent(struct inode *inode, u64 num_bytes)
 {
        unsigned drop_inode_space = 0;
        unsigned dropped_extents = 0;
+        unsigned num_extents = 0;
-        BUG_ON(!BTRFS_I(inode)->outstanding_extents);
+        num_extents = (unsigned)div64_u64(num_bytes +
-        BTRFS_I(inode)->outstanding_extents--;
+                                          BTRFS_MAX_EXTENT_SIZE - 1,
+                                          BTRFS_MAX_EXTENT_SIZE);
+        ASSERT(num_extents);
+        ASSERT(BTRFS_I(inode)->outstanding_extents >= num_extents);
+        BTRFS_I(inode)->outstanding_extents -= num_extents;
        if (BTRFS_I(inode)->outstanding_extents == 0 &&
            test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
@@ -5226,7 +5152,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 out_fail:
        spin_lock(&BTRFS_I(inode)->lock);
-        dropped = drop_outstanding_extent(inode);
+        dropped = drop_outstanding_extent(inode, num_bytes);
        /*
         * If the inodes csum_bytes is the same as the original
         * csum_bytes then we know we haven't raced with any free()ers
@@ -5305,7 +5231,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        num_bytes = ALIGN(num_bytes, root->sectorsize);
        spin_lock(&BTRFS_I(inode)->lock);
-        dropped = drop_outstanding_extent(inode);
+        dropped = drop_outstanding_extent(inode, num_bytes);
        if (num_bytes)
                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
@@ -5375,8 +5301,9 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
        btrfs_free_reserved_data_space(inode, num_bytes);
 }
-static int update_block_group(struct btrfs_root *root,
+static int update_block_group(struct btrfs_trans_handle *trans,
-                              u64 bytenr, u64 num_bytes, int alloc)
+                              struct btrfs_root *root, u64 bytenr,
+                              u64 num_bytes, int alloc)
 {
        struct btrfs_block_group_cache *cache = NULL;
        struct btrfs_fs_info *info = root->fs_info;
@@ -5414,6 +5341,14 @@ static int update_block_group(struct btrfs_root *root,
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
                        cache_block_group(cache, 1);
+                spin_lock(&trans->transaction->dirty_bgs_lock);
+                if (list_empty(&cache->dirty_list)) {
+                        list_add_tail(&cache->dirty_list,
+                                      &trans->transaction->dirty_bgs);
+                        btrfs_get_block_group(cache);
+                }
+                spin_unlock(&trans->transaction->dirty_bgs_lock);
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -5424,7 +5359,6 @@ static int update_block_group(struct btrfs_root *root,
                    cache->disk_cache_state < BTRFS_DC_CLEAR)
                        cache->disk_cache_state = BTRFS_DC_CLEAR;
-                cache->dirty = 1;
                old_val = btrfs_block_group_used(&cache->item);
                num_bytes = min(total, cache->key.offset - byte_in_group);
                if (alloc) {
@@ -5807,10 +5741,13 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                unpin = &fs_info->freed_extents[0];
        while (1) {
+                mutex_lock(&fs_info->unused_bg_unpin_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY, NULL);
-                if (ret)
+                if (ret) {
+                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        break;
+                }
                if (btrfs_test_opt(root, DISCARD))
                        ret = btrfs_discard_extent(root, start,
@@ -5818,6 +5755,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
                unpin_extent_range(root, start, end, true);
+                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                cond_resched();
        }
@@ -6103,7 +6041,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
-                ret = update_block_group(root, bytenr, num_bytes, 0);
+                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
                if (ret) {
                        btrfs_abort_transaction(trans, extent_root, ret);
                        goto out;
@@ -6205,7 +6143,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct extent_buffer *buf,
                           u64 parent, int last_ref)
 {
-        struct btrfs_block_group_cache *cache = NULL;
        int pin = 1;
        int ret;
@@ -6221,17 +6158,20 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
        if (!last_ref)
                return;
-        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
        if (btrfs_header_generation(buf) == trans->transid) {
+                struct btrfs_block_group_cache *cache;
                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                        ret = check_ref_cleanup(trans, root, buf->start);
                        if (!ret)
                                goto out;
                }
+                cache = btrfs_lookup_block_group(root->fs_info, buf->start);
                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                        pin_down_extent(root, cache, buf->start, buf->len, 1);
+                        btrfs_put_block_group(cache);
                        goto out;
                }
@@ -6239,6 +6179,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                btrfs_add_free_space(cache, buf->start, buf->len);
                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE, 0);
+                btrfs_put_block_group(cache);
                trace_btrfs_reserved_extent_free(root, buf->start, buf->len);
                pin = 0;
        }
@@ -6253,7 +6194,6 @@ out:
         * anymore.
         */
        clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
-        btrfs_put_block_group(cache);
 }
 /* Can return -ENOMEM */
@@ -7063,7 +7003,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
-        ret = update_block_group(root, ins->objectid, ins->offset, 1);
+        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                        ins->objectid, ins->offset);
@@ -7152,7 +7092,8 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
                        return ret;
        }
-        ret = update_block_group(root, ins->objectid, root->nodesize, 1);
+        ret = update_block_group(trans, root, ins->objectid, root->nodesize,
+                                 1);
        if (ret) { /* -ENOENT, logic error */
                btrfs_err(fs_info, "update block group failed for %llu %llu",
                        ins->objectid, ins->offset);
@@ -7217,11 +7158,11 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 static struct extent_buffer *
 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                      u64 bytenr, u32 blocksize, int level)
+                      u64 bytenr, int level)
 {
        struct extent_buffer *buf;
-        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        buf = btrfs_find_create_tree_block(root, bytenr);
        if (!buf)
                return ERR_PTR(-ENOMEM);
        btrfs_set_header_generation(buf, trans->transid);
@@ -7340,7 +7281,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
        if (btrfs_test_is_dummy_root(root)) {
                buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
-                                            blocksize, level);
+                                            level);
                if (!IS_ERR(buf))
                        root->alloc_bytenr += blocksize;
                return buf;
@@ -7357,8 +7298,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
                return ERR_PTR(ret);
        }
-        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+        buf = btrfs_init_new_buffer(trans, root, ins.objectid, level);
-                                    blocksize, level);
        BUG_ON(IS_ERR(buf)); /* -ENOMEM */
        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
@@ -7487,7 +7427,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
                                continue;
                }
 reada:
-                readahead_tree_block(root, bytenr, blocksize);
+                readahead_tree_block(root, bytenr);
                nread++;
        }
        wc->reada_slot = slot;
@@ -7828,7 +7768,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
        next = btrfs_find_tree_block(root, bytenr);
        if (!next) {
-                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+                next = btrfs_find_create_tree_block(root, bytenr);
                if (!next)
                        return -ENOMEM;
                btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
@@ -8548,14 +8488,6 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        alloc_flags = update_block_group_flags(root, cache->flags);
-        if (alloc_flags != cache->flags) {
-                ret = do_chunk_alloc(trans, root, alloc_flags,
-                                     CHUNK_ALLOC_FORCE);
-                if (ret < 0)
-                        goto out;
-        }
        ret = set_block_group_ro(cache, 0);
        if (!ret)
                goto out;
@@ -8566,6 +8498,11 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
                goto out;
        ret = set_block_group_ro(cache, 0);
 out:
+        if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+                alloc_flags = update_block_group_flags(root, cache->flags);
+                check_system_chunk(trans, root, alloc_flags);
+        }
        btrfs_end_transaction(trans, root);
        return ret;
 }
@@ -9005,6 +8942,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
        INIT_LIST_HEAD(&cache->cluster_list);
        INIT_LIST_HEAD(&cache->bg_list);
        INIT_LIST_HEAD(&cache->ro_list);
+        INIT_LIST_HEAD(&cache->dirty_list);
        btrfs_init_free_space_ctl(cache);
        atomic_set(&cache->trimming, 0);
@@ -9068,9 +9006,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                         * b) Setting 'dirty flag' makes sure that we flush
                         *    the new space cache info onto disk.
                         */
-                        cache->disk_cache_state = BTRFS_DC_CLEAR;
                        if (btrfs_test_opt(root, SPACE_CACHE))
-                                cache->dirty = 1;
+                                cache->disk_cache_state = BTRFS_DC_CLEAR;
                }
                read_extent_buffer(leaf, &cache->item,
@@ -9422,7 +9359,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
         * are still on the list after taking the semaphore
         */
        list_del_init(&block_group->list);
-        list_del_init(&block_group->ro_list);
        if (list_empty(&block_group->space_info->block_groups[index])) {
                kobj = block_group->space_info->block_group_kobjs[index];
                block_group->space_info->block_group_kobjs[index] = NULL;
@@ -9461,9 +9397,17 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                }
        }
+        spin_lock(&trans->transaction->dirty_bgs_lock);
+        if (!list_empty(&block_group->dirty_list)) {
+                list_del_init(&block_group->dirty_list);
+                btrfs_put_block_group(block_group);
+        }
+        spin_unlock(&trans->transaction->dirty_bgs_lock);
        btrfs_remove_free_space_cache(block_group);
        spin_lock(&block_group->space_info->lock);
+        list_del_init(&block_group->ro_list);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        block_group->space_info->disk_total -= block_group->key.offset * factor;
@@ -9611,7 +9555,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 * Want to do this before we do anything else so we can recover
                 * properly if we fail to join the transaction.
                 */
-                trans = btrfs_join_transaction(root);
+                /* 1 for btrfs_orphan_reserve_metadata() */
+                trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans)) {
                        btrfs_set_block_group_rw(root, block_group);
                        ret = PTR_ERR(trans);
@@ -9624,18 +9569,33 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
                 */
                start = block_group->key.objectid;
                end = start + block_group->key.offset - 1;
+                /*
+                 * Hold the unused_bg_unpin_mutex lock to avoid racing with
+                 * btrfs_finish_extent_commit(). If we are at transaction N,
+                 * another task might be running finish_extent_commit() for the
+                 * previous transaction N - 1, and have seen a range belonging
+                 * to the block group in freed_extents[] before we were able to
+                 * clear the whole block group range from freed_extents[]. This
+                 * means that task can lookup for the block group after we
+                 * unpinned it from freed_extents[] and removed it, leading to
+                 * a BUG_ON() at btrfs_unpin_extent_range().
+                 */
+                mutex_lock(&fs_info->unused_bg_unpin_mutex);
                ret = clear_extent_bits(&fs_info->freed_extents[0], start, end,
                                  EXTENT_DIRTY, GFP_NOFS);
                if (ret) {
+                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        btrfs_set_block_group_rw(root, block_group);
                        goto end_trans;
                }
                ret = clear_extent_bits(&fs_info->freed_extents[1], start, end,
                                  EXTENT_DIRTY, GFP_NOFS);
                if (ret) {
+                        mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                        btrfs_set_block_group_rw(root, block_group);
                        goto end_trans;
                }
+                mutex_unlock(&fs_info->unused_bg_unpin_mutex);
                /* Reset pinned so btrfs_put_block_group doesn't complain */
                block_group->pinned = 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 4ebabd237153..c7233ff1d533 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -64,7 +64,7 @@ void btrfs_leak_debug_check(void)
        while (!list_empty(&states)) {
                state = list_entry(states.next, struct extent_state, leak_list);
-                pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n",
+                pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n",
                       state->start, state->end, state->state,
                       extent_state_in_tree(state),
                       atomic_read(&state->refs));
@@ -396,21 +396,21 @@ static void merge_state(struct extent_io_tree *tree,
 }
 static void set_state_cb(struct extent_io_tree *tree,
-                         struct extent_state *state, unsigned long *bits)
+                         struct extent_state *state, unsigned *bits)
 {
        if (tree->ops && tree->ops->set_bit_hook)
                tree->ops->set_bit_hook(tree->mapping->host, state, bits);
 }
 static void clear_state_cb(struct extent_io_tree *tree,
-                           struct extent_state *state, unsigned long *bits)
+                           struct extent_state *state, unsigned *bits)
 {
        if (tree->ops && tree->ops->clear_bit_hook)
                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
 }
 static void set_state_bits(struct extent_io_tree *tree,
-                           struct extent_state *state, unsigned long *bits);
+                           struct extent_state *state, unsigned *bits);
 /*
 * insert an extent_state struct into the tree.  'bits' are set on the
@@ -426,7 +426,7 @@ static int insert_state(struct extent_io_tree *tree,
                        struct extent_state *state, u64 start, u64 end,
                        struct rb_node ***p,
                        struct rb_node **parent,
-                        unsigned long *bits)
+                        unsigned *bits)
 {
        struct rb_node *node;
@@ -511,10 +511,10 @@ static struct extent_state *next_state(struct extent_state *state)
 */
 static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
                                            struct extent_state *state,
-                                            unsigned long *bits, int wake)
+                                            unsigned *bits, int wake)
 {
        struct extent_state *next;
-        unsigned long bits_to_clear = *bits & ~EXTENT_CTLBITS;
+        unsigned bits_to_clear = *bits & ~EXTENT_CTLBITS;
        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
@@ -570,7 +570,7 @@ static void extent_io_tree_panic(struct extent_io_tree *tree, int err)
 * This takes the tree lock, and returns 0 on success and < 0 on error.
 */
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                     unsigned long bits, int wake, int delete,
+                     unsigned bits, int wake, int delete,
                     struct extent_state **cached_state,
                     gfp_t mask)
 {
@@ -789,9 +789,9 @@ out:
 static void set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
-                           unsigned long *bits)
+                           unsigned *bits)
 {
-        unsigned long bits_to_set = *bits & ~EXTENT_CTLBITS;
+        unsigned bits_to_set = *bits & ~EXTENT_CTLBITS;
        set_state_cb(tree, state, bits);
        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
@@ -803,7 +803,7 @@ static void set_state_bits(struct extent_io_tree *tree,
 static void cache_state_if_flags(struct extent_state *state,
                                 struct extent_state **cached_ptr,
-                                 const u64 flags)
+                                 unsigned flags)
 {
        if (cached_ptr && !(*cached_ptr)) {
                if (!flags || (state->state & flags)) {
@@ -833,7 +833,7 @@ static void cache_state(struct extent_state *state,
 static int __must_check
 __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                 unsigned long bits, unsigned long exclusive_bits,
+                 unsigned bits, unsigned exclusive_bits,
                 u64 *failed_start, struct extent_state **cached_state,
                 gfp_t mask)
 {
@@ -1034,7 +1034,7 @@ search_again:
 }
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                   unsigned long bits, u64 * failed_start,
+                   unsigned bits, u64 * failed_start,
                   struct extent_state **cached_state, gfp_t mask)
 {
        return __set_extent_bit(tree, start, end, bits, 0, failed_start,
@@ -1060,7 +1060,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 * boundary bits like LOCK.
 */
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                       unsigned long bits, unsigned long clear_bits,
+                       unsigned bits, unsigned clear_bits,
                       struct extent_state **cached_state, gfp_t mask)
 {
        struct extent_state *state;
@@ -1268,14 +1268,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 }
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned long bits, gfp_t mask)
+                    unsigned bits, gfp_t mask)
 {
        return set_extent_bit(tree, start, end, bits, NULL,
                              NULL, mask);
 }
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                      unsigned long bits, gfp_t mask)
+                      unsigned bits, gfp_t mask)
 {
        return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask);
 }
@@ -1330,10 +1330,11 @@ int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
 * us if waiting is desired.
 */
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                     unsigned long bits, struct extent_state **cached_state)
+                     unsigned bits, struct extent_state **cached_state)
 {
        int err;
        u64 failed_start;
        while (1) {
                err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
                                       EXTENT_LOCKED, &failed_start,
@@ -1407,8 +1408,8 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
        while (index <= end_index) {
                page = find_get_page(inode->i_mapping, index);
                BUG_ON(!page); /* Pages should be in the extent_io_tree */
-                account_page_redirty(page);
                __set_page_dirty_nobuffers(page);
+                account_page_redirty(page);
                page_cache_release(page);
                index++;
        }
@@ -1440,7 +1441,7 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 */
 static struct extent_state *
 find_first_extent_bit_state(struct extent_io_tree *tree,
-                            u64 start, unsigned long bits)
+                            u64 start, unsigned bits)
 {
        struct rb_node *node;
        struct extent_state *state;
@@ -1474,7 +1475,7 @@ out:
 * If nothing was found, 1 is returned. If found something, return 0.
 */
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                          u64 *start_ret, u64 *end_ret, unsigned long bits,
+                          u64 *start_ret, u64 *end_ret, unsigned bits,
                          struct extent_state **cached_state)
 {
        struct extent_state *state;
@@ -1753,7 +1754,7 @@ out_failed:
 int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
                                 struct page *locked_page,
-                                 unsigned long clear_bits,
+                                 unsigned clear_bits,
                                 unsigned long page_ops)
 {
        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
@@ -1810,7 +1811,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 */
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end, u64 max_bytes,
-                     unsigned long bits, int contig)
+                     unsigned bits, int contig)
 {
        struct rb_node *node;
        struct extent_state *state;
@@ -1928,7 +1929,7 @@ out:
 * range is found set.
 */
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                   unsigned long bits, int filled, struct extent_state *cached)
+                   unsigned bits, int filled, struct extent_state *cached)
 {
        struct extent_state *state = NULL;
        struct rb_node *node;
@@ -2057,7 +2058,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
        sector = bbio->stripes[mirror_num-1].physical >> 9;
        bio->bi_iter.bi_sector = sector;
        dev = bbio->stripes[mirror_num-1].dev;
-        kfree(bbio);
+        btrfs_put_bbio(bbio);
        if (!dev || !dev->bdev || !dev->writeable) {
                bio_put(bio);
                return -EIO;
@@ -2190,7 +2191,7 @@ void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end)
                next = next_state(state);
-                failrec = (struct io_failure_record *)state->private;
+                failrec = (struct io_failure_record *)(unsigned long)state->private;
                free_extent_state(state);
                kfree(failrec);
@@ -2816,8 +2817,10 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                    bio_add_page(bio, page, page_size, offset) < page_size) {
                        ret = submit_one_bio(rw, bio, mirror_num,
                                             prev_bio_flags);
-                        if (ret < 0)
+                        if (ret < 0) {
+                                *bio_ret = NULL;
                                return ret;
+                        }
                        bio = NULL;
                } else {
                        return 0;
@@ -3239,7 +3242,7 @@ static noinline_for_stack int writepage_delalloc(struct inode *inode,
                                               page,
                                               &delalloc_start,
                                               &delalloc_end,
-                                               128 * 1024 * 1024);
+                                               BTRFS_MAX_EXTENT_SIZE);
                if (nr_delalloc == 0) {
                        delalloc_start = delalloc_end + 1;
                        continue;
@@ -4598,11 +4601,11 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 static struct extent_buffer *
 __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start,
-                      unsigned long len, gfp_t mask)
+                      unsigned long len)
 {
        struct extent_buffer *eb = NULL;
-        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+        eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS);
        if (eb == NULL)
                return NULL;
        eb->start = start;
@@ -4643,7 +4646,7 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
        struct extent_buffer *new;
        unsigned long num_pages = num_extent_pages(src->start, src->len);
-        new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS);
+        new = __alloc_extent_buffer(src->fs_info, src->start, src->len);
        if (new == NULL)
                return NULL;
@@ -4666,13 +4669,26 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
        return new;
 }
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+                                                u64 start)
 {
        struct extent_buffer *eb;
-        unsigned long num_pages = num_extent_pages(0, len);
+        unsigned long len;
+        unsigned long num_pages;
        unsigned long i;
-        eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS);
+        if (!fs_info) {
+                /*
+                 * Called only from tests that don't always have a fs_info
+                 * available, but we know that nodesize is 4096
+                 */
+                len = 4096;
+        } else {
+                len = fs_info->tree_root->nodesize;
+        }
+        num_pages = num_extent_pages(0, len);
+        eb = __alloc_extent_buffer(fs_info, start, len);
        if (!eb)
                return NULL;
@@ -4762,7 +4778,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
-                                               u64 start, unsigned long len)
+                                               u64 start)
 {
        struct extent_buffer *eb, *exists = NULL;
        int ret;
@@ -4770,7 +4786,7 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
        eb = find_extent_buffer(fs_info, start);
        if (eb)
                return eb;
-        eb = alloc_dummy_extent_buffer(start, len);
+        eb = alloc_dummy_extent_buffer(fs_info, start);
        if (!eb)
                return NULL;
        eb->fs_info = fs_info;
@@ -4808,8 +4824,9 @@ free_eb:
 #endif
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
-                                          u64 start, unsigned long len)
+                                          u64 start)
 {
+        unsigned long len = fs_info->tree_root->nodesize;
        unsigned long num_pages = num_extent_pages(start, len);
        unsigned long i;
        unsigned long index = start >> PAGE_CACHE_SHIFT;
@@ -4824,7 +4841,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
        if (eb)
                return eb;
-        eb = __alloc_extent_buffer(fs_info, start, len, GFP_NOFS);
+        eb = __alloc_extent_buffer(fs_info, start, len);
        if (!eb)
                return NULL;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index ece9ce87edff..695b0ccfb755 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -4,22 +4,22 @@
 #include <linux/rbtree.h>
 /* bits for the extent state */
-#define EXTENT_DIRTY 1
+#define EXTENT_DIRTY            (1U << 0)
-#define EXTENT_WRITEBACK (1 << 1)
+#define EXTENT_WRITEBACK        (1U << 1)
-#define EXTENT_UPTODATE (1 << 2)
+#define EXTENT_UPTODATE         (1U << 2)
-#define EXTENT_LOCKED (1 << 3)
+#define EXTENT_LOCKED           (1U << 3)
-#define EXTENT_NEW (1 << 4)
+#define EXTENT_NEW              (1U << 4)
-#define EXTENT_DELALLOC (1 << 5)
+#define EXTENT_DELALLOC         (1U << 5)
-#define EXTENT_DEFRAG (1 << 6)
+#define EXTENT_DEFRAG           (1U << 6)
-#define EXTENT_BOUNDARY (1 << 9)
+#define EXTENT_BOUNDARY         (1U << 9)
-#define EXTENT_NODATASUM (1 << 10)
+#define EXTENT_NODATASUM        (1U << 10)
-#define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_DO_ACCOUNTING    (1U << 11)
-#define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_FIRST_DELALLOC   (1U << 12)
-#define EXTENT_NEED_WAIT (1 << 13)
+#define EXTENT_NEED_WAIT        (1U << 13)
-#define EXTENT_DAMAGED (1 << 14)
+#define EXTENT_DAMAGED          (1U << 14)
-#define EXTENT_NORESERVE (1 << 15)
+#define EXTENT_NORESERVE        (1U << 15)
-#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_IOBITS           (EXTENT_LOCKED | EXTENT_WRITEBACK)
-#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
+#define EXTENT_CTLBITS          (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 /*
 * flags for bio submission. The high bits indicate the compression
@@ -81,9 +81,9 @@ struct extent_io_ops {
        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
                                      struct extent_state *state, int uptodate);
        void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-                             unsigned long *bits);
+                             unsigned *bits);
        void (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-                               unsigned long *bits);
+                               unsigned *bits);
        void (*merge_extent_hook)(struct inode *inode,
                                  struct extent_state *new,
                                  struct extent_state *other);
@@ -108,7 +108,7 @@ struct extent_state {
        /* ADD NEW ELEMENTS AFTER THIS */
        wait_queue_head_t wq;
        atomic_t refs;
-        unsigned long state;
+        unsigned state;
        /* for use by the FS */
        u64 private;
@@ -188,7 +188,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 int try_release_extent_buffer(struct page *page);
 int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                     unsigned long bits, struct extent_state **cached);
+                     unsigned bits, struct extent_state **cached);
 int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
                         struct extent_state **cached, gfp_t mask);
@@ -202,21 +202,21 @@ void extent_io_exit(void);
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
-                     u64 max_bytes, unsigned long bits, int contig);
+                     u64 max_bytes, unsigned bits, int contig);
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                   unsigned long bits, int filled,
+                   unsigned bits, int filled,
                   struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                      unsigned long bits, gfp_t mask);
+                      unsigned bits, gfp_t mask);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                     unsigned long bits, int wake, int delete,
+                     unsigned bits, int wake, int delete,
                     struct extent_state **cached, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-                    unsigned long bits, gfp_t mask);
+                    unsigned bits, gfp_t mask);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                   unsigned long bits, u64 *failed_start,
+                   unsigned bits, u64 *failed_start,
                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
@@ -229,14 +229,14 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask);
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                       unsigned long bits, unsigned long clear_bits,
+                       unsigned bits, unsigned clear_bits,
                       struct extent_state **cached_state, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
 int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
                      struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-                          u64 *start_ret, u64 *end_ret, unsigned long bits,
+                          u64 *start_ret, u64 *end_ret, unsigned bits,
                          struct extent_state **cached_state);
 int extent_invalidatepage(struct extent_io_tree *tree,
                          struct page *page, unsigned long offset);
@@ -262,8 +262,9 @@ int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
-                                          u64 start, unsigned long len);
+                                          u64 start);
-struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len);
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+                u64 start);
 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
                                         u64 start);
@@ -322,7 +323,7 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
 int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
                                 struct page *locked_page,
-                                 unsigned long bits_to_clear,
+                                 unsigned bits_to_clear,
                                 unsigned long page_ops);
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
@@ -377,5 +378,5 @@ noinline u64 find_lock_delalloc_range(struct inode *inode,
                                      u64 *end, u64 max_bytes);
 #endif
 struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info,
-                                               u64 start, unsigned long len);
+                                               u64 start);
 #endif
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4090259569b..b78bbbac900d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1746,7 +1746,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
        mutex_lock(&inode->i_mutex);
-        current->backing_dev_info = inode->i_mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err) {
                mutex_unlock(&inode->i_mutex);
@@ -2081,7 +2081,6 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = btrfs_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d6c03f7f136b..a71978578fa7 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -651,15 +651,13 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        struct io_ctl io_ctl;
        struct btrfs_key key;
        struct btrfs_free_space *e, *n;
-        struct list_head bitmaps;
+        LIST_HEAD(bitmaps);
        u64 num_entries;
        u64 num_bitmaps;
        u64 generation;
        u8 type;
        int ret = 0;
-        INIT_LIST_HEAD(&bitmaps);
        /* Nothing in the space cache, goodbye */
        if (!i_size_read(inode))
                return 0;
@@ -1243,6 +1241,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct inode *inode;
        int ret = 0;
+        enum btrfs_disk_cache_state dcs = BTRFS_DC_WRITTEN;
        root = root->fs_info->tree_root;
@@ -1266,9 +1265,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
                                      path, block_group->key.objectid);
        if (ret) {
-                spin_lock(&block_group->lock);
+                dcs = BTRFS_DC_ERROR;
-                block_group->disk_cache_state = BTRFS_DC_ERROR;
-                spin_unlock(&block_group->lock);
                ret = 0;
 #ifdef DEBUG
                btrfs_err(root->fs_info,
@@ -1277,6 +1274,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 #endif
        }
+        spin_lock(&block_group->lock);
+        block_group->disk_cache_state = dcs;
+        spin_unlock(&block_group->lock);
        iput(inode);
        return ret;
 }
@@ -2903,7 +2903,6 @@ int btrfs_find_space_cluster(struct btrfs_root *root,
        trace_btrfs_find_cluster(block_group, offset, bytes, empty_size,
                                 min_bytes);
-        INIT_LIST_HEAD(&bitmaps);
        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
                                      bytes + empty_size,
                                      cont1_bytes, min_bytes);
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 8ffa4783cbf4..265e03c73f4d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -344,6 +344,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->leave_spinning = 1;
+        path->skip_release_on_error = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      ins_len);
        if (ret == -EEXIST) {
@@ -362,8 +363,12 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                ptr = (unsigned long)(ref + 1);
                ret = 0;
        } else if (ret < 0) {
-                if (ret == -EOVERFLOW)
+                if (ret == -EOVERFLOW) {
-                        ret = -EMLINK;
+                        if (find_name_in_backref(path, name, name_len, &ref))
+                                ret = -EEXIST;
+                        else
+                                ret = -EMLINK;
+                }
                goto out;
        } else {
                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e687bb0dc73a..a85c23dfcddb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1530,10 +1530,45 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 static void btrfs_split_extent_hook(struct inode *inode,
                                    struct extent_state *orig, u64 split)
 {
+        u64 size;
        /* not delalloc, ignore it */
        if (!(orig->state & EXTENT_DELALLOC))
                return;
+        size = orig->end - orig->start + 1;
+        if (size > BTRFS_MAX_EXTENT_SIZE) {
+                u64 num_extents;
+                u64 new_size;
+                /*
+                 * We need the largest size of the remaining extent to see if we
+                 * need to add a new outstanding extent.  Think of the following
+                 * case
+                 *
+                 * [MEAX_EXTENT_SIZEx2 - 4k][4k]
+                 *
+                 * The new_size would just be 4k and we'd think we had enough
+                 * outstanding extents for this if we only took one side of the
+                 * split, same goes for the other direction.  We need to see if
+                 * the larger size still is the same amount of extents as the
+                 * original size, because if it is we need to add a new
+                 * outstanding extent.  But if we split up and the larger size
+                 * is less than the original then we are good to go since we've
+                 * already accounted for the extra extent in our original
+                 * accounting.
+                 */
+                new_size = orig->end - split + 1;
+                if ((split - orig->start) > new_size)
+                        new_size = split - orig->start;
+                num_extents = div64_u64(size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                        BTRFS_MAX_EXTENT_SIZE);
+                if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                              BTRFS_MAX_EXTENT_SIZE) < num_extents)
+                        return;
+        }
        spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->outstanding_extents++;
        spin_unlock(&BTRFS_I(inode)->lock);
@@ -1549,10 +1584,34 @@ static void btrfs_merge_extent_hook(struct inode *inode,
                                    struct extent_state *new,
                                    struct extent_state *other)
 {
+        u64 new_size, old_size;
+        u64 num_extents;
        /* not delalloc, ignore it */
        if (!(other->state & EXTENT_DELALLOC))
                return;
+        old_size = other->end - other->start + 1;
+        new_size = old_size + (new->end - new->start + 1);
+        /* we're not bigger than the max, unreserve the space and go */
+        if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+                spin_lock(&BTRFS_I(inode)->lock);
+                BTRFS_I(inode)->outstanding_extents--;
+                spin_unlock(&BTRFS_I(inode)->lock);
+                return;
+        }
+        /*
+         * If we grew by another max_extent, just return, we want to keep that
+         * reserved amount.
+         */
+        num_extents = div64_u64(old_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                                BTRFS_MAX_EXTENT_SIZE);
+        if (div64_u64(new_size + BTRFS_MAX_EXTENT_SIZE - 1,
+                      BTRFS_MAX_EXTENT_SIZE) > num_extents)
+                return;
        spin_lock(&BTRFS_I(inode)->lock);
        BTRFS_I(inode)->outstanding_extents--;
        spin_unlock(&BTRFS_I(inode)->lock);
@@ -1604,7 +1663,7 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
 * have pending delalloc work to be done.
 */
 static void btrfs_set_bit_hook(struct inode *inode,
-                               struct extent_state *state, unsigned long *bits)
+                               struct extent_state *state, unsigned *bits)
 {
        if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
@@ -1645,9 +1704,11 @@ static void btrfs_set_bit_hook(struct inode *inode,
 */
 static void btrfs_clear_bit_hook(struct inode *inode,
                                 struct extent_state *state,
-                                 unsigned long *bits)
+                                 unsigned *bits)
 {
        u64 len = state->end + 1 - state->start;
+        u64 num_extents = div64_u64(len + BTRFS_MAX_EXTENT_SIZE -1,
+                                    BTRFS_MAX_EXTENT_SIZE);
        spin_lock(&BTRFS_I(inode)->lock);
        if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG))
@@ -1667,7 +1728,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                        *bits &= ~EXTENT_FIRST_DELALLOC;
                } else if (!(*bits & EXTENT_DO_ACCOUNTING)) {
                        spin_lock(&BTRFS_I(inode)->lock);
-                        BTRFS_I(inode)->outstanding_extents--;
+                        BTRFS_I(inode)->outstanding_extents -= num_extents;
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
@@ -2945,7 +3006,7 @@ static int __readpage_endio_check(struct inode *inode,
        return 0;
 zeroit:
        if (__ratelimit(&_rs))
-                btrfs_info(BTRFS_I(inode)->root->fs_info,
+                btrfs_warn(BTRFS_I(inode)->root->fs_info,
                           "csum failed ino %llu off %llu csum %u expected csum %u",
                           btrfs_ino(inode), start, csum, csum_expected);
        memset(kaddr + pgoff, 1, len);
@@ -3407,7 +3468,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 out:
        if (ret)
-                btrfs_crit(root->fs_info,
+                btrfs_err(root->fs_info,
                        "could not do orphan cleanup %d", ret);
        btrfs_free_path(path);
        return ret;
@@ -3490,7 +3551,6 @@ static void btrfs_read_locked_inode(struct inode *inode)
        struct btrfs_path *path;
        struct extent_buffer *leaf;
        struct btrfs_inode_item *inode_item;
-        struct btrfs_timespec *tspec;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key location;
        unsigned long ptr;
@@ -3527,17 +3587,19 @@ static void btrfs_read_locked_inode(struct inode *inode)
        i_gid_write(inode, btrfs_inode_gid(leaf, inode_item));
        btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item));
-        tspec = btrfs_inode_atime(inode_item);
+        inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->atime);
-        inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+        inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->atime);
-        inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+        inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->mtime);
+        inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->mtime);
-        tspec = btrfs_inode_mtime(inode_item);
+        inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, &inode_item->ctime);
-        inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+        inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, &inode_item->ctime);
-        inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
-        tspec = btrfs_inode_ctime(inode_item);
+        BTRFS_I(inode)->i_otime.tv_sec =
-        inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec);
+                btrfs_timespec_sec(leaf, &inode_item->otime);
-        inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec);
+        BTRFS_I(inode)->i_otime.tv_nsec =
+                btrfs_timespec_nsec(leaf, &inode_item->otime);
        inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item));
        BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item);
@@ -3608,7 +3670,6 @@ cache_acl:
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_mapping->a_ops = &btrfs_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
                inode->i_fop = &btrfs_file_operations;
                inode->i_op = &btrfs_file_inode_operations;
@@ -3623,7 +3684,6 @@ cache_acl:
        case S_IFLNK:
                inode->i_op = &btrfs_symlink_inode_operations;
                inode->i_mapping->a_ops = &btrfs_symlink_aops;
-                inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
                break;
        default:
                inode->i_op = &btrfs_special_inode_operations;
@@ -3658,21 +3718,26 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
-        btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+        btrfs_set_token_timespec_sec(leaf, &item->atime,
                                     inode->i_atime.tv_sec, &token);
-        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+        btrfs_set_token_timespec_nsec(leaf, &item->atime,
                                      inode->i_atime.tv_nsec, &token);
-        btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+        btrfs_set_token_timespec_sec(leaf, &item->mtime,
                                     inode->i_mtime.tv_sec, &token);
-        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+        btrfs_set_token_timespec_nsec(leaf, &item->mtime,
                                      inode->i_mtime.tv_nsec, &token);
-        btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+        btrfs_set_token_timespec_sec(leaf, &item->ctime,
                                     inode->i_ctime.tv_sec, &token);
-        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+        btrfs_set_token_timespec_nsec(leaf, &item->ctime,
                                      inode->i_ctime.tv_nsec, &token);
+        btrfs_set_token_timespec_sec(leaf, &item->otime,
+                                     BTRFS_I(inode)->i_otime.tv_sec, &token);
+        btrfs_set_token_timespec_nsec(leaf, &item->otime,
+                                      BTRFS_I(inode)->i_otime.tv_nsec, &token);
        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
                                     &token);
        btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
@@ -5009,6 +5074,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        struct btrfs_root *new_root;
        struct btrfs_root_ref *ref;
        struct extent_buffer *leaf;
+        struct btrfs_key key;
        int ret;
        int err = 0;
@@ -5019,9 +5085,12 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        }
        err = -ENOENT;
-        ret = btrfs_find_item(root->fs_info->tree_root, path,
+        key.objectid = BTRFS_I(dir)->root->root_key.objectid;
-                                BTRFS_I(dir)->root->root_key.objectid,
+        key.type = BTRFS_ROOT_REF_KEY;
-                                location->objectid, BTRFS_ROOT_REF_KEY, NULL);
+        key.offset = location->objectid;
+        ret = btrfs_search_slot(NULL, root->fs_info->tree_root, &key, path,
+                                0, 0);
        if (ret) {
                if (ret < 0)
                        err = ret;
@@ -5260,7 +5329,10 @@ static struct inode *new_simple_dir(struct super_block *s,
        inode->i_op = &btrfs_dir_ro_inode_operations;
        inode->i_fop = &simple_dir_operations;
        inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO;
-        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        inode->i_mtime = CURRENT_TIME;
+        inode->i_atime = inode->i_mtime;
+        inode->i_ctime = inode->i_mtime;
+        BTRFS_I(inode)->i_otime = inode->i_mtime;
        return inode;
 }
@@ -5828,7 +5900,12 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        inode_init_owner(inode, dir, mode);
        inode_set_bytes(inode, 0);
-        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+        inode->i_mtime = CURRENT_TIME;
+        inode->i_atime = inode->i_mtime;
+        inode->i_ctime = inode->i_mtime;
+        BTRFS_I(inode)->i_otime = inode->i_mtime;
        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                  struct btrfs_inode_item);
        memset_extent_buffer(path->nodes[0], 0, (unsigned long)inode_item,
@@ -6088,7 +6165,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
@@ -6255,8 +6331,10 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 out_fail:
        btrfs_end_transaction(trans, root);
-        if (drop_on_err)
+        if (drop_on_err) {
+                inode_dec_link_count(inode);
                iput(inode);
+        }
        btrfs_balance_delayed_items(root);
        btrfs_btree_balance_dirty(root);
        return err;
@@ -7135,11 +7213,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        u64 start = iblock << inode->i_blkbits;
        u64 lockstart, lockend;
        u64 len = bh_result->b_size;
+        u64 orig_len = len;
        int unlock_bits = EXTENT_LOCKED;
        int ret = 0;
        if (create)
-                unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
+                unlock_bits |= EXTENT_DIRTY;
        else
                len = min_t(u64, len, root->sectorsize);
@@ -7270,14 +7349,12 @@ unlock:
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
-                spin_lock(&BTRFS_I(inode)->lock);
+                if (len < orig_len) {
-                BTRFS_I(inode)->outstanding_extents++;
+                        spin_lock(&BTRFS_I(inode)->lock);
-                spin_unlock(&BTRFS_I(inode)->lock);
+                        BTRFS_I(inode)->outstanding_extents++;
+                        spin_unlock(&BTRFS_I(inode)->lock);
-                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                }
-                                     lockstart + len - 1, EXTENT_DELALLOC, NULL,
+                btrfs_free_reserved_data_space(inode, len);
-                                     &cached_state, GFP_NOFS);
-                BUG_ON(ret);
        }
        /*
@@ -7806,8 +7883,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        }
        /* async crcs make it difficult to collect full stripe writes. */
-        if (btrfs_get_alloc_profile(root, 1) &
+        if (btrfs_get_alloc_profile(root, 1) & BTRFS_BLOCK_GROUP_RAID56_MASK)
-            (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
                async_submit = 0;
        else
                async_submit = 1;
@@ -8054,8 +8130,6 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                else if (ret >= 0 && (size_t)ret < count)
                        btrfs_delalloc_release_space(inode,
                                                     count - (size_t)ret);
-                else
-                        btrfs_delalloc_release_metadata(inode, 0);
        }
 out:
        if (wakeup)
@@ -8576,6 +8650,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->delayed_node = NULL;
+        ei->i_otime.tv_sec = 0;
+        ei->i_otime.tv_nsec = 0;
        inode = &ei->vfs_inode;
        extent_map_tree_init(&ei->extent_tree);
        extent_io_tree_init(&ei->io_tree, &inode->i_data);
@@ -9201,7 +9278,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_fop = &btrfs_file_operations;
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
@@ -9245,7 +9321,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        inode->i_op = &btrfs_symlink_inode_operations;
        inode->i_mapping->a_ops = &btrfs_symlink_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        inode_set_bytes(inode, name_len);
        btrfs_i_size_write(inode, name_len);
        err = btrfs_update_inode(trans, root, inode);
@@ -9457,7 +9532,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_op = &btrfs_file_inode_operations;
        inode->i_mapping->a_ops = &btrfs_aops;
-        inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
        BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        ret = btrfs_init_inode_security(trans, inode, dir, NULL);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 48b60dbf807f..97159a8e91d4 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1431,9 +1431,8 @@ static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info,
                qgroup = u64_to_ptr(unode->aux);
                qgroup->rfer += sign * oper->num_bytes;
                qgroup->rfer_cmpr += sign * oper->num_bytes;
+                WARN_ON(sign < 0 && qgroup->excl < oper->num_bytes);
                qgroup->excl += sign * oper->num_bytes;
-                if (sign < 0)
-                        WARN_ON(qgroup->excl < oper->num_bytes);
                qgroup->excl_cmpr += sign * oper->num_bytes;
                qgroup_dirty(fs_info, qgroup);
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8ab2a17bbba8..5264858ed768 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -58,15 +58,6 @@
 */
 #define RBIO_CACHE_READY_BIT    3
-/*
- * bbio and raid_map is managed by the caller, so we shouldn't free
- * them here. And besides that, all rbios with this flag should not
- * be cached, because we need raid_map to check the rbios' stripe
- * is the same or not, but it is very likely that the caller has
- * free raid_map, so don't cache those rbios.
- */
-#define RBIO_HOLD_BBIO_MAP_BIT  4
 #define RBIO_CACHE_SIZE 1024
 enum btrfs_rbio_ops {
@@ -79,13 +70,6 @@ struct btrfs_raid_bio {
        struct btrfs_fs_info *fs_info;
        struct btrfs_bio *bbio;
-        /*
-         * logical block numbers for the start of each stripe
-         * The last one or two are p/q.  These are sorted,
-         * so raid_map[0] is the start of our full stripe
-         */
-        u64 *raid_map;
        /* while we're doing rmw on a stripe
         * we put it into a hash table so we can
         * lock the stripe and merge more rbios
@@ -303,7 +287,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 */
 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 {
-        u64 num = rbio->raid_map[0];
+        u64 num = rbio->bbio->raid_map[0];
        /*
         * we shift down quite a bit.  We're using byte
@@ -606,8 +590,8 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
            test_bit(RBIO_CACHE_BIT, &cur->flags))
                return 0;
-        if (last->raid_map[0] !=
+        if (last->bbio->raid_map[0] !=
-            cur->raid_map[0])
+            cur->bbio->raid_map[0])
                return 0;
        /* we can't merge with different operations */
@@ -689,7 +673,7 @@ static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
        spin_lock_irqsave(&h->lock, flags);
        list_for_each_entry(cur, &h->hash_list, hash_list) {
                walk++;
-                if (cur->raid_map[0] == rbio->raid_map[0]) {
+                if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
                        spin_lock(&cur->bio_list_lock);
                        /* can we steal this cached rbio's pages? */
@@ -841,21 +825,6 @@ done_nolock:
                remove_rbio_from_cache(rbio);
 }
-static inline void
-__free_bbio_and_raid_map(struct btrfs_bio *bbio, u64 *raid_map, int need)
-{
-        if (need) {
-                kfree(raid_map);
-                kfree(bbio);
-        }
-}
-static inline void free_bbio_and_raid_map(struct btrfs_raid_bio *rbio)
-{
-        __free_bbio_and_raid_map(rbio->bbio, rbio->raid_map,
-                        !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags));
-}
 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 {
        int i;
@@ -875,8 +844,7 @@ static void __free_raid_bio(struct btrfs_raid_bio *rbio)
                }
        }
-        free_bbio_and_raid_map(rbio);
+        btrfs_put_bbio(rbio->bbio);
        kfree(rbio);
 }
@@ -985,8 +953,7 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 * this does not allocate any pages for rbio->pages.
 */
 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
-                          struct btrfs_bio *bbio, u64 *raid_map,
+                          struct btrfs_bio *bbio, u64 stripe_len)
-                          u64 stripe_len)
 {
        struct btrfs_raid_bio *rbio;
        int nr_data = 0;
@@ -1007,7 +974,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
        INIT_LIST_HEAD(&rbio->stripe_cache);
        INIT_LIST_HEAD(&rbio->hash_list);
        rbio->bbio = bbio;
-        rbio->raid_map = raid_map;
        rbio->fs_info = root->fs_info;
        rbio->stripe_len = stripe_len;
        rbio->nr_pages = num_pages;
@@ -1028,10 +994,12 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
        rbio->bio_pages = p + sizeof(struct page *) * num_pages;
        rbio->dbitmap = p + sizeof(struct page *) * num_pages * 2;
-        if (raid_map[real_stripes - 1] == RAID6_Q_STRIPE)
+        if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
+                nr_data = real_stripes - 1;
+        else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
                nr_data = real_stripes - 2;
        else
-                nr_data = real_stripes - 1;
+                BUG();
        rbio->nr_data = nr_data;
        return rbio;
@@ -1182,7 +1150,7 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio)
        spin_lock_irq(&rbio->bio_list_lock);
        bio_list_for_each(bio, &rbio->bio_list) {
                start = (u64)bio->bi_iter.bi_sector << 9;
-                stripe_offset = start - rbio->raid_map[0];
+                stripe_offset = start - rbio->bbio->raid_map[0];
                page_index = stripe_offset >> PAGE_CACHE_SHIFT;
                for (i = 0; i < bio->bi_vcnt; i++) {
@@ -1402,7 +1370,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
        logical <<= 9;
        for (i = 0; i < rbio->nr_data; i++) {
-                stripe_start = rbio->raid_map[i];
+                stripe_start = rbio->bbio->raid_map[i];
                if (logical >= stripe_start &&
                    logical < stripe_start + rbio->stripe_len) {
                        return i;
@@ -1776,17 +1744,16 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
 * our main entry point for writes from the rest of the FS.
 */
 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
-                        struct btrfs_bio *bbio, u64 *raid_map,
+                        struct btrfs_bio *bbio, u64 stripe_len)
-                        u64 stripe_len)
 {
        struct btrfs_raid_bio *rbio;
        struct btrfs_plug_cb *plug = NULL;
        struct blk_plug_cb *cb;
        int ret;
-        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+        rbio = alloc_rbio(root, bbio, stripe_len);
        if (IS_ERR(rbio)) {
-                __free_bbio_and_raid_map(bbio, raid_map, 1);
+                btrfs_put_bbio(bbio);
                return PTR_ERR(rbio);
        }
        bio_list_add(&rbio->bio_list, bio);
@@ -1885,9 +1852,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                }
                /* all raid6 handling here */
-                if (rbio->raid_map[rbio->real_stripes - 1] ==
+                if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
-                    RAID6_Q_STRIPE) {
                        /*
                         * single failure, rebuild from parity raid5
                         * style
@@ -1922,8 +1887,9 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                         * here due to a crc mismatch and we can't give them the
                         * data they want
                         */
-                        if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
+                        if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
-                                if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+                                if (rbio->bbio->raid_map[faila] ==
+                                    RAID5_P_STRIPE) {
                                        err = -EIO;
                                        goto cleanup;
                                }
@@ -1934,7 +1900,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
                                goto pstripe;
                        }
-                        if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+                        if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
                                raid6_datap_recov(rbio->real_stripes,
                                                  PAGE_SIZE, faila, pointers);
                        } else {
@@ -2001,8 +1967,7 @@ cleanup:
 cleanup_io:
        if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
-                if (err == 0 &&
+                if (err == 0)
-                    !test_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags))
                        cache_rbio_pages(rbio);
                else
                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
@@ -2156,15 +2121,16 @@ cleanup:
 * of the drive.
 */
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
-                          struct btrfs_bio *bbio, u64 *raid_map,
+                          struct btrfs_bio *bbio, u64 stripe_len,
-                          u64 stripe_len, int mirror_num, int generic_io)
+                          int mirror_num, int generic_io)
 {
        struct btrfs_raid_bio *rbio;
        int ret;
-        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+        rbio = alloc_rbio(root, bbio, stripe_len);
        if (IS_ERR(rbio)) {
-                __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+                if (generic_io)
+                        btrfs_put_bbio(bbio);
                return PTR_ERR(rbio);
        }
@@ -2175,7 +2141,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
        rbio->faila = find_logical_bio_stripe(rbio, bio);
        if (rbio->faila == -1) {
                BUG();
-                __free_bbio_and_raid_map(bbio, raid_map, generic_io);
+                if (generic_io)
+                        btrfs_put_bbio(bbio);
                kfree(rbio);
                return -EIO;
        }
@@ -2184,7 +2151,7 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
                btrfs_bio_counter_inc_noblocked(root->fs_info);
                rbio->generic_bio_cnt = 1;
        } else {
-                set_bit(RBIO_HOLD_BBIO_MAP_BIT, &rbio->flags);
+                btrfs_get_bbio(bbio);
        }
        /*
@@ -2240,14 +2207,14 @@ static void read_rebuild_work(struct btrfs_work *work)
 struct btrfs_raid_bio *
 raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
-                               struct btrfs_bio *bbio, u64 *raid_map,
+                               struct btrfs_bio *bbio, u64 stripe_len,
-                               u64 stripe_len, struct btrfs_device *scrub_dev,
+                               struct btrfs_device *scrub_dev,
                               unsigned long *dbitmap, int stripe_nsectors)
 {
        struct btrfs_raid_bio *rbio;
        int i;
-        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+        rbio = alloc_rbio(root, bbio, stripe_len);
        if (IS_ERR(rbio))
                return NULL;
        bio_list_add(&rbio->bio_list, bio);
@@ -2279,10 +2246,10 @@ void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
        int stripe_offset;
        int index;
-        ASSERT(logical >= rbio->raid_map[0]);
+        ASSERT(logical >= rbio->bbio->raid_map[0]);
-        ASSERT(logical + PAGE_SIZE <= rbio->raid_map[0] +
+        ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
                                rbio->stripe_len * rbio->nr_data);
-        stripe_offset = (int)(logical - rbio->raid_map[0]);
+        stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
        index = stripe_offset >> PAGE_CACHE_SHIFT;
        rbio->bio_pages[index] = page;
 }
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
index 31d4a157b5e3..2b5d7977d83b 100644
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@@ -43,16 +43,15 @@ struct btrfs_raid_bio;
 struct btrfs_device;
 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
-                          struct btrfs_bio *bbio, u64 *raid_map,
+                          struct btrfs_bio *bbio, u64 stripe_len,
-                          u64 stripe_len, int mirror_num, int generic_io);
+                          int mirror_num, int generic_io);
 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
-                               struct btrfs_bio *bbio, u64 *raid_map,
+                               struct btrfs_bio *bbio, u64 stripe_len);
-                               u64 stripe_len);
 struct btrfs_raid_bio *
 raid56_parity_alloc_scrub_rbio(struct btrfs_root *root, struct bio *bio,
-                               struct btrfs_bio *bbio, u64 *raid_map,
+                               struct btrfs_bio *bbio, u64 stripe_len,
-                               u64 stripe_len, struct btrfs_device *scrub_dev,
+                               struct btrfs_device *scrub_dev,
                               unsigned long *dbitmap, int stripe_nsectors);
 void raid56_parity_add_scrub_pages(struct btrfs_raid_bio *rbio,
                                   struct page *page, u64 logical);
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index b63ae20618fb..0e7beea92b4c 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -66,7 +66,6 @@ struct reada_extctl {
 struct reada_extent {
        u64                     logical;
        struct btrfs_key        top;
-        u32                     blocksize;
        int                     err;
        struct list_head        extctl;
        int                     refcnt;
@@ -349,7 +348,6 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        blocksize = root->nodesize;
        re->logical = logical;
-        re->blocksize = blocksize;
        re->top = *top;
        INIT_LIST_HEAD(&re->extctl);
        spin_lock_init(&re->lock);
@@ -463,7 +461,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
        spin_unlock(&fs_info->reada_lock);
        btrfs_dev_replace_unlock(&fs_info->dev_replace);
-        kfree(bbio);
+        btrfs_put_bbio(bbio);
        return re;
 error:
@@ -488,7 +486,7 @@ error:
                kref_put(&zone->refcnt, reada_zone_release);
                spin_unlock(&fs_info->reada_lock);
        }
-        kfree(bbio);
+        btrfs_put_bbio(bbio);
        kfree(re);
        return re_exist;
 }
@@ -660,7 +658,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
        int mirror_num = 0;
        struct extent_buffer *eb = NULL;
        u64 logical;
-        u32 blocksize;
        int ret;
        int i;
        int need_kick = 0;
@@ -694,7 +691,7 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
                spin_unlock(&fs_info->reada_lock);
                return 0;
        }
-        dev->reada_next = re->logical + re->blocksize;
+        dev->reada_next = re->logical + fs_info->tree_root->nodesize;
        re->refcnt++;
        spin_unlock(&fs_info->reada_lock);
@@ -709,7 +706,6 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
                }
        }
        logical = re->logical;
-        blocksize = re->blocksize;
        spin_lock(&re->lock);
        if (re->scheduled_for == NULL) {
@@ -724,8 +720,8 @@ static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
                return 0;
        atomic_inc(&dev->reada_in_flight);
-        ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
+        ret = reada_tree_block_flagged(fs_info->extent_root, logical,
-                         mirror_num, &eb);
+                        mirror_num, &eb);
        if (ret)
                __readahead_hook(fs_info->extent_root, NULL, logical, ret);
        else if (eb)
@@ -851,7 +847,7 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                                break;
                        printk(KERN_DEBUG
                                "  re: logical %llu size %u empty %d for %lld",
-                                re->logical, re->blocksize,
+                                re->logical, fs_info->tree_root->nodesize,
                                list_empty(&re->extctl), re->scheduled_for ?
                                re->scheduled_for->devid : -1);
@@ -886,7 +882,8 @@ static void dump_devs(struct btrfs_fs_info *fs_info, int all)
                }
                printk(KERN_DEBUG
                        "re: logical %llu size %u list empty %d for %lld",
-                        re->logical, re->blocksize, list_empty(&re->extctl),
+                        re->logical, fs_info->tree_root->nodesize,
+                        list_empty(&re->extctl),
                        re->scheduled_for ? re->scheduled_for->devid : -1);
                for (i = 0; i < re->nzones; ++i) {
                        printk(KERN_CONT " zone %llu-%llu devs",
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 74257d6436ad..d83085381bcc 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2855,9 +2855,10 @@ static void update_processed_blocks(struct reloc_control *rc,
        }
 }
-static int tree_block_processed(u64 bytenr, u32 blocksize,
+static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
-                                struct reloc_control *rc)
 {
+        u32 blocksize = rc->extent_root->nodesize;
        if (test_range_bit(&rc->processed_blocks, bytenr,
                           bytenr + blocksize - 1, EXTENT_DIRTY, 1, NULL))
                return 1;
@@ -2965,8 +2966,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
                if (!block->key_ready)
-                        readahead_tree_block(rc->extent_root, block->bytenr,
+                        readahead_tree_block(rc->extent_root, block->bytenr);
-                                        block->key.objectid);
                rb_node = rb_next(rb_node);
        }
@@ -3353,7 +3353,7 @@ static int __add_tree_block(struct reloc_control *rc,
        bool skinny = btrfs_fs_incompat(rc->extent_root->fs_info,
                                        SKINNY_METADATA);
-        if (tree_block_processed(bytenr, blocksize, rc))
+        if (tree_block_processed(bytenr, rc))
                return 0;
        if (tree_search(blocks, bytenr))
@@ -3611,7 +3611,7 @@ static int find_data_references(struct reloc_control *rc,
                if (added)
                        goto next;
-                if (!tree_block_processed(leaf->start, leaf->len, rc)) {
+                if (!tree_block_processed(leaf->start, rc)) {
                        block = kmalloc(sizeof(*block), GFP_NOFS);
                        if (!block) {
                                err = -ENOMEM;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index f2bb13a23f86..ec57687c9a4d 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -66,7 +66,6 @@ struct scrub_ctx;
 struct scrub_recover {
        atomic_t                refs;
        struct btrfs_bio        *bbio;
-        u64                     *raid_map;
        u64                     map_length;
 };
@@ -80,7 +79,7 @@ struct scrub_page {
        u64                     logical;
        u64                     physical;
        u64                     physical_for_dev_replace;
-        atomic_t                ref_count;
+        atomic_t                refs;
        struct {
                unsigned int    mirror_num:8;
                unsigned int    have_csum:1;
@@ -113,7 +112,7 @@ struct scrub_block {
        struct scrub_page       *pagev[SCRUB_MAX_PAGES_PER_BLOCK];
        int                     page_count;
        atomic_t                outstanding_pages;
-        atomic_t                ref_count; /* free mem on transition to zero */
+        atomic_t                refs; /* free mem on transition to zero */
        struct scrub_ctx        *sctx;
        struct scrub_parity     *sparity;
        struct {
@@ -142,7 +141,7 @@ struct scrub_parity {
        int                     stripe_len;
-        atomic_t                ref_count;
+        atomic_t                refs;
        struct list_head        spages;
@@ -194,6 +193,15 @@ struct scrub_ctx {
         */
        struct btrfs_scrub_progress stat;
        spinlock_t              stat_lock;
+        /*
+         * Use a ref counter to avoid use-after-free issues. Scrub workers
+         * decrement bios_in_flight and workers_pending and then do a wakeup
+         * on the list_wait wait queue. We must ensure the main scrub task
+         * doesn't free the scrub context before or while the workers are
+         * doing the wakeup() call.
+         */
+        atomic_t                refs;
 };
 struct scrub_fixup_nodatasum {
@@ -236,10 +244,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
-                                     struct btrfs_fs_info *fs_info,
-                                     struct scrub_block *original_sblock,
-                                     u64 length, u64 logical,
                                     struct scrub_block *sblocks_for_recheck);
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
                                struct scrub_block *sblock, int is_metadata,
@@ -251,8 +256,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
                                         const u8 *csum, u64 generation,
                                         u16 csum_size);
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
-                                             struct scrub_block *sblock_good,
+                                             struct scrub_block *sblock_good);
-                                             int force_write);
 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
                                            struct scrub_block *sblock_good,
                                            int page_num, int force_write);
@@ -302,10 +306,12 @@ static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 static void copy_nocow_pages_worker(struct btrfs_work *work);
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
+static void scrub_put_ctx(struct scrub_ctx *sctx);
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 {
+        atomic_inc(&sctx->refs);
        atomic_inc(&sctx->bios_in_flight);
 }
@@ -313,6 +319,7 @@ static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 {
        atomic_dec(&sctx->bios_in_flight);
        wake_up(&sctx->list_wait);
+        scrub_put_ctx(sctx);
 }
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
@@ -346,6 +353,7 @@ static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 {
        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+        atomic_inc(&sctx->refs);
        /*
         * increment scrubs_running to prevent cancel requests from
         * completing as long as a worker is running. we must also
@@ -388,6 +396,7 @@ static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
        atomic_dec(&sctx->workers_pending);
        wake_up(&fs_info->scrub_pause_wait);
        wake_up(&sctx->list_wait);
+        scrub_put_ctx(sctx);
 }
 static void scrub_free_csums(struct scrub_ctx *sctx)
@@ -433,6 +442,12 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
        kfree(sctx);
 }
+static void scrub_put_ctx(struct scrub_ctx *sctx)
+{
+        if (atomic_dec_and_test(&sctx->refs))
+                scrub_free_ctx(sctx);
+}
 static noinline_for_stack
 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 {
@@ -457,6 +472,7 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
        sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
        if (!sctx)
                goto nomem;
+        atomic_set(&sctx->refs, 1);
        sctx->is_dev_replace = is_dev_replace;
        sctx->pages_per_rd_bio = pages_per_rd_bio;
        sctx->curr = -1;
@@ -520,6 +536,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
        struct inode_fs_paths *ipath = NULL;
        struct btrfs_root *local_root;
        struct btrfs_key root_key;
+        struct btrfs_key key;
        root_key.objectid = root;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -530,7 +547,14 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
                goto err;
        }
-        ret = inode_item_info(inum, 0, local_root, swarn->path);
+        /*
+         * this makes the path point to (inum INODE_ITEM ioff)
+         */
+        key.objectid = inum;
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
        if (ret) {
                btrfs_release_path(swarn->path);
                goto err;
@@ -848,8 +872,7 @@ static inline void scrub_get_recover(struct scrub_recover *recover)
 static inline void scrub_put_recover(struct scrub_recover *recover)
 {
        if (atomic_dec_and_test(&recover->refs)) {
-                kfree(recover->bbio);
+                btrfs_put_bbio(recover->bbio);
-                kfree(recover->raid_map);
                kfree(recover);
        }
 }
@@ -955,8 +978,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        }
        /* setup the context, map the logical blocks and alloc the pages */
-        ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
+        ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
-                                        logical, sblocks_for_recheck);
        if (ret) {
                spin_lock(&sctx->stat_lock);
                sctx->stat.read_errors++;
@@ -1030,9 +1052,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
        if (!is_metadata && !have_csum) {
                struct scrub_fixup_nodatasum *fixup_nodatasum;
-nodatasum_case:
                WARN_ON(sctx->is_dev_replace);
+nodatasum_case:
                /*
                 * !is_metadata and !have_csum, this means that the data
                 * might not be COW'ed, that it might be modified
@@ -1091,76 +1114,20 @@ nodatasum_case:
                    sblock_other->no_io_error_seen) {
                        if (sctx->is_dev_replace) {
                                scrub_write_block_to_dev_replace(sblock_other);
+                                goto corrected_error;
                        } else {
-                                int force_write = is_metadata || have_csum;
                                ret = scrub_repair_block_from_good_copy(
-                                                sblock_bad, sblock_other,
+                                                sblock_bad, sblock_other);
-                                                force_write);
+                                if (!ret)
+                                        goto corrected_error;
                        }
-                        if (0 == ret)
-                                goto corrected_error;
                }
        }
-        /*
+        if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
-         * for dev_replace, pick good pages and write to the target device.
+                goto did_not_correct_error;
-         */
-        if (sctx->is_dev_replace) {
-                success = 1;
-                for (page_num = 0; page_num < sblock_bad->page_count;
-                     page_num++) {
-                        int sub_success;
-                        sub_success = 0;
-                        for (mirror_index = 0;
-                             mirror_index < BTRFS_MAX_MIRRORS &&
-                             sblocks_for_recheck[mirror_index].page_count > 0;
-                             mirror_index++) {
-                                struct scrub_block *sblock_other =
-                                        sblocks_for_recheck + mirror_index;
-                                struct scrub_page *page_other =
-                                        sblock_other->pagev[page_num];
-                                if (!page_other->io_error) {
-                                        ret = scrub_write_page_to_dev_replace(
-                                                        sblock_other, page_num);
-                                        if (ret == 0) {
-                                                /* succeeded for this page */
-                                                sub_success = 1;
-                                                break;
-                                        } else {
-                                                btrfs_dev_replace_stats_inc(
-                                                        &sctx->dev_root->
-                                                        fs_info->dev_replace.
-                                                        num_write_errors);
-                                        }
-                                }
-                        }
-                        if (!sub_success) {
-                                /*
-                                 * did not find a mirror to fetch the page
-                                 * from. scrub_write_page_to_dev_replace()
-                                 * handles this case (page->io_error), by
-                                 * filling the block with zeros before
-                                 * submitting the write request
-                                 */
-                                success = 0;
-                                ret = scrub_write_page_to_dev_replace(
-                                                sblock_bad, page_num);
-                                if (ret)
-                                        btrfs_dev_replace_stats_inc(
-                                                &sctx->dev_root->fs_info->
-                                                dev_replace.num_write_errors);
-                        }
-                }
-                goto out;
-        }
        /*
-         * for regular scrub, repair those pages that are errored.
         * In case of I/O errors in the area that is supposed to be
         * repaired, continue by picking good copies of those pages.
         * Select the good pages from mirrors to rewrite bad pages from
@@ -1184,44 +1151,64 @@ nodatasum_case:
         * mirror, even if other 512 byte sectors in the same PAGE_SIZE
         * area are unreadable.
         */
-        /* can only fix I/O errors from here on */
-        if (sblock_bad->no_io_error_seen)
-                goto did_not_correct_error;
        success = 1;
-        for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
+        for (page_num = 0; page_num < sblock_bad->page_count;
+             page_num++) {
                struct scrub_page *page_bad = sblock_bad->pagev[page_num];
+                struct scrub_block *sblock_other = NULL;
-                if (!page_bad->io_error)
+                /* skip no-io-error page in scrub */
+                if (!page_bad->io_error && !sctx->is_dev_replace)
                        continue;
-                for (mirror_index = 0;
+                /* try to find no-io-error page in mirrors */
-                     mirror_index < BTRFS_MAX_MIRRORS &&
+                if (page_bad->io_error) {
-                     sblocks_for_recheck[mirror_index].page_count > 0;
+                        for (mirror_index = 0;
-                     mirror_index++) {
+                             mirror_index < BTRFS_MAX_MIRRORS &&
-                        struct scrub_block *sblock_other = sblocks_for_recheck +
+                             sblocks_for_recheck[mirror_index].page_count > 0;
-                                                           mirror_index;
+                             mirror_index++) {
-                        struct scrub_page *page_other = sblock_other->pagev[
+                                if (!sblocks_for_recheck[mirror_index].
-                                                        page_num];
+                                    pagev[page_num]->io_error) {
+                                        sblock_other = sblocks_for_recheck +
-                        if (!page_other->io_error) {
+                                                       mirror_index;
-                                ret = scrub_repair_page_from_good_copy(
+                                        break;
-                                        sblock_bad, sblock_other, page_num, 0);
-                                if (0 == ret) {
-                                        page_bad->io_error = 0;
-                                        break; /* succeeded for this page */
                                }
                        }
+                        if (!sblock_other)
+                                success = 0;
                }
-                if (page_bad->io_error) {
+                if (sctx->is_dev_replace) {
-                        /* did not find a mirror to copy the page from */
+                        /*
-                        success = 0;
+                         * did not find a mirror to fetch the page
+                         * from. scrub_write_page_to_dev_replace()
+                         * handles this case (page->io_error), by
+                         * filling the block with zeros before
+                         * submitting the write request
+                         */
+                        if (!sblock_other)
+                                sblock_other = sblock_bad;
+                        if (scrub_write_page_to_dev_replace(sblock_other,
+                                                            page_num) != 0) {
+                                btrfs_dev_replace_stats_inc(
+                                        &sctx->dev_root->
+                                        fs_info->dev_replace.
+                                        num_write_errors);
+                                success = 0;
+                        }
+                } else if (sblock_other) {
+                        ret = scrub_repair_page_from_good_copy(sblock_bad,
+                                                               sblock_other,
+                                                               page_num, 0);
+                        if (0 == ret)
+                                page_bad->io_error = 0;
+                        else
+                                success = 0;
                }
        }
-        if (success) {
+        if (success && !sctx->is_dev_replace) {
                if (is_metadata || have_csum) {
                        /*
                         * need to verify the checksum now that all
@@ -1288,19 +1275,18 @@ out:
        return 0;
 }
-static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio, u64 *raid_map)
+static inline int scrub_nr_raid_mirrors(struct btrfs_bio *bbio)
 {
-        if (raid_map) {
+        if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
-                if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+                return 2;
-                        return 3;
+        else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
-                else
+                return 3;
-                        return 2;
+        else
-        } else {
                return (int)bbio->num_stripes;
-        }
 }
-static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
+static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
+                                                 u64 *raid_map,
                                                 u64 mapped_length,
                                                 int nstripes, int mirror,
                                                 int *stripe_index,
@@ -1308,7 +1294,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
 {
        int i;
-        if (raid_map) {
+        if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                /* RAID5/6 */
                for (i = 0; i < nstripes; i++) {
                        if (raid_map[i] == RAID6_Q_STRIPE ||
@@ -1329,72 +1315,65 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 *raid_map,
        }
 }
-static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
+static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
-                                     struct btrfs_fs_info *fs_info,
-                                     struct scrub_block *original_sblock,
-                                     u64 length, u64 logical,
                                     struct scrub_block *sblocks_for_recheck)
 {
+        struct scrub_ctx *sctx = original_sblock->sctx;
+        struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
+        u64 length = original_sblock->page_count * PAGE_SIZE;
+        u64 logical = original_sblock->pagev[0]->logical;
        struct scrub_recover *recover;
        struct btrfs_bio *bbio;
-        u64 *raid_map;
        u64 sublen;
        u64 mapped_length;
        u64 stripe_offset;
        int stripe_index;
-        int page_index;
+        int page_index = 0;
        int mirror_index;
        int nmirrors;
        int ret;
        /*
-         * note: the two members ref_count and outstanding_pages
+         * note: the two members refs and outstanding_pages
         * are not used (and not set) in the blocks that are used for
         * the recheck procedure
         */
-        page_index = 0;
        while (length > 0) {
                sublen = min_t(u64, length, PAGE_SIZE);
                mapped_length = sublen;
                bbio = NULL;
-                raid_map = NULL;
                /*
                 * with a length of PAGE_SIZE, each returned stripe
                 * represents one mirror
                 */
                ret = btrfs_map_sblock(fs_info, REQ_GET_READ_MIRRORS, logical,
-                                       &mapped_length, &bbio, 0, &raid_map);
+                                       &mapped_length, &bbio, 0, 1);
                if (ret || !bbio || mapped_length < sublen) {
-                        kfree(bbio);
+                        btrfs_put_bbio(bbio);
-                        kfree(raid_map);
                        return -EIO;
                }
                recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
                if (!recover) {
-                        kfree(bbio);
+                        btrfs_put_bbio(bbio);
-                        kfree(raid_map);
                        return -ENOMEM;
                }
                atomic_set(&recover->refs, 1);
                recover->bbio = bbio;
-                recover->raid_map = raid_map;
                recover->map_length = mapped_length;
                BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
-                nmirrors = scrub_nr_raid_mirrors(bbio, raid_map);
+                nmirrors = min(scrub_nr_raid_mirrors(bbio), BTRFS_MAX_MIRRORS);
                for (mirror_index = 0; mirror_index < nmirrors;
                     mirror_index++) {
                        struct scrub_block *sblock;
                        struct scrub_page *page;
-                        if (mirror_index >= BTRFS_MAX_MIRRORS)
-                                continue;
                        sblock = sblocks_for_recheck + mirror_index;
                        sblock->sctx = sctx;
                        page = kzalloc(sizeof(*page), GFP_NOFS);
@@ -1410,9 +1389,12 @@ leave_nomem:
                        sblock->pagev[page_index] = page;
                        page->logical = logical;
-                        scrub_stripe_index_and_offset(logical, raid_map,
+                        scrub_stripe_index_and_offset(logical,
+                                                      bbio->map_type,
+                                                      bbio->raid_map,
                                                      mapped_length,
-                                                      bbio->num_stripes,
+                                                      bbio->num_stripes -
+                                                      bbio->num_tgtdevs,
                                                      mirror_index,
                                                      &stripe_index,
                                                      &stripe_offset);
@@ -1458,7 +1440,8 @@ static void scrub_bio_wait_endio(struct bio *bio, int error)
 static inline int scrub_is_page_on_raid56(struct scrub_page *page)
 {
-        return page->recover && page->recover->raid_map;
+        return page->recover &&
+               (page->recover->bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
 }
 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
@@ -1475,7 +1458,6 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
        bio->bi_end_io = scrub_bio_wait_endio;
        ret = raid56_parity_recover(fs_info->fs_root, bio, page->recover->bbio,
-                                    page->recover->raid_map,
                                    page->recover->map_length,
                                    page->mirror_num, 0);
        if (ret)
@@ -1615,8 +1597,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 }
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
-                                             struct scrub_block *sblock_good,
+                                             struct scrub_block *sblock_good)
-                                             int force_write)
 {
        int page_num;
        int ret = 0;
@@ -1626,8 +1607,7 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
                ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
                                                           sblock_good,
-                                                           page_num,
+                                                           page_num, 1);
-                                                           force_write);
                if (ret_sub)
                        ret = ret_sub;
        }
@@ -2067,12 +2047,12 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 static void scrub_block_get(struct scrub_block *sblock)
 {
-        atomic_inc(&sblock->ref_count);
+        atomic_inc(&sblock->refs);
 }
 static void scrub_block_put(struct scrub_block *sblock)
 {
-        if (atomic_dec_and_test(&sblock->ref_count)) {
+        if (atomic_dec_and_test(&sblock->refs)) {
                int i;
                if (sblock->sparity)
@@ -2086,12 +2066,12 @@ static void scrub_block_put(struct scrub_block *sblock)
 static void scrub_page_get(struct scrub_page *spage)
 {
-        atomic_inc(&spage->ref_count);
+        atomic_inc(&spage->refs);
 }
 static void scrub_page_put(struct scrub_page *spage)
 {
-        if (atomic_dec_and_test(&spage->ref_count)) {
+        if (atomic_dec_and_test(&spage->refs)) {
                if (spage->page)
                        __free_page(spage->page);
                kfree(spage);
@@ -2217,7 +2197,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
        /* one ref inside this function, plus one for each page added to
         * a bio later on */
-        atomic_set(&sblock->ref_count, 1);
+        atomic_set(&sblock->refs, 1);
        sblock->sctx = sctx;
        sblock->no_io_error_seen = 1;
@@ -2510,7 +2490,7 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity,
        /* one ref inside this function, plus one for each page added to
         * a bio later on */
-        atomic_set(&sblock->ref_count, 1);
+        atomic_set(&sblock->refs, 1);
        sblock->sctx = sctx;
        sblock->no_io_error_seen = 1;
        sblock->sparity = sparity;
@@ -2607,9 +2587,9 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity,
                ret = scrub_pages_for_parity(sparity, logical, l, physical, dev,
                                             flags, gen, mirror_num,
                                             have_csum ? csum : NULL);
-skip:
                if (ret)
                        return ret;
+skip:
                len -= l;
                logical += l;
                physical += l;
@@ -2705,7 +2685,6 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
        struct btrfs_raid_bio *rbio;
        struct scrub_page *spage;
        struct btrfs_bio *bbio = NULL;
-        u64 *raid_map = NULL;
        u64 length;
        int ret;
@@ -2716,8 +2695,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
        length = sparity->logic_end - sparity->logic_start + 1;
        ret = btrfs_map_sblock(sctx->dev_root->fs_info, WRITE,
                               sparity->logic_start,
-                               &length, &bbio, 0, &raid_map);
+                               &length, &bbio, 0, 1);
-        if (ret || !bbio || !raid_map)
+        if (ret || !bbio || !bbio->raid_map)
                goto bbio_out;
        bio = btrfs_io_bio_alloc(GFP_NOFS, 0);
@@ -2729,8 +2708,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
        bio->bi_end_io = scrub_parity_bio_endio;
        rbio = raid56_parity_alloc_scrub_rbio(sctx->dev_root, bio, bbio,
-                                              raid_map, length,
+                                              length, sparity->scrub_dev,
-                                              sparity->scrub_dev,
                                              sparity->dbitmap,
                                              sparity->nsectors);
        if (!rbio)
@@ -2747,8 +2725,7 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
 rbio_out:
        bio_put(bio);
 bbio_out:
-        kfree(bbio);
+        btrfs_put_bbio(bbio);
-        kfree(raid_map);
        bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
                  sparity->nsectors);
        spin_lock(&sctx->stat_lock);
@@ -2765,12 +2742,12 @@ static inline int scrub_calc_parity_bitmap_len(int nsectors)
 static void scrub_parity_get(struct scrub_parity *sparity)
 {
-        atomic_inc(&sparity->ref_count);
+        atomic_inc(&sparity->refs);
 }
 static void scrub_parity_put(struct scrub_parity *sparity)
 {
-        if (!atomic_dec_and_test(&sparity->ref_count))
+        if (!atomic_dec_and_test(&sparity->refs))
                return;
        scrub_parity_check_and_repair(sparity);
@@ -2820,7 +2797,7 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
        sparity->scrub_dev = sdev;
        sparity->logic_start = logic_start;
        sparity->logic_end = logic_end;
-        atomic_set(&sparity->ref_count, 1);
+        atomic_set(&sparity->refs, 1);
        INIT_LIST_HEAD(&sparity->spages);
        sparity->dbitmap = sparity->bitmap;
        sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
@@ -3037,8 +3014,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
                increment = map->stripe_len;
                mirror_num = num % map->num_stripes + 1;
-        } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-                                BTRFS_BLOCK_GROUP_RAID6)) {
                get_raid56_logic_offset(physical, num, map, &offset, NULL);
                increment = map->stripe_len * nr_data_stripes(map);
                mirror_num = 1;
@@ -3053,7 +3029,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        ppath = btrfs_alloc_path();
        if (!ppath) {
-                btrfs_free_path(ppath);
+                btrfs_free_path(path);
                return -ENOMEM;
        }
@@ -3065,6 +3041,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        path->search_commit_root = 1;
        path->skip_locking = 1;
+        ppath->search_commit_root = 1;
+        ppath->skip_locking = 1;
        /*
         * trigger the readahead for extent tree csum tree and wait for
         * completion. During readahead, the scrub is officially paused
@@ -3072,8 +3050,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
         */
        logical = base + offset;
        physical_end = physical + nstripes * map->stripe_len;
-        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-                         BTRFS_BLOCK_GROUP_RAID6)) {
                get_raid56_logic_offset(physical_end, num,
                                        map, &logic_end, NULL);
                logic_end += base;
@@ -3119,8 +3096,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        ret = 0;
        while (physical < physical_end) {
                /* for raid56, we skip parity stripe */
-                if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-                                BTRFS_BLOCK_GROUP_RAID6)) {
                        ret = get_raid56_logic_offset(physical, num,
                                        map, &logical, &stripe_logical);
                        logical += base;
@@ -3278,8 +3254,7 @@ again:
                        scrub_free_csums(sctx);
                        if (extent_logical + extent_len <
                            key.objectid + bytes) {
-                                if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+                                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-                                        BTRFS_BLOCK_GROUP_RAID6)) {
                                        /*
                                         * loop until we find next data stripe
                                         * or we have finished all stripes.
@@ -3773,7 +3748,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
        scrub_workers_put(fs_info);
        mutex_unlock(&fs_info->scrub_lock);
-        scrub_free_ctx(sctx);
+        scrub_put_ctx(sctx);
        return ret;
 }
@@ -3879,14 +3854,14 @@ static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
                              &mapped_length, &bbio, 0);
        if (ret || !bbio || mapped_length < extent_len ||
            !bbio->stripes[0].dev->bdev) {
-                kfree(bbio);
+                btrfs_put_bbio(bbio);
                return;
        }
        *extent_physical = bbio->stripes[0].physical;
        *extent_mirror_num = bbio->mirror_num;
        *extent_dev = bbio->stripes[0].dev;
-        kfree(bbio);
+        btrfs_put_bbio(bbio);
 }
 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 804432dbc351..fe5857223515 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -2471,12 +2471,9 @@ verbose_printk("btrfs: send_utimes %llu\n", ino);
        if (ret < 0)
                goto out;
        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
-        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb,
+        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime);
-                        btrfs_inode_atime(ii));
+        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime);
-        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb,
+        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime);
-                        btrfs_inode_mtime(ii));
-        TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb,
-                        btrfs_inode_ctime(ii));
        /* TODO Add otime support when the otime patches get into upstream */
        ret = send_cmd(sctx);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 60f7cbe815e9..05fef198ff94 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1000,10 +1000,20 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
                         */
                        if (fs_info->pending_changes == 0)
                                return 0;
+                        /*
+                         * A non-blocking test if the fs is frozen. We must not
+                         * start a new transaction here otherwise a deadlock
+                         * happens. The pending operations are delayed to the
+                         * next commit after thawing.
+                         */
+                        if (__sb_start_write(sb, SB_FREEZE_WRITE, false))
+                                __sb_end_write(sb, SB_FREEZE_WRITE);
+                        else
+                                return 0;
                        trans = btrfs_start_transaction(root, 0);
-                } else {
-                        return PTR_ERR(trans);
                }
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
        }
        return btrfs_commit_transaction(trans, root);
 }
@@ -1948,11 +1958,6 @@ static int btrfs_freeze(struct super_block *sb)
        return btrfs_commit_transaction(trans, root);
 }
-static int btrfs_unfreeze(struct super_block *sb)
-{
-        return 0;
-}
 static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
@@ -2001,7 +2006,6 @@ static const struct super_operations btrfs_super_ops = {
        .statfs         = btrfs_statfs,
        .remount_fs     = btrfs_remount,
        .freeze_fs      = btrfs_freeze,
-        .unfreeze_fs    = btrfs_unfreeze,
 };
 static const struct file_operations btrfs_ctl_fops = {
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 92db3f648df4..94edb0a2a026 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -733,10 +733,18 @@ int btrfs_init_sysfs(void)
        ret = btrfs_init_debugfs();
        if (ret)
-                return ret;
+                goto out1;
        init_feature_attrs();
        ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_feature_attr_group);
+        if (ret)
+                goto out2;
+        return 0;
+out2:
+        debugfs_remove_recursive(btrfs_debugfs_root_dentry);
+out1:
+        kset_unregister(btrfs_kset);
        return ret;
 }
diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c
index cc286ce97d1e..f51963a8f929 100644
--- a/fs/btrfs/tests/extent-buffer-tests.c
+++ b/fs/btrfs/tests/extent-buffer-tests.c
@@ -53,7 +53,7 @@ static int test_btrfs_split_item(void)
                return -ENOMEM;
        }
-        path->nodes[0] = eb = alloc_dummy_extent_buffer(0, 4096);
+        path->nodes[0] = eb = alloc_dummy_extent_buffer(NULL, 4096);
        if (!eb) {
                test_msg("Could not allocate dummy buffer\n");
                ret = -ENOMEM;
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 7e99c2f98dd0..9e9f2368177d 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -258,8 +258,7 @@ static int test_find_delalloc(void)
        }
        ret = 0;
 out_bits:
-        clear_extent_bits(&tmp, 0, total_dirty - 1,
+        clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
-                          (unsigned long)-1, GFP_NOFS);
 out:
        if (locked_page)
                page_cache_release(locked_page);
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 3ae0f5b8bb80..a116b55ce788 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -255,7 +255,7 @@ static noinline int test_btrfs_get_extent(void)
                goto out;
        }
-        root->node = alloc_dummy_extent_buffer(0, 4096);
+        root->node = alloc_dummy_extent_buffer(NULL, 4096);
        if (!root->node) {
                test_msg("Couldn't allocate dummy buffer\n");
                goto out;
@@ -843,7 +843,7 @@ static int test_hole_first(void)
                goto out;
        }
-        root->node = alloc_dummy_extent_buffer(0, 4096);
+        root->node = alloc_dummy_extent_buffer(NULL, 4096);
        if (!root->node) {
                test_msg("Couldn't allocate dummy buffer\n");
                goto out;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ec3dcb202357..73f299ebdabb 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -404,12 +404,22 @@ int btrfs_test_qgroups(void)
                ret = -ENOMEM;
                goto out;
        }
+        /* We are using this root as our extent root */
+        root->fs_info->extent_root = root;
+        /*
+         * Some of the paths we test assume we have a filled out fs_info, so we
+         * just need to add the root in there so we don't panic.
+         */
+        root->fs_info->tree_root = root;
+        root->fs_info->quota_root = root;
+        root->fs_info->quota_enabled = 1;
        /*
         * Can't use bytenr 0, some things freak out
         * *cough*backref walking code*cough*
         */
-        root->node = alloc_test_extent_buffer(root->fs_info, 4096, 4096);
+        root->node = alloc_test_extent_buffer(root->fs_info, 4096);
        if (!root->node) {
                test_msg("Couldn't allocate dummy buffer\n");
                ret = -ENOMEM;
@@ -448,17 +458,6 @@ int btrfs_test_qgroups(void)
                goto out;
        }
-        /* We are using this root as our extent root */
-        root->fs_info->extent_root = root;
-        /*
-         * Some of the paths we test assume we have a filled out fs_info, so we
-         * just need to addt he root in there so we don't panic.
-         */
-        root->fs_info->tree_root = root;
-        root->fs_info->quota_root = root;
-        root->fs_info->quota_enabled = 1;
        test_msg("Running qgroup tests\n");
        ret = test_no_shared_qgroup(root);
        if (ret)
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a605d4e2f2bc..7e80f32550a6 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -220,6 +220,7 @@ loop:
         * commit the transaction.
         */
        atomic_set(&cur_trans->use_count, 2);
+        cur_trans->have_free_bgs = 0;
        cur_trans->start_time = get_seconds();
        cur_trans->delayed_refs.href_root = RB_ROOT;
@@ -248,6 +249,8 @@ loop:
        INIT_LIST_HEAD(&cur_trans->pending_chunks);
        INIT_LIST_HEAD(&cur_trans->switch_commits);
        INIT_LIST_HEAD(&cur_trans->pending_ordered);
+        INIT_LIST_HEAD(&cur_trans->dirty_bgs);
+        spin_lock_init(&cur_trans->dirty_bgs_lock);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@ -1020,6 +1023,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        u64 old_root_bytenr;
        u64 old_root_used;
        struct btrfs_root *tree_root = root->fs_info->tree_root;
+        bool extent_root = (root->objectid == BTRFS_EXTENT_TREE_OBJECTID);
        old_root_used = btrfs_root_used(&root->root_item);
        btrfs_write_dirty_block_groups(trans, root);
@@ -1027,7 +1031,9 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start &&
-                    old_root_used == btrfs_root_used(&root->root_item))
+                    old_root_used == btrfs_root_used(&root->root_item) &&
+                    (!extent_root ||
+                     list_empty(&trans->transaction->dirty_bgs)))
                        break;
                btrfs_set_root_node(&root->root_item, root->node);
@@ -1038,7 +1044,15 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                        return ret;
                old_root_used = btrfs_root_used(&root->root_item);
-                ret = btrfs_write_dirty_block_groups(trans, root);
+                if (extent_root) {
+                        ret = btrfs_write_dirty_block_groups(trans, root);
+                        if (ret)
+                                return ret;
+                }
+                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                if (ret)
+                        return ret;
+                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
                if (ret)
                        return ret;
        }
@@ -1061,10 +1075,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
        struct extent_buffer *eb;
        int ret;
-        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-        if (ret)
-                return ret;
        eb = btrfs_lock_root_node(fs_info->tree_root);
        ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL,
                              0, &eb);
@@ -1097,6 +1107,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                next = fs_info->dirty_cowonly_roots.next;
                list_del_init(next);
                root = list_entry(next, struct btrfs_root, dirty_list);
+                clear_bit(BTRFS_ROOT_DIRTY, &root->state);
                if (root != fs_info->extent_root)
                        list_add_tail(&root->dirty_list,
@@ -1983,6 +1994,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        switch_commit_roots(cur_trans, root->fs_info);
        assert_qgroups_uptodate(trans);
+        ASSERT(list_empty(&cur_trans->dirty_bgs));
        update_super_roots(root);
        btrfs_set_super_log_root(root->fs_info->super_copy, 0);
@@ -2026,6 +2038,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_finish_extent_commit(trans, root);
+        if (cur_trans->have_free_bgs)
+                btrfs_clear_space_info_full(root->fs_info);
        root->fs_info->last_trans_committed = cur_trans->transid;
        /*
         * We needn't acquire the lock here because there is no other task
@@ -2118,7 +2133,7 @@ void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
        unsigned long prev;
        unsigned long bit;
-        prev = cmpxchg(&fs_info->pending_changes, 0, 0);
+        prev = xchg(&fs_info->pending_changes, 0);
        if (!prev)
                return;
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 00ed29c4b3f9..937050a2b68e 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -47,6 +47,11 @@ struct btrfs_transaction {
        atomic_t num_writers;
        atomic_t use_count;
+        /*
+         * true if there is free bgs operations in this transaction
+         */
+        int have_free_bgs;
        /* Be protected by fs_info->trans_lock when we want to change it. */
        enum btrfs_trans_state state;
        struct list_head list;
@@ -58,6 +63,8 @@ struct btrfs_transaction {
        struct list_head pending_chunks;
        struct list_head pending_ordered;
        struct list_head switch_commits;
+        struct list_head dirty_bgs;
+        spinlock_t dirty_bgs_lock;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
 };
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9a02da16f2be..9a37f8b39bae 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -453,11 +453,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
 insert:
        btrfs_release_path(path);
        /* try to insert the key into the destination tree */
+        path->skip_release_on_error = 1;
        ret = btrfs_insert_empty_item(trans, root, path,
                                      key, item_size);
+        path->skip_release_on_error = 0;
        /* make sure any existing item is the correct size */
-        if (ret == -EEXIST) {
+        if (ret == -EEXIST || ret == -EOVERFLOW) {
                u32 found_size;
                found_size = btrfs_item_size_nr(path->nodes[0],
                                                path->slots[0]);
@@ -488,8 +490,20 @@ insert:
                src_item = (struct btrfs_inode_item *)src_ptr;
                dst_item = (struct btrfs_inode_item *)dst_ptr;
-                if (btrfs_inode_generation(eb, src_item) == 0)
+                if (btrfs_inode_generation(eb, src_item) == 0) {
+                        struct extent_buffer *dst_eb = path->nodes[0];
+                        if (S_ISREG(btrfs_inode_mode(eb, src_item)) &&
+                            S_ISREG(btrfs_inode_mode(dst_eb, dst_item))) {
+                                struct btrfs_map_token token;
+                                u64 ino_size = btrfs_inode_size(eb, src_item);
+                                btrfs_init_map_token(&token);
+                                btrfs_set_token_inode_size(dst_eb, dst_item,
+                                                           ino_size, &token);
+                        }
                        goto no_copy;
+                }
                if (overwrite_root &&
                    S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
@@ -844,7 +858,7 @@ out:
 static noinline int backref_in_log(struct btrfs_root *log,
                                   struct btrfs_key *key,
                                   u64 ref_objectid,
-                                   char *name, int namelen)
+                                   const char *name, int namelen)
 {
        struct btrfs_path *path;
        struct btrfs_inode_ref *ref;
@@ -1254,13 +1268,14 @@ out:
 }
 static int insert_orphan_item(struct btrfs_trans_handle *trans,
-                              struct btrfs_root *root, u64 offset)
+                              struct btrfs_root *root, u64 ino)
 {
        int ret;
-        ret = btrfs_find_item(root, NULL, BTRFS_ORPHAN_OBJECTID,
-                        offset, BTRFS_ORPHAN_ITEM_KEY, NULL);
+        ret = btrfs_insert_orphan_item(trans, root, ino);
-        if (ret > 0)
+        if (ret == -EEXIST)
-                ret = btrfs_insert_orphan_item(trans, root, offset);
+                ret = 0;
        return ret;
 }
@@ -1287,6 +1302,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
                leaf = path->nodes[0];
                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
                ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+                cur_offset = 0;
                while (cur_offset < item_size) {
                        extref = (struct btrfs_inode_extref *) (ptr + cur_offset);
@@ -1302,7 +1318,7 @@ static int count_inode_extrefs(struct btrfs_root *root,
        }
        btrfs_release_path(path);
-        if (ret < 0)
+        if (ret < 0 && ret != -ENOENT)
                return ret;
        return nlink;
 }
@@ -1394,9 +1410,6 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        nlink = ret;
        ret = count_inode_extrefs(root, inode, path);
-        if (ret == -ENOENT)
-                ret = 0;
        if (ret < 0)
                goto out;
@@ -1557,6 +1570,30 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans,
 }
 /*
+ * Return true if an inode reference exists in the log for the given name,
+ * inode and parent inode.
+ */
+static bool name_in_log_ref(struct btrfs_root *log_root,
+                            const char *name, const int name_len,
+                            const u64 dirid, const u64 ino)
+{
+        struct btrfs_key search_key;
+        search_key.objectid = ino;
+        search_key.type = BTRFS_INODE_REF_KEY;
+        search_key.offset = dirid;
+        if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+                return true;
+        search_key.type = BTRFS_INODE_EXTREF_KEY;
+        search_key.offset = btrfs_extref_hash(dirid, name, name_len);
+        if (backref_in_log(log_root, &search_key, dirid, name, name_len))
+                return true;
+        return false;
+}
+/*
 * take a single entry in a log directory item and replay it into
 * the subvolume.
 *
@@ -1666,10 +1703,17 @@ out:
        return ret;
 insert:
+        if (name_in_log_ref(root->log_root, name, name_len,
+                            key->objectid, log_key.objectid)) {
+                /* The dentry will be added later. */
+                ret = 0;
+                update_size = false;
+                goto out;
+        }
        btrfs_release_path(path);
        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
                              name, name_len, log_type, &log_key);
-        if (ret && ret != -ENOENT)
+        if (ret && ret != -ENOENT && ret != -EEXIST)
                goto out;
        update_size = false;
        ret = 0;
@@ -2164,7 +2208,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                parent = path->nodes[*level];
                root_owner = btrfs_header_owner(parent);
-                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+                next = btrfs_find_create_tree_block(root, bytenr);
                if (!next)
                        return -ENOMEM;
@@ -2416,8 +2460,8 @@ static void wait_for_writer(struct btrfs_trans_handle *trans,
                mutex_unlock(&root->log_mutex);
                if (atomic_read(&root->log_writers))
                        schedule();
-                mutex_lock(&root->log_mutex);
                finish_wait(&root->log_writer_wait, &wait);
+                mutex_lock(&root->log_mutex);
        }
 }
@@ -2591,6 +2635,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        }
        if (log_root_tree->log_transid_committed >= root_log_ctx.log_transid) {
+                blk_finish_plug(&plug);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = root_log_ctx.log_ret;
                goto out;
@@ -3218,7 +3263,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
 static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct extent_buffer *leaf,
                            struct btrfs_inode_item *item,
-                            struct inode *inode, int log_inode_only)
+                            struct inode *inode, int log_inode_only,
+                            u64 logged_isize)
 {
        struct btrfs_map_token token;
@@ -3231,7 +3277,7 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                 * to say 'update this inode with these values'
                 */
                btrfs_set_token_inode_generation(leaf, item, 0, &token);
-                btrfs_set_token_inode_size(leaf, item, 0, &token);
+                btrfs_set_token_inode_size(leaf, item, logged_isize, &token);
        } else {
                btrfs_set_token_inode_generation(leaf, item,
                                                 BTRFS_I(inode)->generation,
@@ -3244,19 +3290,19 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
-        btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
+        btrfs_set_token_timespec_sec(leaf, &item->atime,
                                     inode->i_atime.tv_sec, &token);
-        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
+        btrfs_set_token_timespec_nsec(leaf, &item->atime,
                                      inode->i_atime.tv_nsec, &token);
-        btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
+        btrfs_set_token_timespec_sec(leaf, &item->mtime,
                                     inode->i_mtime.tv_sec, &token);
-        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
+        btrfs_set_token_timespec_nsec(leaf, &item->mtime,
                                      inode->i_mtime.tv_nsec, &token);
-        btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
+        btrfs_set_token_timespec_sec(leaf, &item->ctime,
                                     inode->i_ctime.tv_sec, &token);
-        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
+        btrfs_set_token_timespec_nsec(leaf, &item->ctime,
                                      inode->i_ctime.tv_nsec, &token);
        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
@@ -3283,7 +3329,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans,
                return ret;
        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                    struct btrfs_inode_item);
-        fill_inode_item(trans, path->nodes[0], inode_item, inode, 0);
+        fill_inode_item(trans, path->nodes[0], inode_item, inode, 0, 0);
        btrfs_release_path(path);
        return 0;
 }
@@ -3292,7 +3338,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                               struct inode *inode,
                               struct btrfs_path *dst_path,
                               struct btrfs_path *src_path, u64 *last_extent,
-                               int start_slot, int nr, int inode_only)
+                               int start_slot, int nr, int inode_only,
+                               u64 logged_isize)
 {
        unsigned long src_offset;
        unsigned long dst_offset;
@@ -3349,7 +3396,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                                    dst_path->slots[0],
                                                    struct btrfs_inode_item);
                        fill_inode_item(trans, dst_path->nodes[0], inode_item,
-                                        inode, inode_only == LOG_INODE_EXISTS);
+                                        inode, inode_only == LOG_INODE_EXISTS,
+                                        logged_isize);
                } else {
                        copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
                                           src_offset, ins_sizes[i]);
@@ -3901,6 +3949,33 @@ process:
        return ret;
 }
+static int logged_inode_size(struct btrfs_root *log, struct inode *inode,
+                             struct btrfs_path *path, u64 *size_ret)
+{
+        struct btrfs_key key;
+        int ret;
+        key.objectid = btrfs_ino(inode);
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        ret = btrfs_search_slot(NULL, log, &key, path, 0, 0);
+        if (ret < 0) {
+                return ret;
+        } else if (ret > 0) {
+                *size_ret = i_size_read(inode);
+        } else {
+                struct btrfs_inode_item *item;
+                item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                      struct btrfs_inode_item);
+                *size_ret = btrfs_inode_size(path->nodes[0], item);
+        }
+        btrfs_release_path(path);
+        return 0;
+}
 /* log a single inode in the tree log.
 * At least one parent directory for this inode must exist in the tree
 * or be logged already.
@@ -3938,6 +4013,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        bool fast_search = false;
        u64 ino = btrfs_ino(inode);
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        u64 logged_isize = 0;
        path = btrfs_alloc_path();
        if (!path)
@@ -3965,15 +4041,22 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
-        /* Only run delayed items if we are a dir or a new file */
+        /*
+         * Only run delayed items if we are a dir or a new file.
+         * Otherwise commit the delayed inode only, which is needed in
+         * order for the log replay code to mark inodes for link count
+         * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+         */
        if (S_ISDIR(inode->i_mode) ||
-            BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) {
+            BTRFS_I(inode)->generation > root->fs_info->last_trans_committed)
                ret = btrfs_commit_inode_delayed_items(trans, inode);
-                if (ret) {
+        else
-                        btrfs_free_path(path);
+                ret = btrfs_commit_inode_delayed_inode(inode);
-                        btrfs_free_path(dst_path);
-                        return ret;
+        if (ret) {
-                }
+                btrfs_free_path(path);
+                btrfs_free_path(dst_path);
+                return ret;
        }
        mutex_lock(&BTRFS_I(inode)->log_mutex);
@@ -3987,22 +4070,56 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        if (S_ISDIR(inode->i_mode)) {
                int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
-                if (inode_only == LOG_INODE_EXISTS)
+                if (inode_only == LOG_INODE_EXISTS) {
-                        max_key_type = BTRFS_XATTR_ITEM_KEY;
+                        max_key_type = BTRFS_INODE_EXTREF_KEY;
+                        max_key.type = max_key_type;
+                }
                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
        } else {
-                if (test_and_clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                if (inode_only == LOG_INODE_EXISTS) {
-                                       &BTRFS_I(inode)->runtime_flags)) {
+                        /*
-                        clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                         * Make sure the new inode item we write to the log has
-                                  &BTRFS_I(inode)->runtime_flags);
+                         * the same isize as the current one (if it exists).
-                        ret = btrfs_truncate_inode_items(trans, log,
+                         * This is necessary to prevent data loss after log
-                                                         inode, 0, 0);
+                         * replay, and also to prevent doing a wrong expanding
-                } else if (test_and_clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                         * truncate - for e.g. create file, write 4K into offset
-                                              &BTRFS_I(inode)->runtime_flags) ||
+                         * 0, fsync, write 4K into offset 4096, add hard link,
+                         * fsync some other file (to sync log), power fail - if
+                         * we use the inode's current i_size, after log replay
+                         * we get a 8Kb file, with the last 4Kb extent as a hole
+                         * (zeroes), as if an expanding truncate happened,
+                         * instead of getting a file of 4Kb only.
+                         */
+                        err = logged_inode_size(log, inode, path,
+                                                &logged_isize);
+                        if (err)
+                                goto out_unlock;
+                }
+                if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                             &BTRFS_I(inode)->runtime_flags)) {
+                        if (inode_only == LOG_INODE_EXISTS) {
+                                max_key.type = BTRFS_INODE_EXTREF_KEY;
+                                ret = drop_objectid_items(trans, log, path, ino,
+                                                          max_key.type);
+                        } else {
+                                clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                                          &BTRFS_I(inode)->runtime_flags);
+                                clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                          &BTRFS_I(inode)->runtime_flags);
+                                ret = btrfs_truncate_inode_items(trans, log,
+                                                                 inode, 0, 0);
+                        }
+                } else if (test_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                    &BTRFS_I(inode)->runtime_flags) ||
                           inode_only == LOG_INODE_EXISTS) {
-                        if (inode_only == LOG_INODE_ALL)
+                        if (inode_only == LOG_INODE_ALL) {
+                                clear_bit(BTRFS_INODE_COPY_EVERYTHING,
+                                          &BTRFS_I(inode)->runtime_flags);
                                fast_search = true;
-                        max_key.type = BTRFS_XATTR_ITEM_KEY;
+                                max_key.type = BTRFS_XATTR_ITEM_KEY;
+                        } else {
+                                max_key.type = BTRFS_INODE_EXTREF_KEY;
+                        }
                        ret = drop_objectid_items(trans, log, path, ino,
                                                  max_key.type);
                } else {
@@ -4046,7 +4163,8 @@ again:
                }
                ret = copy_items(trans, inode, dst_path, path, &last_extent,
-                                 ins_start_slot, ins_nr, inode_only);
+                                 ins_start_slot, ins_nr, inode_only,
+                                 logged_isize);
                if (ret < 0) {
                        err = ret;
                        goto out_unlock;
@@ -4070,7 +4188,7 @@ next_slot:
                if (ins_nr) {
                        ret = copy_items(trans, inode, dst_path, path,
                                         &last_extent, ins_start_slot,
-                                         ins_nr, inode_only);
+                                         ins_nr, inode_only, logged_isize);
                        if (ret < 0) {
                                err = ret;
                                goto out_unlock;
@@ -4091,7 +4209,8 @@ next_slot:
        }
        if (ins_nr) {
                ret = copy_items(trans, inode, dst_path, path, &last_extent,
-                                 ins_start_slot, ins_nr, inode_only);
+                                 ins_start_slot, ins_nr, inode_only,
+                                 logged_isize);
                if (ret < 0) {
                        err = ret;
                        goto out_unlock;
@@ -4272,6 +4391,9 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
        struct dentry *old_parent = NULL;
        int ret = 0;
        u64 last_committed = root->fs_info->last_trans_committed;
+        const struct dentry * const first_parent = parent;
+        const bool did_unlink = (BTRFS_I(inode)->last_unlink_trans >
+                                 last_committed);
        sb = inode->i_sb;
@@ -4327,7 +4449,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_trans;
        }
-        inode_only = LOG_INODE_EXISTS;
        while (1) {
                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
                        break;
@@ -4336,8 +4457,22 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (root != BTRFS_I(inode)->root)
                        break;
+                /*
+                 * On unlink we must make sure our immediate parent directory
+                 * inode is fully logged. This is to prevent leaving dangling
+                 * directory index entries and a wrong directory inode's i_size.
+                 * Not doing so can result in a directory being impossible to
+                 * delete after log replay (rmdir will always fail with error
+                 * -ENOTEMPTY).
+                 */
+                if (did_unlink && parent == first_parent)
+                        inode_only = LOG_INODE_ALL;
+                else
+                        inode_only = LOG_INODE_EXISTS;
                if (BTRFS_I(inode)->generation >
-                    root->fs_info->last_trans_committed) {
+                    root->fs_info->last_trans_committed ||
+                    inode_only == LOG_INODE_ALL) {
                        ret = btrfs_log_inode(trans, root, inode, inode_only,
                                              0, LLONG_MAX, ctx);
                        if (ret)
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 50c5a8762aed..cd4d1315aaa9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1310,6 +1310,8 @@ again:
        if (ret) {
                btrfs_error(root->fs_info, ret,
                            "Failed to remove dev extent item");
+        } else {
+                trans->transaction->have_free_bgs = 1;
        }
 out:
        btrfs_free_path(path);
@@ -4196,7 +4198,7 @@ static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
 {
-        if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+        if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
                return;
        btrfs_set_fs_incompat(info, RAID56);
@@ -4803,10 +4805,8 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
        BUG_ON(em->start > logical || em->start + em->len < logical);
        map = (struct map_lookup *)em->bdev;
-        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-                         BTRFS_BLOCK_GROUP_RAID6)) {
                len = map->stripe_len * nr_data_stripes(map);
-        }
        free_extent_map(em);
        return len;
 }
@@ -4826,8 +4826,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
        BUG_ON(em->start > logical || em->start + em->len < logical);
        map = (struct map_lookup *)em->bdev;
-        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
-                         BTRFS_BLOCK_GROUP_RAID6))
                ret = 1;
        free_extent_map(em);
        return ret;
@@ -4876,32 +4875,24 @@ static inline int parity_smaller(u64 a, u64 b)
 }
 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
-static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
 {
        struct btrfs_bio_stripe s;
-        int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
        int i;
        u64 l;
        int again = 1;
-        int m;
        while (again) {
                again = 0;
-                for (i = 0; i < real_stripes - 1; i++) {
+                for (i = 0; i < num_stripes - 1; i++) {
-                        if (parity_smaller(raid_map[i], raid_map[i+1])) {
+                        if (parity_smaller(bbio->raid_map[i],
+                                           bbio->raid_map[i+1])) {
                                s = bbio->stripes[i];
-                                l = raid_map[i];
+                                l = bbio->raid_map[i];
                                bbio->stripes[i] = bbio->stripes[i+1];
-                                raid_map[i] = raid_map[i+1];
+                                bbio->raid_map[i] = bbio->raid_map[i+1];
                                bbio->stripes[i+1] = s;
-                                raid_map[i+1] = l;
+                                bbio->raid_map[i+1] = l;
-                                if (bbio->tgtdev_map) {
-                                        m = bbio->tgtdev_map[i];
-                                        bbio->tgtdev_map[i] =
-                                                        bbio->tgtdev_map[i + 1];
-                                        bbio->tgtdev_map[i + 1] = m;
-                                }
                                again = 1;
                        }
@@ -4909,10 +4900,41 @@ static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
        }
 }
+static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
+{
+        struct btrfs_bio *bbio = kzalloc(
+                sizeof(struct btrfs_bio) +
+                sizeof(struct btrfs_bio_stripe) * (total_stripes) +
+                sizeof(int) * (real_stripes) +
+                sizeof(u64) * (real_stripes),
+                GFP_NOFS);
+        if (!bbio)
+                return NULL;
+        atomic_set(&bbio->error, 0);
+        atomic_set(&bbio->refs, 1);
+        return bbio;
+}
+void btrfs_get_bbio(struct btrfs_bio *bbio)
+{
+        WARN_ON(!atomic_read(&bbio->refs));
+        atomic_inc(&bbio->refs);
+}
+void btrfs_put_bbio(struct btrfs_bio *bbio)
+{
+        if (!bbio)
+                return;
+        if (atomic_dec_and_test(&bbio->refs))
+                kfree(bbio);
+}
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                             u64 logical, u64 *length,
                             struct btrfs_bio **bbio_ret,
-                             int mirror_num, u64 **raid_map_ret)
+                             int mirror_num, int need_raid_map)
 {
        struct extent_map *em;
        struct map_lookup *map;
@@ -4925,7 +4947,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        u64 stripe_nr_orig;
        u64 stripe_nr_end;
        u64 stripe_len;
-        u64 *raid_map = NULL;
        int stripe_index;
        int i;
        int ret = 0;
@@ -4976,7 +4997,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        stripe_offset = offset - stripe_offset;
        /* if we're here for raid56, we need to know the stripe aligned start */
-        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
                unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
                raid56_full_stripe_start = offset;
@@ -4989,8 +5010,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        if (rw & REQ_DISCARD) {
                /* we don't discard raid56 yet */
-                if (map->type &
+                if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-                    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
                        ret = -EOPNOTSUPP;
                        goto out;
                }
@@ -5000,7 +5020,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                /* For writes to RAID[56], allow a full stripeset across all disks.
                   For other RAID types and for RAID[56] reads, just allow a single
                   stripe (on a single disk). */
-                if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+                if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
                    (rw & REQ_WRITE)) {
                        max_len = stripe_len * nr_data_stripes(map) -
                                (offset - raid56_full_stripe_start);
@@ -5047,7 +5067,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                u64 physical_of_found = 0;
                ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
-                             logical, &tmp_length, &tmp_bbio, 0, NULL);
+                             logical, &tmp_length, &tmp_bbio, 0, 0);
                if (ret) {
                        WARN_ON(tmp_bbio != NULL);
                        goto out;
@@ -5061,7 +5081,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                         * is not left of the left cursor
                         */
                        ret = -EIO;
-                        kfree(tmp_bbio);
+                        btrfs_put_bbio(tmp_bbio);
                        goto out;
                }
@@ -5096,11 +5116,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                } else {
                        WARN_ON(1);
                        ret = -EIO;
-                        kfree(tmp_bbio);
+                        btrfs_put_bbio(tmp_bbio);
                        goto out;
                }
-                kfree(tmp_bbio);
+                btrfs_put_bbio(tmp_bbio);
        } else if (mirror_num > map->num_stripes) {
                mirror_num = 0;
        }
@@ -5166,15 +5186,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        mirror_num = stripe_index - old_stripe_index + 1;
                }
-        } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-                                BTRFS_BLOCK_GROUP_RAID6)) {
+                if (need_raid_map &&
-                u64 tmp;
-                if (raid_map_ret &&
                    ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
                     mirror_num > 1)) {
-                        int i, rot;
                        /* push stripe_nr back to the start of the full stripe */
                        stripe_nr = raid56_full_stripe_start;
                        do_div(stripe_nr, stripe_len * nr_data_stripes(map));
@@ -5183,32 +5198,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                        num_stripes = map->num_stripes;
                        max_errors = nr_parity_stripes(map);
-                        raid_map = kmalloc_array(num_stripes, sizeof(u64),
-                                           GFP_NOFS);
-                        if (!raid_map) {
-                                ret = -ENOMEM;
-                                goto out;
-                        }
-                        /* Work out the disk rotation on this stripe-set */
-                        tmp = stripe_nr;
-                        rot = do_div(tmp, num_stripes);
-                        /* Fill in the logical address of each stripe */
-                        tmp = stripe_nr * nr_data_stripes(map);
-                        for (i = 0; i < nr_data_stripes(map); i++)
-                                raid_map[(i+rot) % num_stripes] =
-                                        em->start + (tmp + i) * map->stripe_len;
-                        raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
-                        if (map->type & BTRFS_BLOCK_GROUP_RAID6)
-                                raid_map[(i+rot+1) % num_stripes] =
-                                        RAID6_Q_STRIPE;
                        *length = map->stripe_len;
                        stripe_index = 0;
                        stripe_offset = 0;
                } else {
+                        u64 tmp;
                        /*
                         * Mirror #0 or #1 means the original data block.
                         * Mirror #2 is RAID5 parity block.
@@ -5246,17 +5241,42 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                tgtdev_indexes = num_stripes;
        }
-        bbio = kzalloc(btrfs_bio_size(num_alloc_stripes, tgtdev_indexes),
+        bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
-                       GFP_NOFS);
        if (!bbio) {
-                kfree(raid_map);
                ret = -ENOMEM;
                goto out;
        }
-        atomic_set(&bbio->error, 0);
        if (dev_replace_is_ongoing)
                bbio->tgtdev_map = (int *)(bbio->stripes + num_alloc_stripes);
+        /* build raid_map */
+        if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK &&
+            need_raid_map && ((rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) ||
+            mirror_num > 1)) {
+                u64 tmp;
+                int i, rot;
+                bbio->raid_map = (u64 *)((void *)bbio->stripes +
+                                 sizeof(struct btrfs_bio_stripe) *
+                                 num_alloc_stripes +
+                                 sizeof(int) * tgtdev_indexes);
+                /* Work out the disk rotation on this stripe-set */
+                tmp = stripe_nr;
+                rot = do_div(tmp, num_stripes);
+                /* Fill in the logical address of each stripe */
+                tmp = stripe_nr * nr_data_stripes(map);
+                for (i = 0; i < nr_data_stripes(map); i++)
+                        bbio->raid_map[(i+rot) % num_stripes] =
+                                em->start + (tmp + i) * map->stripe_len;
+                bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+                if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+                        bbio->raid_map[(i+rot+1) % num_stripes] =
+                                RAID6_Q_STRIPE;
+        }
        if (rw & REQ_DISCARD) {
                int factor = 0;
                int sub_stripes = 0;
@@ -5340,6 +5360,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS))
                max_errors = btrfs_chunk_max_errors(map);
+        if (bbio->raid_map)
+                sort_parity_stripes(bbio, num_stripes);
        tgtdev_indexes = 0;
        if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) &&
            dev_replace->tgtdev != NULL) {
@@ -5427,6 +5450,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        }
        *bbio_ret = bbio;
+        bbio->map_type = map->type;
        bbio->num_stripes = num_stripes;
        bbio->max_errors = max_errors;
        bbio->mirror_num = mirror_num;
@@ -5443,10 +5467,6 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
                bbio->mirror_num = map->num_stripes + 1;
        }
-        if (raid_map) {
-                sort_parity_stripes(bbio, raid_map);
-                *raid_map_ret = raid_map;
-        }
 out:
        if (dev_replace_is_ongoing)
                btrfs_dev_replace_unlock(dev_replace);
@@ -5459,17 +5479,17 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                      struct btrfs_bio **bbio_ret, int mirror_num)
 {
        return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
-                                 mirror_num, NULL);
+                                 mirror_num, 0);
 }
 /* For Scrub/replace */
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
                     u64 logical, u64 *length,
                     struct btrfs_bio **bbio_ret, int mirror_num,
-                     u64 **raid_map_ret)
+                     int need_raid_map)
 {
        return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
-                                 mirror_num, raid_map_ret);
+                                 mirror_num, need_raid_map);
 }
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -5511,8 +5531,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                do_div(length, map->num_stripes / map->sub_stripes);
        else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
                do_div(length, map->num_stripes);
-        else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+        else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
-                              BTRFS_BLOCK_GROUP_RAID6)) {
                do_div(length, nr_data_stripes(map));
                rmap_len = map->stripe_len * nr_data_stripes(map);
        }
@@ -5565,7 +5584,7 @@ static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int e
                bio_endio_nodec(bio, err);
        else
                bio_endio(bio, err);
-        kfree(bbio);
+        btrfs_put_bbio(bbio);
 }
 static void btrfs_end_bio(struct bio *bio, int err)
@@ -5808,7 +5827,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        u64 logical = (u64)bio->bi_iter.bi_sector << 9;
        u64 length = 0;
        u64 map_length;
-        u64 *raid_map = NULL;
        int ret;
        int dev_nr = 0;
        int total_devs = 1;
@@ -5819,7 +5837,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        btrfs_bio_counter_inc_blocked(root->fs_info);
        ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
-                              mirror_num, &raid_map);
+                              mirror_num, 1);
        if (ret) {
                btrfs_bio_counter_dec(root->fs_info);
                return ret;
@@ -5832,15 +5850,13 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        bbio->fs_info = root->fs_info;
        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
-        if (raid_map) {
+        if (bbio->raid_map) {
                /* In this case, map_length has been set to the length of
                   a single stripe; not the whole write */
                if (rw & WRITE) {
-                        ret = raid56_parity_write(root, bio, bbio,
+                        ret = raid56_parity_write(root, bio, bbio, map_length);
-                                                  raid_map, map_length);
                } else {
-                        ret = raid56_parity_recover(root, bio, bbio,
+                        ret = raid56_parity_recover(root, bio, bbio, map_length,
-                                                    raid_map, map_length,
                                                    mirror_num, 1);
                }
@@ -6238,17 +6254,22 @@ int btrfs_read_sys_array(struct btrfs_root *root)
        struct extent_buffer *sb;
        struct btrfs_disk_key *disk_key;
        struct btrfs_chunk *chunk;
-        u8 *ptr;
+        u8 *array_ptr;
-        unsigned long sb_ptr;
+        unsigned long sb_array_offset;
        int ret = 0;
        u32 num_stripes;
        u32 array_size;
        u32 len = 0;
-        u32 cur;
+        u32 cur_offset;
        struct btrfs_key key;
-        sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET,
+        ASSERT(BTRFS_SUPER_INFO_SIZE <= root->nodesize);
-                                          BTRFS_SUPER_INFO_SIZE);
+        /*
+         * This will create extent buffer of nodesize, superblock size is
+         * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
+         * overallocate but we can keep it as-is, only the first page is used.
+         */
+        sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
        if (!sb)
                return -ENOMEM;
        btrfs_set_buffer_uptodate(sb);
@@ -6271,35 +6292,56 @@ int btrfs_read_sys_array(struct btrfs_root *root)
        write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
        array_size = btrfs_super_sys_array_size(super_copy);
-        ptr = super_copy->sys_chunk_array;
+        array_ptr = super_copy->sys_chunk_array;
-        sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+        sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
-        cur = 0;
+        cur_offset = 0;
+        while (cur_offset < array_size) {
+                disk_key = (struct btrfs_disk_key *)array_ptr;
+                len = sizeof(*disk_key);
+                if (cur_offset + len > array_size)
+                        goto out_short_read;
-        while (cur < array_size) {
-                disk_key = (struct btrfs_disk_key *)ptr;
                btrfs_disk_key_to_cpu(&key, disk_key);
-                len = sizeof(*disk_key); ptr += len;
+                array_ptr += len;
-                sb_ptr += len;
+                sb_array_offset += len;
-                cur += len;
+                cur_offset += len;
                if (key.type == BTRFS_CHUNK_ITEM_KEY) {
-                        chunk = (struct btrfs_chunk *)sb_ptr;
+                        chunk = (struct btrfs_chunk *)sb_array_offset;
+                        /*
+                         * At least one btrfs_chunk with one stripe must be
+                         * present, exact stripe count check comes afterwards
+                         */
+                        len = btrfs_chunk_item_size(1);
+                        if (cur_offset + len > array_size)
+                                goto out_short_read;
+                        num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+                        len = btrfs_chunk_item_size(num_stripes);
+                        if (cur_offset + len > array_size)
+                                goto out_short_read;
                        ret = read_one_chunk(root, &key, sb, chunk);
                        if (ret)
                                break;
-                        num_stripes = btrfs_chunk_num_stripes(sb, chunk);
-                        len = btrfs_chunk_item_size(num_stripes);
                } else {
                        ret = -EIO;
                        break;
                }
-                ptr += len;
+                array_ptr += len;
-                sb_ptr += len;
+                sb_array_offset += len;
-                cur += len;
+                cur_offset += len;
        }
        free_extent_buffer(sb);
        return ret;
+out_short_read:
+        printk(KERN_ERR "BTRFS: sys_array too short to read %u bytes at offset %u\n",
+                        len, cur_offset);
+        free_extent_buffer(sb);
+        return -EIO;
 }
 int btrfs_read_chunk_tree(struct btrfs_root *root)
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d6fe73c0f4a2..83069dec6898 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -295,8 +295,10 @@ typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
 #define BTRFS_BIO_ORIG_BIO_SUBMITTED    (1 << 0)
 struct btrfs_bio {
+        atomic_t refs;
        atomic_t stripes_pending;
        struct btrfs_fs_info *fs_info;
+        u64 map_type; /* get from map_lookup->type */
        bio_end_io_t *end_io;
        struct bio *orig_bio;
        unsigned long flags;
@@ -307,6 +309,12 @@ struct btrfs_bio {
        int mirror_num;
        int num_tgtdevs;
        int *tgtdev_map;
+        /*
+         * logical block numbers for the start of each stripe
+         * The last one or two are p/q.  These are sorted,
+         * so raid_map[0] is the start of our full stripe
+         */
+        u64 *raid_map;
        struct btrfs_bio_stripe stripes[];
 };
@@ -388,19 +396,15 @@ struct btrfs_balance_control {
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
                                   u64 end, u64 *length);
+void btrfs_get_bbio(struct btrfs_bio *bbio);
-#define btrfs_bio_size(total_stripes, real_stripes)             \
+void btrfs_put_bbio(struct btrfs_bio *bbio);
-        (sizeof(struct btrfs_bio) +                             \
-         (sizeof(struct btrfs_bio_stripe) * (total_stripes)) +  \
-         (sizeof(int) * (real_stripes)))
 int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                    u64 logical, u64 *length,
                    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, int rw,
                     u64 logical, u64 *length,
                     struct btrfs_bio **bbio_ret, int mirror_num,
-                     u64 **raid_map_ret);
+                     int need_raid_map);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                     u64 chunk_start, u64 physical, u64 devid,
                     u64 **logical, int *naddrs, int *stripe_len);
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 5bd853ba44ff..64fa248343f6 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -40,20 +40,6 @@ static inline void ceph_set_cached_acl(struct inode *inode,
        spin_unlock(&ci->i_ceph_lock);
 }
-static inline struct posix_acl *ceph_get_cached_acl(struct inode *inode,
-                                                        int type)
-{
-        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct posix_acl *acl = ACL_NOT_CACHED;
-        spin_lock(&ci->i_ceph_lock);
-        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0))
-                acl = get_cached_acl(inode, type);
-        spin_unlock(&ci->i_ceph_lock);
-        return acl;
-}
 struct posix_acl *ceph_get_acl(struct inode *inode, int type)
 {
        int size;
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index f5013d92a7e6..fd5599d32362 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -196,17 +196,22 @@ static int readpage_nounlock(struct file *filp, struct page *page)
        u64 len = PAGE_CACHE_SIZE;
        if (off >= i_size_read(inode)) {
-                zero_user_segment(page, err, PAGE_CACHE_SIZE);
+                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
                SetPageUptodate(page);
                return 0;
        }
-        /*
+        if (ci->i_inline_version != CEPH_INLINE_NONE) {
-         * Uptodate inline data should have been added into page cache
+                /*
-         * while getting Fcr caps.
+                 * Uptodate inline data should have been added
-         */
+                 * into page cache while getting Fcr caps.
-        if (ci->i_inline_version != CEPH_INLINE_NONE)
+                 */
-                return -EINVAL;
+                if (off == 0)
+                        return -EINVAL;
+                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
+                SetPageUptodate(page);
+                return 0;
+        }
        err = ceph_readpage_from_fscache(inode, page);
        if (err == 0)
@@ -1416,7 +1421,7 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
                }
        }
-        dout("fill_inline_data %p %llx.%llx len %lu locked_page %p\n",
+        dout("fill_inline_data %p %llx.%llx len %zu locked_page %p\n",
             inode, ceph_vinop(inode), len, locked_page);
        if (len > 0) {
@@ -1569,7 +1574,6 @@ out:
 static struct vm_operations_struct ceph_vmops = {
        .fault          = ceph_filemap_fault,
        .page_mkwrite   = ceph_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index b93c631c6c87..8172775428a0 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -577,7 +577,6 @@ void ceph_add_cap(struct inode *inode,
                struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
                                                               realmino);
                if (realm) {
-                        ceph_get_snap_realm(mdsc, realm);
                        spin_lock(&realm->inodes_with_caps_lock);
                        ci->i_snap_realm = realm;
                        list_add(&ci->i_snap_realm_item,
@@ -1451,8 +1450,8 @@ static int __mark_caps_flushing(struct inode *inode,
        spin_lock(&mdsc->cap_dirty_lock);
        list_del_init(&ci->i_dirty_item);
-        ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
        if (list_empty(&ci->i_flushing_item)) {
+                ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
                list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
                mdsc->num_cap_flushing++;
                dout(" inode %p now flushing seq %lld\n", inode,
@@ -2073,17 +2072,16 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got)
 * requested from the MDS.
 */
 static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
-                            loff_t endoff, int *got, struct page **pinned_page,
+                            loff_t endoff, int *got, int *check_max, int *err)
-                            int *check_max, int *err)
 {
        struct inode *inode = &ci->vfs_inode;
        int ret = 0;
-        int have, implemented, _got = 0;
+        int have, implemented;
        int file_wanted;
        dout("get_cap_refs %p need %s want %s\n", inode,
             ceph_cap_string(need), ceph_cap_string(want));
-again:
        spin_lock(&ci->i_ceph_lock);
        /* make sure file is actually open */
@@ -2138,50 +2136,34 @@ again:
                     inode, ceph_cap_string(have), ceph_cap_string(not),
                     ceph_cap_string(revoking));
                if ((revoking & not) == 0) {
-                        _got = need | (have & want);
+                        *got = need | (have & want);
-                        __take_cap_refs(ci, _got);
+                        __take_cap_refs(ci, *got);
                        ret = 1;
                }
        } else {
+                int session_readonly = false;
+                if ((need & CEPH_CAP_FILE_WR) && ci->i_auth_cap) {
+                        struct ceph_mds_session *s = ci->i_auth_cap->session;
+                        spin_lock(&s->s_cap_lock);
+                        session_readonly = s->s_readonly;
+                        spin_unlock(&s->s_cap_lock);
+                }
+                if (session_readonly) {
+                        dout("get_cap_refs %p needed %s but mds%d readonly\n",
+                             inode, ceph_cap_string(need), ci->i_auth_cap->mds);
+                        *err = -EROFS;
+                        ret = 1;
+                        goto out_unlock;
+                }
                dout("get_cap_refs %p have %s needed %s\n", inode,
                     ceph_cap_string(have), ceph_cap_string(need));
        }
 out_unlock:
        spin_unlock(&ci->i_ceph_lock);
-        if (ci->i_inline_version != CEPH_INLINE_NONE &&
-            (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
-            i_size_read(inode) > 0) {
-                int ret1;
-                struct page *page = find_get_page(inode->i_mapping, 0);
-                if (page) {
-                        if (PageUptodate(page)) {
-                                *pinned_page = page;
-                                goto out;
-                        }
-                        page_cache_release(page);
-                }
-                /*
-                 * drop cap refs first because getattr while holding
-                 * caps refs can cause deadlock.
-                 */
-                ceph_put_cap_refs(ci, _got);
-                _got = 0;
-                /* getattr request will bring inline data into page cache */
-                ret1 = __ceph_do_getattr(inode, NULL,
-                                         CEPH_STAT_CAP_INLINE_DATA, true);
-                if (ret1 >= 0) {
-                        ret = 0;
-                        goto again;
-                }
-                *err = ret1;
-                ret = 1;
-        }
-out:
        dout("get_cap_refs %p ret %d got %s\n", inode,
-             ret, ceph_cap_string(_got));
+             ret, ceph_cap_string(*got));
-        *got = _got;
        return ret;
 }
@@ -2221,22 +2203,52 @@ static void check_max_size(struct inode *inode, loff_t endoff)
 int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
                  loff_t endoff, int *got, struct page **pinned_page)
 {
-        int check_max, ret, err;
+        int _got, check_max, ret, err = 0;
 retry:
        if (endoff > 0)
                check_max_size(&ci->vfs_inode, endoff);
+        _got = 0;
        check_max = 0;
-        err = 0;
        ret = wait_event_interruptible(ci->i_cap_wq,
-                                       try_get_cap_refs(ci, need, want, endoff,
+                                try_get_cap_refs(ci, need, want, endoff,
-                                                        got, pinned_page,
+                                                 &_got, &check_max, &err));
-                                                        &check_max, &err));
        if (err)
                ret = err;
+        if (ret < 0)
+                return ret;
        if (check_max)
                goto retry;
-        return ret;
+        if (ci->i_inline_version != CEPH_INLINE_NONE &&
+            (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) &&
+            i_size_read(&ci->vfs_inode) > 0) {
+                struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0);
+                if (page) {
+                        if (PageUptodate(page)) {
+                                *pinned_page = page;
+                                goto out;
+                        }
+                        page_cache_release(page);
+                }
+                /*
+                 * drop cap refs first because getattr while holding
+                 * caps refs can cause deadlock.
+                 */
+                ceph_put_cap_refs(ci, _got);
+                _got = 0;
+                /* getattr request will bring inline data into page cache */
+                ret = __ceph_do_getattr(&ci->vfs_inode, NULL,
+                                        CEPH_STAT_CAP_INLINE_DATA, true);
+                if (ret < 0)
+                        return ret;
+                goto retry;
+        }
+out:
+        *got = _got;
+        return 0;
 }
 /*
@@ -2432,13 +2444,13 @@ static void invalidate_aliases(struct inode *inode)
 */
 static void handle_cap_grant(struct ceph_mds_client *mdsc,
                             struct inode *inode, struct ceph_mds_caps *grant,
-                             void *snaptrace, int snaptrace_len,
                             u64 inline_version,
                             void *inline_data, int inline_len,
                             struct ceph_buffer *xattr_buf,
                             struct ceph_mds_session *session,
                             struct ceph_cap *cap, int issued)
        __releases(ci->i_ceph_lock)
+        __releases(mdsc->snap_rwsem)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        int mds = session->s_mds;
@@ -2639,10 +2651,6 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
        spin_unlock(&ci->i_ceph_lock);
        if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
-                down_write(&mdsc->snap_rwsem);
-                ceph_update_snap_trace(mdsc, snaptrace,
-                                       snaptrace + snaptrace_len, false);
-                downgrade_write(&mdsc->snap_rwsem);
                kick_flushing_inode_caps(mdsc, session, inode);
                up_read(&mdsc->snap_rwsem);
                if (newcaps & ~issued)
@@ -3052,6 +3060,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        struct ceph_cap *cap;
        struct ceph_mds_caps *h;
        struct ceph_mds_cap_peer *peer = NULL;
+        struct ceph_snap_realm *realm;
        int mds = session->s_mds;
        int op, issued;
        u32 seq, mseq;
@@ -3153,11 +3162,23 @@ void ceph_handle_caps(struct ceph_mds_session *session,
                goto done_unlocked;
        case CEPH_CAP_OP_IMPORT:
+                realm = NULL;
+                if (snaptrace_len) {
+                        down_write(&mdsc->snap_rwsem);
+                        ceph_update_snap_trace(mdsc, snaptrace,
+                                               snaptrace + snaptrace_len,
+                                               false, &realm);
+                        downgrade_write(&mdsc->snap_rwsem);
+                } else {
+                        down_read(&mdsc->snap_rwsem);
+                }
                handle_cap_import(mdsc, inode, h, peer, session,
                                  &cap, &issued);
-                handle_cap_grant(mdsc, inode, h,  snaptrace, snaptrace_len,
+                handle_cap_grant(mdsc, inode, h,
                                 inline_version, inline_data, inline_len,
                                 msg->middle, session, cap, issued);
+                if (realm)
+                        ceph_put_snap_realm(mdsc, realm);
                goto done_unlocked;
        }
@@ -3177,7 +3198,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        case CEPH_CAP_OP_GRANT:
                __ceph_caps_issued(ci, &issued);
                issued |= __ceph_caps_dirty(ci);
-                handle_cap_grant(mdsc, inode, h, NULL, 0,
+                handle_cap_grant(mdsc, inode, h,
                                 inline_version, inline_data, inline_len,
                                 msg->middle, session, cap, issued);
                goto done_unlocked;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index c241603764fd..0411dbb15815 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -26,8 +26,6 @@
 * point by name.
 */
-const struct inode_operations ceph_dir_iops;
-const struct file_operations ceph_dir_fops;
 const struct dentry_operations ceph_dentry_ops;
 /*
@@ -672,13 +670,17 @@ int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
                /*
                 * We created the item, then did a lookup, and found
                 * it was already linked to another inode we already
-                 * had in our cache (and thus got spliced).  Link our
+                 * had in our cache (and thus got spliced). To not
-                 * dentry to that inode, but don't hash it, just in
+                 * confuse VFS (especially when inode is a directory),
-                 * case the VFS wants to dereference it.
+                 * we don't link our dentry to that inode, return an
+                 * error instead.
+                 *
+                 * This event should be rare and it happens only when
+                 * we talk to old MDS. Recent MDS does not send traceless
+                 * reply for request that creates new inode.
                 */
-                BUG_ON(!result->d_inode);
+                d_drop(result);
-                d_instantiate(dentry, result->d_inode);
+                return -ESTALE;
-                return 0;
        }
        return PTR_ERR(result);
 }
@@ -1335,6 +1337,13 @@ const struct file_operations ceph_dir_fops = {
        .fsync = ceph_dir_fsync,
 };
+const struct file_operations ceph_snapdir_fops = {
+        .iterate = ceph_readdir,
+        .llseek = ceph_dir_llseek,
+        .open = ceph_open,
+        .release = ceph_release,
+};
 const struct inode_operations ceph_dir_iops = {
        .lookup = ceph_lookup,
        .permission = ceph_permission,
@@ -1357,6 +1366,14 @@ const struct inode_operations ceph_dir_iops = {
        .atomic_open = ceph_atomic_open,
 };
+const struct inode_operations ceph_snapdir_iops = {
+        .lookup = ceph_lookup,
+        .permission = ceph_permission,
+        .getattr = ceph_getattr,
+        .mkdir = ceph_mkdir,
+        .rmdir = ceph_unlink,
+};
 const struct dentry_operations ceph_dentry_ops = {
        .d_revalidate = ceph_d_revalidate,
        .d_release = ceph_d_release,
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index ce74b394b49d..a3d774b35149 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -275,10 +275,10 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
        err = ceph_mdsc_do_request(mdsc,
                                   (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
                                   req);
+        err = ceph_handle_snapdir(req, dentry, err);
        if (err)
                goto out_req;
-        err = ceph_handle_snapdir(req, dentry, err);
        if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
                err = ceph_handle_notrace_create(dir, dentry);
@@ -392,13 +392,14 @@ more:
        if (ret >= 0) {
                int didpages;
                if (was_short && (pos + ret < inode->i_size)) {
-                        u64 tmp = min(this_len - ret,
+                        int zlen = min(this_len - ret,
-                                        inode->i_size - pos - ret);
+                                       inode->i_size - pos - ret);
+                        int zoff = (o_direct ? buf_align : io_align) +
+                                    read + ret;
                        dout(" zero gap %llu to %llu\n",
-                                pos + ret, pos + ret + tmp);
+                                pos + ret, pos + ret + zlen);
-                        ceph_zero_page_vector_range(page_align + read + ret,
+                        ceph_zero_page_vector_range(zoff, zlen, pages);
-                                                        tmp, pages);
+                        ret += zlen;
-                        ret += tmp;
                }
                didpages = (page_align + ret) >> PAGE_CACHE_SHIFT;
@@ -878,28 +879,34 @@ again:
                i_size = i_size_read(inode);
                if (retry_op == READ_INLINE) {
-                        /* does not support inline data > PAGE_SIZE */
+                        BUG_ON(ret > 0 || read > 0);
-                        if (i_size > PAGE_CACHE_SIZE) {
+                        if (iocb->ki_pos < i_size &&
-                                ret = -EIO;
+                            iocb->ki_pos < PAGE_CACHE_SIZE) {
-                        } else if (iocb->ki_pos < i_size) {
                                loff_t end = min_t(loff_t, i_size,
                                                   iocb->ki_pos + len);
+                                end = min_t(loff_t, end, PAGE_CACHE_SIZE);
                                if (statret < end)
                                        zero_user_segment(page, statret, end);
                                ret = copy_page_to_iter(page,
                                                iocb->ki_pos & ~PAGE_MASK,
                                                end - iocb->ki_pos, to);
                                iocb->ki_pos += ret;
-                        } else {
+                                read += ret;
-                                ret = 0;
+                        }
+                        if (iocb->ki_pos < i_size && read < len) {
+                                size_t zlen = min_t(size_t, len - read,
+                                                    i_size - iocb->ki_pos);
+                                ret = iov_iter_zero(zlen, to);
+                                iocb->ki_pos += ret;
+                                read += ret;
                        }
                        __free_pages(page, 0);
-                        return ret;
+                        return read;
                }
                /* hit EOF or hole? */
                if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
-                        ret < len) {
+                    ret < len) {
                        dout("sync_read hit hole, ppos %lld < size %lld"
                             ", reading more\n", iocb->ki_pos,
                             inode->i_size);
@@ -945,7 +952,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
        mutex_lock(&inode->i_mutex);
        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = file->f_mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index f61a74115beb..119c43c80638 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -82,8 +82,8 @@ struct inode *ceph_get_snapdir(struct inode *parent)
        inode->i_mode = parent->i_mode;
        inode->i_uid = parent->i_uid;
        inode->i_gid = parent->i_gid;
-        inode->i_op = &ceph_dir_iops;
+        inode->i_op = &ceph_snapdir_iops;
-        inode->i_fop = &ceph_dir_fops;
+        inode->i_fop = &ceph_snapdir_fops;
        ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
        ci->i_rbytes = 0;
        return inode;
@@ -783,8 +783,6 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
        }
        inode->i_mapping->a_ops = &ceph_aops;
-        inode->i_mapping->backing_dev_info =
-                &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
        switch (inode->i_mode & S_IFMT) {
        case S_IFIFO:
@@ -840,30 +838,31 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
                       ceph_vinop(inode), inode->i_mode);
        }
-        /* set dir completion flag? */
-        if (S_ISDIR(inode->i_mode) &&
-            ci->i_files == 0 && ci->i_subdirs == 0 &&
-            ceph_snap(inode) == CEPH_NOSNAP &&
-            (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
-            (issued & CEPH_CAP_FILE_EXCL) == 0 &&
-            !__ceph_dir_is_complete(ci)) {
-                dout(" marking %p complete (empty)\n", inode);
-                __ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count),
-                                        ci->i_ordered_count);
-        }
        /* were we issued a capability? */
        if (info->cap.caps) {
                if (ceph_snap(inode) == CEPH_NOSNAP) {
+                        unsigned caps = le32_to_cpu(info->cap.caps);
                        ceph_add_cap(inode, session,
                                     le64_to_cpu(info->cap.cap_id),
-                                     cap_fmode,
+                                     cap_fmode, caps,
-                                     le32_to_cpu(info->cap.caps),
                                     le32_to_cpu(info->cap.wanted),
                                     le32_to_cpu(info->cap.seq),
                                     le32_to_cpu(info->cap.mseq),
                                     le64_to_cpu(info->cap.realm),
                                     info->cap.flags, &new_cap);
+                        /* set dir completion flag? */
+                        if (S_ISDIR(inode->i_mode) &&
+                            ci->i_files == 0 && ci->i_subdirs == 0 &&
+                            (caps & CEPH_CAP_FILE_SHARED) &&
+                            (issued & CEPH_CAP_FILE_EXCL) == 0 &&
+                            !__ceph_dir_is_complete(ci)) {
+                                dout(" marking %p complete (empty)\n", inode);
+                                __ceph_dir_set_complete(ci,
+                                        atomic_read(&ci->i_release_count),
+                                        ci->i_ordered_count);
+                        }
                        wake = true;
                } else {
                        dout(" %p got snap_caps %s\n", inode,
@@ -1448,12 +1447,14 @@ retry_lookup:
                }
                if (!dn->d_inode) {
-                        dn = splice_dentry(dn, in, NULL);
+                        struct dentry *realdn = splice_dentry(dn, in, NULL);
-                        if (IS_ERR(dn)) {
+                        if (IS_ERR(realdn)) {
-                                err = PTR_ERR(dn);
+                                err = PTR_ERR(realdn);
+                                d_drop(dn);
                                dn = NULL;
                                goto next_item;
                        }
+                        dn = realdn;
                }
                di = dn->d_fsdata;
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index c35c5c614e38..4347039ecc18 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -239,23 +239,26 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl)
        return err;
 }
-/**
+/*
- * Must be called with lock_flocks() already held. Fills in the passed
+ * Fills in the passed counter variables, so you can prepare pagelist metadata
- * counter variables, so you can prepare pagelist metadata before calling
+ * before calling ceph_encode_locks.
- * ceph_encode_locks.
 */
 void ceph_count_locks(struct inode *inode, int *fcntl_count, int *flock_count)
 {
        struct file_lock *lock;
+        struct file_lock_context *ctx;
        *fcntl_count = 0;
        *flock_count = 0;
-        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+        ctx = inode->i_flctx;
-                if (lock->fl_flags & FL_POSIX)
+        if (ctx) {
+                spin_lock(&ctx->flc_lock);
+                list_for_each_entry(lock, &ctx->flc_posix, fl_list)
                        ++(*fcntl_count);
-                else if (lock->fl_flags & FL_FLOCK)
+                list_for_each_entry(lock, &ctx->flc_flock, fl_list)
                        ++(*flock_count);
+                spin_unlock(&ctx->flc_lock);
        }
        dout("counted %d flock locks and %d fcntl locks",
             *flock_count, *fcntl_count);
@@ -271,6 +274,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
                                int num_fcntl_locks, int num_flock_locks)
 {
        struct file_lock *lock;
+        struct file_lock_context *ctx = inode->i_flctx;
        int err = 0;
        int seen_fcntl = 0;
        int seen_flock = 0;
@@ -279,33 +283,34 @@ int ceph_encode_locks_to_buffer(struct inode *inode,
        dout("encoding %d flock and %d fcntl locks", num_flock_locks,
             num_fcntl_locks);
-        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+        if (!ctx)
-                if (lock->fl_flags & FL_POSIX) {
+                return 0;
-                        ++seen_fcntl;
-                        if (seen_fcntl > num_fcntl_locks) {
+        spin_lock(&ctx->flc_lock);
-                                err = -ENOSPC;
+        list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
-                                goto fail;
+                ++seen_fcntl;
-                        }
+                if (seen_fcntl > num_fcntl_locks) {
-                        err = lock_to_ceph_filelock(lock, &flocks[l]);
+                        err = -ENOSPC;
-                        if (err)
+                        goto fail;
-                                goto fail;
-                        ++l;
                }
+                err = lock_to_ceph_filelock(lock, &flocks[l]);
+                if (err)
+                        goto fail;
+                ++l;
        }
-        for (lock = inode->i_flock; lock != NULL; lock = lock->fl_next) {
+        list_for_each_entry(lock, &ctx->flc_flock, fl_list) {
-                if (lock->fl_flags & FL_FLOCK) {
+                ++seen_flock;
-                        ++seen_flock;
+                if (seen_flock > num_flock_locks) {
-                        if (seen_flock > num_flock_locks) {
+                        err = -ENOSPC;
-                                err = -ENOSPC;
+                        goto fail;
-                                goto fail;
-                        }
-                        err = lock_to_ceph_filelock(lock, &flocks[l]);
-                        if (err)
-                                goto fail;
-                        ++l;
                }
+                err = lock_to_ceph_filelock(lock, &flocks[l]);
+                if (err)
+                        goto fail;
+                ++l;
        }
 fail:
+        spin_unlock(&ctx->flc_lock);
        return err;
 }
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index d2171f4a6980..71c073f38e54 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -480,6 +480,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
                mdsc->max_sessions = newmax;
        }
        mdsc->sessions[mds] = s;
+        atomic_inc(&mdsc->num_sessions);
        atomic_inc(&s->s_ref);  /* one ref to sessions[], one to caller */
        ceph_con_open(&s->s_con, CEPH_ENTITY_TYPE_MDS, mds,
@@ -503,6 +504,7 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
        mdsc->sessions[s->s_mds] = NULL;
        ceph_con_close(&s->s_con);
        ceph_put_mds_session(s);
+        atomic_dec(&mdsc->num_sessions);
 }
 /*
@@ -842,8 +844,9 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
        struct ceph_options *opt = mdsc->fsc->client->options;
        void *p;
-        const char* metadata[3][2] = {
+        const char* metadata[][2] = {
                {"hostname", utsname()->nodename},
+                {"kernel_version", utsname()->release},
                {"entity_id", opt->name ? opt->name : ""},
                {NULL, NULL}
        };
@@ -1464,19 +1467,33 @@ out_unlocked:
        return err;
 }
+static int check_cap_flush(struct inode *inode, u64 want_flush_seq)
+{
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        int ret;
+        spin_lock(&ci->i_ceph_lock);
+        if (ci->i_flushing_caps)
+                ret = ci->i_cap_flush_seq >= want_flush_seq;
+        else
+                ret = 1;
+        spin_unlock(&ci->i_ceph_lock);
+        return ret;
+}
 /*
 * flush all dirty inode data to disk.
 *
 * returns true if we've flushed through want_flush_seq
 */
-static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
+static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 {
-        int mds, ret = 1;
+        int mds;
        dout("check_cap_flush want %lld\n", want_flush_seq);
        mutex_lock(&mdsc->mutex);
-        for (mds = 0; ret && mds < mdsc->max_sessions; mds++) {
+        for (mds = 0; mds < mdsc->max_sessions; mds++) {
                struct ceph_mds_session *session = mdsc->sessions[mds];
+                struct inode *inode = NULL;
                if (!session)
                        continue;
@@ -1489,29 +1506,29 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
                                list_entry(session->s_cap_flushing.next,
                                           struct ceph_inode_info,
                                           i_flushing_item);
-                        struct inode *inode = &ci->vfs_inode;
-                        spin_lock(&ci->i_ceph_lock);
+                        if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) {
-                        if (ci->i_cap_flush_seq <= want_flush_seq) {
                                dout("check_cap_flush still flushing %p "
-                                     "seq %lld <= %lld to mds%d\n", inode,
+                                     "seq %lld <= %lld to mds%d\n",
-                                     ci->i_cap_flush_seq, want_flush_seq,
+                                     &ci->vfs_inode, ci->i_cap_flush_seq,
-                                     session->s_mds);
+                                     want_flush_seq, session->s_mds);
-                                ret = 0;
+                                inode = igrab(&ci->vfs_inode);
                        }
-                        spin_unlock(&ci->i_ceph_lock);
                }
                mutex_unlock(&session->s_mutex);
                ceph_put_mds_session(session);
-                if (!ret)
+                if (inode) {
-                        return ret;
+                        wait_event(mdsc->cap_flushing_wq,
+                                   check_cap_flush(inode, want_flush_seq));
+                        iput(inode);
+                }
                mutex_lock(&mdsc->mutex);
        }
        mutex_unlock(&mdsc->mutex);
        dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
-        return ret;
 }
 /*
@@ -1923,7 +1940,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        head->num_releases = cpu_to_le16(releases);
        /* time stamp */
-        ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+        {
+                struct ceph_timespec ts;
+                ceph_encode_timespec(&ts, &req->r_stamp);
+                ceph_encode_copy(&p, &ts, sizeof(ts));
+        }
        BUG_ON(p > end);
        msg->front.iov_len = p - msg->front.iov_base;
@@ -2012,7 +2033,11 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
                /* time stamp */
                p = msg->front.iov_base + req->r_request_release_offset;
-                ceph_encode_copy(&p, &req->r_stamp, sizeof(req->r_stamp));
+                {
+                        struct ceph_timespec ts;
+                        ceph_encode_timespec(&ts, &req->r_stamp);
+                        ceph_encode_copy(&p, &ts, sizeof(ts));
+                }
                msg->front.iov_len = p - msg->front.iov_base;
                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
@@ -2159,6 +2184,8 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds)
                p = rb_next(p);
                if (req->r_got_unsafe)
                        continue;
+                if (req->r_attempts > 0)
+                        continue; /* only new requests */
                if (req->r_session &&
                    req->r_session->s_mds == mds) {
                        dout(" kicking tid %llu\n", req->r_tid);
@@ -2286,6 +2313,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        struct ceph_mds_request *req;
        struct ceph_mds_reply_head *head = msg->front.iov_base;
        struct ceph_mds_reply_info_parsed *rinfo;  /* parsed reply info */
+        struct ceph_snap_realm *realm;
        u64 tid;
        int err, result;
        int mds = session->s_mds;
@@ -2401,11 +2429,13 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
        /* snap trace */
+        realm = NULL;
        if (rinfo->snapblob_len) {
                down_write(&mdsc->snap_rwsem);
                ceph_update_snap_trace(mdsc, rinfo->snapblob,
-                               rinfo->snapblob + rinfo->snapblob_len,
+                                rinfo->snapblob + rinfo->snapblob_len,
-                               le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP);
+                                le32_to_cpu(head->op) == CEPH_MDS_OP_RMSNAP,
+                                &realm);
                downgrade_write(&mdsc->snap_rwsem);
        } else {
                down_read(&mdsc->snap_rwsem);
@@ -2423,6 +2453,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        mutex_unlock(&req->r_fill_mutex);
        up_read(&mdsc->snap_rwsem);
+        if (realm)
+                ceph_put_snap_realm(mdsc, realm);
 out_err:
        mutex_lock(&mdsc->mutex);
        if (!req->r_aborted) {
@@ -2487,6 +2519,7 @@ static void handle_forward(struct ceph_mds_client *mdsc,
                dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
                BUG_ON(req->r_err);
                BUG_ON(req->r_got_result);
+                req->r_attempts = 0;
                req->r_num_fwd = fwd_seq;
                req->r_resend_mds = next_mds;
                put_request_session(req);
@@ -2580,6 +2613,14 @@ static void handle_session(struct ceph_mds_session *session,
                send_flushmsg_ack(mdsc, session, seq);
                break;
+        case CEPH_SESSION_FORCE_RO:
+                dout("force_session_readonly %p\n", session);
+                spin_lock(&session->s_cap_lock);
+                session->s_readonly = true;
+                spin_unlock(&session->s_cap_lock);
+                wake_up_session_caps(session, 0);
+                break;
        default:
                pr_err("mdsc_handle_session bad op %d mds%d\n", op, mds);
                WARN_ON(1);
@@ -2610,6 +2651,7 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
                                   struct ceph_mds_session *session)
 {
        struct ceph_mds_request *req, *nreq;
+        struct rb_node *p;
        int err;
        dout("replay_unsafe_requests mds%d\n", session->s_mds);
@@ -2622,6 +2664,28 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
                        ceph_con_send(&session->s_con, req->r_request);
                }
        }
+        /*
+         * also re-send old requests when MDS enters reconnect stage. So that MDS
+         * can process completed request in clientreplay stage.
+         */
+        p = rb_first(&mdsc->request_tree);
+        while (p) {
+                req = rb_entry(p, struct ceph_mds_request, r_node);
+                p = rb_next(p);
+                if (req->r_got_unsafe)
+                        continue;
+                if (req->r_attempts == 0)
+                        continue; /* only old requests */
+                if (req->r_session &&
+                    req->r_session->s_mds == session->s_mds) {
+                        err = __prepare_send_request(mdsc, req, session->s_mds);
+                        if (!err) {
+                                ceph_msg_get(req->r_request);
+                                ceph_con_send(&session->s_con, req->r_request);
+                        }
+                }
+        }
        mutex_unlock(&mdsc->mutex);
 }
@@ -2700,20 +2764,16 @@ static int encode_caps_cb(struct inode *inode, struct ceph_cap *cap,
                struct ceph_filelock *flocks;
 encode_again:
-                spin_lock(&inode->i_lock);
                ceph_count_locks(inode, &num_fcntl_locks, &num_flock_locks);
-                spin_unlock(&inode->i_lock);
                flocks = kmalloc((num_fcntl_locks+num_flock_locks) *
                                 sizeof(struct ceph_filelock), GFP_NOFS);
                if (!flocks) {
                        err = -ENOMEM;
                        goto out_free;
                }
-                spin_lock(&inode->i_lock);
                err = ceph_encode_locks_to_buffer(inode, flocks,
                                                  num_fcntl_locks,
                                                  num_flock_locks);
-                spin_unlock(&inode->i_lock);
                if (err) {
                        kfree(flocks);
                        if (err == -ENOSPC)
@@ -2791,6 +2851,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
        spin_unlock(&session->s_gen_ttl_lock);
        spin_lock(&session->s_cap_lock);
+        /* don't know if session is readonly */
+        session->s_readonly = 0;
        /*
         * notify __ceph_remove_cap() that we are composing cap reconnect.
         * If a cap get released before being added to the cap reconnect,
@@ -2937,9 +2999,6 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                                mutex_unlock(&s->s_mutex);
                                s->s_state = CEPH_MDS_SESSION_RESTARTING;
                        }
-                        /* kick any requests waiting on the recovering mds */
-                        kick_requests(mdsc, i);
                } else if (oldstate == newstate) {
                        continue;  /* nothing new with this mds */
                }
@@ -3299,6 +3358,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
        init_waitqueue_head(&mdsc->session_close_wq);
        INIT_LIST_HEAD(&mdsc->waiting_for_map);
        mdsc->sessions = NULL;
+        atomic_set(&mdsc->num_sessions, 0);
        mdsc->max_sessions = 0;
        mdsc->stopping = 0;
        init_rwsem(&mdsc->snap_rwsem);
@@ -3432,14 +3492,17 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
        dout("sync\n");
        mutex_lock(&mdsc->mutex);
        want_tid = mdsc->last_tid;
-        want_flush = mdsc->cap_flush_seq;
        mutex_unlock(&mdsc->mutex);
-        dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
        ceph_flush_dirty_caps(mdsc);
+        spin_lock(&mdsc->cap_dirty_lock);
+        want_flush = mdsc->cap_flush_seq;
+        spin_unlock(&mdsc->cap_dirty_lock);
+        dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush);
        wait_unsafe_requests(mdsc, want_tid);
-        wait_event(mdsc->cap_flushing_wq, check_cap_flush(mdsc, want_flush));
+        wait_caps_flush(mdsc, want_flush);
 }
 /*
@@ -3447,17 +3510,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 */
 static bool done_closing_sessions(struct ceph_mds_client *mdsc)
 {
-        int i, n = 0;
        if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN)
                return true;
+        return atomic_read(&mdsc->num_sessions) == 0;
-        mutex_lock(&mdsc->mutex);
-        for (i = 0; i < mdsc->max_sessions; i++)
-                if (mdsc->sessions[i])
-                        n++;
-        mutex_unlock(&mdsc->mutex);
-        return n == 0;
 }
 /*
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index e2817d00f7d9..1875b5d985c6 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -137,6 +137,7 @@ struct ceph_mds_session {
        int               s_nr_caps, s_trim_caps;
        int               s_num_cap_releases;
        int               s_cap_reconnect;
+        int               s_readonly;
        struct list_head  s_cap_releases; /* waiting cap_release messages */
        struct list_head  s_cap_releases_done; /* ready to send */
        struct ceph_cap  *s_cap_iterator;
@@ -272,6 +273,7 @@ struct ceph_mds_client {
        struct list_head        waiting_for_map;
        struct ceph_mds_session **sessions;    /* NULL for mds if no session */
+        atomic_t                num_sessions;
        int                     max_sessions;  /* len of s_mds_sessions */
        int                     stopping;      /* true if shutting down */
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index ce35fbd4ba5d..a97e39f09ba6 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -70,13 +70,11 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
         * safe.  we do need to protect against concurrent empty list
         * additions, however.
         */
-        if (atomic_read(&realm->nref) == 0) {
+        if (atomic_inc_return(&realm->nref) == 1) {
                spin_lock(&mdsc->snap_empty_lock);
                list_del_init(&realm->empty_item);
                spin_unlock(&mdsc->snap_empty_lock);
        }
-        atomic_inc(&realm->nref);
 }
 static void __insert_snap_realm(struct rb_root *root,
@@ -116,7 +114,7 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
        if (!realm)
                return ERR_PTR(-ENOMEM);
-        atomic_set(&realm->nref, 0);    /* tree does not take a ref */
+        atomic_set(&realm->nref, 1);    /* for caller */
        realm->ino = ino;
        INIT_LIST_HEAD(&realm->children);
        INIT_LIST_HEAD(&realm->child_item);
@@ -134,8 +132,8 @@ static struct ceph_snap_realm *ceph_create_snap_realm(
 *
 * caller must hold snap_rwsem for write.
 */
-struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+static struct ceph_snap_realm *__lookup_snap_realm(struct ceph_mds_client *mdsc,
-                                               u64 ino)
+                                                   u64 ino)
 {
        struct rb_node *n = mdsc->snap_realms.rb_node;
        struct ceph_snap_realm *r;
@@ -154,6 +152,16 @@ struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
        return NULL;
 }
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+                                               u64 ino)
+{
+        struct ceph_snap_realm *r;
+        r = __lookup_snap_realm(mdsc, ino);
+        if (r)
+                ceph_get_snap_realm(mdsc, r);
+        return r;
+}
 static void __put_snap_realm(struct ceph_mds_client *mdsc,
                             struct ceph_snap_realm *realm);
@@ -273,7 +281,6 @@ static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
        }
        realm->parent_ino = parentino;
        realm->parent = parent;
-        ceph_get_snap_realm(mdsc, parent);
        list_add(&realm->child_item, &parent->children);
        return 1;
 }
@@ -631,12 +638,14 @@ static void queue_realm_cap_snaps(struct ceph_snap_realm *realm)
 * Caller must hold snap_rwsem for write.
 */
 int ceph_update_snap_trace(struct ceph_mds_client *mdsc,
-                           void *p, void *e, bool deletion)
+                           void *p, void *e, bool deletion,
+                           struct ceph_snap_realm **realm_ret)
 {
        struct ceph_mds_snap_realm *ri;    /* encoded */
        __le64 *snaps;                     /* encoded */
        __le64 *prior_parent_snaps;        /* encoded */
-        struct ceph_snap_realm *realm;
+        struct ceph_snap_realm *realm = NULL;
+        struct ceph_snap_realm *first_realm = NULL;
        int invalidate = 0;
        int err = -ENOMEM;
        LIST_HEAD(dirty_realms);
@@ -704,13 +713,18 @@ more:
        dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
             realm, invalidate, p, e);
-        if (p < e)
-                goto more;
        /* invalidate when we reach the _end_ (root) of the trace */
-        if (invalidate)
+        if (invalidate && p >= e)
                rebuild_snap_realms(realm);
+        if (!first_realm)
+                first_realm = realm;
+        else
+                ceph_put_snap_realm(mdsc, realm);
+        if (p < e)
+                goto more;
        /*
         * queue cap snaps _after_ we've built the new snap contexts,
         * so that i_head_snapc can be set appropriately.
@@ -721,12 +735,21 @@ more:
                queue_realm_cap_snaps(realm);
        }
+        if (realm_ret)
+                *realm_ret = first_realm;
+        else
+                ceph_put_snap_realm(mdsc, first_realm);
        __cleanup_empty_realms(mdsc);
        return 0;
 bad:
        err = -EINVAL;
 fail:
+        if (realm && !IS_ERR(realm))
+                ceph_put_snap_realm(mdsc, realm);
+        if (first_realm)
+                ceph_put_snap_realm(mdsc, first_realm);
        pr_err("update_snap_trace error %d\n", err);
        return err;
 }
@@ -844,7 +867,6 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
                        if (IS_ERR(realm))
                                goto out;
                }
-                ceph_get_snap_realm(mdsc, realm);
                dout("splitting snap_realm %llx %p\n", realm->ino, realm);
                for (i = 0; i < num_split_inos; i++) {
@@ -905,7 +927,7 @@ skip_inode:
                /* we may have taken some of the old realm's children. */
                for (i = 0; i < num_split_realms; i++) {
                        struct ceph_snap_realm *child =
-                                ceph_lookup_snap_realm(mdsc,
+                                __lookup_snap_realm(mdsc,
                                           le64_to_cpu(split_realms[i]));
                        if (!child)
                                continue;
@@ -918,7 +940,7 @@ skip_inode:
         * snap, we can avoid queueing cap_snaps.
         */
        ceph_update_snap_trace(mdsc, p, e,
-                               op == CEPH_SNAP_OP_DESTROY);
+                               op == CEPH_SNAP_OP_DESTROY, NULL);
        if (op == CEPH_SNAP_OP_SPLIT)
                /* we took a reference when we created the realm, above */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 50f06cddc94b..a63997b8bcff 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -40,17 +40,6 @@ static void ceph_put_super(struct super_block *s)
        dout("put_super\n");
        ceph_mdsc_close_sessions(fsc->mdsc);
-        /*
-         * ensure we release the bdi before put_anon_super releases
-         * the device name.
-         */
-        if (s->s_bdi == &fsc->backing_dev_info) {
-                bdi_unregister(&fsc->backing_dev_info);
-                s->s_bdi = NULL;
-        }
-        return;
 }
 static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -425,6 +414,10 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
                seq_puts(m, ",noshare");
        if (opt->flags & CEPH_OPT_NOCRC)
                seq_puts(m, ",nocrc");
+        if (opt->flags & CEPH_OPT_NOMSGAUTH)
+                seq_puts(m, ",nocephx_require_signatures");
+        if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
+                seq_puts(m, ",notcp_nodelay");
        if (opt->name)
                seq_printf(m, ",name=%s", opt->name);
@@ -910,7 +903,7 @@ static int ceph_register_bdi(struct super_block *sb,
                        >> PAGE_SHIFT;
        else
                fsc->backing_dev_info.ra_pages =
-                        default_backing_dev_info.ra_pages;
+                        VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE;
        err = bdi_register(&fsc->backing_dev_info, NULL, "ceph-%ld",
                           atomic_long_inc_return(&bdi_seq));
@@ -1002,11 +995,16 @@ out_final:
 static void ceph_kill_sb(struct super_block *s)
 {
        struct ceph_fs_client *fsc = ceph_sb_to_client(s);
+        dev_t dev = s->s_dev;
        dout("kill_sb %p\n", s);
        ceph_mdsc_pre_umount(fsc->mdsc);
-        kill_anon_super(s);    /* will call put_super after sb is r/o */
+        generic_shutdown_super(s);
        ceph_mdsc_destroy(fsc);
        destroy_fs_client(fsc);
+        free_anon_bdev(dev);
 }
 static struct file_system_type ceph_fs_type = {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index e1aa32d0759d..04c8124ed30e 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -693,7 +693,8 @@ extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
 extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
                                struct ceph_snap_realm *realm);
 extern int ceph_update_snap_trace(struct ceph_mds_client *m,
-                                  void *p, void *e, bool deletion);
+                                  void *p, void *e, bool deletion,
+                                  struct ceph_snap_realm **realm_ret);
 extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
                             struct ceph_mds_session *session,
                             struct ceph_msg *msg);
@@ -892,7 +893,9 @@ extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
 int ceph_uninline_data(struct file *filp, struct page *locked_page);
 /* dir.c */
 extern const struct file_operations ceph_dir_fops;
+extern const struct file_operations ceph_snapdir_fops;
 extern const struct inode_operations ceph_dir_iops;
+extern const struct inode_operations ceph_snapdir_iops;
 extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
        ceph_snapdir_dentry_ops;
diff --git a/fs/char_dev.c b/fs/char_dev.c
index 67b2007f10fe..ea06a3d0364c 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -24,27 +24,6 @@
 #include "internal.h"
-/*
- * capabilities for /dev/mem, /dev/kmem and similar directly mappable character
- * devices
- * - permits shared-mmap for read, write and/or exec
- * - does not permit private mmap in NOMMU mode (can't do COW)
- * - no readahead or I/O queue unplugging required
- */
-struct backing_dev_info directly_mappable_cdev_bdi = {
-        .name = "char",
-        .capabilities   = (
-#ifdef CONFIG_MMU
-                /* permit private copies of the data to be taken */
-                BDI_CAP_MAP_COPY |
-#endif
-                /* permit direct mmap, for read, write or exec */
-                BDI_CAP_MAP_DIRECT |
-                BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP |
-                /* no writeback happens */
-                BDI_CAP_NO_ACCT_AND_WRITEBACK),
-};
 static struct kobj_map *cdev_map;
 static DEFINE_MUTEX(chrdevs_lock);
@@ -575,8 +554,6 @@ static struct kobject *base_probe(dev_t dev, int *part, void *data)
 void __init chrdev_init(void)
 {
        cdev_map = kobj_map_init(base_probe, &chrdevs_lock);
-        if (bdi_init(&directly_mappable_cdev_bdi))
-                panic("Failed to init directly mappable cdev bdi");
 }
@@ -590,4 +567,3 @@ EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(__register_chrdev);
 EXPORT_SYMBOL(__unregister_chrdev);
-EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 9c56ef776407..7febcf2475c5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -606,9 +606,11 @@ cifs_security_flags_handle_must_flags(unsigned int *flags)
                *flags = CIFSSEC_MUST_NTLMV2;
        else if ((*flags & CIFSSEC_MUST_NTLM) == CIFSSEC_MUST_NTLM)
                *flags = CIFSSEC_MUST_NTLM;
-        else if ((*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
+        else if (CIFSSEC_MUST_LANMAN &&
+                 (*flags & CIFSSEC_MUST_LANMAN) == CIFSSEC_MUST_LANMAN)
                *flags = CIFSSEC_MUST_LANMAN;
-        else if ((*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
+        else if (CIFSSEC_MUST_PLNTXT &&
+                 (*flags & CIFSSEC_MUST_PLNTXT) == CIFSSEC_MUST_PLNTXT)
                *flags = CIFSSEC_MUST_PLNTXT;
        *flags |= signflags;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6e139111fdb2..22b289a3b1c4 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -661,16 +661,16 @@ set_credits(struct TCP_Server_Info *server, const int val)
        server->ops->set_credits(server, val);
 }
-static inline __u64
+static inline __le64
 get_next_mid64(struct TCP_Server_Info *server)
 {
-        return server->ops->get_next_mid(server);
+        return cpu_to_le64(server->ops->get_next_mid(server));
 }
 static inline __le16
 get_next_mid(struct TCP_Server_Info *server)
 {
-        __u16 mid = get_next_mid64(server);
+        __u16 mid = server->ops->get_next_mid(server);
        /*
         * The value in the SMB header should be little endian for easy
         * on-the-wire decoding.
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2a772da16b83..d3aa999ab785 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -3446,7 +3446,7 @@ cifs_mount(struct cifs_sb_info *cifs_sb, struct smb_vol *volume_info)
        int referral_walks_count = 0;
 #endif
-        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY);
+        rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs");
        if (rc)
                return rc;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 96b7e9b7706d..a94b3e673182 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -366,6 +366,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        struct cifsLockInfo *li, *tmp;
        struct cifs_fid fid;
        struct cifs_pending_open open;
+        bool oplock_break_cancelled;
        spin_lock(&cifs_file_list_lock);
        if (--cifs_file->count > 0) {
@@ -397,7 +398,7 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
        }
        spin_unlock(&cifs_file_list_lock);
-        cancel_work_sync(&cifs_file->oplock_break);
+        oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break);
        if (!tcon->need_reconnect && !cifs_file->invalidHandle) {
                struct TCP_Server_Info *server = tcon->ses->server;
@@ -409,6 +410,9 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
                _free_xid(xid);
        }
+        if (oplock_break_cancelled)
+                cifs_done_oplock_break(cifsi);
        cifs_del_pending_open(&open);
        /*
@@ -1109,11 +1113,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
        return rc;
 }
-/* copied from fs/locks.c with a name change */
-#define cifs_for_each_lock(inode, lockp) \
-        for (lockp = &inode->i_flock; *lockp != NULL; \
-             lockp = &(*lockp)->fl_next)
 struct lock_to_push {
        struct list_head llist;
        __u64 offset;
@@ -1128,8 +1127,9 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
 {
        struct inode *inode = cfile->dentry->d_inode;
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
-        struct file_lock *flock, **before;
+        struct file_lock *flock;
-        unsigned int count = 0, i = 0;
+        struct file_lock_context *flctx = inode->i_flctx;
+        unsigned int count = 0, i;
        int rc = 0, xid, type;
        struct list_head locks_to_send, *el;
        struct lock_to_push *lck, *tmp;
@@ -1137,12 +1137,14 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        xid = get_xid();
-        spin_lock(&inode->i_lock);
+        if (!flctx)
-        cifs_for_each_lock(inode, before) {
+                goto out;
-                if ((*before)->fl_flags & FL_POSIX)
-                        count++;
+        spin_lock(&flctx->flc_lock);
+        list_for_each(el, &flctx->flc_posix) {
+                count++;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&flctx->flc_lock);
        INIT_LIST_HEAD(&locks_to_send);
@@ -1151,7 +1153,7 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
         * added to the list while we are holding cinode->lock_sem that
         * protects locking operations of this inode.
         */
-        for (; i < count; i++) {
+        for (i = 0; i < count; i++) {
                lck = kmalloc(sizeof(struct lock_to_push), GFP_KERNEL);
                if (!lck) {
                        rc = -ENOMEM;
@@ -1161,11 +1163,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        }
        el = locks_to_send.next;
-        spin_lock(&inode->i_lock);
+        spin_lock(&flctx->flc_lock);
-        cifs_for_each_lock(inode, before) {
+        list_for_each_entry(flock, &flctx->flc_posix, fl_list) {
-                flock = *before;
-                if ((flock->fl_flags & FL_POSIX) == 0)
-                        continue;
                if (el == &locks_to_send) {
                        /*
                         * The list ended. We don't have enough allocated
@@ -1185,9 +1184,8 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
                lck->length = length;
                lck->type = type;
                lck->offset = flock->fl_start;
-                el = el->next;
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&flctx->flc_lock);
        list_for_each_entry_safe(lck, tmp, &locks_to_send, llist) {
                int stored_rc;
@@ -3244,7 +3242,6 @@ static struct vm_operations_struct cifs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = cifs_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 0c3ce464cae4..2d4f37235ed0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -937,8 +937,6 @@ retry_iget5_locked:
                        inode->i_flags |= S_NOATIME | S_NOCMTIME;
                if (inode->i_state & I_NEW) {
                        inode->i_ino = hash;
-                        if (S_ISREG(inode->i_mode))
-                                inode->i_data.backing_dev_info = sb->s_bdi;
 #ifdef CONFIG_CIFS_FSCACHE
                        /* initialize per-inode cache cookie pointer */
                        CIFS_I(inode)->fscache = NULL;
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 45cb59bcc791..8b7898b7670f 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -86,21 +86,16 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
        }
        src_inode = file_inode(src_file.file);
+        rc = -EINVAL;
+        if (S_ISDIR(src_inode->i_mode))
+                goto out_fput;
        /*
         * Note: cifs case is easier than btrfs since server responsible for
         * checks for proper open modes and file type and if it wants
         * server could even support copy of range where source = target
         */
+        lock_two_nondirectories(target_inode, src_inode);
-        /* so we do not deadlock racing two ioctls on same files */
-        if (target_inode < src_inode) {
-                mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_PARENT);
-                mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
-        } else {
-                mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
-                mutex_lock_nested(&target_inode->i_mutex, I_MUTEX_CHILD);
-        }
        /* determine range to clone */
        rc = -EINVAL;
@@ -124,13 +119,7 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
 out_unlock:
        /* although unlocking in the reverse order from locking is not
           strictly necessary here it is a little cleaner to be consistent */
-        if (target_inode < src_inode) {
+        unlock_two_nondirectories(src_inode, target_inode);
-                mutex_unlock(&src_inode->i_mutex);
-                mutex_unlock(&target_inode->i_mutex);
-        } else {
-                mutex_unlock(&target_inode->i_mutex);
-                mutex_unlock(&src_inode->i_mutex);
-        }
 out_fput:
        fdput(src_file);
 out_drop_write:
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index b333ff60781d..abae6dd2c6b9 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -926,6 +926,7 @@ cifs_NTtimeToUnix(__le64 ntutc)
        /* Subtract the NTFS time offset, then convert to 1s intervals. */
        s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
+        u64 abs_t;
        /*
         * Unfortunately can not use normal 64 bit division on 32 bit arch, but
@@ -933,13 +934,14 @@ cifs_NTtimeToUnix(__le64 ntutc)
         * to special case them
         */
        if (t < 0) {
-                t = -t;
+                abs_t = -t;
-                ts.tv_nsec = (long)(do_div(t, 10000000) * 100);
+                ts.tv_nsec = (long)(do_div(abs_t, 10000000) * 100);
                ts.tv_nsec = -ts.tv_nsec;
-                ts.tv_sec = -t;
+                ts.tv_sec = -abs_t;
        } else {
-                ts.tv_nsec = (long)do_div(t, 10000000) * 100;
+                abs_t = t;
-                ts.tv_sec = t;
+                ts.tv_nsec = (long)do_div(abs_t, 10000000) * 100;
+                ts.tv_sec = abs_t;
        }
        return ts;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 8eaf20a80649..c295338e0a98 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -69,7 +69,8 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
 *
 * Find the dentry that matches "name". If there isn't one, create one. If it's
- * a negative dentry or the uniqueid changed, then drop it and recreate it.
+ * a negative dentry or the uniqueid or filetype(mode) changed,
+ * then drop it and recreate it.
 */
 static void
 cifs_prime_dcache(struct dentry *parent, struct qstr *name,
@@ -97,8 +98,11 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name,
                        if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
                                fattr->cf_uniqueid = CIFS_I(inode)->uniqueid;
-                        /* update inode in place if i_ino didn't change */
+                        /* update inode in place
-                        if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
+                         * if both i_ino and i_mode didn't change */
+                        if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid &&
+                            (inode->i_mode & S_IFMT) ==
+                            (fattr->cf_mode & S_IFMT)) {
                                cifs_fattr_to_inode(inode, fattr);
                                goto out;
                        }
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index f1cefc9763ed..689f035915cf 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -32,12 +32,14 @@
 static int
 check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
 {
+        __u64 wire_mid = le64_to_cpu(hdr->MessageId);
        /*
         * Make sure that this really is an SMB, that it is a response,
         * and that the message ids match.
         */
        if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
-            (mid == hdr->MessageId)) {
+            (mid == wire_mid)) {
                if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
                        return 0;
                else {
@@ -51,11 +53,11 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
                if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
                        cifs_dbg(VFS, "Bad protocol string signature header %x\n",
                                 *(unsigned int *) hdr->ProtocolId);
-                if (mid != hdr->MessageId)
+                if (mid != wire_mid)
                        cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
-                                 mid, hdr->MessageId);
+                                 mid, wire_mid);
        }
-        cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", hdr->MessageId);
+        cifs_dbg(VFS, "Bad SMB detected. The Mid=%llu\n", wire_mid);
        return 1;
 }
@@ -95,7 +97,7 @@ smb2_check_message(char *buf, unsigned int length)
 {
        struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
        struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
-        __u64 mid = hdr->MessageId;
+        __u64 mid = le64_to_cpu(hdr->MessageId);
        __u32 len = get_rfc1002_length(buf);
        __u32 clc_len;  /* calculated length */
        int command;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 93fd0586f9ec..96b5d40a2ece 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -176,10 +176,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
 {
        struct mid_q_entry *mid;
        struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
+        __u64 wire_mid = le64_to_cpu(hdr->MessageId);
        spin_lock(&GlobalMid_Lock);
        list_for_each_entry(mid, &server->pending_mid_q, qhead) {
-                if ((mid->mid == hdr->MessageId) &&
+                if ((mid->mid == wire_mid) &&
                    (mid->mid_state == MID_REQUEST_SUBMITTED) &&
                    (mid->command == hdr->Command)) {
                        spin_unlock(&GlobalMid_Lock);
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index ce858477002a..70867d54fb8b 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -110,7 +110,7 @@ struct smb2_hdr {
        __le16 CreditRequest;  /* CreditResponse */
        __le32 Flags;
        __le32 NextCommand;
-        __u64  MessageId;       /* opaque - so can stay little endian */
+        __le64 MessageId;
        __le32 ProcessId;
        __u32  TreeId;          /* opaque - so do not make little endian */
        __u64  SessionId;       /* opaque - so do not make little endian */
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 5111e7272db6..d4c5b6f109a7 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -490,7 +490,7 @@ smb2_mid_entry_alloc(const struct smb2_hdr *smb_buffer,
                return temp;
        else {
                memset(temp, 0, sizeof(struct mid_q_entry));
-                temp->mid = smb_buffer->MessageId;      /* always LE */
+                temp->mid = le64_to_cpu(smb_buffer->MessageId);
                temp->pid = current->pid;
                temp->command = smb_buffer->Command;    /* Always LE */
                temp->when_alloc = jiffies;
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 6c1566366a66..a4232ec4f2ba 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -221,7 +221,7 @@ E_md4hash(const unsigned char *passwd, unsigned char *p16,
        }
        rc = mdfour(p16, (unsigned char *) wpwd, len * sizeof(__le16));
-        memset(wpwd, 0, 129 * sizeof(__le16));
+        memzero_explicit(wpwd, sizeof(wpwd));
        return rc;
 }
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index 86c893884eb9..281ee011bb6a 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -28,29 +28,6 @@
 #include "coda_int.h"
-/* dir inode-ops */
-static int coda_create(struct inode *dir, struct dentry *new, umode_t mode, bool excl);
-static struct dentry *coda_lookup(struct inode *dir, struct dentry *target, unsigned int flags);
-static int coda_link(struct dentry *old_dentry, struct inode *dir_inode, 
-                     struct dentry *entry);
-static int coda_unlink(struct inode *dir_inode, struct dentry *entry);
-static int coda_symlink(struct inode *dir_inode, struct dentry *entry,
-                        const char *symname);
-static int coda_mkdir(struct inode *dir_inode, struct dentry *entry, umode_t mode);
-static int coda_rmdir(struct inode *dir_inode, struct dentry *entry);
-static int coda_rename(struct inode *old_inode, struct dentry *old_dentry, 
-                       struct inode *new_inode, struct dentry *new_dentry);
-/* dir file-ops */
-static int coda_readdir(struct file *file, struct dir_context *ctx);
-/* dentry ops */
-static int coda_dentry_revalidate(struct dentry *de, unsigned int flags);
-static int coda_dentry_delete(const struct dentry *);
-/* support routines */
-static int coda_venus_readdir(struct file *, struct dir_context *);
 /* same as fs/bad_inode.c */
 static int coda_return_EIO(void)
 {
@@ -58,38 +35,6 @@ static int coda_return_EIO(void)
 }
 #define CODA_EIO_ERROR ((void *) (coda_return_EIO))
-const struct dentry_operations coda_dentry_operations =
-{
-        .d_revalidate   = coda_dentry_revalidate,
-        .d_delete       = coda_dentry_delete,
-};
-const struct inode_operations coda_dir_inode_operations =
-{
-        .create         = coda_create,
-        .lookup         = coda_lookup,
-        .link           = coda_link,
-        .unlink         = coda_unlink,
-        .symlink        = coda_symlink,
-        .mkdir          = coda_mkdir,
-        .rmdir          = coda_rmdir,
-        .mknod          = CODA_EIO_ERROR,
-        .rename         = coda_rename,
-        .permission     = coda_permission,
-        .getattr        = coda_getattr,
-        .setattr        = coda_setattr,
-};
-const struct file_operations coda_dir_operations = {
-        .llseek         = generic_file_llseek,
-        .read           = generic_read_dir,
-        .iterate        = coda_readdir,
-        .open           = coda_open,
-        .release        = coda_release,
-        .fsync          = coda_fsync,
-};
 /* inode operations for directories */
 /* access routines: lookup, readlink, permission */
 static struct dentry *coda_lookup(struct inode *dir, struct dentry *entry, unsigned int flags)
@@ -374,33 +319,6 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry,
        return error;
 }
-/* file operations for directories */
-static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
-{
-        struct coda_file_info *cfi;
-        struct file *host_file;
-        int ret;
-        cfi = CODA_FTOC(coda_file);
-        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
-        host_file = cfi->cfi_container;
-        if (host_file->f_op->iterate) {
-                struct inode *host_inode = file_inode(host_file);
-                mutex_lock(&host_inode->i_mutex);
-                ret = -ENOENT;
-                if (!IS_DEADDIR(host_inode)) {
-                        ret = host_file->f_op->iterate(host_file, ctx);
-                        file_accessed(host_file);
-                }
-                mutex_unlock(&host_inode->i_mutex);
-                return ret;
-        }
-        /* Venus: we must read Venus dirents from a file */
-        return coda_venus_readdir(coda_file, ctx);
-}
 static inline unsigned int CDT2DT(unsigned char cdt)
 {
        unsigned int dt;
@@ -495,6 +413,33 @@ out:
        return 0;
 }
+/* file operations for directories */
+static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
+{
+        struct coda_file_info *cfi;
+        struct file *host_file;
+        int ret;
+        cfi = CODA_FTOC(coda_file);
+        BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
+        host_file = cfi->cfi_container;
+        if (host_file->f_op->iterate) {
+                struct inode *host_inode = file_inode(host_file);
+                mutex_lock(&host_inode->i_mutex);
+                ret = -ENOENT;
+                if (!IS_DEADDIR(host_inode)) {
+                        ret = host_file->f_op->iterate(host_file, ctx);
+                        file_accessed(host_file);
+                }
+                mutex_unlock(&host_inode->i_mutex);
+                return ret;
+        }
+        /* Venus: we must read Venus dirents from a file */
+        return coda_venus_readdir(coda_file, ctx);
+}
 /* called when a cache lookup succeeds */
 static int coda_dentry_revalidate(struct dentry *de, unsigned int flags)
 {
@@ -603,3 +548,32 @@ int coda_revalidate_inode(struct inode *inode)
        }
        return 0;
 }
+const struct dentry_operations coda_dentry_operations = {
+        .d_revalidate   = coda_dentry_revalidate,
+        .d_delete       = coda_dentry_delete,
+};
+const struct inode_operations coda_dir_inode_operations = {
+        .create         = coda_create,
+        .lookup         = coda_lookup,
+        .link           = coda_link,
+        .unlink         = coda_unlink,
+        .symlink        = coda_symlink,
+        .mkdir          = coda_mkdir,
+        .rmdir          = coda_rmdir,
+        .mknod          = CODA_EIO_ERROR,
+        .rename         = coda_rename,
+        .permission     = coda_permission,
+        .getattr        = coda_getattr,
+        .setattr        = coda_setattr,
+};
+const struct file_operations coda_dir_operations = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .iterate        = coda_readdir,
+        .open           = coda_open,
+        .release        = coda_release,
+        .fsync          = coda_fsync,
+};
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index b945410bfcd5..82ec68b59208 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -183,7 +183,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent)
                goto unlock_out;
        }
-        error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY);
+        error = bdi_setup_and_register(&vc->bdi, "coda");
        if (error)
                goto unlock_out;
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index bd4a3c167091..a315677e44d3 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -70,8 +70,6 @@ extern int configfs_is_root(struct config_item *item);
 extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, struct super_block *);
 extern int configfs_create(struct dentry *, umode_t mode, int (*init)(struct inode *));
-extern int configfs_inode_init(void);
-extern void configfs_inode_exit(void);
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 5946ad98053f..65af86147154 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -50,12 +50,6 @@ static const struct address_space_operations configfs_aops = {
        .write_end      = simple_write_end,
 };
-static struct backing_dev_info configfs_backing_dev_info = {
-        .name           = "configfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 static const struct inode_operations configfs_inode_operations ={
        .setattr        = configfs_setattr,
 };
@@ -137,7 +131,6 @@ struct inode *configfs_new_inode(umode_t mode, struct configfs_dirent *sd,
        if (inode) {
                inode->i_ino = get_next_ino();
                inode->i_mapping->a_ops = &configfs_aops;
-                inode->i_mapping->backing_dev_info = &configfs_backing_dev_info;
                inode->i_op = &configfs_inode_operations;
                if (sd->s_iattr) {
@@ -283,13 +276,3 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
        }
        mutex_unlock(&dir->d_inode->i_mutex);
 }
-int __init configfs_inode_init(void)
-{
-        return bdi_init(&configfs_backing_dev_info);
-}
-void configfs_inode_exit(void)
-{
-        bdi_destroy(&configfs_backing_dev_info);
-}
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index f6c285833390..da94e41bdbf6 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -145,19 +145,13 @@ static int __init configfs_init(void)
        if (!config_kobj)
                goto out2;
-        err = configfs_inode_init();
-        if (err)
-                goto out3;
        err = register_filesystem(&configfs_fs_type);
        if (err)
-                goto out4;
+                goto out3;
        return 0;
-out4:
-        pr_err("Unable to register filesystem!\n");
-        configfs_inode_exit();
 out3:
+        pr_err("Unable to register filesystem!\n");
        kobject_put(config_kobj);
 out2:
        kmem_cache_destroy(configfs_dir_cachep);
@@ -172,7 +166,6 @@ static void __exit configfs_exit(void)
        kobject_put(config_kobj);
        kmem_cache_destroy(configfs_dir_cachep);
        configfs_dir_cachep = NULL;
-        configfs_inode_exit();
 }
 MODULE_AUTHOR("Oracle");
diff --git a/fs/dax.c b/fs/dax.c
new file mode 100644
index 000000000000..ed1619ec6537
--- /dev/null
+++ b/fs/dax.c
@@ -0,0 +1,534 @@
+/*
+ * fs/dax.c - Direct Access filesystem code
+ * Copyright (c) 2013-2014 Intel Corporation
+ * Author: Matthew Wilcox <matthew.r.wilcox@intel.com>
+ * Author: Ross Zwisler <ross.zwisler@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ */
+#include <linux/atomic.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/fs.h>
+#include <linux/genhd.h>
+#include <linux/highmem.h>
+#include <linux/memcontrol.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/uio.h>
+#include <linux/vmstat.h>
+int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+{
+        struct block_device *bdev = inode->i_sb->s_bdev;
+        sector_t sector = block << (inode->i_blkbits - 9);
+        might_sleep();
+        do {
+                void *addr;
+                unsigned long pfn;
+                long count;
+                count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+                if (count < 0)
+                        return count;
+                BUG_ON(size < count);
+                while (count > 0) {
+                        unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
+                        if (pgsz > count)
+                                pgsz = count;
+                        if (pgsz < PAGE_SIZE)
+                                memset(addr, 0, pgsz);
+                        else
+                                clear_page(addr);
+                        addr += pgsz;
+                        size -= pgsz;
+                        count -= pgsz;
+                        BUG_ON(pgsz & 511);
+                        sector += pgsz / 512;
+                        cond_resched();
+                }
+        } while (size);
+        return 0;
+}
+EXPORT_SYMBOL_GPL(dax_clear_blocks);
+static long dax_get_addr(struct buffer_head *bh, void **addr, unsigned blkbits)
+{
+        unsigned long pfn;
+        sector_t sector = bh->b_blocknr << (blkbits - 9);
+        return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
+}
+static void dax_new_buf(void *addr, unsigned size, unsigned first, loff_t pos,
+                        loff_t end)
+{
+        loff_t final = end - pos + first; /* The final byte of the buffer */
+        if (first > 0)
+                memset(addr, 0, first);
+        if (final < size)
+                memset(addr + final, 0, size - final);
+}
+static bool buffer_written(struct buffer_head *bh)
+{
+        return buffer_mapped(bh) && !buffer_unwritten(bh);
+}
+/*
+ * When ext4 encounters a hole, it returns without modifying the buffer_head
+ * which means that we can't trust b_size.  To cope with this, we set b_state
+ * to 0 before calling get_block and, if any bit is set, we know we can trust
+ * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
+ * and would save us time calling get_block repeatedly.
+ */
+static bool buffer_size_valid(struct buffer_head *bh)
+{
+        return bh->b_state != 0;
+}
+static ssize_t dax_io(int rw, struct inode *inode, struct iov_iter *iter,
+                        loff_t start, loff_t end, get_block_t get_block,
+                        struct buffer_head *bh)
+{
+        ssize_t retval = 0;
+        loff_t pos = start;
+        loff_t max = start;
+        loff_t bh_max = start;
+        void *addr;
+        bool hole = false;
+        if (rw != WRITE)
+                end = min(end, i_size_read(inode));
+        while (pos < end) {
+                unsigned len;
+                if (pos == max) {
+                        unsigned blkbits = inode->i_blkbits;
+                        sector_t block = pos >> blkbits;
+                        unsigned first = pos - (block << blkbits);
+                        long size;
+                        if (pos == bh_max) {
+                                bh->b_size = PAGE_ALIGN(end - pos);
+                                bh->b_state = 0;
+                                retval = get_block(inode, block, bh,
+                                                                rw == WRITE);
+                                if (retval)
+                                        break;
+                                if (!buffer_size_valid(bh))
+                                        bh->b_size = 1 << blkbits;
+                                bh_max = pos - first + bh->b_size;
+                        } else {
+                                unsigned done = bh->b_size -
+                                                (bh_max - (pos - first));
+                                bh->b_blocknr += done >> blkbits;
+                                bh->b_size -= done;
+                        }
+                        hole = (rw != WRITE) && !buffer_written(bh);
+                        if (hole) {
+                                addr = NULL;
+                                size = bh->b_size - first;
+                        } else {
+                                retval = dax_get_addr(bh, &addr, blkbits);
+                                if (retval < 0)
+                                        break;
+                                if (buffer_unwritten(bh) || buffer_new(bh))
+                                        dax_new_buf(addr, retval, first, pos,
+                                                                        end);
+                                addr += first;
+                                size = retval - first;
+                        }
+                        max = min(pos + size, end);
+                }
+                if (rw == WRITE)
+                        len = copy_from_iter(addr, max - pos, iter);
+                else if (!hole)
+                        len = copy_to_iter(addr, max - pos, iter);
+                else
+                        len = iov_iter_zero(max - pos, iter);
+                if (!len)
+                        break;
+                pos += len;
+                addr += len;
+        }
+        return (pos == start) ? retval : pos - start;
+}
+/**
+ * dax_do_io - Perform I/O to a DAX file
+ * @rw: READ to read or WRITE to write
+ * @iocb: The control block for this I/O
+ * @inode: The file which the I/O is directed at
+ * @iter: The addresses to do I/O from or to
+ * @pos: The file offset where the I/O starts
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ * @end_io: A filesystem callback for I/O completion
+ * @flags: See below
+ *
+ * This function uses the same locking scheme as do_blockdev_direct_IO:
+ * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
+ * caller for writes.  For reads, we take and release the i_mutex ourselves.
+ * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
+ * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
+ * is in progress.
+ */
+ssize_t dax_do_io(int rw, struct kiocb *iocb, struct inode *inode,
+                        struct iov_iter *iter, loff_t pos,
+                        get_block_t get_block, dio_iodone_t end_io, int flags)
+{
+        struct buffer_head bh;
+        ssize_t retval = -EINVAL;
+        loff_t end = pos + iov_iter_count(iter);
+        memset(&bh, 0, sizeof(bh));
+        if ((flags & DIO_LOCKING) && (rw == READ)) {
+                struct address_space *mapping = inode->i_mapping;
+                mutex_lock(&inode->i_mutex);
+                retval = filemap_write_and_wait_range(mapping, pos, end - 1);
+                if (retval) {
+                        mutex_unlock(&inode->i_mutex);
+                        goto out;
+                }
+        }
+        /* Protects against truncate */
+        atomic_inc(&inode->i_dio_count);
+        retval = dax_io(rw, inode, iter, pos, end, get_block, &bh);
+        if ((flags & DIO_LOCKING) && (rw == READ))
+                mutex_unlock(&inode->i_mutex);
+        if ((retval > 0) && end_io)
+                end_io(iocb, pos, retval, bh.b_private);
+        inode_dio_done(inode);
+ out:
+        return retval;
+}
+EXPORT_SYMBOL_GPL(dax_do_io);
+/*
+ * The user has performed a load from a hole in the file.  Allocating
+ * a new page in the file would cause excessive storage usage for
+ * workloads with sparse files.  We allocate a page cache page instead.
+ * We'll kick it out of the page cache if it's ever written to,
+ * otherwise it will simply fall out of the page cache under memory
+ * pressure without ever having been dirtied.
+ */
+static int dax_load_hole(struct address_space *mapping, struct page *page,
+                                                        struct vm_fault *vmf)
+{
+        unsigned long size;
+        struct inode *inode = mapping->host;
+        if (!page)
+                page = find_or_create_page(mapping, vmf->pgoff,
+                                                GFP_KERNEL | __GFP_ZERO);
+        if (!page)
+                return VM_FAULT_OOM;
+        /* Recheck i_size under page lock to avoid truncate race */
+        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        if (vmf->pgoff >= size) {
+                unlock_page(page);
+                page_cache_release(page);
+                return VM_FAULT_SIGBUS;
+        }
+        vmf->page = page;
+        return VM_FAULT_LOCKED;
+}
+static int copy_user_bh(struct page *to, struct buffer_head *bh,
+                        unsigned blkbits, unsigned long vaddr)
+{
+        void *vfrom, *vto;
+        if (dax_get_addr(bh, &vfrom, blkbits) < 0)
+                return -EIO;
+        vto = kmap_atomic(to);
+        copy_user_page(vto, vfrom, vaddr, to);
+        kunmap_atomic(vto);
+        return 0;
+}
+static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
+                        struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        struct address_space *mapping = inode->i_mapping;
+        sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+        unsigned long vaddr = (unsigned long)vmf->virtual_address;
+        void *addr;
+        unsigned long pfn;
+        pgoff_t size;
+        int error;
+        i_mmap_lock_read(mapping);
+        /*
+         * Check truncate didn't happen while we were allocating a block.
+         * If it did, this block may or may not be still allocated to the
+         * file.  We can't tell the filesystem to free it because we can't
+         * take i_mutex here.  In the worst case, the file still has blocks
+         * allocated past the end of the file.
+         */
+        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        if (unlikely(vmf->pgoff >= size)) {
+                error = -EIO;
+                goto out;
+        }
+        error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
+        if (error < 0)
+                goto out;
+        if (error < PAGE_SIZE) {
+                error = -EIO;
+                goto out;
+        }
+        if (buffer_unwritten(bh) || buffer_new(bh))
+                clear_page(addr);
+        error = vm_insert_mixed(vma, vaddr, pfn);
+ out:
+        i_mmap_unlock_read(mapping);
+        if (bh->b_end_io)
+                bh->b_end_io(bh, 1);
+        return error;
+}
+static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                        get_block_t get_block)
+{
+        struct file *file = vma->vm_file;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        struct page *page;
+        struct buffer_head bh;
+        unsigned long vaddr = (unsigned long)vmf->virtual_address;
+        unsigned blkbits = inode->i_blkbits;
+        sector_t block;
+        pgoff_t size;
+        int error;
+        int major = 0;
+        size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        if (vmf->pgoff >= size)
+                return VM_FAULT_SIGBUS;
+        memset(&bh, 0, sizeof(bh));
+        block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+        bh.b_size = PAGE_SIZE;
+ repeat:
+        page = find_get_page(mapping, vmf->pgoff);
+        if (page) {
+                if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+                        page_cache_release(page);
+                        return VM_FAULT_RETRY;
+                }
+                if (unlikely(page->mapping != mapping)) {
+                        unlock_page(page);
+                        page_cache_release(page);
+                        goto repeat;
+                }
+                size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                if (unlikely(vmf->pgoff >= size)) {
+                        /*
+                         * We have a struct page covering a hole in the file
+                         * from a read fault and we've raced with a truncate
+                         */
+                        error = -EIO;
+                        goto unlock_page;
+                }
+        }
+        error = get_block(inode, block, &bh, 0);
+        if (!error && (bh.b_size < PAGE_SIZE))
+                error = -EIO;           /* fs corruption? */
+        if (error)
+                goto unlock_page;
+        if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
+                if (vmf->flags & FAULT_FLAG_WRITE) {
+                        error = get_block(inode, block, &bh, 1);
+                        count_vm_event(PGMAJFAULT);
+                        mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
+                        major = VM_FAULT_MAJOR;
+                        if (!error && (bh.b_size < PAGE_SIZE))
+                                error = -EIO;
+                        if (error)
+                                goto unlock_page;
+                } else {
+                        return dax_load_hole(mapping, page, vmf);
+                }
+        }
+        if (vmf->cow_page) {
+                struct page *new_page = vmf->cow_page;
+                if (buffer_written(&bh))
+                        error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+                else
+                        clear_user_highpage(new_page, vaddr);
+                if (error)
+                        goto unlock_page;
+                vmf->page = page;
+                if (!page) {
+                        i_mmap_lock_read(mapping);
+                        /* Check we didn't race with truncate */
+                        size = (i_size_read(inode) + PAGE_SIZE - 1) >>
+                                                                PAGE_SHIFT;
+                        if (vmf->pgoff >= size) {
+                                i_mmap_unlock_read(mapping);
+                                error = -EIO;
+                                goto out;
+                        }
+                }
+                return VM_FAULT_LOCKED;
+        }
+        /* Check we didn't race with a read fault installing a new page */
+        if (!page && major)
+                page = find_lock_page(mapping, vmf->pgoff);
+        if (page) {
+                unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT,
+                                                        PAGE_CACHE_SIZE, 0);
+                delete_from_page_cache(page);
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        error = dax_insert_mapping(inode, &bh, vma, vmf);
+ out:
+        if (error == -ENOMEM)
+                return VM_FAULT_OOM | major;
+        /* -EBUSY is fine, somebody else faulted on the same PTE */
+        if ((error < 0) && (error != -EBUSY))
+                return VM_FAULT_SIGBUS | major;
+        return VM_FAULT_NOPAGE | major;
+ unlock_page:
+        if (page) {
+                unlock_page(page);
+                page_cache_release(page);
+        }
+        goto out;
+}
+/**
+ * dax_fault - handle a page fault on a DAX file
+ * @vma: The virtual memory area where the fault occurred
+ * @vmf: The description of the fault
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * When a page fault occurs, filesystems may call this helper in their
+ * fault handler for DAX files.
+ */
+int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+                        get_block_t get_block)
+{
+        int result;
+        struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+        if (vmf->flags & FAULT_FLAG_WRITE) {
+                sb_start_pagefault(sb);
+                file_update_time(vma->vm_file);
+        }
+        result = do_dax_fault(vma, vmf, get_block);
+        if (vmf->flags & FAULT_FLAG_WRITE)
+                sb_end_pagefault(sb);
+        return result;
+}
+EXPORT_SYMBOL_GPL(dax_fault);
+/**
+ * dax_zero_page_range - zero a range within a page of a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @length: The number of bytes to zero
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * This function can be called by a filesystem when it is zeroing part of a
+ * page in a DAX file.  This is intended for hole-punch operations.  If
+ * you are truncating a file, the helper function dax_truncate_page() may be
+ * more convenient.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
+                                                        get_block_t get_block)
+{
+        struct buffer_head bh;
+        pgoff_t index = from >> PAGE_CACHE_SHIFT;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        int err;
+        /* Block boundary? Nothing to do */
+        if (!length)
+                return 0;
+        BUG_ON((offset + length) > PAGE_CACHE_SIZE);
+        memset(&bh, 0, sizeof(bh));
+        bh.b_size = PAGE_CACHE_SIZE;
+        err = get_block(inode, index, &bh, 0);
+        if (err < 0)
+                return err;
+        if (buffer_written(&bh)) {
+                void *addr;
+                err = dax_get_addr(&bh, &addr, inode->i_blkbits);
+                if (err < 0)
+                        return err;
+                memset(addr + offset, 0, length);
+        }
+        return 0;
+}
+EXPORT_SYMBOL_GPL(dax_zero_page_range);
+/**
+ * dax_truncate_page - handle a partial page being truncated in a DAX file
+ * @inode: The file being truncated
+ * @from: The file offset that is being truncated to
+ * @get_block: The filesystem method used to translate file offsets to blocks
+ *
+ * Similar to block_truncate_page(), this function can be called by a
+ * filesystem when it is truncating a DAX file to handle the partial page.
+ *
+ * We work in terms of PAGE_CACHE_SIZE here for commonality with
+ * block_truncate_page(), but we could go down to PAGE_SIZE if the filesystem
+ * took care of disposing of the unnecessary blocks.  Even if the filesystem
+ * block size is smaller than PAGE_SIZE, we have to zero the rest of the page
+ * since the file might be mmapped.
+ */
+int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+{
+        unsigned length = PAGE_CACHE_ALIGN(from) - from;
+        return dax_zero_page_range(inode, from, length, get_block);
+}
+EXPORT_SYMBOL_GPL(dax_truncate_page);
diff --git a/fs/dcache.c b/fs/dcache.c
index e368d4f412f9..dc400fd29f4d 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -38,6 +38,8 @@
 #include <linux/prefetch.h>
 #include <linux/ratelimit.h>
 #include <linux/list_lru.h>
+#include <linux/kasan.h>
 #include "internal.h"
 #include "mount.h"
@@ -400,19 +402,20 @@ static void d_shrink_add(struct dentry *dentry, struct list_head *list)
 * LRU lists entirely, while shrink_move moves it to the indicated
 * private list.
 */
-static void d_lru_isolate(struct dentry *dentry)
+static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
 {
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags &= ~DCACHE_LRU_LIST;
        this_cpu_dec(nr_dentry_unused);
-        list_del_init(&dentry->d_lru);
+        list_lru_isolate(lru, &dentry->d_lru);
 }
-static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
+                              struct list_head *list)
 {
        D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
        dentry->d_flags |= DCACHE_SHRINK_LIST;
-        list_move_tail(&dentry->d_lru, list);
+        list_lru_isolate_move(lru, &dentry->d_lru, list);
 }
 /*
@@ -508,7 +511,7 @@ static void __dentry_kill(struct dentry *dentry)
         * dentry_iput drops the locks, at which point nobody (except
         * transient RCU lookups) can reach this dentry.
         */
-        BUG_ON((int)dentry->d_lockref.count > 0);
+        BUG_ON(dentry->d_lockref.count > 0);
        this_cpu_dec(nr_dentry);
        if (dentry->d_op && dentry->d_op->d_release)
                dentry->d_op->d_release(dentry);
@@ -561,7 +564,7 @@ static inline struct dentry *lock_parent(struct dentry *dentry)
        struct dentry *parent = dentry->d_parent;
        if (IS_ROOT(dentry))
                return NULL;
-        if (unlikely((int)dentry->d_lockref.count < 0))
+        if (unlikely(dentry->d_lockref.count < 0))
                return NULL;
        if (likely(spin_trylock(&parent->d_lock)))
                return parent;
@@ -590,6 +593,110 @@ again:
        return parent;
 }
+/*
+ * Try to do a lockless dput(), and return whether that was successful.
+ *
+ * If unsuccessful, we return false, having already taken the dentry lock.
+ *
+ * The caller needs to hold the RCU read lock, so that the dentry is
+ * guaranteed to stay around even if the refcount goes down to zero!
+ */
+static inline bool fast_dput(struct dentry *dentry)
+{
+        int ret;
+        unsigned int d_flags;
+        /*
+         * If we have a d_op->d_delete() operation, we sould not
+         * let the dentry count go to zero, so use "put__or_lock".
+         */
+        if (unlikely(dentry->d_flags & DCACHE_OP_DELETE))
+                return lockref_put_or_lock(&dentry->d_lockref);
+        /*
+         * .. otherwise, we can try to just decrement the
+         * lockref optimistically.
+         */
+        ret = lockref_put_return(&dentry->d_lockref);
+        /*
+         * If the lockref_put_return() failed due to the lock being held
+         * by somebody else, the fast path has failed. We will need to
+         * get the lock, and then check the count again.
+         */
+        if (unlikely(ret < 0)) {
+                spin_lock(&dentry->d_lock);
+                if (dentry->d_lockref.count > 1) {
+                        dentry->d_lockref.count--;
+                        spin_unlock(&dentry->d_lock);
+                        return 1;
+                }
+                return 0;
+        }
+        /*
+         * If we weren't the last ref, we're done.
+         */
+        if (ret)
+                return 1;
+        /*
+         * Careful, careful. The reference count went down
+         * to zero, but we don't hold the dentry lock, so
+         * somebody else could get it again, and do another
+         * dput(), and we need to not race with that.
+         *
+         * However, there is a very special and common case
+         * where we don't care, because there is nothing to
+         * do: the dentry is still hashed, it does not have
+         * a 'delete' op, and it's referenced and already on
+         * the LRU list.
+         *
+         * NOTE! Since we aren't locked, these values are
+         * not "stable". However, it is sufficient that at
+         * some point after we dropped the reference the
+         * dentry was hashed and the flags had the proper
+         * value. Other dentry users may have re-gotten
+         * a reference to the dentry and change that, but
+         * our work is done - we can leave the dentry
+         * around with a zero refcount.
+         */
+        smp_rmb();
+        d_flags = ACCESS_ONCE(dentry->d_flags);
+        d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST;
+        /* Nothing to do? Dropping the reference was all we needed? */
+        if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry))
+                return 1;
+        /*
+         * Not the fast normal case? Get the lock. We've already decremented
+         * the refcount, but we'll need to re-check the situation after
+         * getting the lock.
+         */
+        spin_lock(&dentry->d_lock);
+        /*
+         * Did somebody else grab a reference to it in the meantime, and
+         * we're no longer the last user after all? Alternatively, somebody
+         * else could have killed it and marked it dead. Either way, we
+         * don't need to do anything else.
+         */
+        if (dentry->d_lockref.count) {
+                spin_unlock(&dentry->d_lock);
+                return 1;
+        }
+        /*
+         * Re-get the reference we optimistically dropped. We hold the
+         * lock, and we just tested that it was zero, so we can just
+         * set it to 1.
+         */
+        dentry->d_lockref.count = 1;
+        return 0;
+}
 /* 
 * This is dput
 *
@@ -622,8 +729,14 @@ void dput(struct dentry *dentry)
                return;
 repeat:
-        if (lockref_put_or_lock(&dentry->d_lockref))
+        rcu_read_lock();
+        if (likely(fast_dput(dentry))) {
+                rcu_read_unlock();
                return;
+        }
+        /* Slow case: now with the dentry lock held */
+        rcu_read_unlock();
        /* Unreachable? Get rid of it */
        if (unlikely(d_unhashed(dentry)))
@@ -810,7 +923,7 @@ static void shrink_dentry_list(struct list_head *list)
                 * We found an inuse dentry which was not removed from
                 * the LRU because of laziness during lookup. Do not free it.
                 */
-                if ((int)dentry->d_lockref.count > 0) {
+                if (dentry->d_lockref.count > 0) {
                        spin_unlock(&dentry->d_lock);
                        if (parent)
                                spin_unlock(&parent->d_lock);
@@ -869,8 +982,8 @@ static void shrink_dentry_list(struct list_head *list)
        }
 }
-static enum lru_status
+static enum lru_status dentry_lru_isolate(struct list_head *item,
-dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
        struct list_head *freeable = arg;
        struct dentry   *dentry = container_of(item, struct dentry, d_lru);
@@ -890,7 +1003,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
         * another pass through the LRU.
         */
        if (dentry->d_lockref.count) {
-                d_lru_isolate(dentry);
+                d_lru_isolate(lru, dentry);
                spin_unlock(&dentry->d_lock);
                return LRU_REMOVED;
        }
@@ -921,7 +1034,7 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
                return LRU_ROTATE;
        }
-        d_lru_shrink_move(dentry, freeable);
+        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);
        return LRU_REMOVED;
@@ -930,30 +1043,28 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 /**
 * prune_dcache_sb - shrink the dcache
 * @sb: superblock
- * @nr_to_scan : number of entries to try to free
+ * @sc: shrink control, passed to list_lru_shrink_walk()
- * @nid: which node to scan for freeable entities
 *
- * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
+ * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
- * done when we need more memory an called from the superblock shrinker
+ * is done when we need more memory and called from the superblock shrinker
 * function.
 *
 * This function may fail to free any resources if all the dentries are in
 * use.
 */
-long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
-                     int nid)
 {
        LIST_HEAD(dispose);
        long freed;
-        freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
+        freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
-                                       &dispose, &nr_to_scan);
+                                     dentry_lru_isolate, &dispose);
        shrink_dentry_list(&dispose);
        return freed;
 }
 static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
-                                                spinlock_t *lru_lock, void *arg)
+                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
        struct list_head *freeable = arg;
        struct dentry   *dentry = container_of(item, struct dentry, d_lru);
@@ -966,7 +1077,7 @@ static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
        if (!spin_trylock(&dentry->d_lock))
                return LRU_SKIP;
-        d_lru_shrink_move(dentry, freeable);
+        d_lru_shrink_move(lru, dentry, freeable);
        spin_unlock(&dentry->d_lock);
        return LRU_REMOVED;
@@ -1430,6 +1541,9 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
                }
                atomic_set(&p->u.count, 1);
                dname = p->name;
+                if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS))
+                        kasan_unpoison_shadow(dname,
+                                round_up(name->len + 1, sizeof(unsigned long)));
        } else  {
                dname = dentry->d_iname;
        }       
@@ -2187,37 +2301,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name)
 }
 EXPORT_SYMBOL(d_hash_and_lookup);
-/**
- * d_validate - verify dentry provided from insecure source (deprecated)
- * @dentry: The dentry alleged to be valid child of @dparent
- * @dparent: The parent dentry (known to be valid)
- *
- * An insecure source has sent us a dentry, here we verify it and dget() it.
- * This is used by ncpfs in its readdir implementation.
- * Zero is returned in the dentry is invalid.
- *
- * This function is slow for big directories, and deprecated, do not use it.
- */
-int d_validate(struct dentry *dentry, struct dentry *dparent)
-{
-        struct dentry *child;
-        spin_lock(&dparent->d_lock);
-        list_for_each_entry(child, &dparent->d_subdirs, d_child) {
-                if (dentry == child) {
-                        spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
-                        __dget_dlock(dentry);
-                        spin_unlock(&dentry->d_lock);
-                        spin_unlock(&dparent->d_lock);
-                        return 1;
-                }
-        }
-        spin_unlock(&dparent->d_lock);
-        return 0;
-}
-EXPORT_SYMBOL(d_validate);
 /*
 * When a file is deleted, we have two options:
 * - turn this dentry into a negative dentry
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 05f2960ed7c3..45b18a5e225c 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -34,93 +34,16 @@ static struct vfsmount *debugfs_mount;
 static int debugfs_mount_count;
 static bool debugfs_registered;
-static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev_t dev,
+static struct inode *debugfs_get_inode(struct super_block *sb)
-                                       void *data, const struct file_operations *fops)
 {
        struct inode *inode = new_inode(sb);
        if (inode) {
                inode->i_ino = get_next_ino();
-                inode->i_mode = mode;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                switch (mode & S_IFMT) {
-                default:
-                        init_special_inode(inode, mode, dev);
-                        break;
-                case S_IFREG:
-                        inode->i_fop = fops ? fops : &debugfs_file_operations;
-                        inode->i_private = data;
-                        break;
-                case S_IFLNK:
-                        inode->i_op = &debugfs_link_operations;
-                        inode->i_private = data;
-                        break;
-                case S_IFDIR:
-                        inode->i_op = &simple_dir_inode_operations;
-                        inode->i_fop = &simple_dir_operations;
-                        /* directory inodes start off with i_nlink == 2
-                         * (for "." entry) */
-                        inc_nlink(inode);
-                        break;
-                }
        }
        return inode;
 }
-/* SMP-safe */
-static int debugfs_mknod(struct inode *dir, struct dentry *dentry,
-                         umode_t mode, dev_t dev, void *data,
-                         const struct file_operations *fops)
-{
-        struct inode *inode;
-        int error = -EPERM;
-        if (dentry->d_inode)
-                return -EEXIST;
-        inode = debugfs_get_inode(dir->i_sb, mode, dev, data, fops);
-        if (inode) {
-                d_instantiate(dentry, inode);
-                dget(dentry);
-                error = 0;
-        }
-        return error;
-}
-static int debugfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-        int res;
-        mode = (mode & (S_IRWXUGO | S_ISVTX)) | S_IFDIR;
-        res = debugfs_mknod(dir, dentry, mode, 0, NULL, NULL);
-        if (!res) {
-                inc_nlink(dir);
-                fsnotify_mkdir(dir, dentry);
-        }
-        return res;
-}
-static int debugfs_link(struct inode *dir, struct dentry *dentry, umode_t mode,
-                        void *data)
-{
-        mode = (mode & S_IALLUGO) | S_IFLNK;
-        return debugfs_mknod(dir, dentry, mode, 0, data, NULL);
-}
-static int debugfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-                          void *data, const struct file_operations *fops)
-{
-        int res;
-        mode = (mode & S_IALLUGO) | S_IFREG;
-        res = debugfs_mknod(dir, dentry, mode, 0, data, fops);
-        if (!res)
-                fsnotify_create(dir, dentry);
-        return res;
-}
 static inline int debugfs_positive(struct dentry *dentry)
 {
        return dentry->d_inode && !d_unhashed(dentry);
@@ -252,6 +175,18 @@ static const struct super_operations debugfs_super_operations = {
        .show_options   = debugfs_show_options,
 };
+static struct vfsmount *debugfs_automount(struct path *path)
+{
+        struct vfsmount *(*f)(void *);
+        f = (struct vfsmount *(*)(void *))path->dentry->d_fsdata;
+        return f(path->dentry->d_inode->i_private);
+}
+static const struct dentry_operations debugfs_dops = {
+        .d_delete = always_delete_dentry,
+        .d_automount = debugfs_automount,
+};
 static int debug_fill_super(struct super_block *sb, void *data, int silent)
 {
        static struct tree_descr debug_files[] = {{""}};
@@ -276,6 +211,7 @@ static int debug_fill_super(struct super_block *sb, void *data, int silent)
                goto fail;
        sb->s_op = &debugfs_super_operations;
+        sb->s_d_op = &debugfs_dops;
        debugfs_apply_options(sb);
@@ -302,11 +238,9 @@ static struct file_system_type debug_fs_type = {
 };
 MODULE_ALIAS_FS("debugfs");
-static struct dentry *__create_file(const char *name, umode_t mode,
+static struct dentry *start_creating(const char *name, struct dentry *parent)
-                                    struct dentry *parent, void *data,
-                                    const struct file_operations *fops)
 {
-        struct dentry *dentry = NULL;
+        struct dentry *dentry;
        int error;
        pr_debug("debugfs: creating file '%s'\n",name);
@@ -314,7 +248,7 @@ static struct dentry *__create_file(const char *name, umode_t mode,
        error = simple_pin_fs(&debug_fs_type, &debugfs_mount,
                              &debugfs_mount_count);
        if (error)
-                goto exit;
+                return ERR_PTR(error);
        /* If the parent is not specified, we create it in the root.
         * We need the root dentry to do this, which is in the super
@@ -326,31 +260,26 @@ static struct dentry *__create_file(const char *name, umode_t mode,
        mutex_lock(&parent->d_inode->i_mutex);
        dentry = lookup_one_len(name, parent, strlen(name));
-        if (!IS_ERR(dentry)) {
+        if (!IS_ERR(dentry) && dentry->d_inode) {
-                switch (mode & S_IFMT) {
-                case S_IFDIR:
-                        error = debugfs_mkdir(parent->d_inode, dentry, mode);
-                        break;
-                case S_IFLNK:
-                        error = debugfs_link(parent->d_inode, dentry, mode,
-                                             data);
-                        break;
-                default:
-                        error = debugfs_create(parent->d_inode, dentry, mode,
-                                               data, fops);
-                        break;
-                }
                dput(dentry);
-        } else
+                dentry = ERR_PTR(-EEXIST);
-                error = PTR_ERR(dentry);
-        mutex_unlock(&parent->d_inode->i_mutex);
-        if (error) {
-                dentry = NULL;
-                simple_release_fs(&debugfs_mount, &debugfs_mount_count);
        }
-exit:
+        if (IS_ERR(dentry))
+                mutex_unlock(&parent->d_inode->i_mutex);
+        return dentry;
+}
+static struct dentry *failed_creating(struct dentry *dentry)
+{
+        mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+        dput(dentry);
+        simple_release_fs(&debugfs_mount, &debugfs_mount_count);
+        return NULL;
+}
+static struct dentry *end_creating(struct dentry *dentry)
+{
+        mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
        return dentry;
 }
@@ -384,19 +313,71 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode,
                                   struct dentry *parent, void *data,
                                   const struct file_operations *fops)
 {
-        switch (mode & S_IFMT) {
+        struct dentry *dentry;
-        case S_IFREG:
+        struct inode *inode;
-        case 0:
-                break;
+        if (!(mode & S_IFMT))
-        default:
+                mode |= S_IFREG;
-                BUG();
+        BUG_ON(!S_ISREG(mode));
-        }
+        dentry = start_creating(name, parent);
+        if (IS_ERR(dentry))
+                return NULL;
-        return __create_file(name, mode, parent, data, fops);
+        inode = debugfs_get_inode(dentry->d_sb);
+        if (unlikely(!inode))
+                return failed_creating(dentry);
+        inode->i_mode = mode;
+        inode->i_fop = fops ? fops : &debugfs_file_operations;
+        inode->i_private = data;
+        d_instantiate(dentry, inode);
+        fsnotify_create(dentry->d_parent->d_inode, dentry);
+        return end_creating(dentry);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_file);
 /**
+ * debugfs_create_file_size - create a file in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @data: a pointer to something that the caller will want to get to later
+ *        on.  The inode.i_private pointer will point to this value on
+ *        the open() call.
+ * @fops: a pointer to a struct file_operations that should be used for
+ *        this file.
+ * @file_size: initial file size
+ *
+ * This is the basic "create a file" function for debugfs.  It allows for a
+ * wide range of flexibility in creating a file, or a directory (if you want
+ * to create a directory, the debugfs_create_dir() function is
+ * recommended to be used instead.)
+ *
+ * This function will return a pointer to a dentry if it succeeds.  This
+ * pointer must be passed to the debugfs_remove() function when the file is
+ * to be removed (no automatic cleanup happens if your module is unloaded,
+ * you are responsible here.)  If an error occurs, %NULL will be returned.
+ *
+ * If debugfs is not enabled in the kernel, the value -%ENODEV will be
+ * returned.
+ */
+struct dentry *debugfs_create_file_size(const char *name, umode_t mode,
+                                        struct dentry *parent, void *data,
+                                        const struct file_operations *fops,
+                                        loff_t file_size)
+{
+        struct dentry *de = debugfs_create_file(name, mode, parent, data, fops);
+        if (de)
+                de->d_inode->i_size = file_size;
+        return de;
+}
+EXPORT_SYMBOL_GPL(debugfs_create_file_size);
+/**
 * debugfs_create_dir - create a directory in the debugfs filesystem
 * @name: a pointer to a string containing the name of the directory to
 *        create.
@@ -416,12 +397,65 @@ EXPORT_SYMBOL_GPL(debugfs_create_file);
 */
 struct dentry *debugfs_create_dir(const char *name, struct dentry *parent)
 {
-        return __create_file(name, S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO,
+        struct dentry *dentry = start_creating(name, parent);
-                                   parent, NULL, NULL);
+        struct inode *inode;
+        if (IS_ERR(dentry))
+                return NULL;
+        inode = debugfs_get_inode(dentry->d_sb);
+        if (unlikely(!inode))
+                return failed_creating(dentry);
+        inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+        inode->i_op = &simple_dir_inode_operations;
+        inode->i_fop = &simple_dir_operations;
+        /* directory inodes start off with i_nlink == 2 (for "." entry) */
+        inc_nlink(inode);
+        d_instantiate(dentry, inode);
+        inc_nlink(dentry->d_parent->d_inode);
+        fsnotify_mkdir(dentry->d_parent->d_inode, dentry);
+        return end_creating(dentry);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_dir);
 /**
+ * debugfs_create_automount - create automount point in the debugfs filesystem
+ * @name: a pointer to a string containing the name of the file to create.
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @f: function to be called when pathname resolution steps on that one.
+ * @data: opaque argument to pass to f().
+ *
+ * @f should return what ->d_automount() would.
+ */
+struct dentry *debugfs_create_automount(const char *name,
+                                        struct dentry *parent,
+                                        struct vfsmount *(*f)(void *),
+                                        void *data)
+{
+        struct dentry *dentry = start_creating(name, parent);
+        struct inode *inode;
+        if (IS_ERR(dentry))
+                return NULL;
+        inode = debugfs_get_inode(dentry->d_sb);
+        if (unlikely(!inode))
+                return failed_creating(dentry);
+        inode->i_mode = S_IFDIR | S_IRWXU | S_IRUGO | S_IXUGO;
+        inode->i_flags |= S_AUTOMOUNT;
+        inode->i_private = data;
+        dentry->d_fsdata = (void *)f;
+        d_instantiate(dentry, inode);
+        return end_creating(dentry);
+}
+EXPORT_SYMBOL(debugfs_create_automount);
+/**
 * debugfs_create_symlink- create a symbolic link in the debugfs filesystem
 * @name: a pointer to a string containing the name of the symbolic link to
 *        create.
@@ -447,17 +481,28 @@ EXPORT_SYMBOL_GPL(debugfs_create_dir);
 struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent,
                                      const char *target)
 {
-        struct dentry *result;
+        struct dentry *dentry;
-        char *link;
+        struct inode *inode;
+        char *link = kstrdup(target, GFP_KERNEL);
-        link = kstrdup(target, GFP_KERNEL);
        if (!link)
                return NULL;
-        result = __create_file(name, S_IFLNK | S_IRWXUGO, parent, link, NULL);
+        dentry = start_creating(name, parent);
-        if (!result)
+        if (IS_ERR(dentry)) {
                kfree(link);
-        return result;
+                return NULL;
+        }
+        inode = debugfs_get_inode(dentry->d_sb);
+        if (unlikely(!inode)) {
+                kfree(link);
+                return failed_creating(dentry);
+        }
+        inode->i_mode = S_IFLNK | S_IRWXUGO;
+        inode->i_op = &debugfs_link_operations;
+        inode->i_private = link;
+        d_instantiate(dentry, inode);
+        return end_creating(dentry);
 }
 EXPORT_SYMBOL_GPL(debugfs_create_symlink);
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index e7cfbaf8d0e2..1e6e227134d7 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -56,13 +56,8 @@ static int send_data(struct sk_buff *skb)
 {
        struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
        void *data = genlmsg_data(genlhdr);
-        int rv;
-        rv = genlmsg_end(skb, data);
+        genlmsg_end(skb, data);
-        if (rv < 0) {
-                nlmsg_free(skb);
-                return rv;
-        }
        return genlmsg_unicast(&init_net, skb, listener_nlportid);
 }
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 2bc2c87f35e7..5718cb9f7273 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -37,20 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
        iput(toput_inode);
 }
-static void drop_slab(void)
-{
-        int nr_objects;
-        do {
-                int nid;
-                nr_objects = 0;
-                for_each_online_node(nid)
-                        nr_objects += shrink_node_slabs(GFP_KERNEL, nid,
-                                                        1000, 1000);
-        } while (nr_objects > 10);
-}
 int drop_caches_sysctl_handler(struct ctl_table *table, int write,
        void __user *buffer, size_t *length, loff_t *ppos)
 {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 1686dc2da9fd..34b36a504059 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -67,7 +67,6 @@ static int ecryptfs_inode_set(struct inode *inode, void *opaque)
        inode->i_ino = lower_inode->i_ino;
        inode->i_version++;
        inode->i_mapping->a_ops = &ecryptfs_aops;
-        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        if (S_ISLNK(inode->i_mode))
                inode->i_op = &ecryptfs_symlink_iops;
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index d9eb84bda559..1895d60f4122 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -520,7 +520,7 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags
                goto out;
        }
-        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs", BDI_CAP_MAP_COPY);
+        rc = bdi_setup_and_register(&sbi->bdi, "ecryptfs");
        if (rc)
                goto out1;
diff --git a/fs/efivarfs/Kconfig b/fs/efivarfs/Kconfig
index 367bbb10c543..c2499ef174a2 100644
--- a/fs/efivarfs/Kconfig
+++ b/fs/efivarfs/Kconfig
@@ -1,6 +1,7 @@
 config EFIVAR_FS
        tristate "EFI Variable filesystem"
        depends on EFI
+        default m
        help
          efivarfs is a replacement filesystem for the old EFI
          variable support via sysfs, as it doesn't suffer from the
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 6dad1176ec52..ddbce42548c9 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -140,7 +140,7 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
        name[len] = '-';
-        efi_guid_unparse(&entry->var.VendorGuid, name + len + 1);
+        efi_guid_to_str(&entry->var.VendorGuid, name + len + 1);
        name[len + EFI_VARIABLE_GUID_LEN+1] = '\0';
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 4b0a226024fa..8d0c0df01854 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -118,18 +118,18 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait)
 {
        struct eventfd_ctx *ctx = file->private_data;
        unsigned int events = 0;
-        unsigned long flags;
+        u64 count;
        poll_wait(file, &ctx->wqh, wait);
+        smp_rmb();
+        count = ctx->count;
-        spin_lock_irqsave(&ctx->wqh.lock, flags);
+        if (count > 0)
-        if (ctx->count > 0)
                events |= POLLIN;
-        if (ctx->count == ULLONG_MAX)
+        if (count == ULLONG_MAX)
                events |= POLLERR;
-        if (ULLONG_MAX - 1 > ctx->count)
+        if (ULLONG_MAX - 1 > count)
                events |= POLLOUT;
-        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
        return events;
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index d77f94491352..1e009cad8d5c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1639,9 +1639,9 @@ fetch_events:
                        spin_lock_irqsave(&ep->lock, flags);
                }
-                __remove_wait_queue(&ep->wq, &wait);
-                set_current_state(TASK_RUNNING);
+                __remove_wait_queue(&ep->wq, &wait);
+                __set_current_state(TASK_RUNNING);
        }
 check_events:
        /* Is it worth to try to dig for events ? */
diff --git a/fs/exec.c b/fs/exec.c
index ad8798e26be9..c7f9b733406d 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -794,8 +794,14 @@ exit:
 struct file *open_exec(const char *name)
 {
-        struct filename tmp = { .name = name };
+        struct filename *filename = getname_kernel(name);
-        return do_open_execat(AT_FDCWD, &tmp, 0);
+        struct file *f = ERR_CAST(filename);
+        if (!IS_ERR(filename)) {
+                f = do_open_execat(AT_FDCWD, filename, 0);
+                putname(filename);
+        }
+        return f;
 }
 EXPORT_SYMBOL(open_exec);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index f1d3d4eb8c4f..a198e94813fe 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -985,7 +985,6 @@ const struct address_space_operations exofs_aops = {
        .direct_IO      = exofs_direct_IO,
        /* With these NULL has special meaning or default is not exported */
-        .get_xip_mem    = NULL,
        .migratepage    = NULL,
        .launder_page   = NULL,
        .is_partially_uptodate = NULL,
@@ -1214,7 +1213,6 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
                memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
        }
-        inode->i_mapping->backing_dev_info = sb->s_bdi;
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &exofs_file_inode_operations;
                inode->i_fop = &exofs_file_operations;
@@ -1314,7 +1312,6 @@ struct inode *exofs_new_inode(struct inode *dir, umode_t mode)
        set_obj_2bcreated(oi);
-        inode->i_mapping->backing_dev_info = sb->s_bdi;
        inode_init_owner(inode, dir, mode);
        inode->i_ino = sbi->s_nextid++;
        inode->i_blkbits = EXOFS_BLKSHIFT;
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index 95965503afcb..fcc2e565f540 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -836,7 +836,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
                goto free_sbi;
        }
-        ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY);
+        ret = bdi_setup_and_register(&sbi->bdi, "exofs");
        if (ret) {
                EXOFS_DBGMSG("Failed to bdi_setup_and_register\n");
                dput(sb->s_root);
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 14a6780fd034..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -42,14 +42,3 @@ config EXT2_FS_SECURITY
          If you are not using a security module that requires using
          extended attributes for file security labels, say N.
-config EXT2_FS_XIP
-        bool "Ext2 execute in place support"
-        depends on EXT2_FS && MMU
-        help
-          Execute in place can be used on memory-backed block devices. If you
-          enable this option, you can select to mount block devices which are
-          capable of this feature without using the page cache.
-          If you do not use a block device that is capable of using this,
-          or if unsure, say N.
diff --git a/fs/ext2/Makefile b/fs/ext2/Makefile
index f42af45cfd88..445b0e996a12 100644
--- a/fs/ext2/Makefile
+++ b/fs/ext2/Makefile
@@ -10,4 +10,3 @@ ext2-y := balloc.o dir.o file.o ialloc.o inode.o \
 ext2-$(CONFIG_EXT2_FS_XATTR)     += xattr.o xattr_user.o xattr_trusted.o
 ext2-$(CONFIG_EXT2_FS_POSIX_ACL) += acl.o
 ext2-$(CONFIG_EXT2_FS_SECURITY)  += xattr_security.o
-ext2-$(CONFIG_EXT2_FS_XIP)       += xip.o
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index e4279ead4a05..678f9ab08c48 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -380,10 +380,15 @@ struct ext2_inode {
 #define EXT2_MOUNT_NO_UID32             0x000200  /* Disable 32-bit UIDs */
 #define EXT2_MOUNT_XATTR_USER           0x004000  /* Extended user attributes */
 #define EXT2_MOUNT_POSIX_ACL            0x008000  /* POSIX Access Control Lists */
-#define EXT2_MOUNT_XIP                  0x010000  /* Execute in place */
+#define EXT2_MOUNT_XIP                  0x010000  /* Obsolete, use DAX */
 #define EXT2_MOUNT_USRQUOTA             0x020000  /* user quota */
 #define EXT2_MOUNT_GRPQUOTA             0x040000  /* group quota */
 #define EXT2_MOUNT_RESERVATION          0x080000  /* Preallocation */
+#ifdef CONFIG_FS_DAX
+#define EXT2_MOUNT_DAX                  0x100000  /* Direct Access */
+#else
+#define EXT2_MOUNT_DAX                  0
+#endif
 #define clear_opt(o, opt)               o &= ~EXT2_MOUNT_##opt
@@ -788,11 +793,10 @@ extern int ext2_fsync(struct file *file, loff_t start, loff_t end,
                      int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
-extern const struct file_operations ext2_xip_file_operations;
+extern const struct file_operations ext2_dax_file_operations;
 /* inode.c */
 extern const struct address_space_operations ext2_aops;
-extern const struct address_space_operations ext2_aops_xip;
 extern const struct address_space_operations ext2_nobh_aops;
 /* namei.c */
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 7c87b22a7228..e31701713516 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -25,6 +25,36 @@
 #include "xattr.h"
 #include "acl.h"
+#ifdef CONFIG_FS_DAX
+static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        return dax_fault(vma, vmf, ext2_get_block);
+}
+static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        return dax_mkwrite(vma, vmf, ext2_get_block);
+}
+static const struct vm_operations_struct ext2_dax_vm_ops = {
+        .fault          = ext2_dax_fault,
+        .page_mkwrite   = ext2_dax_mkwrite,
+};
+static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+        if (!IS_DAX(file_inode(file)))
+                return generic_file_mmap(file, vma);
+        file_accessed(file);
+        vma->vm_ops = &ext2_dax_vm_ops;
+        vma->vm_flags |= VM_MIXEDMAP;
+        return 0;
+}
+#else
+#define ext2_file_mmap  generic_file_mmap
+#endif
 /*
 * Called when filp is released. This happens when all file descriptors
 * for a single struct file are closed. Note that different open() calls
@@ -70,7 +100,7 @@ const struct file_operations ext2_file_operations = {
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-        .mmap           = generic_file_mmap,
+        .mmap           = ext2_file_mmap,
        .open           = dquot_file_open,
        .release        = ext2_release_file,
        .fsync          = ext2_fsync,
@@ -78,16 +108,18 @@ const struct file_operations ext2_file_operations = {
        .splice_write   = iter_file_splice_write,
 };
-#ifdef CONFIG_EXT2_FS_XIP
+#ifdef CONFIG_FS_DAX
-const struct file_operations ext2_xip_file_operations = {
+const struct file_operations ext2_dax_file_operations = {
        .llseek         = generic_file_llseek,
-        .read           = xip_file_read,
+        .read           = new_sync_read,
-        .write          = xip_file_write,
+        .write          = new_sync_write,
+        .read_iter      = generic_file_read_iter,
+        .write_iter     = generic_file_write_iter,
        .unlocked_ioctl = ext2_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext2_compat_ioctl,
 #endif
-        .mmap           = xip_file_mmap,
+        .mmap           = ext2_file_mmap,
        .open           = dquot_file_open,
        .release        = ext2_release_file,
        .fsync          = ext2_fsync,
diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c
index 7d66fb0e4cca..6c14bb8322fa 100644
--- a/fs/ext2/ialloc.c
+++ b/fs/ext2/ialloc.c
@@ -170,7 +170,7 @@ static void ext2_preread_inode(struct inode *inode)
        struct ext2_group_desc * gdp;
        struct backing_dev_info *bdi;
-        bdi = inode->i_mapping->backing_dev_info;
+        bdi = inode_to_bdi(inode);
        if (bdi_read_congested(bdi))
                return;
        if (bdi_write_congested(bdi))
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 36d35c36311d..6434bc000125 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -34,7 +34,6 @@
 #include <linux/aio.h>
 #include "ext2.h"
 #include "acl.h"
-#include "xip.h"
 #include "xattr.h"
 static int __ext2_write_inode(struct inode *inode, int do_sync);
@@ -731,12 +730,14 @@ static int ext2_get_blocks(struct inode *inode,
                goto cleanup;
        }
-        if (ext2_use_xip(inode->i_sb)) {
+        if (IS_DAX(inode)) {
                /*
-                 * we need to clear the block
+                 * block must be initialised before we put it in the tree
+                 * so that it's not found by another thread before it's
+                 * initialised
                 */
-                err = ext2_clear_xip_target (inode,
+                err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
-                        le32_to_cpu(chain[depth-1].key));
+                                                1 << inode->i_blkbits);
                if (err) {
                        mutex_unlock(&ei->truncate_mutex);
                        goto cleanup;
@@ -859,7 +860,12 @@ ext2_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter,
        size_t count = iov_iter_count(iter);
        ssize_t ret;
-        ret = blockdev_direct_IO(rw, iocb, inode, iter, offset, ext2_get_block);
+        if (IS_DAX(inode))
+                ret = dax_do_io(rw, iocb, inode, iter, offset, ext2_get_block,
+                                NULL, DIO_LOCKING);
+        else
+                ret = blockdev_direct_IO(rw, iocb, inode, iter, offset,
+                                         ext2_get_block);
        if (ret < 0 && (rw & WRITE))
                ext2_write_failed(mapping, offset + count);
        return ret;
@@ -885,11 +891,6 @@ const struct address_space_operations ext2_aops = {
        .error_remove_page      = generic_error_remove_page,
 };
-const struct address_space_operations ext2_aops_xip = {
-        .bmap                   = ext2_bmap,
-        .get_xip_mem            = ext2_get_xip_mem,
-};
 const struct address_space_operations ext2_nobh_aops = {
        .readpage               = ext2_readpage,
        .readpages              = ext2_readpages,
@@ -1201,8 +1202,8 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
        inode_dio_wait(inode);
-        if (mapping_is_xip(inode->i_mapping))
+        if (IS_DAX(inode))
-                error = xip_truncate_page(inode->i_mapping, newsize);
+                error = dax_truncate_page(inode, newsize, ext2_get_block);
        else if (test_opt(inode->i_sb, NOBH))
                error = nobh_truncate_page(inode->i_mapping,
                                newsize, ext2_get_block);
@@ -1273,7 +1274,8 @@ void ext2_set_inode_flags(struct inode *inode)
 {
        unsigned int flags = EXT2_I(inode)->i_flags;
-        inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+        inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
+                                S_DIRSYNC | S_DAX);
        if (flags & EXT2_SYNC_FL)
                inode->i_flags |= S_SYNC;
        if (flags & EXT2_APPEND_FL)
@@ -1284,6 +1286,8 @@ void ext2_set_inode_flags(struct inode *inode)
                inode->i_flags |= S_NOATIME;
        if (flags & EXT2_DIRSYNC_FL)
                inode->i_flags |= S_DIRSYNC;
+        if (test_opt(inode->i_sb, DAX))
+                inode->i_flags |= S_DAX;
 }
 /* Propagate flags from i_flags to EXT2_I(inode)->i_flags */
@@ -1384,9 +1388,9 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext2_file_inode_operations;
-                if (ext2_use_xip(inode->i_sb)) {
+                if (test_opt(inode->i_sb, DAX)) {
-                        inode->i_mapping->a_ops = &ext2_aops_xip;
+                        inode->i_mapping->a_ops = &ext2_aops;
-                        inode->i_fop = &ext2_xip_file_operations;
+                        inode->i_fop = &ext2_dax_file_operations;
                } else if (test_opt(inode->i_sb, NOBH)) {
                        inode->i_mapping->a_ops = &ext2_nobh_aops;
                        inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index c268d0af1db9..148f6e3789ea 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -35,7 +35,6 @@
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "xip.h"
 static inline int ext2_add_nondir(struct dentry *dentry, struct inode *inode)
 {
@@ -105,9 +104,9 @@ static int ext2_create (struct inode * dir, struct dentry * dentry, umode_t mode
                return PTR_ERR(inode);
        inode->i_op = &ext2_file_inode_operations;
-        if (ext2_use_xip(inode->i_sb)) {
+        if (test_opt(inode->i_sb, DAX)) {
-                inode->i_mapping->a_ops = &ext2_aops_xip;
+                inode->i_mapping->a_ops = &ext2_aops;
-                inode->i_fop = &ext2_xip_file_operations;
+                inode->i_fop = &ext2_dax_file_operations;
        } else if (test_opt(inode->i_sb, NOBH)) {
                inode->i_mapping->a_ops = &ext2_nobh_aops;
                inode->i_fop = &ext2_file_operations;
@@ -126,9 +125,9 @@ static int ext2_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
                return PTR_ERR(inode);
        inode->i_op = &ext2_file_inode_operations;
-        if (ext2_use_xip(inode->i_sb)) {
+        if (test_opt(inode->i_sb, DAX)) {
-                inode->i_mapping->a_ops = &ext2_aops_xip;
+                inode->i_mapping->a_ops = &ext2_aops;
-                inode->i_fop = &ext2_xip_file_operations;
+                inode->i_fop = &ext2_dax_file_operations;
        } else if (test_opt(inode->i_sb, NOBH)) {
                inode->i_mapping->a_ops = &ext2_nobh_aops;
                inode->i_fop = &ext2_file_operations;
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index ae55fddc26a9..d0e746e96511 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -35,7 +35,6 @@
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "xip.h"
 static void ext2_sync_super(struct super_block *sb,
                            struct ext2_super_block *es, int wait);
@@ -292,9 +291,11 @@ static int ext2_show_options(struct seq_file *seq, struct dentry *root)
                seq_puts(seq, ",grpquota");
 #endif
-#if defined(CONFIG_EXT2_FS_XIP)
+#ifdef CONFIG_FS_DAX
        if (sbi->s_mount_opt & EXT2_MOUNT_XIP)
                seq_puts(seq, ",xip");
+        if (sbi->s_mount_opt & EXT2_MOUNT_DAX)
+                seq_puts(seq, ",dax");
 #endif
        if (!test_opt(sb, RESERVATION))
@@ -403,7 +404,7 @@ enum {
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic,
        Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug,
        Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr,
-        Opt_acl, Opt_noacl, Opt_xip, Opt_ignore, Opt_err, Opt_quota,
+        Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota,
        Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation
 };
@@ -432,6 +433,7 @@ static const match_table_t tokens = {
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
        {Opt_xip, "xip"},
+        {Opt_dax, "dax"},
        {Opt_grpquota, "grpquota"},
        {Opt_ignore, "noquota"},
        {Opt_quota, "quota"},
@@ -559,10 +561,14 @@ static int parse_options(char *options, struct super_block *sb)
                        break;
 #endif
                case Opt_xip:
-#ifdef CONFIG_EXT2_FS_XIP
+                        ext2_msg(sb, KERN_INFO, "use dax instead of xip");
-                        set_opt (sbi->s_mount_opt, XIP);
+                        set_opt(sbi->s_mount_opt, XIP);
+                        /* Fall through */
+                case Opt_dax:
+#ifdef CONFIG_FS_DAX
+                        set_opt(sbi->s_mount_opt, DAX);
 #else
-                        ext2_msg(sb, KERN_INFO, "xip option not supported");
+                        ext2_msg(sb, KERN_INFO, "dax option not supported");
 #endif
                        break;
@@ -877,9 +883,6 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
                ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ?
                 MS_POSIXACL : 0);
-        ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
-                                    EXT2_MOUNT_XIP if not */
        if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV &&
            (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT2_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
@@ -909,11 +912,17 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        blocksize = BLOCK_SIZE << le32_to_cpu(sbi->s_es->s_log_block_size);
-        if (ext2_use_xip(sb) && blocksize != PAGE_SIZE) {
+        if (sbi->s_mount_opt & EXT2_MOUNT_DAX) {
-                if (!silent)
+                if (blocksize != PAGE_SIZE) {
                        ext2_msg(sb, KERN_ERR,
-                                "error: unsupported blocksize for xip");
+                                        "error: unsupported blocksize for dax");
-                goto failed_mount;
+                        goto failed_mount;
+                }
+                if (!sb->s_bdev->bd_disk->fops->direct_access) {
+                        ext2_msg(sb, KERN_ERR,
+                                        "error: device does not support dax");
+                        goto failed_mount;
+                }
        }
        /* If the blocksize doesn't match, re-read the thing.. */
@@ -1259,7 +1268,6 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 {
        struct ext2_sb_info * sbi = EXT2_SB(sb);
        struct ext2_super_block * es;
-        unsigned long old_mount_opt = sbi->s_mount_opt;
        struct ext2_mount_options old_opts;
        unsigned long old_sb_flags;
        int err;
@@ -1284,22 +1292,11 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
        sb->s_flags = (sb->s_flags & ~MS_POSIXACL) |
                ((sbi->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
-        ext2_xip_verify_sb(sb); /* see if bdev supports xip, unset
-                                    EXT2_MOUNT_XIP if not */
-        if ((ext2_use_xip(sb)) && (sb->s_blocksize != PAGE_SIZE)) {
-                ext2_msg(sb, KERN_WARNING,
-                        "warning: unsupported blocksize for xip");
-                err = -EINVAL;
-                goto restore_opts;
-        }
        es = sbi->s_es;
-        if ((sbi->s_mount_opt ^ old_mount_opt) & EXT2_MOUNT_XIP) {
+        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT2_MOUNT_DAX) {
                ext2_msg(sb, KERN_WARNING, "warning: refusing change of "
-                         "xip flag with busy inodes while remounting");
+                         "dax flag with busy inodes while remounting");
-                sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
+                sbi->s_mount_opt ^= EXT2_MOUNT_DAX;
-                sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
        }
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
                spin_unlock(&sbi->s_lock);
diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c
deleted file mode 100644
index e98171a11cfe..000000000000
--- a/fs/ext2/xip.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- *  linux/fs/ext2/xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte (cotte@de.ibm.com)
- */
-#include <linux/mm.h>
-#include <linux/fs.h>
-#include <linux/genhd.h>
-#include <linux/buffer_head.h>
-#include <linux/blkdev.h>
-#include "ext2.h"
-#include "xip.h"
-static inline int
-__inode_direct_access(struct inode *inode, sector_t block,
-                      void **kaddr, unsigned long *pfn)
-{
-        struct block_device *bdev = inode->i_sb->s_bdev;
-        const struct block_device_operations *ops = bdev->bd_disk->fops;
-        sector_t sector;
-        sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */
-        BUG_ON(!ops->direct_access);
-        return ops->direct_access(bdev, sector, kaddr, pfn);
-}
-static inline int
-__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create,
-                   sector_t *result)
-{
-        struct buffer_head tmp;
-        int rc;
-        memset(&tmp, 0, sizeof(struct buffer_head));
-        tmp.b_size = 1 << inode->i_blkbits;
-        rc = ext2_get_block(inode, pgoff, &tmp, create);
-        *result = tmp.b_blocknr;
-        /* did we get a sparse block (hole in the file)? */
-        if (!tmp.b_blocknr && !rc) {
-                BUG_ON(create);
-                rc = -ENODATA;
-        }
-        return rc;
-}
-int
-ext2_clear_xip_target(struct inode *inode, sector_t block)
-{
-        void *kaddr;
-        unsigned long pfn;
-        int rc;
-        rc = __inode_direct_access(inode, block, &kaddr, &pfn);
-        if (!rc)
-                clear_page(kaddr);
-        return rc;
-}
-void ext2_xip_verify_sb(struct super_block *sb)
-{
-        struct ext2_sb_info *sbi = EXT2_SB(sb);
-        if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) &&
-            !sb->s_bdev->bd_disk->fops->direct_access) {
-                sbi->s_mount_opt &= (~EXT2_MOUNT_XIP);
-                ext2_msg(sb, KERN_WARNING,
-                             "warning: ignoring xip option - "
-                             "not supported by bdev");
-        }
-}
-int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create,
-                                void **kmem, unsigned long *pfn)
-{
-        int rc;
-        sector_t block;
-        /* first, retrieve the sector number */
-        rc = __ext2_get_block(mapping->host, pgoff, create, &block);
-        if (rc)
-                return rc;
-        /* retrieve address of the target data */
-        rc = __inode_direct_access(mapping->host, block, kmem, pfn);
-        return rc;
-}
diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h
deleted file mode 100644
index 18b34d2f31b3..000000000000
--- a/fs/ext2/xip.h
+++ /dev/null
@@ -1,26 +0,0 @@
-/*
- *  linux/fs/ext2/xip.h
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte (cotte@de.ibm.com)
- */
-#ifdef CONFIG_EXT2_FS_XIP
-extern void ext2_xip_verify_sb (struct super_block *);
-extern int ext2_clear_xip_target (struct inode *, sector_t);
-static inline int ext2_use_xip (struct super_block *sb)
-{
-        struct ext2_sb_info *sbi = EXT2_SB(sb);
-        return (sbi->s_mount_opt & EXT2_MOUNT_XIP);
-}
-int ext2_get_xip_mem(struct address_space *, pgoff_t, int,
-                                void **, unsigned long *);
-#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem)
-#else
-#define mapping_is_xip(map)                     0
-#define ext2_xip_verify_sb(sb)                  do { } while (0)
-#define ext2_use_xip(sb)                        0
-#define ext2_clear_xip_target(inode, chain)     0
-#define ext2_get_xip_mem                        NULL
-#endif
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9b4e7d750d4f..d4dbf3c259b3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -466,6 +466,8 @@ static void ext3_put_super (struct super_block * sb)
        }
        sb->s_fs_info = NULL;
        kfree(sbi->s_blockgroup_lock);
+        mutex_destroy(&sbi->s_orphan_lock);
+        mutex_destroy(&sbi->s_resize_lock);
        kfree(sbi);
 }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a75fba67bb1f..982d934fd9ac 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -965,6 +965,11 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_ERRORS_MASK          0x00070
 #define EXT4_MOUNT_MINIX_DF             0x00080 /* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD               0x00100 /* Don't use existing journal*/
+#ifdef CONFIG_FS_DAX
+#define EXT4_MOUNT_DAX                  0x00200 /* Direct Access */
+#else
+#define EXT4_MOUNT_DAX                  0
+#endif
 #define EXT4_MOUNT_DATA_FLAGS           0x00C00 /* Mode for data writes: */
 #define EXT4_MOUNT_JOURNAL_DATA         0x00400 /* Write data to journal */
 #define EXT4_MOUNT_ORDERED_DATA         0x00800 /* Flush data before commit */
@@ -2578,6 +2583,7 @@ extern const struct file_operations ext4_dir_operations;
 /* file.c */
 extern const struct inode_operations ext4_file_inode_operations;
 extern const struct file_operations ext4_file_operations;
+extern const struct file_operations ext4_dax_file_operations;
 extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin);
 /* inline.c */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e5d3eadf47b1..bed43081720f 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -5166,8 +5166,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        /* fallback to generic here if not in extents fmt */
        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
-                return __generic_block_fiemap(inode, fieinfo, start, len,
+                return generic_block_fiemap(inode, fieinfo, start, len,
-                                              ext4_get_block);
+                        ext4_get_block);
        if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS))
                return -EBADR;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 513c12cf444c..33a09da16c9c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        struct inode *inode = file_inode(iocb->ki_filp);
        struct mutex *aio_mutex = NULL;
        struct blk_plug plug;
-        int o_direct = file->f_flags & O_DIRECT;
+        int o_direct = io_is_direct(file);
        int overwrite = 0;
        size_t length = iov_iter_count(from);
        ssize_t ret;
@@ -191,17 +191,41 @@ errout:
        return ret;
 }
+#ifdef CONFIG_FS_DAX
+static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        return dax_fault(vma, vmf, ext4_get_block);
+                                        /* Is this the right get_block? */
+}
+static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+        return dax_mkwrite(vma, vmf, ext4_get_block);
+}
+static const struct vm_operations_struct ext4_dax_vm_ops = {
+        .fault          = ext4_dax_fault,
+        .page_mkwrite   = ext4_dax_mkwrite,
+};
+#else
+#define ext4_dax_vm_ops ext4_file_vm_ops
+#endif
 static const struct vm_operations_struct ext4_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = ext4_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
        file_accessed(file);
-        vma->vm_ops = &ext4_file_vm_ops;
+        if (IS_DAX(file_inode(file))) {
+                vma->vm_ops = &ext4_dax_vm_ops;
+                vma->vm_flags |= VM_MIXEDMAP;
+        } else {
+                vma->vm_ops = &ext4_file_vm_ops;
+        }
        return 0;
 }
@@ -273,19 +297,24 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 * we determine this extent as a data or a hole according to whether the
 * page cache has data or not.
 */
-static int ext4_find_unwritten_pgoff(struct inode *inode, int whence,
+static int ext4_find_unwritten_pgoff(struct inode *inode,
-                                     loff_t endoff, loff_t *offset)
+                                     int whence,
+                                     struct ext4_map_blocks *map,
+                                     loff_t *offset)
 {
        struct pagevec pvec;
+        unsigned int blkbits;
        pgoff_t index;
        pgoff_t end;
+        loff_t endoff;
        loff_t startoff;
        loff_t lastoff;
        int found = 0;
+        blkbits = inode->i_sb->s_blocksize_bits;
        startoff = *offset;
        lastoff = startoff;
+        endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
        index = startoff >> PAGE_CACHE_SHIFT;
        end = endoff >> PAGE_CACHE_SHIFT;
@@ -403,144 +432,147 @@ out:
 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 {
        struct inode *inode = file->f_mapping->host;
-        struct fiemap_extent_info fie;
+        struct ext4_map_blocks map;
-        struct fiemap_extent ext[2];
+        struct extent_status es;
-        loff_t next;
+        ext4_lblk_t start, last, end;
-        int i, ret = 0;
+        loff_t dataoff, isize;
+        int blkbits;
+        int ret = 0;
        mutex_lock(&inode->i_mutex);
-        if (offset >= inode->i_size) {
+        isize = i_size_read(inode);
+        if (offset >= isize) {
                mutex_unlock(&inode->i_mutex);
                return -ENXIO;
        }
-        fie.fi_flags = 0;
-        fie.fi_extents_max = 2;
+        blkbits = inode->i_sb->s_blocksize_bits;
-        fie.fi_extents_start = (struct fiemap_extent __user *) &ext;
+        start = offset >> blkbits;
-        while (1) {
+        last = start;
-                mm_segment_t old_fs = get_fs();
+        end = isize >> blkbits;
+        dataoff = offset;
-                fie.fi_extents_mapped = 0;
-                memset(ext, 0, sizeof(*ext) * fie.fi_extents_max);
+        do {
+                map.m_lblk = last;
-                set_fs(get_ds());
+                map.m_len = end - last + 1;
-                ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
+                ret = ext4_map_blocks(NULL, inode, &map, 0);
-                set_fs(old_fs);
+                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-                if (ret)
+                        if (last != start)
+                                dataoff = (loff_t)last << blkbits;
                        break;
+                }
-                /* No extents found, EOF */
+                /*
-                if (!fie.fi_extents_mapped) {
+                 * If there is a delay extent at this offset,
-                        ret = -ENXIO;
+                 * it will be as a data.
+                 */
+                ext4_es_find_delayed_extent_range(inode, last, last, &es);
+                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+                        if (last != start)
+                                dataoff = (loff_t)last << blkbits;
                        break;
                }
-                for (i = 0; i < fie.fi_extents_mapped; i++) {
-                        next = (loff_t)(ext[i].fe_length + ext[i].fe_logical);
-                        if (offset < (loff_t)ext[i].fe_logical)
+                /*
-                                offset = (loff_t)ext[i].fe_logical;
+                 * If there is a unwritten extent at this offset,
-                        /*
+                 * it will be as a data or a hole according to page
-                         * If extent is not unwritten, then it contains valid
+                 * cache that has data or not.
-                         * data, mapped or delayed.
+                 */
-                         */
+                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-                        if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN))
+                        int unwritten;
-                                goto out;
+                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+                                                              &map, &dataoff);
+                        if (unwritten)
+                                break;
+                }
-                        /*
+                last++;
-                         * If there is a unwritten extent at this offset,
+                dataoff = (loff_t)last << blkbits;
-                         * it will be as a data or a hole according to page
+        } while (last <= end);
-                         * cache that has data or not.
-                         */
-                        if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
-                                                      next, &offset))
-                                goto out;
-                        if (ext[i].fe_flags & FIEMAP_EXTENT_LAST) {
-                                ret = -ENXIO;
-                                goto out;
-                        }
-                        offset = next;
-                }
-        }
-        if (offset > inode->i_size)
-                offset = inode->i_size;
-out:
        mutex_unlock(&inode->i_mutex);
-        if (ret)
-                return ret;
-        return vfs_setpos(file, offset, maxsize);
+        if (dataoff > isize)
+                return -ENXIO;
+        return vfs_setpos(file, dataoff, maxsize);
 }
 /*
- * ext4_seek_hole() retrieves the offset for SEEK_HOLE
+ * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
 */
 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 {
        struct inode *inode = file->f_mapping->host;
-        struct fiemap_extent_info fie;
+        struct ext4_map_blocks map;
-        struct fiemap_extent ext[2];
+        struct extent_status es;
-        loff_t next;
+        ext4_lblk_t start, last, end;
-        int i, ret = 0;
+        loff_t holeoff, isize;
+        int blkbits;
+        int ret = 0;
        mutex_lock(&inode->i_mutex);
-        if (offset >= inode->i_size) {
+        isize = i_size_read(inode);
+        if (offset >= isize) {
                mutex_unlock(&inode->i_mutex);
                return -ENXIO;
        }
-        fie.fi_flags = 0;
+        blkbits = inode->i_sb->s_blocksize_bits;
-        fie.fi_extents_max = 2;
+        start = offset >> blkbits;
-        fie.fi_extents_start = (struct fiemap_extent __user *)&ext;
+        last = start;
-        while (1) {
+        end = isize >> blkbits;
-                mm_segment_t old_fs = get_fs();
+        holeoff = offset;
-                fie.fi_extents_mapped = 0;
+        do {
-                memset(ext, 0, sizeof(*ext));
+                map.m_lblk = last;
+                map.m_len = end - last + 1;
-                set_fs(get_ds());
+                ret = ext4_map_blocks(NULL, inode, &map, 0);
-                ret = ext4_fiemap(inode, &fie, offset, maxsize - offset);
+                if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-                set_fs(old_fs);
+                        last += ret;
-                if (ret)
+                        holeoff = (loff_t)last << blkbits;
-                        break;
+                        continue;
+                }
-                /* No extents found */
+                /*
-                if (!fie.fi_extents_mapped)
+                 * If there is a delay extent at this offset,
-                        break;
+                 * we will skip this extent.
+                 */
+                ext4_es_find_delayed_extent_range(inode, last, last, &es);
+                if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
+                        last = es.es_lblk + es.es_len;
+                        holeoff = (loff_t)last << blkbits;
+                        continue;
+                }
-                for (i = 0; i < fie.fi_extents_mapped; i++) {
+                /*
-                        next = (loff_t)(ext[i].fe_logical + ext[i].fe_length);
+                 * If there is a unwritten extent at this offset,
-                        /*
+                 * it will be as a data or a hole according to page
-                         * If extent is not unwritten, then it contains valid
+                 * cache that has data or not.
-                         * data, mapped or delayed.
+                 */
-                         */
+                if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-                        if (!(ext[i].fe_flags & FIEMAP_EXTENT_UNWRITTEN)) {
+                        int unwritten;
-                                if (offset < (loff_t)ext[i].fe_logical)
+                        unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
-                                        goto out;
+                                                              &map, &holeoff);
-                                offset = next;
+                        if (!unwritten) {
+                                last += ret;
+                                holeoff = (loff_t)last << blkbits;
                                continue;
                        }
-                        /*
-                         * If there is a unwritten extent at this offset,
-                         * it will be as a data or a hole according to page
-                         * cache that has data or not.
-                         */
-                        if (ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
-                                                      next, &offset))
-                                goto out;
-                        offset = next;
-                        if (ext[i].fe_flags & FIEMAP_EXTENT_LAST)
-                                goto out;
                }
-        }
-        if (offset > inode->i_size)
+                /* find a hole */
-                offset = inode->i_size;
+                break;
-out:
+        } while (last <= end);
        mutex_unlock(&inode->i_mutex);
-        if (ret)
-                return ret;
-        return vfs_setpos(file, offset, maxsize);
+        if (holeoff > isize)
+                holeoff = isize;
+        return vfs_setpos(file, holeoff, maxsize);
 }
 /*
@@ -592,6 +624,26 @@ const struct file_operations ext4_file_operations = {
        .fallocate      = ext4_fallocate,
 };
+#ifdef CONFIG_FS_DAX
+const struct file_operations ext4_dax_file_operations = {
+        .llseek         = ext4_llseek,
+        .read           = new_sync_read,
+        .write          = new_sync_write,
+        .read_iter      = generic_file_read_iter,
+        .write_iter     = ext4_file_write_iter,
+        .unlocked_ioctl = ext4_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = ext4_compat_ioctl,
+#endif
+        .mmap           = ext4_file_mmap,
+        .open           = ext4_file_open,
+        .release        = ext4_release_file,
+        .fsync          = ext4_sync_file,
+        /* Splice not yet supported with DAX */
+        .fallocate      = ext4_fallocate,
+};
+#endif
 const struct inode_operations ext4_file_inode_operations = {
        .setattr        = ext4_setattr,
        .getattr        = ext4_getattr,
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index 36b369697a13..6b9878a24182 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -689,14 +689,22 @@ retry:
                        inode_dio_done(inode);
                        goto locked;
                }
-                ret = __blockdev_direct_IO(rw, iocb, inode,
+                if (IS_DAX(inode))
-                                 inode->i_sb->s_bdev, iter, offset,
+                        ret = dax_do_io(rw, iocb, inode, iter, offset,
-                                 ext4_get_block, NULL, NULL, 0);
+                                        ext4_get_block, NULL, 0);
+                else
+                        ret = __blockdev_direct_IO(rw, iocb, inode,
+                                        inode->i_sb->s_bdev, iter, offset,
+                                        ext4_get_block, NULL, NULL, 0);
                inode_dio_done(inode);
        } else {
 locked:
-                ret = blockdev_direct_IO(rw, iocb, inode, iter,
+                if (IS_DAX(inode))
-                                 offset, ext4_get_block);
+                        ret = dax_do_io(rw, iocb, inode, iter, offset,
+                                        ext4_get_block, NULL, DIO_LOCKING);
+                else
+                        ret = blockdev_direct_IO(rw, iocb, inode, iter,
+                                        offset, ext4_get_block);
                if (unlikely((rw & WRITE) && ret < 0)) {
                        loff_t isize = i_size_read(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 5653fa42930b..85404f15e53a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -657,6 +657,18 @@ has_zeroout:
        return retval;
 }
+static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
+{
+        struct inode *inode = bh->b_assoc_map->host;
+        /* XXX: breaks on 32-bit > 16GB. Is that even supported? */
+        loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
+        int err;
+        if (!uptodate)
+                return;
+        WARN_ON(!buffer_unwritten(bh));
+        err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
+}
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
@@ -694,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
                map_bh(bh, inode->i_sb, map.m_pblk);
                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+                if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) {
+                        bh->b_assoc_map = inode->i_mapping;
+                        bh->b_private = (void *)(unsigned long)iblock;
+                        bh->b_end_io = ext4_end_io_unwritten;
+                }
                if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
                        set_buffer_defer_completion(bh);
                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -3010,13 +3027,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb,
                get_block_func = ext4_get_block_write;
                dio_flags = DIO_LOCKING;
        }
-        ret = __blockdev_direct_IO(rw, iocb, inode,
+        if (IS_DAX(inode))
-                                   inode->i_sb->s_bdev, iter,
+                ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func,
-                                   offset,
+                                ext4_end_io_dio, dio_flags);
-                                   get_block_func,
+        else
-                                   ext4_end_io_dio,
+                ret = __blockdev_direct_IO(rw, iocb, inode,
-                                   NULL,
+                                           inode->i_sb->s_bdev, iter, offset,
-                                   dio_flags);
+                                           get_block_func,
+                                           ext4_end_io_dio, NULL, dio_flags);
        /*
         * Put our reference to io_end. This can free the io_end structure e.g.
@@ -3180,19 +3198,12 @@ void ext4_set_aops(struct inode *inode)
                inode->i_mapping->a_ops = &ext4_aops;
 }
-/*
+static int __ext4_block_zero_page_range(handle_t *handle,
- * ext4_block_zero_page_range() zeros out a mapping of length 'length'
- * starting from file offset 'from'.  The range to be zero'd must
- * be contained with in one block.  If the specified range exceeds
- * the end of the block it will be shortened to end of the block
- * that cooresponds to 'from'
- */
-static int ext4_block_zero_page_range(handle_t *handle,
                struct address_space *mapping, loff_t from, loff_t length)
 {
        ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
-        unsigned blocksize, max, pos;
+        unsigned blocksize, pos;
        ext4_lblk_t iblock;
        struct inode *inode = mapping->host;
        struct buffer_head *bh;
@@ -3205,14 +3216,6 @@ static int ext4_block_zero_page_range(handle_t *handle,
                return -ENOMEM;
        blocksize = inode->i_sb->s_blocksize;
-        max = blocksize - (offset & (blocksize - 1));
-        /*
-         * correct length if it does not fall between
-         * 'from' and the end of the block
-         */
-        if (length > max || length < 0)
-                length = max;
        iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
@@ -3278,6 +3281,33 @@ unlock:
 }
 /*
+ * ext4_block_zero_page_range() zeros out a mapping of length 'length'
+ * starting from file offset 'from'.  The range to be zero'd must
+ * be contained with in one block.  If the specified range exceeds
+ * the end of the block it will be shortened to end of the block
+ * that cooresponds to 'from'
+ */
+static int ext4_block_zero_page_range(handle_t *handle,
+                struct address_space *mapping, loff_t from, loff_t length)
+{
+        struct inode *inode = mapping->host;
+        unsigned offset = from & (PAGE_CACHE_SIZE-1);
+        unsigned blocksize = inode->i_sb->s_blocksize;
+        unsigned max = blocksize - (offset & (blocksize - 1));
+        /*
+         * correct length if it does not fall between
+         * 'from' and the end of the block
+         */
+        if (length > max || length < 0)
+                length = max;
+        if (IS_DAX(inode))
+                return dax_zero_page_range(inode, from, length, ext4_get_block);
+        return __ext4_block_zero_page_range(handle, mapping, from, length);
+}
+/*
 * ext4_block_truncate_page() zeroes out a mapping from file offset `from'
 * up to the end of the block which corresponds to `from'.
 * This required during truncate. We need to physically zero the tail end
@@ -3798,8 +3828,10 @@ void ext4_set_inode_flags(struct inode *inode)
                new_fl |= S_NOATIME;
        if (flags & EXT4_DIRSYNC_FL)
                new_fl |= S_DIRSYNC;
+        if (test_opt(inode->i_sb, DAX))
+                new_fl |= S_DAX;
        inode_set_flags(inode, new_fl,
-                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+                        S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
 }
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
@@ -4052,7 +4084,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
-                inode->i_fop = &ext4_file_operations;
+                if (test_opt(inode->i_sb, DAX))
+                        inode->i_fop = &ext4_dax_file_operations;
+                else
+                        inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
        } else if (S_ISDIR(inode->i_mode)) {
                inode->i_op = &ext4_dir_inode_operations;
@@ -4139,6 +4174,65 @@ static int ext4_inode_blocks_set(handle_t *handle,
        return 0;
 }
+struct other_inode {
+        unsigned long           orig_ino;
+        struct ext4_inode       *raw_inode;
+};
+static int other_inode_match(struct inode * inode, unsigned long ino,
+                             void *data)
+{
+        struct other_inode *oi = (struct other_inode *) data;
+        if ((inode->i_ino != ino) ||
+            (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+                               I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+            ((inode->i_state & I_DIRTY_TIME) == 0))
+                return 0;
+        spin_lock(&inode->i_lock);
+        if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+                                I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+            (inode->i_state & I_DIRTY_TIME)) {
+                struct ext4_inode_info  *ei = EXT4_I(inode);
+                inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+                spin_unlock(&inode->i_lock);
+                spin_lock(&ei->i_raw_lock);
+                EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
+                EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
+                EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
+                ext4_inode_csum_set(inode, oi->raw_inode, ei);
+                spin_unlock(&ei->i_raw_lock);
+                trace_ext4_other_inode_update_time(inode, oi->orig_ino);
+                return -1;
+        }
+        spin_unlock(&inode->i_lock);
+        return -1;
+}
+/*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+                                          unsigned long orig_ino, char *buf)
+{
+        struct other_inode oi;
+        unsigned long ino;
+        int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+        int inode_size = EXT4_INODE_SIZE(sb);
+        oi.orig_ino = orig_ino;
+        ino = orig_ino & ~(inodes_per_block - 1);
+        for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+                if (ino == orig_ino)
+                        continue;
+                oi.raw_inode = (struct ext4_inode *) buf;
+                (void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+        }
+}
 /*
 * Post the struct inode info into an on-disk inode location in the
 * buffer-cache.  This gobbles the caller's reference to the
@@ -4248,10 +4342,11 @@ static int ext4_do_update_inode(handle_t *handle,
                                cpu_to_le16(ei->i_extra_isize);
                }
        }
        ext4_inode_csum_set(inode, raw_inode, ei);
        spin_unlock(&ei->i_raw_lock);
+        if (inode->i_sb->s_flags & MS_LAZYTIME)
+                ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
+                                              bh->b_data);
        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
        rc = ext4_handle_dirty_metadata(handle, NULL, bh);
@@ -4534,7 +4629,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                 * Truncate pagecache after we've waited for commit
                 * in data=journal mode to make pages freeable.
                 */
-                        truncate_pagecache(inode, inode->i_size);
+                truncate_pagecache(inode, inode->i_size);
        }
        /*
         * We want to call ext4_truncate() even if attr->ia_size ==
@@ -4840,11 +4935,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 * If the inode is marked synchronous, we don't honour that here - doing
 * so would cause a commit on atime updates, which we don't bother doing.
 * We handle synchronous inodes at the highest possible level.
+ *
+ * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
+ * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
+ * to copy into the on-disk inode structure are the timestamp files.
 */
 void ext4_dirty_inode(struct inode *inode, int flags)
 {
        handle_t *handle;
+        if (flags == I_DIRTY_TIME)
+                return;
        handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
        if (IS_ERR(handle))
                goto out;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2291923dae4e..28fe71a2904c 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -2235,7 +2235,10 @@ retry:
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
-                inode->i_fop = &ext4_file_operations;
+                if (test_opt(inode->i_sb, DAX))
+                        inode->i_fop = &ext4_dax_file_operations;
+                else
+                        inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
                err = ext4_add_nondir(handle, dentry, inode);
                if (!err && IS_DIRSYNC(dir))
@@ -2299,7 +2302,10 @@ retry:
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
                inode->i_op = &ext4_file_inode_operations;
-                inode->i_fop = &ext4_file_operations;
+                if (test_opt(inode->i_sb, DAX))
+                        inode->i_fop = &ext4_dax_file_operations;
+                else
+                        inode->i_fop = &ext4_file_operations;
                ext4_set_aops(inode);
                d_tmpfile(dentry, inode);
                err = ext4_orphan_add(handle, inode);
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bf76f405a5f9..8a8ec6293b19 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -24,6 +24,18 @@ int ext4_resize_begin(struct super_block *sb)
                return -EPERM;
        /*
+         * If we are not using the primary superblock/GDT copy don't resize,
+         * because the user tools have no way of handling this.  Probably a
+         * bad time to do it anyways.
+         */
+        if (EXT4_SB(sb)->s_sbh->b_blocknr !=
+            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
+                ext4_warning(sb, "won't resize using backup superblock at %llu",
+                        (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
+                return -EPERM;
+        }
+        /*
         * We are not allowed to do online-resizing on a filesystem mounted
         * with error, because it can destroy the filesystem easily.
         */
@@ -758,18 +770,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode,
                       "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n",
                       gdb_num);
-        /*
-         * If we are not using the primary superblock/GDT copy don't resize,
-         * because the user tools have no way of handling this.  Probably a
-         * bad time to do it anyways.
-         */
-        if (EXT4_SB(sb)->s_sbh->b_blocknr !=
-            le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) {
-                ext4_warning(sb, "won't resize using backup superblock at %llu",
-                        (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr);
-                return -EPERM;
-        }
        gdb_bh = sb_bread(sb, gdblock);
        if (!gdb_bh)
                return -EIO;
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 43c92b1685cb..1adac6868e6f 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -334,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func,
 static int block_device_ejected(struct super_block *sb)
 {
        struct inode *bd_inode = sb->s_bdev->bd_inode;
-        struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(bd_inode);
        return bdi->dev == NULL;
 }
@@ -1046,10 +1046,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                         struct path *path);
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
-                                 int format_id);
 static int ext4_quota_off(struct super_block *sb, int type);
-static int ext4_quota_off_sysfile(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1084,16 +1081,6 @@ static const struct quotactl_ops ext4_qctl_operations = {
        .get_dqblk      = dquot_get_dqblk,
        .set_dqblk      = dquot_set_dqblk
 };
-static const struct quotactl_ops ext4_qctl_sysfile_operations = {
-        .quota_on_meta  = ext4_quota_on_sysfile,
-        .quota_off      = ext4_quota_off_sysfile,
-        .quota_sync     = dquot_quota_sync,
-        .get_info       = dquot_get_dqinfo,
-        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = dquot_set_dqblk
-};
 #endif
 static const struct super_operations ext4_sops = {
@@ -1137,8 +1124,9 @@ enum {
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
        Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
-        Opt_usrquota, Opt_grpquota, Opt_i_version,
+        Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+        Opt_lazytime, Opt_nolazytime,
        Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio,
        Opt_dioread_nolock, Opt_dioread_lock,
@@ -1200,8 +1188,11 @@ static const match_table_t tokens = {
        {Opt_barrier, "barrier"},
        {Opt_nobarrier, "nobarrier"},
        {Opt_i_version, "i_version"},
+        {Opt_dax, "dax"},
        {Opt_stripe, "stripe=%u"},
        {Opt_delalloc, "delalloc"},
+        {Opt_lazytime, "lazytime"},
+        {Opt_nolazytime, "nolazytime"},
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_removed, "mblk_io_submit"},
        {Opt_removed, "nomblk_io_submit"},
@@ -1384,6 +1375,7 @@ static const struct mount_opts {
        {Opt_min_batch_time, 0, MOPT_GTE0},
        {Opt_inode_readahead_blks, 0, MOPT_GTE0},
        {Opt_init_itable, 0, MOPT_GTE0},
+        {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET},
        {Opt_stripe, 0, MOPT_GTE0},
        {Opt_resuid, 0, MOPT_GTE0},
        {Opt_resgid, 0, MOPT_GTE0},
@@ -1459,6 +1451,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
        case Opt_i_version:
                sb->s_flags |= MS_I_VERSION;
                return 1;
+        case Opt_lazytime:
+                sb->s_flags |= MS_LAZYTIME;
+                return 1;
+        case Opt_nolazytime:
+                sb->s_flags &= ~MS_LAZYTIME;
+                return 1;
        }
        for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -1620,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
                }
                sbi->s_jquota_fmt = m->mount_opt;
 #endif
+#ifndef CONFIG_FS_DAX
+        } else if (token == Opt_dax) {
+                ext4_msg(sb, KERN_INFO, "dax option not supported");
+                return -1;
+#endif
        } else {
                if (!args->from)
                        arg = 1;
@@ -3482,7 +3485,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                       EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
            EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM))
-                ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are "
+                ext4_warning(sb, "metadata_csum and uninit_bg are "
                             "redundant flags; please run fsck.");
        /* Check for a known checksum algorithm */
@@ -3602,6 +3605,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                 "both data=journal and dioread_nolock");
                        goto failed_mount;
                }
+                if (test_opt(sb, DAX)) {
+                        ext4_msg(sb, KERN_ERR, "can't mount with "
+                                 "both data=journal and dax");
+                        goto failed_mount;
+                }
                if (test_opt(sb, DELALLOC))
                        clear_opt(sb, DELALLOC);
        }
@@ -3665,6 +3673,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
+        if (sbi->s_mount_opt & EXT4_MOUNT_DAX) {
+                if (blocksize != PAGE_SIZE) {
+                        ext4_msg(sb, KERN_ERR,
+                                        "error: unsupported blocksize for dax");
+                        goto failed_mount;
+                }
+                if (!sb->s_bdev->bd_disk->fops->direct_access) {
+                        ext4_msg(sb, KERN_ERR,
+                                        "error: device does not support dax");
+                        goto failed_mount;
+                }
+        }
        if (sb->s_blocksize != blocksize) {
                /* Validate the filesystem blocksize */
                if (!sb_set_blocksize(sb, blocksize)) {
@@ -3935,7 +3956,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_QUOTA
        sb->dq_op = &ext4_quota_operations;
        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
-                sb->s_qcop = &ext4_qctl_sysfile_operations;
+                sb->s_qcop = &dquot_quotactl_sysfile_ops;
        else
                sb->s_qcop = &ext4_qctl_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
@@ -4882,6 +4903,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        err = -EINVAL;
                        goto restore_opts;
                }
+                if (test_opt(sb, DAX)) {
+                        ext4_msg(sb, KERN_ERR, "can't mount with "
+                                 "both data=journal and dax");
+                        err = -EINVAL;
+                        goto restore_opts;
+                }
+        }
+        if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
+                ext4_msg(sb, KERN_WARNING, "warning: refusing change of "
+                        "dax flag with busy inodes while remounting");
+                sbi->s_mount_opt ^= EXT4_MOUNT_DAX;
        }
        if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED)
@@ -5020,6 +5053,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        }
 #endif
+        *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
        ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
        kfree(orig_data);
        return 0;
@@ -5288,21 +5322,6 @@ static int ext4_enable_quotas(struct super_block *sb)
        return 0;
 }
-/*
- * quota_on function that is used when QUOTA feature is set.
- */
-static int ext4_quota_on_sysfile(struct super_block *sb, int type,
-                                 int format_id)
-{
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
-                return -EINVAL;
-        /*
-         * USAGE was enabled at mount time. Only need to enable LIMITS now.
-         */
-        return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED);
-}
 static int ext4_quota_off(struct super_block *sb, int type)
 {
        struct inode *inode = sb_dqopt(sb)->files[type];
@@ -5329,18 +5348,6 @@ out:
        return dquot_quota_off(sb, type);
 }
-/*
- * quota_off function that is used when QUOTA feature is set.
- */
-static int ext4_quota_off_sysfile(struct super_block *sb, int type)
-{
-        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA))
-                return -EINVAL;
-        /* Disable only the limits. */
-        return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
 /* Read data from quotafile - avoid pagecache and such because we cannot afford
 * acquiring the locks... As quota files are never truncated and quota code
 * itself serializes the operations (and no one else should touch the files)
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index 736a348509f7..94e2d2ffabe1 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -71,3 +71,13 @@ config F2FS_CHECK_FS
          Enables BUG_ONs which check the filesystem consistency in runtime.
          If you want to improve the performance, say N.
+config F2FS_IO_TRACE
+        bool "F2FS IO tracer"
+        depends on F2FS_FS
+        depends on FUNCTION_TRACER
+        help
+          F2FS IO trace is based on a function trace, which gathers process
+          information and block IO patterns in the filesystem level.
+          If unsure, say N.
diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile
index 2e35da12d292..d92397731db8 100644
--- a/fs/f2fs/Makefile
+++ b/fs/f2fs/Makefile
@@ -5,3 +5,4 @@ f2fs-y		+= checkpoint.o gc.o data.o node.o segment.o recovery.o
 f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o
 f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o
 f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o
+f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 1ccb26bc2a0b..742202779bd5 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -62,7 +62,7 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size)
        if (count == 0)
                return NULL;
-        acl = posix_acl_alloc(count, GFP_KERNEL);
+        acl = posix_acl_alloc(count, GFP_NOFS);
        if (!acl)
                return ERR_PTR(-ENOMEM);
@@ -116,7 +116,7 @@ static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size)
        int i;
        f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count *
-                        sizeof(struct f2fs_acl_entry), GFP_KERNEL);
+                        sizeof(struct f2fs_acl_entry), GFP_NOFS);
        if (!f2fs_acl)
                return ERR_PTR(-ENOMEM);
@@ -396,7 +396,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
                posix_acl_release(default_acl);
        }
        if (acl) {
-                if (error)
+                if (!error)
                        error = __f2fs_set_acl(inode, ACL_TYPE_ACCESS, acl,
                                               ipage);
                posix_acl_release(acl);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index e6c271fefaca..7f794b72b3b7 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -20,10 +20,11 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "trace.h"
 #include <trace/events/f2fs.h>
 static struct kmem_cache *ino_entry_slab;
-static struct kmem_cache *inode_entry_slab;
+struct kmem_cache *inode_entry_slab;
 /*
 * We guarantee no failure on the returned page.
@@ -50,6 +51,11 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index)
 {
        struct address_space *mapping = META_MAPPING(sbi);
        struct page *page;
+        struct f2fs_io_info fio = {
+                .type = META,
+                .rw = READ_SYNC | REQ_META | REQ_PRIO,
+                .blk_addr = index,
+        };
 repeat:
        page = grab_cache_page(mapping, index);
        if (!page) {
@@ -59,8 +65,7 @@ repeat:
        if (PageUptodate(page))
                goto out;
-        if (f2fs_submit_page_bio(sbi, page, index,
+        if (f2fs_submit_page_bio(sbi, page, &fio))
-                                READ_SYNC | REQ_META | REQ_PRIO))
                goto repeat;
        lock_page(page);
@@ -112,14 +117,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
        block_t prev_blk_addr = 0;
        struct page *page;
        block_t blkno = start;
        struct f2fs_io_info fio = {
                .type = META,
                .rw = READ_SYNC | REQ_META | REQ_PRIO
        };
        for (; nrpages-- > 0; blkno++) {
-                block_t blk_addr;
                if (!is_valid_blkaddr(sbi, blkno, type))
                        goto out;
@@ -130,27 +133,27 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
                                        NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid)))
                                blkno = 0;
                        /* get nat block addr */
-                        blk_addr = current_nat_addr(sbi,
+                        fio.blk_addr = current_nat_addr(sbi,
                                        blkno * NAT_ENTRY_PER_BLOCK);
                        break;
                case META_SIT:
                        /* get sit block addr */
-                        blk_addr = current_sit_addr(sbi,
+                        fio.blk_addr = current_sit_addr(sbi,
                                        blkno * SIT_ENTRY_PER_BLOCK);
-                        if (blkno != start && prev_blk_addr + 1 != blk_addr)
+                        if (blkno != start && prev_blk_addr + 1 != fio.blk_addr)
                                goto out;
-                        prev_blk_addr = blk_addr;
+                        prev_blk_addr = fio.blk_addr;
                        break;
                case META_SSA:
                case META_CP:
                case META_POR:
-                        blk_addr = blkno;
+                        fio.blk_addr = blkno;
                        break;
                default:
                        BUG();
                }
-                page = grab_cache_page(META_MAPPING(sbi), blk_addr);
+                page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr);
                if (!page)
                        continue;
                if (PageUptodate(page)) {
@@ -158,7 +161,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type
                        continue;
                }
-                f2fs_submit_page_mbio(sbi, page, blk_addr, &fio);
+                f2fs_submit_page_mbio(sbi, page, &fio);
                f2fs_put_page(page, 0);
        }
 out:
@@ -187,7 +190,7 @@ static int f2fs_write_meta_page(struct page *page,
        trace_f2fs_writepage(page, META);
-        if (unlikely(sbi->por_doing))
+        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
                goto redirty_out;
        if (wbc->for_reclaim && page->index < GET_SUM_BLOCK(sbi, 0))
                goto redirty_out;
@@ -299,6 +302,8 @@ static int f2fs_set_meta_page_dirty(struct page *page)
        if (!PageDirty(page)) {
                __set_page_dirty_nobuffers(page);
                inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META);
+                SetPagePrivate(page);
+                f2fs_trace_pid(page);
                return 1;
        }
        return 0;
@@ -308,6 +313,8 @@ const struct address_space_operations f2fs_meta_aops = {
        .writepage      = f2fs_write_meta_page,
        .writepages     = f2fs_write_meta_pages,
        .set_page_dirty = f2fs_set_meta_page_dirty,
+        .invalidatepage = f2fs_invalidate_page,
+        .releasepage    = f2fs_release_page,
 };
 static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -462,7 +469,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG))
                return;
-        sbi->por_doing = true;
+        set_sbi_flag(sbi, SBI_POR_DOING);
        start_blk = __start_cp_addr(sbi) + 1 +
                le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_payload);
@@ -483,7 +490,7 @@ void recover_orphan_inodes(struct f2fs_sb_info *sbi)
        }
        /* clear Orphan Flag */
        clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG);
-        sbi->por_doing = false;
+        clear_sbi_flag(sbi, SBI_POR_DOING);
        return;
 }
@@ -567,7 +574,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
        if (crc_offset >= blk_size)
                goto invalid_cp1;
-        crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
+        crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
                goto invalid_cp1;
@@ -582,7 +589,7 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi,
        if (crc_offset >= blk_size)
                goto invalid_cp2;
-        crc = le32_to_cpu(*((__u32 *)((unsigned char *)cp_block + crc_offset)));
+        crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset)));
        if (!f2fs_crc_valid(crc, cp_block, crc_offset))
                goto invalid_cp2;
@@ -669,7 +676,7 @@ fail_no_cp:
        return -EINVAL;
 }
-static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
+static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -686,7 +693,7 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new)
 void update_dirty_page(struct inode *inode, struct page *page)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-        struct dir_inode_entry *new;
+        struct inode_entry *new;
        int ret = 0;
        if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode))
@@ -710,12 +717,13 @@ void update_dirty_page(struct inode *inode, struct page *page)
                kmem_cache_free(inode_entry_slab, new);
 out:
        SetPagePrivate(page);
+        f2fs_trace_pid(page);
 }
 void add_dirty_dir_inode(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-        struct dir_inode_entry *new =
+        struct inode_entry *new =
                        f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
        int ret = 0;
@@ -733,7 +741,7 @@ void add_dirty_dir_inode(struct inode *inode)
 void remove_dirty_dir_inode(struct inode *inode)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-        struct dir_inode_entry *entry;
+        struct inode_entry *entry;
        if (!S_ISDIR(inode->i_mode))
                return;
@@ -763,7 +771,7 @@ void remove_dirty_dir_inode(struct inode *inode)
 void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
 {
        struct list_head *head;
-        struct dir_inode_entry *entry;
+        struct inode_entry *entry;
        struct inode *inode;
 retry:
        if (unlikely(f2fs_cp_error(sbi)))
@@ -776,7 +784,7 @@ retry:
                spin_unlock(&sbi->dir_inode_lock);
                return;
        }
-        entry = list_entry(head->next, struct dir_inode_entry, list);
+        entry = list_entry(head->next, struct inode_entry, list);
        inode = igrab(entry->inode);
        spin_unlock(&sbi->dir_inode_lock);
        if (inode) {
@@ -922,7 +930,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        ckpt->next_free_nid = cpu_to_le32(last_nid);
        /* 2 cp  + n data seg summary + orphan inode blocks */
-        data_sum_blocks = npages_for_summary_flush(sbi);
+        data_sum_blocks = npages_for_summary_flush(sbi, false);
        if (data_sum_blocks < NR_CURSEG_DATA_TYPE)
                set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG);
        else
@@ -932,24 +940,31 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks +
                        orphan_blocks);
-        if (cpc->reason == CP_UMOUNT) {
+        if (__remain_node_summaries(cpc->reason))
-                set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
                ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+
                                cp_payload_blks + data_sum_blocks +
                                orphan_blocks + NR_CURSEG_NODE_TYPE);
-        } else {
+        else
-                clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
                ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS +
                                cp_payload_blks + data_sum_blocks +
                                orphan_blocks);
-        }
+        if (cpc->reason == CP_UMOUNT)
+                set_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+        else
+                clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG);
+        if (cpc->reason == CP_FASTBOOT)
+                set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
+        else
+                clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG);
        if (orphan_num)
                set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
        else
                clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
-        if (sbi->need_fsck)
+        if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
                set_ckpt_flags(ckpt, CP_FSCK_FLAG);
        /* update SIT/NAT bitmap */
@@ -966,15 +981,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        /* write out checkpoint buffer at block 0 */
        cp_page = grab_meta_page(sbi, start_blk++);
        kaddr = page_address(cp_page);
-        memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+        memcpy(kaddr, ckpt, F2FS_BLKSIZE);
        set_page_dirty(cp_page);
        f2fs_put_page(cp_page, 1);
        for (i = 1; i < 1 + cp_payload_blks; i++) {
                cp_page = grab_meta_page(sbi, start_blk++);
                kaddr = page_address(cp_page);
-                memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE,
+                memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE);
-                                (1 << sbi->log_blocksize));
                set_page_dirty(cp_page);
                f2fs_put_page(cp_page, 1);
        }
@@ -986,7 +1000,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        write_data_summaries(sbi, start_blk);
        start_blk += data_sum_blocks;
-        if (cpc->reason == CP_UMOUNT) {
+        if (__remain_node_summaries(cpc->reason)) {
                write_node_summaries(sbi, start_blk);
                start_blk += NR_CURSEG_NODE_TYPE;
        }
@@ -994,7 +1008,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        /* writeout checkpoint block */
        cp_page = grab_meta_page(sbi, start_blk);
        kaddr = page_address(cp_page);
-        memcpy(kaddr, ckpt, (1 << sbi->log_blocksize));
+        memcpy(kaddr, ckpt, F2FS_BLKSIZE);
        set_page_dirty(cp_page);
        f2fs_put_page(cp_page, 1);
@@ -1023,7 +1037,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                return;
        clear_prefree_segments(sbi);
-        F2FS_RESET_SB_DIRT(sbi);
+        clear_sbi_flag(sbi, SBI_IS_DIRTY);
 }
 /*
@@ -1038,10 +1052,13 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        mutex_lock(&sbi->cp_mutex);
-        if (!sbi->s_dirty && cpc->reason != CP_DISCARD)
+        if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) &&
+                        cpc->reason != CP_DISCARD && cpc->reason != CP_UMOUNT)
                goto out;
        if (unlikely(f2fs_cp_error(sbi)))
                goto out;
+        if (f2fs_readonly(sbi->sb))
+                goto out;
        if (block_operations(sbi))
                goto out;
@@ -1102,8 +1119,8 @@ int __init create_checkpoint_caches(void)
                        sizeof(struct ino_entry));
        if (!ino_entry_slab)
                return -ENOMEM;
-        inode_entry_slab = f2fs_kmem_cache_create("f2fs_dirty_dir_entry",
+        inode_entry_slab = f2fs_kmem_cache_create("f2fs_inode_entry",
-                        sizeof(struct dir_inode_entry));
+                        sizeof(struct inode_entry));
        if (!inode_entry_slab) {
                kmem_cache_destroy(ino_entry_slab);
                return -ENOMEM;
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 7ec697b37f19..985ed023a750 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -22,6 +22,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "trace.h"
 #include <trace/events/f2fs.h>
 static void f2fs_read_end_io(struct bio *bio, int err)
@@ -95,11 +96,9 @@ static void __submit_merged_bio(struct f2fs_bio_info *io)
                return;
        if (is_read_io(fio->rw))
-                trace_f2fs_submit_read_bio(io->sbi->sb, fio->rw,
+                trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio);
-                                                        fio->type, io->bio);
        else
-                trace_f2fs_submit_write_bio(io->sbi->sb, fio->rw,
+                trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio);
-                                                        fio->type, io->bio);
        submit_bio(fio->rw, io->bio);
        io->bio = NULL;
@@ -132,14 +131,15 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 * Return unlocked page.
 */
 int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
-                                        block_t blk_addr, int rw)
+                                        struct f2fs_io_info *fio)
 {
        struct bio *bio;
-        trace_f2fs_submit_page_bio(page, blk_addr, rw);
+        trace_f2fs_submit_page_bio(page, fio);
+        f2fs_trace_ios(page, fio, 0);
        /* Allocate a new bio */
-        bio = __bio_alloc(sbi, blk_addr, 1, is_read_io(rw));
+        bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw));
        if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) {
                bio_put(bio);
@@ -147,12 +147,12 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page,
                return -EFAULT;
        }
-        submit_bio(rw, bio);
+        submit_bio(fio->rw, bio);
        return 0;
 }
 void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
-                        block_t blk_addr, struct f2fs_io_info *fio)
+                                        struct f2fs_io_info *fio)
 {
        enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
        struct f2fs_bio_info *io;
@@ -160,21 +160,21 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page,
        io = is_read ? &sbi->read_io : &sbi->write_io[btype];
-        verify_block_addr(sbi, blk_addr);
+        verify_block_addr(sbi, fio->blk_addr);
        down_write(&io->io_rwsem);
        if (!is_read)
                inc_page_count(sbi, F2FS_WRITEBACK);
-        if (io->bio && (io->last_block_in_bio != blk_addr - 1 ||
+        if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 ||
                                                io->fio.rw != fio->rw))
                __submit_merged_bio(io);
 alloc_new:
        if (io->bio == NULL) {
                int bio_blocks = MAX_BIO_BLOCKS(sbi);
-                io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read);
+                io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read);
                io->fio = *fio;
        }
@@ -184,10 +184,11 @@ alloc_new:
                goto alloc_new;
        }
-        io->last_block_in_bio = blk_addr;
+        io->last_block_in_bio = fio->blk_addr;
+        f2fs_trace_ios(page, fio, 0);
        up_write(&io->io_rwsem);
-        trace_f2fs_submit_page_mbio(page, fio->rw, fio->type, blk_addr);
+        trace_f2fs_submit_page_mbio(page, fio);
 }
 /*
@@ -196,7 +197,7 @@ alloc_new:
 *  ->node_page
 *    update block addresses in the node page
 */
-static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
+static void __set_data_blkaddr(struct dnode_of_data *dn)
 {
        struct f2fs_node *rn;
        __le32 *addr_array;
@@ -209,7 +210,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr)
        /* Get physical address of data block */
        addr_array = blkaddr_in_node(rn);
-        addr_array[ofs_in_node] = cpu_to_le32(new_addr);
+        addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
        set_page_dirty(node_page);
 }
@@ -224,8 +225,8 @@ int reserve_new_block(struct dnode_of_data *dn)
        trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node);
-        __set_data_blkaddr(dn, NEW_ADDR);
        dn->data_blkaddr = NEW_ADDR;
+        __set_data_blkaddr(dn);
        mark_inode_dirty(dn->inode);
        sync_inode_page(dn);
        return 0;
@@ -273,7 +274,7 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
                unsigned int blkbits = inode->i_sb->s_blocksize_bits;
                size_t count;
-                clear_buffer_new(bh_result);
+                set_buffer_new(bh_result);
                map_bh(bh_result, inode->i_sb,
                                start_blkaddr + pgofs - start_fofs);
                count = end_fofs - pgofs + 1;
@@ -290,23 +291,24 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs,
        return 0;
 }
-void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
+void update_extent_cache(struct dnode_of_data *dn)
 {
        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
        pgoff_t fofs, start_fofs, end_fofs;
        block_t start_blkaddr, end_blkaddr;
        int need_update = true;
-        f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR);
+        f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR);
-        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
-                                                        dn->ofs_in_node;
        /* Update the page address in the parent node */
-        __set_data_blkaddr(dn, blk_addr);
+        __set_data_blkaddr(dn);
        if (is_inode_flag_set(fi, FI_NO_EXTENT))
                return;
+        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
+                                                        dn->ofs_in_node;
        write_lock(&fi->ext.ext_lock);
        start_fofs = fi->ext.fofs;
@@ -320,16 +322,16 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
        /* Initial extent */
        if (fi->ext.len == 0) {
-                if (blk_addr != NULL_ADDR) {
+                if (dn->data_blkaddr != NULL_ADDR) {
                        fi->ext.fofs = fofs;
-                        fi->ext.blk_addr = blk_addr;
+                        fi->ext.blk_addr = dn->data_blkaddr;
                        fi->ext.len = 1;
                }
                goto end_update;
        }
        /* Front merge */
-        if (fofs == start_fofs - 1 && blk_addr == start_blkaddr - 1) {
+        if (fofs == start_fofs - 1 && dn->data_blkaddr == start_blkaddr - 1) {
                fi->ext.fofs--;
                fi->ext.blk_addr--;
                fi->ext.len++;
@@ -337,7 +339,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn)
        }
        /* Back merge */
-        if (fofs == end_fofs + 1 && blk_addr == end_blkaddr + 1) {
+        if (fofs == end_fofs + 1 && dn->data_blkaddr == end_blkaddr + 1) {
                fi->ext.len++;
                goto end_update;
        }
@@ -376,6 +378,10 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
        struct dnode_of_data dn;
        struct page *page;
        int err;
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = sync ? READ_SYNC : READA,
+        };
        page = find_get_page(mapping, index);
        if (page && PageUptodate(page))
@@ -404,8 +410,8 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync)
                return page;
        }
-        err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr,
+        fio.blk_addr = dn.data_blkaddr;
-                                        sync ? READ_SYNC : READA);
+        err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
        if (err)
                return ERR_PTR(err);
@@ -430,7 +436,10 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index)
        struct dnode_of_data dn;
        struct page *page;
        int err;
+        struct f2fs_io_info fio = {
+                .type = DATA,
+                .rw = READ_SYNC,
+        };
 repeat:
        page = grab_cache_page(mapping, index);
        if (!page)
@@ -464,8 +473,8 @@ repeat:
                return page;
        }
-        err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+        fio.blk_addr = dn.data_blkaddr;
-                                        dn.data_blkaddr, READ_SYNC);
+        err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
        if (err)
                return ERR_PTR(err);
@@ -515,8 +524,12 @@ repeat:
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
                SetPageUptodate(page);
        } else {
-                err = f2fs_submit_page_bio(F2FS_I_SB(inode), page,
+                struct f2fs_io_info fio = {
-                                                dn.data_blkaddr, READ_SYNC);
+                        .type = DATA,
+                        .rw = READ_SYNC,
+                        .blk_addr = dn.data_blkaddr,
+                };
+                err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio);
                if (err)
                        goto put_err;
@@ -550,30 +563,25 @@ static int __allocate_data_block(struct dnode_of_data *dn)
        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct f2fs_inode_info *fi = F2FS_I(dn->inode);
        struct f2fs_summary sum;
-        block_t new_blkaddr;
        struct node_info ni;
+        int seg = CURSEG_WARM_DATA;
        pgoff_t fofs;
-        int type;
        if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC)))
                return -EPERM;
        if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1)))
                return -ENOSPC;
-        __set_data_blkaddr(dn, NEW_ADDR);
-        dn->data_blkaddr = NEW_ADDR;
        get_node_info(sbi, dn->nid, &ni);
        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
-        type = CURSEG_WARM_DATA;
+        if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
+                seg = CURSEG_DIRECT_IO;
-        allocate_data_block(sbi, NULL, NULL_ADDR, &new_blkaddr, &sum, type);
+        allocate_data_block(sbi, NULL, NULL_ADDR, &dn->data_blkaddr, &sum, seg);
        /* direct IO doesn't use extent cache to maximize the performance */
-        set_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
+        __set_data_blkaddr(dn);
-        update_extent_cache(new_blkaddr, dn);
-        clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT);
        /* update i_size */
        fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) +
@@ -581,10 +589,59 @@ static int __allocate_data_block(struct dnode_of_data *dn)
        if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT))
                i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT));
-        dn->data_blkaddr = new_blkaddr;
        return 0;
 }
+static void __allocate_data_blocks(struct inode *inode, loff_t offset,
+                                                        size_t count)
+{
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+        struct dnode_of_data dn;
+        u64 start = F2FS_BYTES_TO_BLK(offset);
+        u64 len = F2FS_BYTES_TO_BLK(count);
+        bool allocated;
+        u64 end_offset;
+        while (len) {
+                f2fs_balance_fs(sbi);
+                f2fs_lock_op(sbi);
+                /* When reading holes, we need its node page */
+                set_new_dnode(&dn, inode, NULL, NULL, 0);
+                if (get_dnode_of_data(&dn, start, ALLOC_NODE))
+                        goto out;
+                allocated = false;
+                end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
+                while (dn.ofs_in_node < end_offset && len) {
+                        if (dn.data_blkaddr == NULL_ADDR) {
+                                if (__allocate_data_block(&dn))
+                                        goto sync_out;
+                                allocated = true;
+                        }
+                        len--;
+                        start++;
+                        dn.ofs_in_node++;
+                }
+                if (allocated)
+                        sync_inode_page(&dn);
+                f2fs_put_dnode(&dn);
+                f2fs_unlock_op(sbi);
+        }
+        return;
+sync_out:
+        if (allocated)
+                sync_inode_page(&dn);
+        f2fs_put_dnode(&dn);
+out:
+        f2fs_unlock_op(sbi);
+        return;
+}
 /*
 * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh.
 * If original data blocks are allocated, then give them to blockdev.
@@ -610,10 +667,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
        if (check_extent_cache(inode, pgofs, bh_result))
                goto out;
-        if (create) {
+        if (create)
-                f2fs_balance_fs(F2FS_I_SB(inode));
                f2fs_lock_op(F2FS_I_SB(inode));
-        }
        /* When reading holes, we need its node page */
        set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -627,12 +682,14 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
                goto put_out;
        if (dn.data_blkaddr != NULL_ADDR) {
+                set_buffer_new(bh_result);
                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
        } else if (create) {
                err = __allocate_data_block(&dn);
                if (err)
                        goto put_out;
                allocated = true;
+                set_buffer_new(bh_result);
                map_bh(bh_result, inode->i_sb, dn.data_blkaddr);
        } else {
                goto put_out;
@@ -745,7 +802,6 @@ static int f2fs_read_data_pages(struct file *file,
 int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
 {
        struct inode *inode = page->mapping->host;
-        block_t old_blkaddr, new_blkaddr;
        struct dnode_of_data dn;
        int err = 0;
@@ -754,10 +810,10 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
        if (err)
                return err;
-        old_blkaddr = dn.data_blkaddr;
+        fio->blk_addr = dn.data_blkaddr;
        /* This page is already truncated */
-        if (old_blkaddr == NULL_ADDR)
+        if (fio->blk_addr == NULL_ADDR)
                goto out_writepage;
        set_page_writeback(page);
@@ -766,14 +822,14 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio)
         * If current allocation needs SSR,
         * it had better in-place writes for updated data.
         */
-        if (unlikely(old_blkaddr != NEW_ADDR &&
+        if (unlikely(fio->blk_addr != NEW_ADDR &&
                        !is_cold_data(page) &&
                        need_inplace_update(inode))) {
-                rewrite_data_page(page, old_blkaddr, fio);
+                rewrite_data_page(page, fio);
                set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
        } else {
-                write_data_page(page, &dn, &new_blkaddr, fio);
+                write_data_page(page, &dn, fio);
-                update_extent_cache(new_blkaddr, &dn);
+                update_extent_cache(&dn);
                set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE);
        }
 out_writepage:
@@ -812,7 +868,12 @@ static int f2fs_write_data_page(struct page *page,
        zero_user_segment(page, offset, PAGE_CACHE_SIZE);
 write:
-        if (unlikely(sbi->por_doing))
+        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
+                goto redirty_out;
+        if (f2fs_is_drop_cache(inode))
+                goto out;
+        if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim &&
+                        available_free_memory(sbi, BASE_CHECK))
                goto redirty_out;
        /* Dentry blocks are controlled by checkpoint */
@@ -826,7 +887,6 @@ write:
        /* we should bypass data pages to proceed the kworkder jobs */
        if (unlikely(f2fs_cp_error(sbi))) {
                SetPageError(page);
-                unlock_page(page);
                goto out;
        }
@@ -1002,8 +1062,12 @@ put_next:
        if (dn.data_blkaddr == NEW_ADDR) {
                zero_user_segment(page, 0, PAGE_CACHE_SIZE);
        } else {
-                err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr,
+                struct f2fs_io_info fio = {
-                                           READ_SYNC);
+                        .type = DATA,
+                        .rw = READ_SYNC,
+                        .blk_addr = dn.data_blkaddr,
+                };
+                err = f2fs_submit_page_bio(sbi, page, &fio);
                if (err)
                        goto fail;
@@ -1092,6 +1156,9 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
        trace_f2fs_direct_IO_enter(inode, offset, count, rw);
+        if (rw & WRITE)
+                __allocate_data_blocks(inode, offset, count);
        err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block);
        if (err < 0 && (rw & WRITE))
                f2fs_write_failed(mapping, offset + count);
@@ -1101,24 +1168,33 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb,
        return err;
 }
-static void f2fs_invalidate_data_page(struct page *page, unsigned int offset,
+void f2fs_invalidate_page(struct page *page, unsigned int offset,
-                                      unsigned int length)
+                                                        unsigned int length)
 {
        struct inode *inode = page->mapping->host;
+        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-        if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)
+        if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
+                (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE))
                return;
-        if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+        if (PageDirty(page)) {
-                invalidate_inmem_page(inode, page);
+                if (inode->i_ino == F2FS_META_INO(sbi))
+                        dec_page_count(sbi, F2FS_DIRTY_META);
-        if (PageDirty(page))
+                else if (inode->i_ino == F2FS_NODE_INO(sbi))
-                inode_dec_dirty_pages(inode);
+                        dec_page_count(sbi, F2FS_DIRTY_NODES);
+                else
+                        inode_dec_dirty_pages(inode);
+        }
        ClearPagePrivate(page);
 }
-static int f2fs_release_data_page(struct page *page, gfp_t wait)
+int f2fs_release_page(struct page *page, gfp_t wait)
 {
+        /* If this is dirty page, keep PagePrivate */
+        if (PageDirty(page))
+                return 0;
        ClearPagePrivate(page);
        return 1;
 }
@@ -1132,7 +1208,7 @@ static int f2fs_set_data_page_dirty(struct page *page)
        SetPageUptodate(page);
-        if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) {
+        if (f2fs_is_atomic_file(inode)) {
                register_inmem_page(inode, page);
                return 1;
        }
@@ -1168,8 +1244,8 @@ const struct address_space_operations f2fs_dblock_aops = {
        .write_begin    = f2fs_write_begin,
        .write_end      = f2fs_write_end,
        .set_page_dirty = f2fs_set_data_page_dirty,
-        .invalidatepage = f2fs_invalidate_data_page,
+        .invalidatepage = f2fs_invalidate_page,
-        .releasepage    = f2fs_release_data_page,
+        .releasepage    = f2fs_release_page,
        .direct_IO      = f2fs_direct_IO,
        .bmap           = f2fs_bmap,
 };
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 91e8f699ab30..e671373cc8ab 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -40,6 +40,7 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        si->ndirty_dirs = sbi->n_dirty_dirs;
        si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
        si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
+        si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
        si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
        si->rsvd_segs = reserved_segments(sbi);
        si->overp_segs = overprovision_segments(sbi);
@@ -57,7 +58,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
        si->node_pages = NODE_MAPPING(sbi)->nrpages;
        si->meta_pages = META_MAPPING(sbi)->nrpages;
        si->nats = NM_I(sbi)->nat_cnt;
-        si->sits = SIT_I(sbi)->dirty_sentries;
+        si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
+        si->sits = MAIN_SEGS(sbi);
+        si->dirty_sits = SIT_I(sbi)->dirty_sentries;
        si->fnids = NM_I(sbi)->fcnt;
        si->bg_gc = sbi->bg_gc;
        si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
@@ -79,6 +82,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
                si->segment_count[i] = sbi->segment_count[i];
                si->block_count[i] = sbi->block_count[i];
        }
+        si->inplace_count = atomic_read(&sbi->inplace_count);
 }
 /*
@@ -137,6 +142,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
        si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry);
        si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi));
        si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi);
+        si->base_mem += SIT_VBLOCK_MAP_SIZE;
        if (sbi->segs_per_sec > 1)
                si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry);
        si->base_mem += __bitmap_size(sbi, SIT_BITMAP);
@@ -159,20 +165,32 @@ static void update_mem_info(struct f2fs_sb_info *sbi)
        si->base_mem += sizeof(struct f2fs_nm_info);
        si->base_mem += __bitmap_size(sbi, NAT_BITMAP);
+get_cache:
+        si->cache_mem = 0;
        /* build gc */
-        si->base_mem += sizeof(struct f2fs_gc_kthread);
+        if (sbi->gc_thread)
+                si->cache_mem += sizeof(struct f2fs_gc_kthread);
+        /* build merge flush thread */
+        if (SM_I(sbi)->cmd_control_info)
+                si->cache_mem += sizeof(struct flush_cmd_control);
-get_cache:
        /* free nids */
-        si->cache_mem = NM_I(sbi)->fcnt;
+        si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
-        si->cache_mem += NM_I(sbi)->nat_cnt;
+        si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
-        npages = NODE_MAPPING(sbi)->nrpages;
+        si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
-        si->cache_mem += npages << PAGE_CACHE_SHIFT;
+                                        sizeof(struct nat_entry_set);
-        npages = META_MAPPING(sbi)->nrpages;
+        si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
-        si->cache_mem += npages << PAGE_CACHE_SHIFT;
+        si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
-        si->cache_mem += sbi->n_dirty_dirs * sizeof(struct dir_inode_entry);
        for (i = 0; i <= UPDATE_INO; i++)
                si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
+        si->page_mem = 0;
+        npages = NODE_MAPPING(sbi)->nrpages;
+        si->page_mem += npages << PAGE_CACHE_SHIFT;
+        npages = META_MAPPING(sbi)->nrpages;
+        si->page_mem += npages << PAGE_CACHE_SHIFT;
 }
 static int stat_show(struct seq_file *s, void *v)
@@ -250,16 +268,16 @@ static int stat_show(struct seq_file *s, void *v)
                seq_printf(s, "\nExtent Hit Ratio: %d / %d\n",
                           si->hit_ext, si->total_ext);
                seq_puts(s, "\nBalancing F2FS Async:\n");
-                seq_printf(s, "  - inmem: %4d\n",
+                seq_printf(s, "  - inmem: %4d, wb: %4d\n",
-                           si->inmem_pages);
+                           si->inmem_pages, si->wb_pages);
                seq_printf(s, "  - nodes: %4d in %4d\n",
                           si->ndirty_node, si->node_pages);
                seq_printf(s, "  - dents: %4d in dirs:%4d\n",
                           si->ndirty_dent, si->ndirty_dirs);
                seq_printf(s, "  - meta: %4d in %4d\n",
                           si->ndirty_meta, si->meta_pages);
-                seq_printf(s, "  - NATs: %9d\n  - SITs: %9d\n",
+                seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
-                           si->nats, si->sits);
+                           si->dirty_nats, si->nats, si->dirty_sits, si->sits);
                seq_printf(s, "  - free_nids: %9d\n",
                           si->fnids);
                seq_puts(s, "\nDistribution of User Blocks:");
@@ -277,6 +295,7 @@ static int stat_show(struct seq_file *s, void *v)
                for (j = 0; j < si->util_free; j++)
                        seq_putc(s, '-');
                seq_puts(s, "]\n\n");
+                seq_printf(s, "IPU: %u blocks\n", si->inplace_count);
                seq_printf(s, "SSR: %u blocks in %u segments\n",
                           si->block_count[SSR], si->segment_count[SSR]);
                seq_printf(s, "LFS: %u blocks in %u segments\n",
@@ -289,9 +308,14 @@ static int stat_show(struct seq_file *s, void *v)
                /* memory footprint */
                update_mem_info(si->sbi);
-                seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
+                seq_printf(s, "\nMemory: %u KB\n",
-                                (si->base_mem + si->cache_mem) >> 10,
+                        (si->base_mem + si->cache_mem + si->page_mem) >> 10);
-                                si->base_mem >> 10, si->cache_mem >> 10);
+                seq_printf(s, "  - static: %u KB\n",
+                                si->base_mem >> 10);
+                seq_printf(s, "  - cached: %u KB\n",
+                                si->cache_mem >> 10);
+                seq_printf(s, "  - paged : %u KB\n",
+                                si->page_mem >> 10);
        }
        mutex_unlock(&f2fs_stat_mutex);
        return 0;
@@ -331,6 +355,7 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi)
        atomic_set(&sbi->inline_inode, 0);
        atomic_set(&sbi->inline_dir, 0);
+        atomic_set(&sbi->inplace_count, 0);
        mutex_lock(&f2fs_stat_mutex);
        list_add_tail(&si->stat_list, &f2fs_stat_list);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b1a7d5737cd0..b74097a7f6d9 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -286,8 +286,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
        f2fs_wait_on_page_writeback(page, type);
        de->ino = cpu_to_le32(inode->i_ino);
        set_de_type(de, inode);
-        if (!f2fs_has_inline_dentry(dir))
+        f2fs_dentry_kunmap(dir, page);
-                kunmap(page);
        set_page_dirty(page);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        mark_inode_dirty(dir);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index ec58bb2373fc..7fa3313ab0e2 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -28,7 +28,7 @@
        do {                                                            \
                if (unlikely(condition)) {                              \
                        WARN_ON(1);                                     \
-                        sbi->need_fsck = true;                          \
+                        set_sbi_flag(sbi, SBI_NEED_FSCK);               \
                }                                                       \
        } while (0)
 #define f2fs_down_write(x, y)   down_write(x)
@@ -100,10 +100,15 @@ enum {
 enum {
        CP_UMOUNT,
+        CP_FASTBOOT,
        CP_SYNC,
        CP_DISCARD,
 };
+#define DEF_BATCHED_TRIM_SECTIONS       32
+#define BATCHED_TRIM_SEGMENTS(sbi)      \
+                (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec)
 struct cp_control {
        int reason;
        __u64 trim_start;
@@ -136,8 +141,14 @@ struct ino_entry {
        nid_t ino;              /* inode number */
 };
-/* for the list of directory inodes */
+/*
-struct dir_inode_entry {
+ * for the list of directory inodes or gc inodes.
+ * NOTE: there are two slab users for this structure, if we add/modify/delete
+ * fields in structure for one of slab users, it may affect fields or size of
+ * other one, in this condition, it's better to split both of slab and related
+ * data structure.
+ */
+struct inode_entry {
        struct list_head list;  /* list head */
        struct inode *inode;    /* vfs inode pointer */
 };
@@ -196,11 +207,14 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 */
 #define F2FS_IOC_GETFLAGS               FS_IOC_GETFLAGS
 #define F2FS_IOC_SETFLAGS               FS_IOC_SETFLAGS
+#define F2FS_IOC_GETVERSION             FS_IOC_GETVERSION
 #define F2FS_IOCTL_MAGIC                0xf5
 #define F2FS_IOC_START_ATOMIC_WRITE     _IO(F2FS_IOCTL_MAGIC, 1)
 #define F2FS_IOC_COMMIT_ATOMIC_WRITE    _IO(F2FS_IOCTL_MAGIC, 2)
 #define F2FS_IOC_START_VOLATILE_WRITE   _IO(F2FS_IOCTL_MAGIC, 3)
+#define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4)
+#define F2FS_IOC_ABORT_VOLATILE_WRITE   _IO(F2FS_IOCTL_MAGIC, 5)
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -295,7 +309,7 @@ struct f2fs_inode_info {
        nid_t i_xattr_nid;              /* node id that contains xattrs */
        unsigned long long xattr_ver;   /* cp version of xattr modification */
        struct extent_info ext;         /* in-memory extent cache entry */
-        struct dir_inode_entry *dirty_dir;      /* the pointer of dirty dir */
+        struct inode_entry *dirty_dir;  /* the pointer of dirty dir */
        struct radix_tree_root inmem_root;      /* radix tree for inmem pages */
        struct list_head inmem_pages;   /* inmemory pages managed by f2fs */
@@ -398,7 +412,8 @@ enum {
        CURSEG_HOT_NODE,        /* direct node blocks of directory files */
        CURSEG_WARM_NODE,       /* direct node blocks of normal files */
        CURSEG_COLD_NODE,       /* indirect node blocks */
-        NO_CHECK_TYPE
+        NO_CHECK_TYPE,
+        CURSEG_DIRECT_IO,       /* to use for the direct IO path */
 };
 struct flush_cmd {
@@ -437,6 +452,9 @@ struct f2fs_sm_info {
        int nr_discards;                        /* # of discards in the list */
        int max_discards;                       /* max. discards to be issued */
+        /* for batched trimming */
+        unsigned int trim_sections;             /* # of sections to trim */
        struct list_head sit_entry_set; /* sit entry set list */
        unsigned int ipu_policy;        /* in-place-update policy */
@@ -489,6 +507,7 @@ enum page_type {
 struct f2fs_io_info {
        enum page_type type;    /* contains DATA/NODE/META/META_FLUSH */
        int rw;                 /* contains R/RS/W/WS with REQ_META/REQ_PRIO */
+        block_t blk_addr;       /* block address to be written */
 };
 #define is_read_io(rw)  (((rw) & 1) == READ)
@@ -508,13 +527,20 @@ struct inode_management {
        unsigned long ino_num;                  /* number of entries */
 };
+/* For s_flag in struct f2fs_sb_info */
+enum {
+        SBI_IS_DIRTY,                           /* dirty flag for checkpoint */
+        SBI_IS_CLOSE,                           /* specify unmounting */
+        SBI_NEED_FSCK,                          /* need fsck.f2fs to fix */
+        SBI_POR_DOING,                          /* recovery is doing or not */
+};
 struct f2fs_sb_info {
        struct super_block *sb;                 /* pointer to VFS super block */
        struct proc_dir_entry *s_proc;          /* proc entry */
        struct buffer_head *raw_super_buf;      /* buffer head of raw sb */
        struct f2fs_super_block *raw_super;     /* raw super block pointer */
-        int s_dirty;                            /* dirty flag for checkpoint */
+        int s_flag;                             /* flags for sbi */
-        bool need_fsck;                         /* need fsck.f2fs to fix */
        /* for node-related operations */
        struct f2fs_nm_info *nm_info;           /* node manager */
@@ -534,7 +560,6 @@ struct f2fs_sb_info {
        struct rw_semaphore cp_rwsem;           /* blocking FS operations */
        struct rw_semaphore node_write;         /* locking node writes */
        struct mutex writepages;                /* mutex for writepages() */
-        bool por_doing;                         /* recovery is doing or not */
        wait_queue_head_t cp_wait;
        struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
@@ -589,6 +614,7 @@ struct f2fs_sb_info {
        struct f2fs_stat_info *stat_info;       /* FS status information */
        unsigned int segment_count[2];          /* # of allocated segments */
        unsigned int block_count[2];            /* # of allocated blocks */
+        atomic_t inplace_count;         /* # of inplace update */
        int total_hit_ext, read_hit_ext;        /* extent cache hit ratio */
        atomic_t inline_inode;                  /* # of inline_data inodes */
        atomic_t inline_dir;                    /* # of inline_dentry inodes */
@@ -686,14 +712,19 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi)
        return sbi->node_inode->i_mapping;
 }
-static inline void F2FS_SET_SB_DIRT(struct f2fs_sb_info *sbi)
+static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type)
 {
-        sbi->s_dirty = 1;
+        return sbi->s_flag & (0x01 << type);
 }
-static inline void F2FS_RESET_SB_DIRT(struct f2fs_sb_info *sbi)
+static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
 {
-        sbi->s_dirty = 0;
+        sbi->s_flag |= (0x01 << type);
+}
+static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type)
+{
+        sbi->s_flag &= ~(0x01 << type);
 }
 static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
@@ -741,6 +772,28 @@ static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi)
        up_write(&sbi->cp_rwsem);
 }
+static inline int __get_cp_reason(struct f2fs_sb_info *sbi)
+{
+        int reason = CP_SYNC;
+        if (test_opt(sbi, FASTBOOT))
+                reason = CP_FASTBOOT;
+        if (is_sbi_flag_set(sbi, SBI_IS_CLOSE))
+                reason = CP_UMOUNT;
+        return reason;
+}
+static inline bool __remain_node_summaries(int reason)
+{
+        return (reason == CP_UMOUNT || reason == CP_FASTBOOT);
+}
+static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi)
+{
+        return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) ||
+                        is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG));
+}
 /*
 * Check whether the given nid is within node id range.
 */
@@ -805,7 +858,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 {
        atomic_inc(&sbi->nr_pages[count_type]);
-        F2FS_SET_SB_DIRT(sbi);
+        set_sbi_flag(sbi, SBI_IS_DIRTY);
 }
 static inline void inode_inc_dirty_pages(struct inode *inode)
@@ -1113,6 +1166,7 @@ enum {
        FI_NEED_IPU,            /* used for ipu per file */
        FI_ATOMIC_FILE,         /* indicate atomic file */
        FI_VOLATILE_FILE,       /* indicate volatile file */
+        FI_DROP_CACHE,          /* drop dirty page cache */
        FI_DATA_EXIST,          /* indicate data exists */
 };
@@ -1220,6 +1274,11 @@ static inline bool f2fs_is_volatile_file(struct inode *inode)
        return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE);
 }
+static inline bool f2fs_is_drop_cache(struct inode *inode)
+{
+        return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE);
+}
 static inline void *inline_data_addr(struct page *page)
 {
        struct f2fs_inode *ri = F2FS_INODE(page);
@@ -1389,7 +1448,6 @@ void destroy_node_manager_caches(void);
 * segment.c
 */
 void register_inmem_page(struct inode *, struct page *);
-void invalidate_inmem_page(struct inode *, struct page *);
 void commit_inmem_pages(struct inode *, bool);
 void f2fs_balance_fs(struct f2fs_sb_info *);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
@@ -1401,16 +1459,16 @@ void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
 void clear_prefree_segments(struct f2fs_sb_info *);
 void release_discard_addrs(struct f2fs_sb_info *);
 void discard_next_dnode(struct f2fs_sb_info *, block_t);
-int npages_for_summary_flush(struct f2fs_sb_info *);
+int npages_for_summary_flush(struct f2fs_sb_info *, bool);
 void allocate_new_segments(struct f2fs_sb_info *);
 int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *);
 struct page *get_sum_page(struct f2fs_sb_info *, unsigned int);
 void write_meta_page(struct f2fs_sb_info *, struct page *);
 void write_node_page(struct f2fs_sb_info *, struct page *,
-                struct f2fs_io_info *, unsigned int, block_t, block_t *);
+                                unsigned int, struct f2fs_io_info *);
-void write_data_page(struct page *, struct dnode_of_data *, block_t *,
+void write_data_page(struct page *, struct dnode_of_data *,
-                                        struct f2fs_io_info *);
+                        struct f2fs_io_info *);
-void rewrite_data_page(struct page *, block_t, struct f2fs_io_info *);
+void rewrite_data_page(struct page *, struct f2fs_io_info *);
 void recover_data_page(struct f2fs_sb_info *, struct page *,
                                struct f2fs_summary *, block_t, block_t);
 void allocate_data_block(struct f2fs_sb_info *, struct page *,
@@ -1457,17 +1515,20 @@ void destroy_checkpoint_caches(void);
 * data.c
 */
 void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int);
-int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, block_t, int);
+int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *,
-void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, block_t,
+                                                struct f2fs_io_info *);
+void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *,
                                                struct f2fs_io_info *);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
-void update_extent_cache(block_t, struct dnode_of_data *);
+void update_extent_cache(struct dnode_of_data *);
 struct page *find_data_page(struct inode *, pgoff_t, bool);
 struct page *get_lock_data_page(struct inode *, pgoff_t);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int do_write_data_page(struct page *, struct f2fs_io_info *);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
+void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
+int f2fs_release_page(struct page *, gfp_t);
 /*
 * gc.c
@@ -1477,8 +1538,6 @@ void stop_gc_thread(struct f2fs_sb_info *);
 block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *);
 int f2fs_gc(struct f2fs_sb_info *);
 void build_gc_manager(struct f2fs_sb_info *);
-int __init create_gc_caches(void);
-void destroy_gc_caches(void);
 /*
 * recovery.c
@@ -1497,9 +1556,9 @@ struct f2fs_stat_info {
        int main_area_segs, main_area_sections, main_area_zones;
        int hit_ext, total_ext;
        int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
-        int nats, sits, fnids;
+        int nats, dirty_nats, sits, dirty_sits, fnids;
        int total_count, utilization;
-        int bg_gc, inline_inode, inline_dir, inmem_pages;
+        int bg_gc, inline_inode, inline_dir, inmem_pages, wb_pages;
        unsigned int valid_count, valid_node_count, valid_inode_count;
        unsigned int bimodal, avg_vblocks;
        int util_free, util_valid, util_invalid;
@@ -1514,7 +1573,8 @@ struct f2fs_stat_info {
        unsigned int segment_count[2];
        unsigned int block_count[2];
-        unsigned base_mem, cache_mem;
+        unsigned int inplace_count;
+        unsigned base_mem, cache_mem, page_mem;
 };
 static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
@@ -1553,7 +1613,8 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
                ((sbi)->segment_count[(curseg)->alloc_type]++)
 #define stat_inc_block_count(sbi, curseg)                               \
                ((sbi)->block_count[(curseg)->alloc_type]++)
+#define stat_inc_inplace_blocks(sbi)                                    \
+                (atomic_inc(&(sbi)->inplace_count))
 #define stat_inc_seg_count(sbi, type)                                   \
        do {                                                            \
                struct f2fs_stat_info *si = F2FS_STAT(sbi);             \
@@ -1599,6 +1660,7 @@ void f2fs_destroy_root_stats(void);
 #define stat_dec_inline_dir(inode)
 #define stat_inc_seg_type(sbi, curseg)
 #define stat_inc_block_count(sbi, curseg)
+#define stat_inc_inplace_blocks(sbi)
 #define stat_inc_seg_count(si, type)
 #define stat_inc_tot_blk_count(si, blks)
 #define stat_inc_data_blk_count(si, blks)
@@ -1619,6 +1681,7 @@ extern const struct address_space_operations f2fs_meta_aops;
 extern const struct inode_operations f2fs_dir_inode_operations;
 extern const struct inode_operations f2fs_symlink_inode_operations;
 extern const struct inode_operations f2fs_special_inode_operations;
+extern struct kmem_cache *inode_entry_slab;
 /*
 * inline.c
@@ -1629,7 +1692,6 @@ int f2fs_read_inline_data(struct inode *, struct page *);
 int f2fs_convert_inline_page(struct dnode_of_data *, struct page *);
 int f2fs_convert_inline_inode(struct inode *);
 int f2fs_write_inline_data(struct inode *, struct page *);
-void truncate_inline_data(struct page *, u64);
 bool recover_inline_data(struct inode *, struct page *);
 struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *,
                                                        struct page **);
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 3c27e0ecb3bc..98dac27bc3f7 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -26,6 +26,7 @@
 #include "segment.h"
 #include "xattr.h"
 #include "acl.h"
+#include "trace.h"
 #include <trace/events/f2fs.h>
 static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
@@ -92,7 +93,6 @@ static const struct vm_operations_struct f2fs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = f2fs_vm_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int get_parent_ino(struct inode *inode, nid_t *pino)
@@ -246,6 +246,10 @@ go_write:
 sync_nodes:
        sync_node_pages(sbi, ino, &wbc);
+        /* if cp_error was enabled, we should avoid infinite loop */
+        if (unlikely(f2fs_cp_error(sbi)))
+                goto out;
        if (need_inode_block_update(sbi, ino)) {
                mark_inode_dirty_sync(inode);
                f2fs_write_inode(inode, NULL);
@@ -265,6 +269,7 @@ flush_out:
        ret = f2fs_issue_flush(sbi);
 out:
        trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+        f2fs_trace_ios(NULL, NULL, 1);
        return ret;
 }
@@ -351,7 +356,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
                /* find data/hole in dnode block */
                for (; dn.ofs_in_node < end_offset;
                                dn.ofs_in_node++, pgofs++,
-                                data_ofs = pgofs << PAGE_CACHE_SHIFT) {
+                                data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) {
                        block_t blkaddr;
                        blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
@@ -427,7 +432,8 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
                if (blkaddr == NULL_ADDR)
                        continue;
-                update_extent_cache(NULL_ADDR, dn);
+                dn->data_blkaddr = NULL_ADDR;
+                update_extent_cache(dn);
                invalidate_blocks(sbi, blkaddr);
                nr_free++;
        }
@@ -484,8 +490,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock)
        trace_f2fs_truncate_blocks_enter(inode, from);
-        free_from = (pgoff_t)
+        free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1);
-                ((from + blocksize - 1) >> (sbi->log_blocksize));
        if (lock)
                f2fs_lock_op(sbi);
@@ -836,6 +841,19 @@ static long f2fs_fallocate(struct file *file, int mode,
        return ret;
 }
+static int f2fs_release_file(struct inode *inode, struct file *filp)
+{
+        /* some remained atomic pages should discarded */
+        if (f2fs_is_atomic_file(inode))
+                commit_inmem_pages(inode, true);
+        if (f2fs_is_volatile_file(inode)) {
+                set_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+                filemap_fdatawrite(inode->i_mapping);
+                clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE);
+        }
+        return 0;
+}
 #define F2FS_REG_FLMASK         (~(FS_DIRSYNC_FL | FS_TOPDIR_FL))
 #define F2FS_OTHER_FLMASK       (FS_NODUMP_FL | FS_NOATIME_FL)
@@ -906,29 +924,30 @@ out:
        return ret;
 }
+static int f2fs_ioc_getversion(struct file *filp, unsigned long arg)
+{
+        struct inode *inode = file_inode(filp);
+        return put_user(inode->i_generation, (int __user *)arg);
+}
 static int f2fs_ioc_start_atomic_write(struct file *filp)
 {
        struct inode *inode = file_inode(filp);
-        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
        if (!inode_owner_or_capable(inode))
                return -EACCES;
-        f2fs_balance_fs(sbi);
+        f2fs_balance_fs(F2FS_I_SB(inode));
+        if (f2fs_is_atomic_file(inode))
+                return 0;
        set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
        return f2fs_convert_inline_inode(inode);
 }
-static int f2fs_release_file(struct inode *inode, struct file *filp)
-{
-        /* some remained atomic pages should discarded */
-        if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
-                commit_inmem_pages(inode, true);
-        return 0;
-}
 static int f2fs_ioc_commit_atomic_write(struct file *filp)
 {
        struct inode *inode = file_inode(filp);
@@ -949,6 +968,7 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
        ret = f2fs_sync_file(filp, 0, LONG_MAX, 0);
        mnt_drop_write_file(filp);
+        clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
        return ret;
 }
@@ -959,11 +979,56 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
        if (!inode_owner_or_capable(inode))
                return -EACCES;
+        if (f2fs_is_volatile_file(inode))
+                return 0;
        set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
        return f2fs_convert_inline_inode(inode);
 }
+static int f2fs_ioc_release_volatile_write(struct file *filp)
+{
+        struct inode *inode = file_inode(filp);
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
+        if (!f2fs_is_volatile_file(inode))
+                return 0;
+        punch_hole(inode, 0, F2FS_BLKSIZE);
+        return 0;
+}
+static int f2fs_ioc_abort_volatile_write(struct file *filp)
+{
+        struct inode *inode = file_inode(filp);
+        int ret;
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
+        ret = mnt_want_write_file(filp);
+        if (ret)
+                return ret;
+        f2fs_balance_fs(F2FS_I_SB(inode));
+        if (f2fs_is_atomic_file(inode)) {
+                commit_inmem_pages(inode, false);
+                clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+        }
+        if (f2fs_is_volatile_file(inode)) {
+                clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+                filemap_fdatawrite(inode->i_mapping);
+                set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+        }
+        mnt_drop_write_file(filp);
+        return ret;
+}
 static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
 {
        struct inode *inode = file_inode(filp);
@@ -1001,12 +1066,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                return f2fs_ioc_getflags(filp, arg);
        case F2FS_IOC_SETFLAGS:
                return f2fs_ioc_setflags(filp, arg);
+        case F2FS_IOC_GETVERSION:
+                return f2fs_ioc_getversion(filp, arg);
        case F2FS_IOC_START_ATOMIC_WRITE:
                return f2fs_ioc_start_atomic_write(filp);
        case F2FS_IOC_COMMIT_ATOMIC_WRITE:
                return f2fs_ioc_commit_atomic_write(filp);
        case F2FS_IOC_START_VOLATILE_WRITE:
                return f2fs_ioc_start_volatile_write(filp);
+        case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+                return f2fs_ioc_release_volatile_write(filp);
+        case F2FS_IOC_ABORT_VOLATILE_WRITE:
+                return f2fs_ioc_abort_volatile_write(filp);
        case FITRIM:
                return f2fs_ioc_fitrim(filp, arg);
        default:
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index eec0933a4819..76adbc3641f1 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -24,8 +24,6 @@
 #include "gc.h"
 #include <trace/events/f2fs.h>
-static struct kmem_cache *winode_slab;
 static int gc_thread_func(void *data)
 {
        struct f2fs_sb_info *sbi = data;
@@ -46,7 +44,7 @@ static int gc_thread_func(void *data)
                        break;
                if (sbi->sb->s_writers.frozen >= SB_FREEZE_WRITE) {
-                        wait_ms = increase_sleep_time(gc_th, wait_ms);
+                        increase_sleep_time(gc_th, &wait_ms);
                        continue;
                }
@@ -67,15 +65,15 @@ static int gc_thread_func(void *data)
                        continue;
                if (!is_idle(sbi)) {
-                        wait_ms = increase_sleep_time(gc_th, wait_ms);
+                        increase_sleep_time(gc_th, &wait_ms);
                        mutex_unlock(&sbi->gc_mutex);
                        continue;
                }
                if (has_enough_invalid_blocks(sbi))
-                        wait_ms = decrease_sleep_time(gc_th, wait_ms);
+                        decrease_sleep_time(gc_th, &wait_ms);
                else
-                        wait_ms = increase_sleep_time(gc_th, wait_ms);
+                        increase_sleep_time(gc_th, &wait_ms);
                stat_inc_bggc_count(sbi);
@@ -356,13 +354,10 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode)
                iput(inode);
                return;
        }
-        new_ie = f2fs_kmem_cache_alloc(winode_slab, GFP_NOFS);
+        new_ie = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
        new_ie->inode = inode;
-retry:
-        if (radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie)) {
+        f2fs_radix_tree_insert(&gc_list->iroot, inode->i_ino, new_ie);
-                cond_resched();
-                goto retry;
-        }
        list_add_tail(&new_ie->list, &gc_list->ilist);
 }
@@ -373,7 +368,7 @@ static void put_gc_inode(struct gc_inode_list *gc_list)
                radix_tree_delete(&gc_list->iroot, ie->inode->i_ino);
                iput(ie->inode);
                list_del(&ie->list);
-                kmem_cache_free(winode_slab, ie);
+                kmem_cache_free(inode_entry_slab, ie);
        }
 }
@@ -703,8 +698,7 @@ int f2fs_gc(struct f2fs_sb_info *sbi)
                .iroot = RADIX_TREE_INIT(GFP_NOFS),
        };
-        cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
+        cpc.reason = __get_cp_reason(sbi);
 gc_more:
        if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
                goto stop;
@@ -750,17 +744,3 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
 {
        DIRTY_I(sbi)->v_ops = &default_v_ops;
 }
-int __init create_gc_caches(void)
-{
-        winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
-                        sizeof(struct inode_entry));
-        if (!winode_slab)
-                return -ENOMEM;
-        return 0;
-}
-void destroy_gc_caches(void)
-{
-        kmem_cache_destroy(winode_slab);
-}
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index 6ff7ad38463e..b4a65be9f7d3 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -35,11 +35,6 @@ struct f2fs_gc_kthread {
        unsigned int gc_idle;
 };
-struct inode_entry {
-        struct list_head list;
-        struct inode *inode;
-};
 struct gc_inode_list {
        struct list_head ilist;
        struct radix_tree_root iroot;
@@ -69,26 +64,26 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi)
        return (long)(reclaimable_user_blocks * LIMIT_FREE_BLOCK) / 100;
 }
-static inline long increase_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
+static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th,
+                                                                long *wait)
 {
-        if (wait == gc_th->no_gc_sleep_time)
+        if (*wait == gc_th->no_gc_sleep_time)
-                return wait;
+                return;
-        wait += gc_th->min_sleep_time;
+        *wait += gc_th->min_sleep_time;
-        if (wait > gc_th->max_sleep_time)
+        if (*wait > gc_th->max_sleep_time)
-                wait = gc_th->max_sleep_time;
+                *wait = gc_th->max_sleep_time;
-        return wait;
 }
-static inline long decrease_sleep_time(struct f2fs_gc_kthread *gc_th, long wait)
+static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th,
+                                                                long *wait)
 {
-        if (wait == gc_th->no_gc_sleep_time)
+        if (*wait == gc_th->no_gc_sleep_time)
-                wait = gc_th->max_sleep_time;
+                *wait = gc_th->max_sleep_time;
-        wait -= gc_th->min_sleep_time;
+        *wait -= gc_th->min_sleep_time;
-        if (wait <= gc_th->min_sleep_time)
+        if (*wait <= gc_th->min_sleep_time)
-                wait = gc_th->min_sleep_time;
+                *wait = gc_th->min_sleep_time;
-        return wait;
 }
 static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index f2d3c581e776..1484c00133cd 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -50,6 +50,12 @@ void read_inline_data(struct page *page, struct page *ipage)
        SetPageUptodate(page);
 }
+static void truncate_inline_data(struct page *ipage)
+{
+        f2fs_wait_on_page_writeback(ipage, NODE);
+        memset(inline_data_addr(ipage), 0, MAX_INLINE_DATA);
+}
 int f2fs_read_inline_data(struct inode *inode, struct page *page)
 {
        struct page *ipage;
@@ -79,7 +85,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
 int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 {
        void *src_addr, *dst_addr;
-        block_t new_blk_addr;
        struct f2fs_io_info fio = {
                .type = DATA,
                .rw = WRITE_SYNC | REQ_PRIO,
@@ -115,9 +120,9 @@ no_update:
        /* write data page to try to make data consistent */
        set_page_writeback(page);
+        fio.blk_addr = dn->data_blkaddr;
-        write_data_page(page, dn, &new_blk_addr, &fio);
+        write_data_page(page, dn, &fio);
-        update_extent_cache(new_blk_addr, dn);
+        update_extent_cache(dn);
        f2fs_wait_on_page_writeback(page, DATA);
        if (dirty)
                inode_dec_dirty_pages(dn->inode);
@@ -126,7 +131,7 @@ no_update:
        set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE);
        /* clear inline data and flag after data writeback */
-        truncate_inline_data(dn->inode_page, 0);
+        truncate_inline_data(dn->inode_page);
 clear_out:
        stat_dec_inline_inode(dn->inode);
        f2fs_clear_inline_inode(dn->inode);
@@ -199,19 +204,6 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page)
        return 0;
 }
-void truncate_inline_data(struct page *ipage, u64 from)
-{
-        void *addr;
-        if (from >= MAX_INLINE_DATA)
-                return;
-        f2fs_wait_on_page_writeback(ipage, NODE);
-        addr = inline_data_addr(ipage);
-        memset(addr + from, 0, MAX_INLINE_DATA - from);
-}
 bool recover_inline_data(struct inode *inode, struct page *npage)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -253,7 +245,7 @@ process_inline:
        if (f2fs_has_inline_data(inode)) {
                ipage = get_node_page(sbi, inode->i_ino);
                f2fs_bug_on(sbi, IS_ERR(ipage));
-                truncate_inline_data(ipage, 0);
+                truncate_inline_data(ipage);
                f2fs_clear_inline_inode(inode);
                update_inode(inode, ipage);
                f2fs_put_page(ipage, 1);
@@ -371,7 +363,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage,
        set_page_dirty(page);
        /* clear inline dir and flag after data writeback */
-        truncate_inline_data(ipage, 0);
+        truncate_inline_data(ipage);
        stat_dec_inline_dir(dir);
        clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY);
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 196cc7843aaf..2d002e3738a7 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -67,29 +67,23 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
        }
 }
-static int __recover_inline_status(struct inode *inode, struct page *ipage)
+static void __recover_inline_status(struct inode *inode, struct page *ipage)
 {
        void *inline_data = inline_data_addr(ipage);
-        struct f2fs_inode *ri;
+        __le32 *start = inline_data;
-        void *zbuf;
+        __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32);
-        zbuf = kzalloc(MAX_INLINE_DATA, GFP_NOFS);
+        while (start < end) {
-        if (!zbuf)
+                if (*start++) {
-                return -ENOMEM;
+                        f2fs_wait_on_page_writeback(ipage, NODE);
-        if (!memcmp(zbuf, inline_data, MAX_INLINE_DATA)) {
+                        set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
-                kfree(zbuf);
+                        set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage));
-                return 0;
+                        set_page_dirty(ipage);
+                        return;
+                }
        }
-        kfree(zbuf);
+        return;
-        f2fs_wait_on_page_writeback(ipage, NODE);
-        set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
-        ri = F2FS_INODE(ipage);
-        set_raw_inline(F2FS_I(inode), ri);
-        set_page_dirty(ipage);
-        return 0;
 }
 static int do_read_inode(struct inode *inode)
@@ -98,7 +92,6 @@ static int do_read_inode(struct inode *inode)
        struct f2fs_inode_info *fi = F2FS_I(inode);
        struct page *node_page;
        struct f2fs_inode *ri;
-        int err = 0;
        /* Check if ino is within scope */
        if (check_nid_range(sbi, inode->i_ino)) {
@@ -142,7 +135,7 @@ static int do_read_inode(struct inode *inode)
        /* check data exist */
        if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
-                err = __recover_inline_status(inode, node_page);
+                __recover_inline_status(inode, node_page);
        /* get rdev by using inline_info */
        __get_inode_rdev(inode, ri);
@@ -152,7 +145,7 @@ static int do_read_inode(struct inode *inode)
        stat_inc_inline_inode(inode);
        stat_inc_inline_dir(inode);
-        return err;
+        return 0;
 }
 struct inode *f2fs_iget(struct super_block *sb, unsigned long ino)
@@ -304,7 +297,7 @@ void f2fs_evict_inode(struct inode *inode)
        nid_t xnid = F2FS_I(inode)->i_xattr_nid;
        /* some remained atomic pages should discarded */
-        if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode))
+        if (f2fs_is_atomic_file(inode))
                commit_inmem_pages(inode, true);
        trace_f2fs_evict_inode(inode);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 547a2deeb1ac..e79639a9787a 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -299,7 +299,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        inode->i_op = &f2fs_dir_inode_operations;
        inode->i_fop = &f2fs_dir_operations;
        inode->i_mapping->a_ops = &f2fs_dblock_aops;
-        mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_ZERO);
+        mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
        set_inode_flag(F2FS_I(inode), FI_INC_LINK);
        f2fs_lock_op(sbi);
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index f83326ca32ef..97bd9d3db882 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -19,6 +19,7 @@
 #include "f2fs.h"
 #include "node.h"
 #include "segment.h"
+#include "trace.h"
 #include <trace/events/f2fs.h>
 #define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock)
@@ -57,12 +58,13 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
        } else if (type == INO_ENTRIES) {
                int i;
-                if (sbi->sb->s_bdi->dirty_exceeded)
-                        return false;
                for (i = 0; i <= UPDATE_INO; i++)
                        mem_size += (sbi->im[i].ino_num *
                                sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
                res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+        } else {
+                if (sbi->sb->s_bdi->dirty_exceeded)
+                        return false;
        }
        return res;
 }
@@ -268,7 +270,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
        e = __lookup_nat_cache(nm_i, ni->nid);
        if (!e) {
                e = grab_nat_entry(nm_i, ni->nid);
-                e->ni = *ni;
+                copy_node_info(&e->ni, ni);
                f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
        } else if (new_blkaddr == NEW_ADDR) {
                /*
@@ -276,7 +278,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
                 * previous nat entry can be remained in nat cache.
                 * So, reinitialize it with new information.
                 */
-                e->ni = *ni;
+                copy_node_info(&e->ni, ni);
                f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
        }
@@ -346,7 +348,6 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
        struct nat_entry *e;
        int i;
-        memset(&ne, 0, sizeof(struct f2fs_nat_entry));
        ni->nid = nid;
        /* Check nat cache */
@@ -361,6 +362,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
        if (e)
                return;
+        memset(&ne, 0, sizeof(struct f2fs_nat_entry));
        /* Check current segment summary */
        mutex_lock(&curseg->curseg_mutex);
        i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
@@ -471,7 +474,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct page *npage[4];
-        struct page *parent;
+        struct page *parent = NULL;
        int offset[4];
        unsigned int noffset[4];
        nid_t nids[4];
@@ -488,6 +491,14 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
                if (IS_ERR(npage[0]))
                        return PTR_ERR(npage[0]);
        }
+        /* if inline_data is set, should not report any block indices */
+        if (f2fs_has_inline_data(dn->inode) && index) {
+                err = -EINVAL;
+                f2fs_put_page(npage[0], 1);
+                goto release_out;
+        }
        parent = npage[0];
        if (level != 0)
                nids[1] = get_nid(parent, offset[0], true);
@@ -585,7 +596,7 @@ static void truncate_node(struct dnode_of_data *dn)
        }
 invalidate:
        clear_node_page_dirty(dn->node_page);
-        F2FS_SET_SB_DIRT(sbi);
+        set_sbi_flag(sbi, SBI_IS_DIRTY);
        f2fs_put_page(dn->node_page, 1);
@@ -976,6 +987,10 @@ static int read_node_page(struct page *page, int rw)
 {
        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
        struct node_info ni;
+        struct f2fs_io_info fio = {
+                .type = NODE,
+                .rw = rw,
+        };
        get_node_info(sbi, page->index, &ni);
@@ -987,7 +1002,8 @@ static int read_node_page(struct page *page, int rw)
        if (PageUptodate(page))
                return LOCKED_PAGE;
-        return f2fs_submit_page_bio(sbi, page, ni.blk_addr, rw);
+        fio.blk_addr = ni.blk_addr;
+        return f2fs_submit_page_bio(sbi, page, &fio);
 }
 /*
@@ -1028,11 +1044,11 @@ repeat:
        err = read_node_page(page, READ_SYNC);
        if (err < 0)
                return ERR_PTR(err);
-        else if (err == LOCKED_PAGE)
+        else if (err != LOCKED_PAGE)
-                goto got_it;
+                lock_page(page);
-        lock_page(page);
        if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
+                ClearPageUptodate(page);
                f2fs_put_page(page, 1);
                return ERR_PTR(-EIO);
        }
@@ -1040,7 +1056,6 @@ repeat:
                f2fs_put_page(page, 1);
                goto repeat;
        }
-got_it:
        return page;
 }
@@ -1268,7 +1283,6 @@ static int f2fs_write_node_page(struct page *page,
 {
        struct f2fs_sb_info *sbi = F2FS_P_SB(page);
        nid_t nid;
-        block_t new_addr;
        struct node_info ni;
        struct f2fs_io_info fio = {
                .type = NODE,
@@ -1277,7 +1291,7 @@ static int f2fs_write_node_page(struct page *page,
        trace_f2fs_writepage(page, NODE);
-        if (unlikely(sbi->por_doing))
+        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
                goto redirty_out;
        if (unlikely(f2fs_cp_error(sbi)))
                goto redirty_out;
@@ -1303,9 +1317,11 @@ static int f2fs_write_node_page(struct page *page,
        } else {
                down_read(&sbi->node_write);
        }
        set_page_writeback(page);
-        write_node_page(sbi, page, &fio, nid, ni.blk_addr, &new_addr);
+        fio.blk_addr = ni.blk_addr;
-        set_node_addr(sbi, &ni, new_addr, is_fsync_dnode(page));
+        write_node_page(sbi, page, nid, &fio);
+        set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page));
        dec_page_count(sbi, F2FS_DIRTY_NODES);
        up_read(&sbi->node_write);
        unlock_page(page);
@@ -1355,26 +1371,12 @@ static int f2fs_set_node_page_dirty(struct page *page)
                __set_page_dirty_nobuffers(page);
                inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES);
                SetPagePrivate(page);
+                f2fs_trace_pid(page);
                return 1;
        }
        return 0;
 }
-static void f2fs_invalidate_node_page(struct page *page, unsigned int offset,
-                                      unsigned int length)
-{
-        struct inode *inode = page->mapping->host;
-        if (PageDirty(page))
-                dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES);
-        ClearPagePrivate(page);
-}
-static int f2fs_release_node_page(struct page *page, gfp_t wait)
-{
-        ClearPagePrivate(page);
-        return 1;
-}
 /*
 * Structure of the f2fs node operations
 */
@@ -1382,8 +1384,8 @@ const struct address_space_operations f2fs_node_aops = {
        .writepage      = f2fs_write_node_page,
        .writepages     = f2fs_write_node_pages,
        .set_page_dirty = f2fs_set_node_page_dirty,
-        .invalidatepage = f2fs_invalidate_node_page,
+        .invalidatepage = f2fs_invalidate_page,
-        .releasepage    = f2fs_release_node_page,
+        .releasepage    = f2fs_release_page,
 };
 static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
@@ -1726,80 +1728,41 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
        return 0;
 }
-/*
- * ra_sum_pages() merge contiguous pages into one bio and submit.
- * these pre-read pages are allocated in bd_inode's mapping tree.
- */
-static int ra_sum_pages(struct f2fs_sb_info *sbi, struct page **pages,
-                                int start, int nrpages)
-{
-        struct inode *inode = sbi->sb->s_bdev->bd_inode;
-        struct address_space *mapping = inode->i_mapping;
-        int i, page_idx = start;
-        struct f2fs_io_info fio = {
-                .type = META,
-                .rw = READ_SYNC | REQ_META | REQ_PRIO
-        };
-        for (i = 0; page_idx < start + nrpages; page_idx++, i++) {
-                /* alloc page in bd_inode for reading node summary info */
-                pages[i] = grab_cache_page(mapping, page_idx);
-                if (!pages[i])
-                        break;
-                f2fs_submit_page_mbio(sbi, pages[i], page_idx, &fio);
-        }
-        f2fs_submit_merged_bio(sbi, META, READ);
-        return i;
-}
 int restore_node_summary(struct f2fs_sb_info *sbi,
                        unsigned int segno, struct f2fs_summary_block *sum)
 {
        struct f2fs_node *rn;
        struct f2fs_summary *sum_entry;
-        struct inode *inode = sbi->sb->s_bdev->bd_inode;
        block_t addr;
        int bio_blocks = MAX_BIO_BLOCKS(sbi);
-        struct page *pages[bio_blocks];
+        int i, idx, last_offset, nrpages;
-        int i, idx, last_offset, nrpages, err = 0;
        /* scan the node segment */
        last_offset = sbi->blocks_per_seg;
        addr = START_BLOCK(sbi, segno);
        sum_entry = &sum->entries[0];
-        for (i = 0; !err && i < last_offset; i += nrpages, addr += nrpages) {
+        for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
                nrpages = min(last_offset - i, bio_blocks);
                /* readahead node pages */
-                nrpages = ra_sum_pages(sbi, pages, addr, nrpages);
+                ra_meta_pages(sbi, addr, nrpages, META_POR);
-                if (!nrpages)
-                        return -ENOMEM;
-                for (idx = 0; idx < nrpages; idx++) {
+                for (idx = addr; idx < addr + nrpages; idx++) {
-                        if (err)
+                        struct page *page = get_meta_page(sbi, idx);
-                                goto skip;
-                        lock_page(pages[idx]);
+                        rn = F2FS_NODE(page);
-                        if (unlikely(!PageUptodate(pages[idx]))) {
+                        sum_entry->nid = rn->footer.nid;
-                                err = -EIO;
+                        sum_entry->version = 0;
-                        } else {
+                        sum_entry->ofs_in_node = 0;
-                                rn = F2FS_NODE(pages[idx]);
+                        sum_entry++;
-                                sum_entry->nid = rn->footer.nid;
+                        f2fs_put_page(page, 1);
-                                sum_entry->version = 0;
-                                sum_entry->ofs_in_node = 0;
-                                sum_entry++;
-                        }
-                        unlock_page(pages[idx]);
-skip:
-                        page_cache_release(pages[idx]);
                }
-                invalidate_mapping_pages(inode->i_mapping, addr,
+                invalidate_mapping_pages(META_MAPPING(sbi), addr,
                                                        addr + nrpages);
        }
-        return err;
+        return 0;
 }
 static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
@@ -1923,7 +1886,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
        struct f2fs_summary_block *sum = curseg->sum_blk;
-        struct nat_entry_set *setvec[NATVEC_SIZE];
+        struct nat_entry_set *setvec[SETVEC_SIZE];
        struct nat_entry_set *set, *tmp;
        unsigned int found;
        nid_t set_idx = 0;
@@ -1940,7 +1903,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
                remove_nats_in_journal(sbi);
        while ((found = __gang_lookup_nat_set(nm_i,
-                                        set_idx, NATVEC_SIZE, setvec))) {
+                                        set_idx, SETVEC_SIZE, setvec))) {
                unsigned idx;
                set_idx = setvec[found - 1]->set + 1;
                for (idx = 0; idx < found; idx++)
@@ -2020,6 +1983,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
        struct f2fs_nm_info *nm_i = NM_I(sbi);
        struct free_nid *i, *next_i;
        struct nat_entry *natvec[NATVEC_SIZE];
+        struct nat_entry_set *setvec[SETVEC_SIZE];
        nid_t nid = 0;
        unsigned int found;
@@ -2044,11 +2008,27 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
        while ((found = __gang_lookup_nat_cache(nm_i,
                                        nid, NATVEC_SIZE, natvec))) {
                unsigned idx;
                nid = nat_get_nid(natvec[found - 1]) + 1;
                for (idx = 0; idx < found; idx++)
                        __del_from_nat_cache(nm_i, natvec[idx]);
        }
        f2fs_bug_on(sbi, nm_i->nat_cnt);
+        /* destroy nat set cache */
+        nid = 0;
+        while ((found = __gang_lookup_nat_set(nm_i,
+                                        nid, SETVEC_SIZE, setvec))) {
+                unsigned idx;
+                nid = setvec[found - 1]->set + 1;
+                for (idx = 0; idx < found; idx++) {
+                        /* entry_cnt is not zero, when cp_error was occurred */
+                        f2fs_bug_on(sbi, !list_empty(&setvec[idx]->entry_list));
+                        radix_tree_delete(&nm_i->nat_set_root, setvec[idx]->set);
+                        kmem_cache_free(nat_entry_set_slab, setvec[idx]);
+                }
+        }
        up_write(&nm_i->nat_tree_lock);
        kfree(nm_i->nat_bitmap);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index d10b6448a671..f405bbf2435a 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -25,10 +25,19 @@
 /* vector size for gang look-up from nat cache that consists of radix tree */
 #define NATVEC_SIZE     64
+#define SETVEC_SIZE     32
 /* return value for read_node_page */
 #define LOCKED_PAGE     1
+/* For flag in struct node_info */
+enum {
+        IS_CHECKPOINTED,        /* is it checkpointed before? */
+        HAS_FSYNCED_INODE,      /* is the inode fsynced before? */
+        HAS_LAST_FSYNC,         /* has the latest node fsync mark? */
+        IS_DIRTY,               /* this nat entry is dirty? */
+};
 /*
 * For node information
 */
@@ -37,18 +46,11 @@ struct node_info {
        nid_t ino;              /* inode number of the node's owner */
        block_t blk_addr;       /* block address of the node */
        unsigned char version;  /* version of the node */
-};
+        unsigned char flag;     /* for node information bits */
-enum {
-        IS_CHECKPOINTED,        /* is it checkpointed before? */
-        HAS_FSYNCED_INODE,      /* is the inode fsynced before? */
-        HAS_LAST_FSYNC,         /* has the latest node fsync mark? */
-        IS_DIRTY,               /* this nat entry is dirty? */
 };
 struct nat_entry {
        struct list_head list;  /* for clean or dirty nat list */
-        unsigned char flag;     /* for node information bits */
        struct node_info ni;    /* in-memory node information */
 };
@@ -63,20 +65,30 @@ struct nat_entry {
 #define inc_node_version(version)       (++version)
+static inline void copy_node_info(struct node_info *dst,
+                                                struct node_info *src)
+{
+        dst->nid = src->nid;
+        dst->ino = src->ino;
+        dst->blk_addr = src->blk_addr;
+        dst->version = src->version;
+        /* should not copy flag here */
+}
 static inline void set_nat_flag(struct nat_entry *ne,
                                unsigned int type, bool set)
 {
        unsigned char mask = 0x01 << type;
        if (set)
-                ne->flag |= mask;
+                ne->ni.flag |= mask;
        else
-                ne->flag &= ~mask;
+                ne->ni.flag &= ~mask;
 }
 static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type)
 {
        unsigned char mask = 0x01 << type;
-        return ne->flag & mask;
+        return ne->ni.flag & mask;
 }
 static inline void nat_reset_flag(struct nat_entry *ne)
@@ -108,6 +120,7 @@ enum mem_type {
        NAT_ENTRIES,    /* indicates the cached nat entry */
        DIRTY_DENTS,    /* indicates dirty dentry pages */
        INO_ENTRIES,    /* indicates inode entries */
+        BASE_CHECK,     /* check kernel status */
 };
 struct nat_entry_set {
@@ -200,11 +213,19 @@ static inline void fill_node_footer(struct page *page, nid_t nid,
                                nid_t ino, unsigned int ofs, bool reset)
 {
        struct f2fs_node *rn = F2FS_NODE(page);
+        unsigned int old_flag = 0;
        if (reset)
                memset(rn, 0, sizeof(*rn));
+        else
+                old_flag = le32_to_cpu(rn->footer.flag);
        rn->footer.nid = cpu_to_le32(nid);
        rn->footer.ino = cpu_to_le32(ino);
-        rn->footer.flag = cpu_to_le32(ofs << OFFSET_BIT_SHIFT);
+        /* should remain old flag bits such as COLD_BIT_SHIFT */
+        rn->footer.flag = cpu_to_le32((ofs << OFFSET_BIT_SHIFT) |
+                                        (old_flag & OFFSET_BIT_MASK));
 }
 static inline void copy_node_footer(struct page *dst, struct page *src)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 9160a37e1c7a..41afb9534bbd 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -346,6 +346,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
        if (IS_INODE(page)) {
                recover_inline_xattr(inode, page);
        } else if (f2fs_has_xattr_block(ofs_of_node(page))) {
+                /*
+                 * Deprecated; xattr blocks should be found from cold log.
+                 * But, we should remain this for backward compatibility.
+                 */
                recover_xattr_data(inode, page, blkaddr);
                goto out;
        }
@@ -396,7 +400,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
                        /* write dummy data page */
                        recover_data_page(sbi, NULL, &sum, src, dest);
-                        update_extent_cache(dest, &dn);
+                        dn.data_blkaddr = dest;
+                        update_extent_cache(&dn);
                        recovered++;
                }
                dn.ofs_in_node++;
@@ -503,7 +508,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
        INIT_LIST_HEAD(&inode_list);
        /* step #1: find fsynced inode numbers */
-        sbi->por_doing = true;
+        set_sbi_flag(sbi, SBI_POR_DOING);
        /* prevent checkpoint */
        mutex_lock(&sbi->cp_mutex);
@@ -536,7 +541,7 @@ out:
                truncate_inode_pages_final(META_MAPPING(sbi));
        }
-        sbi->por_doing = false;
+        clear_sbi_flag(sbi, SBI_POR_DOING);
        if (err) {
                discard_next_dnode(sbi, blkaddr);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 42607a679923..daee4ab913da 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -20,6 +20,7 @@
 #include "f2fs.h"
 #include "segment.h"
 #include "node.h"
+#include "trace.h"
 #include <trace/events/f2fs.h>
 #define __reverse_ffz(x) __reverse_ffs(~(x))
@@ -181,6 +182,7 @@ void register_inmem_page(struct inode *inode, struct page *page)
        int err;
        SetPagePrivate(page);
+        f2fs_trace_pid(page);
        new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS);
@@ -205,23 +207,6 @@ retry:
        mutex_unlock(&fi->inmem_lock);
 }
-void invalidate_inmem_page(struct inode *inode, struct page *page)
-{
-        struct f2fs_inode_info *fi = F2FS_I(inode);
-        struct inmem_pages *cur;
-        mutex_lock(&fi->inmem_lock);
-        cur = radix_tree_lookup(&fi->inmem_root, page->index);
-        if (cur) {
-                radix_tree_delete(&fi->inmem_root, cur->page->index);
-                f2fs_put_page(cur->page, 0);
-                list_del(&cur->list);
-                kmem_cache_free(inmem_entry_slab, cur);
-                dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
-        }
-        mutex_unlock(&fi->inmem_lock);
-}
 void commit_inmem_pages(struct inode *inode, bool abort)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -230,7 +215,7 @@ void commit_inmem_pages(struct inode *inode, bool abort)
        bool submit_bio = false;
        struct f2fs_io_info fio = {
                .type = DATA,
-                .rw = WRITE_SYNC,
+                .rw = WRITE_SYNC | REQ_PRIO,
        };
        /*
@@ -240,33 +225,38 @@ void commit_inmem_pages(struct inode *inode, bool abort)
         * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this
         * inode becomes free by iget_locked in f2fs_iget.
         */
-        if (!abort)
+        if (!abort) {
                f2fs_balance_fs(sbi);
+                f2fs_lock_op(sbi);
-        f2fs_lock_op(sbi);
+        }
        mutex_lock(&fi->inmem_lock);
        list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) {
-                lock_page(cur->page);
+                if (!abort) {
-                if (!abort && cur->page->mapping == inode->i_mapping) {
+                        lock_page(cur->page);
-                        f2fs_wait_on_page_writeback(cur->page, DATA);
+                        if (cur->page->mapping == inode->i_mapping) {
-                        if (clear_page_dirty_for_io(cur->page))
+                                f2fs_wait_on_page_writeback(cur->page, DATA);
-                                inode_dec_dirty_pages(inode);
+                                if (clear_page_dirty_for_io(cur->page))
-                        do_write_data_page(cur->page, &fio);
+                                        inode_dec_dirty_pages(inode);
-                        submit_bio = true;
+                                do_write_data_page(cur->page, &fio);
+                                submit_bio = true;
+                        }
+                        f2fs_put_page(cur->page, 1);
+                } else {
+                        put_page(cur->page);
                }
                radix_tree_delete(&fi->inmem_root, cur->page->index);
-                f2fs_put_page(cur->page, 1);
                list_del(&cur->list);
                kmem_cache_free(inmem_entry_slab, cur);
                dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
        }
-        if (submit_bio)
-                f2fs_submit_merged_bio(sbi, DATA, WRITE);
        mutex_unlock(&fi->inmem_lock);
-        filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX);
+        if (!abort) {
-        f2fs_unlock_op(sbi);
+                f2fs_unlock_op(sbi);
+                if (submit_bio)
+                        f2fs_submit_merged_bio(sbi, DATA, WRITE);
+        }
 }
 /*
@@ -290,7 +280,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
        /* check the # of cached NAT entries and prefree segments */
        if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK) ||
                        excess_prefree_segs(sbi) ||
-                        available_free_memory(sbi, INO_ENTRIES))
+                        !available_free_memory(sbi, INO_ENTRIES))
                f2fs_sync_fs(sbi->sb, true);
 }
@@ -515,12 +505,13 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start);
        unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
        unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
-        unsigned long dmap[entries];
+        unsigned long *dmap = SIT_I(sbi)->tmp_map;
        unsigned int start = 0, end = -1;
        bool force = (cpc->reason == CP_DISCARD);
        int i;
-        if (!force && !test_opt(sbi, DISCARD))
+        if (!force && (!test_opt(sbi, DISCARD) ||
+                        SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards))
                return;
        if (force && !se->valid_blocks) {
@@ -548,7 +539,8 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc)
        /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */
        for (i = 0; i < entries; i++)
-                dmap[i] = ~(cur_map[i] | ckpt_map[i]);
+                dmap[i] = force ? ~ckpt_map[i] :
+                                (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i];
        while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) {
                start = __find_rev_next_bit(dmap, max_blocks, end + 1);
@@ -735,7 +727,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type,
 /*
 * Calculate the number of current summary pages for writing
 */
-int npages_for_summary_flush(struct f2fs_sb_info *sbi)
+int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra)
 {
        int valid_sum_count = 0;
        int i, sum_in_page;
@@ -743,8 +735,13 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi)
        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
                if (sbi->ckpt->alloc_type[i] == SSR)
                        valid_sum_count += sbi->blocks_per_seg;
-                else
+                else {
-                        valid_sum_count += curseg_blkoff(sbi, i);
+                        if (for_ra)
+                                valid_sum_count += le16_to_cpu(
+                                        F2FS_CKPT(sbi)->cur_data_blkoff[i]);
+                        else
+                                valid_sum_count += curseg_blkoff(sbi, i);
+                }
        }
        sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE -
@@ -803,7 +800,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi,
        int go_left = 0;
        int i;
-        write_lock(&free_i->segmap_lock);
+        spin_lock(&free_i->segmap_lock);
        if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) {
                segno = find_next_zero_bit(free_i->free_segmap,
@@ -876,7 +873,7 @@ got_it:
        f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap));
        __set_inuse(sbi, segno);
        *newseg = segno;
-        write_unlock(&free_i->segmap_lock);
+        spin_unlock(&free_i->segmap_lock);
 }
 static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified)
@@ -927,7 +924,7 @@ static void __next_free_blkoff(struct f2fs_sb_info *sbi,
 {
        struct seg_entry *se = get_seg_entry(sbi, seg->segno);
        int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long);
-        unsigned long target_map[entries];
+        unsigned long *target_map = SIT_I(sbi)->tmp_map;
        unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map;
        unsigned long *cur_map = (unsigned long *)se->cur_valid_map;
        int i, pos;
@@ -1027,18 +1024,22 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
        stat_inc_seg_type(sbi, curseg);
 }
+static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
+{
+        struct curseg_info *curseg = CURSEG_I(sbi, type);
+        unsigned int old_segno;
+        old_segno = curseg->segno;
+        SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
+        locate_dirty_segment(sbi, old_segno);
+}
 void allocate_new_segments(struct f2fs_sb_info *sbi)
 {
-        struct curseg_info *curseg;
-        unsigned int old_curseg;
        int i;
-        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+        for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
-                curseg = CURSEG_I(sbi, i);
+                __allocate_new_segments(sbi, i);
-                old_curseg = curseg->segno;
-                SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
-                locate_dirty_segment(sbi, old_curseg);
-        }
 }
 static const struct segment_allocation default_salloc_ops = {
@@ -1047,8 +1048,8 @@ static const struct segment_allocation default_salloc_ops = {
 int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 {
-        __u64 start = range->start >> sbi->log_blocksize;
+        __u64 start = F2FS_BYTES_TO_BLK(range->start);
-        __u64 end = start + (range->len >> sbi->log_blocksize) - 1;
+        __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
        unsigned int start_segno, end_segno;
        struct cp_control cpc;
@@ -1065,16 +1066,21 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
        end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
                                                GET_SEGNO(sbi, end);
        cpc.reason = CP_DISCARD;
-        cpc.trim_start = start_segno;
+        cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen);
-        cpc.trim_end = end_segno;
-        cpc.trim_minlen = range->minlen >> sbi->log_blocksize;
        /* do checkpoint to issue discard commands safely */
-        mutex_lock(&sbi->gc_mutex);
+        for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
-        write_checkpoint(sbi, &cpc);
+                cpc.trim_start = start_segno;
-        mutex_unlock(&sbi->gc_mutex);
+                cpc.trim_end = min_t(unsigned int, rounddown(start_segno +
+                                BATCHED_TRIM_SEGMENTS(sbi),
+                                sbi->segs_per_sec) - 1, end_segno);
+                mutex_lock(&sbi->gc_mutex);
+                write_checkpoint(sbi, &cpc);
+                mutex_unlock(&sbi->gc_mutex);
+        }
 out:
-        range->len = cpc.trimmed << sbi->log_blocksize;
+        range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
        return 0;
 }
@@ -1151,11 +1157,18 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 {
        struct sit_info *sit_i = SIT_I(sbi);
        struct curseg_info *curseg;
+        bool direct_io = (type == CURSEG_DIRECT_IO);
+        type = direct_io ? CURSEG_WARM_DATA : type;
        curseg = CURSEG_I(sbi, type);
        mutex_lock(&curseg->curseg_mutex);
+        /* direct_io'ed data is aligned to the segment for better performance */
+        if (direct_io && curseg->next_blkoff)
+                __allocate_new_segments(sbi, type);
        *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
        /*
@@ -1187,39 +1200,39 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 }
 static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
-                        block_t old_blkaddr, block_t *new_blkaddr,
+                        struct f2fs_summary *sum,
-                        struct f2fs_summary *sum, struct f2fs_io_info *fio)
+                        struct f2fs_io_info *fio)
 {
        int type = __get_segment_type(page, fio->type);
-        allocate_data_block(sbi, page, old_blkaddr, new_blkaddr, sum, type);
+        allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type);
        /* writeout dirty page into bdev */
-        f2fs_submit_page_mbio(sbi, page, *new_blkaddr, fio);
+        f2fs_submit_page_mbio(sbi, page, fio);
 }
 void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 {
        struct f2fs_io_info fio = {
                .type = META,
-                .rw = WRITE_SYNC | REQ_META | REQ_PRIO
+                .rw = WRITE_SYNC | REQ_META | REQ_PRIO,
+                .blk_addr = page->index,
        };
        set_page_writeback(page);
-        f2fs_submit_page_mbio(sbi, page, page->index, &fio);
+        f2fs_submit_page_mbio(sbi, page, &fio);
 }
 void write_node_page(struct f2fs_sb_info *sbi, struct page *page,
-                struct f2fs_io_info *fio,
+                        unsigned int nid, struct f2fs_io_info *fio)
-                unsigned int nid, block_t old_blkaddr, block_t *new_blkaddr)
 {
        struct f2fs_summary sum;
        set_summary(&sum, nid, 0, 0);
-        do_write_page(sbi, page, old_blkaddr, new_blkaddr, &sum, fio);
+        do_write_page(sbi, page, &sum, fio);
 }
 void write_data_page(struct page *page, struct dnode_of_data *dn,
-                block_t *new_blkaddr, struct f2fs_io_info *fio)
+                                struct f2fs_io_info *fio)
 {
        struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
        struct f2fs_summary sum;
@@ -1228,14 +1241,14 @@ void write_data_page(struct page *page, struct dnode_of_data *dn,
        f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR);
        get_node_info(sbi, dn->nid, &ni);
        set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
+        do_write_page(sbi, page, &sum, fio);
-        do_write_page(sbi, page, dn->data_blkaddr, new_blkaddr, &sum, fio);
+        dn->data_blkaddr = fio->blk_addr;
 }
-void rewrite_data_page(struct page *page, block_t old_blkaddr,
+void rewrite_data_page(struct page *page, struct f2fs_io_info *fio)
-                                        struct f2fs_io_info *fio)
 {
-        f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio);
+        stat_inc_inplace_blocks(F2FS_P_SB(page));
+        f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio);
 }
 void recover_data_page(struct f2fs_sb_info *sbi,
@@ -1393,7 +1406,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
                segno = le32_to_cpu(ckpt->cur_data_segno[type]);
                blk_off = le16_to_cpu(ckpt->cur_data_blkoff[type -
                                                        CURSEG_HOT_DATA]);
-                if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+                if (__exist_node_summaries(sbi))
                        blk_addr = sum_blk_addr(sbi, NR_CURSEG_TYPE, type);
                else
                        blk_addr = sum_blk_addr(sbi, NR_CURSEG_DATA_TYPE, type);
@@ -1402,7 +1415,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
                                                        CURSEG_HOT_NODE]);
                blk_off = le16_to_cpu(ckpt->cur_node_blkoff[type -
                                                        CURSEG_HOT_NODE]);
-                if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG))
+                if (__exist_node_summaries(sbi))
                        blk_addr = sum_blk_addr(sbi, NR_CURSEG_NODE_TYPE,
                                                        type - CURSEG_HOT_NODE);
                else
@@ -1413,7 +1426,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type)
        sum = (struct f2fs_summary_block *)page_address(new);
        if (IS_NODESEG(type)) {
-                if (is_set_ckpt_flags(ckpt, CP_UMOUNT_FLAG)) {
+                if (__exist_node_summaries(sbi)) {
                        struct f2fs_summary *ns = &sum->entries[0];
                        int i;
                        for (i = 0; i < sbi->blocks_per_seg; i++, ns++) {
@@ -1450,12 +1463,22 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi)
        int err;
        if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) {
+                int npages = npages_for_summary_flush(sbi, true);
+                if (npages >= 2)
+                        ra_meta_pages(sbi, start_sum_block(sbi), npages,
+                                                                META_CP);
                /* restore for compacted data summary */
                if (read_compacted_summaries(sbi))
                        return -EINVAL;
                type = CURSEG_HOT_NODE;
        }
+        if (__exist_node_summaries(sbi))
+                ra_meta_pages(sbi, sum_blk_addr(sbi, NR_CURSEG_TYPE, type),
+                                        NR_CURSEG_TYPE - type, META_CP);
        for (; type <= CURSEG_COLD_NODE; type++) {
                err = read_normal_summaries(sbi, type);
                if (err)
@@ -1549,8 +1572,7 @@ void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
 void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk)
 {
-        if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG))
+        write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
-                write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE);
 }
 int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
@@ -1754,7 +1776,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
                        se = get_seg_entry(sbi, segno);
                        /* add discard candidates */
-                        if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) {
+                        if (cpc->reason != CP_DISCARD) {
                                cpc->trim_start = segno;
                                add_discard_addrs(sbi, cpc);
                        }
@@ -1833,6 +1855,10 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
                        return -ENOMEM;
        }
+        sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL);
+        if (!sit_i->tmp_map)
+                return -ENOMEM;
        if (sbi->segs_per_sec > 1) {
                sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) *
                                        sizeof(struct sec_entry));
@@ -1897,7 +1923,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi)
        free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi));
        free_i->free_segments = 0;
        free_i->free_sections = 0;
-        rwlock_init(&free_i->segmap_lock);
+        spin_lock_init(&free_i->segmap_lock);
        return 0;
 }
@@ -2110,6 +2136,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
        sm_info->nr_discards = 0;
        sm_info->max_discards = 0;
+        sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
        INIT_LIST_HEAD(&sm_info->sit_entry_set);
        if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) {
@@ -2212,6 +2240,8 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi)
                        kfree(sit_i->sentries[start].ckpt_valid_map);
                }
        }
+        kfree(sit_i->tmp_map);
        vfree(sit_i->sentries);
        vfree(sit_i->sec_entries);
        kfree(sit_i->dirty_sentries_bitmap);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 7f327c0ba4e3..7fd35111cf62 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -189,6 +189,7 @@ struct sit_info {
        char *sit_bitmap;               /* SIT bitmap pointer */
        unsigned int bitmap_size;       /* SIT bitmap size */
+        unsigned long *tmp_map;                 /* bitmap for temporal use */
        unsigned long *dirty_sentries_bitmap;   /* bitmap for dirty sentries */
        unsigned int dirty_sentries;            /* # of dirty sentries */
        unsigned int sents_per_block;           /* # of SIT entries per block */
@@ -207,7 +208,7 @@ struct free_segmap_info {
        unsigned int start_segno;       /* start segment number logically */
        unsigned int free_segments;     /* # of free segments */
        unsigned int free_sections;     /* # of free sections */
-        rwlock_t segmap_lock;           /* free segmap lock */
+        spinlock_t segmap_lock;         /* free segmap lock */
        unsigned long *free_segmap;     /* free segment bitmap */
        unsigned long *free_secmap;     /* free section bitmap */
 };
@@ -318,9 +319,9 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i,
                unsigned int max, unsigned int segno)
 {
        unsigned int ret;
-        read_lock(&free_i->segmap_lock);
+        spin_lock(&free_i->segmap_lock);
        ret = find_next_bit(free_i->free_segmap, max, segno);
-        read_unlock(&free_i->segmap_lock);
+        spin_unlock(&free_i->segmap_lock);
        return ret;
 }
@@ -331,7 +332,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
        unsigned int start_segno = secno * sbi->segs_per_sec;
        unsigned int next;
-        write_lock(&free_i->segmap_lock);
+        spin_lock(&free_i->segmap_lock);
        clear_bit(segno, free_i->free_segmap);
        free_i->free_segments++;
@@ -340,7 +341,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno)
                clear_bit(secno, free_i->free_secmap);
                free_i->free_sections++;
        }
-        write_unlock(&free_i->segmap_lock);
+        spin_unlock(&free_i->segmap_lock);
 }
 static inline void __set_inuse(struct f2fs_sb_info *sbi,
@@ -362,7 +363,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
        unsigned int start_segno = secno * sbi->segs_per_sec;
        unsigned int next;
-        write_lock(&free_i->segmap_lock);
+        spin_lock(&free_i->segmap_lock);
        if (test_and_clear_bit(segno, free_i->free_segmap)) {
                free_i->free_segments++;
@@ -373,7 +374,7 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi,
                                free_i->free_sections++;
                }
        }
-        write_unlock(&free_i->segmap_lock);
+        spin_unlock(&free_i->segmap_lock);
 }
 static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
@@ -381,13 +382,13 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi,
 {
        struct free_segmap_info *free_i = FREE_I(sbi);
        unsigned int secno = segno / sbi->segs_per_sec;
-        write_lock(&free_i->segmap_lock);
+        spin_lock(&free_i->segmap_lock);
        if (!test_and_set_bit(segno, free_i->free_segmap)) {
                free_i->free_segments--;
                if (!test_and_set_bit(secno, free_i->free_secmap))
                        free_i->free_sections--;
        }
-        write_unlock(&free_i->segmap_lock);
+        spin_unlock(&free_i->segmap_lock);
 }
 static inline void get_sit_bitmap(struct f2fs_sb_info *sbi,
@@ -460,7 +461,7 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed)
        int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
        int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
-        if (unlikely(sbi->por_doing))
+        if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
                return false;
        return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs +
@@ -599,13 +600,13 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno)
 {
        if (segno > TOTAL_SEGS(sbi) - 1)
-                sbi->need_fsck = true;
+                set_sbi_flag(sbi, SBI_NEED_FSCK);
 }
 static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr)
 {
        if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi))
-                sbi->need_fsck = true;
+                set_sbi_flag(sbi, SBI_NEED_FSCK);
 }
 /*
@@ -616,11 +617,11 @@ static inline void check_block_count(struct f2fs_sb_info *sbi,
 {
        /* check segment usage */
        if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg)
-                sbi->need_fsck = true;
+                set_sbi_flag(sbi, SBI_NEED_FSCK);
        /* check boundary of a given segment number */
        if (segno > TOTAL_SEGS(sbi) - 1)
-                sbi->need_fsck = true;
+                set_sbi_flag(sbi, SBI_NEED_FSCK);
 }
 #endif
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index f71421d70475..f2fe666a6ea9 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -30,6 +30,7 @@
 #include "segment.h"
 #include "xattr.h"
 #include "gc.h"
+#include "trace.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/f2fs.h>
@@ -41,6 +42,7 @@ static struct kset *f2fs_kset;
 enum {
        Opt_gc_background,
        Opt_disable_roll_forward,
+        Opt_norecovery,
        Opt_discard,
        Opt_noheap,
        Opt_user_xattr,
@@ -61,6 +63,7 @@ enum {
 static match_table_t f2fs_tokens = {
        {Opt_gc_background, "background_gc=%s"},
        {Opt_disable_roll_forward, "disable_roll_forward"},
+        {Opt_norecovery, "norecovery"},
        {Opt_discard, "discard"},
        {Opt_noheap, "no_heap"},
        {Opt_user_xattr, "user_xattr"},
@@ -192,6 +195,7 @@ F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time);
 F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
 F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
@@ -207,6 +211,7 @@ static struct attribute *f2fs_attrs[] = {
        ATTR_LIST(gc_idle),
        ATTR_LIST(reclaim_segments),
        ATTR_LIST(max_small_discards),
+        ATTR_LIST(batched_trim_sections),
        ATTR_LIST(ipu_policy),
        ATTR_LIST(min_ipu_util),
        ATTR_LIST(min_fsync_blocks),
@@ -286,6 +291,12 @@ static int parse_options(struct super_block *sb, char *options)
                case Opt_disable_roll_forward:
                        set_opt(sbi, DISABLE_ROLL_FORWARD);
                        break;
+                case Opt_norecovery:
+                        /* this option mounts f2fs with ro */
+                        set_opt(sbi, DISABLE_ROLL_FORWARD);
+                        if (!f2fs_readonly(sb))
+                                return -EINVAL;
+                        break;
                case Opt_discard:
                        set_opt(sbi, DISCARD);
                        break;
@@ -446,8 +457,13 @@ static void f2fs_put_super(struct super_block *sb)
        f2fs_destroy_stats(sbi);
        stop_gc_thread(sbi);
-        /* We don't need to do checkpoint when it's clean */
+        /*
-        if (sbi->s_dirty) {
+         * We don't need to do checkpoint when superblock is clean.
+         * But, the previous checkpoint was not done by umount, it needs to do
+         * clean checkpoint again.
+         */
+        if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) ||
+                        !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) {
                struct cp_control cpc = {
                        .reason = CP_UMOUNT,
                };
@@ -486,13 +502,15 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
        if (sync) {
                struct cp_control cpc;
-                cpc.reason = test_opt(sbi, FASTBOOT) ? CP_UMOUNT : CP_SYNC;
+                cpc.reason = __get_cp_reason(sbi);
                mutex_lock(&sbi->gc_mutex);
                write_checkpoint(sbi, &cpc);
                mutex_unlock(&sbi->gc_mutex);
        } else {
                f2fs_balance_fs(sbi);
        }
+        f2fs_trace_ios(NULL, NULL, 1);
        return 0;
 }
@@ -887,7 +905,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
                atomic_set(&sbi->nr_pages[i], 0);
        sbi->dir_level = DEF_DIR_LEVEL;
-        sbi->need_fsck = false;
+        clear_sbi_flag(sbi, SBI_NEED_FSCK);
 }
 /*
@@ -942,6 +960,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root;
        long err = -EINVAL;
        bool retry = true;
+        char *options = NULL;
        int i;
 try_onemore:
@@ -973,9 +992,15 @@ try_onemore:
        set_opt(sbi, POSIX_ACL);
 #endif
        /* parse mount options */
-        err = parse_options(sb, (char *)data);
+        options = kstrdup((const char *)data, GFP_KERNEL);
-        if (err)
+        if (data && !options) {
+                err = -ENOMEM;
                goto free_sb_buf;
+        }
+        err = parse_options(sb, options);
+        if (err)
+                goto free_options;
        sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
        sb->s_max_links = F2FS_LINK_MAX;
@@ -998,7 +1023,7 @@ try_onemore:
        mutex_init(&sbi->writepages);
        mutex_init(&sbi->cp_mutex);
        init_rwsem(&sbi->node_write);
-        sbi->por_doing = false;
+        clear_sbi_flag(sbi, SBI_POR_DOING);
        spin_lock_init(&sbi->stat_lock);
        init_rwsem(&sbi->read_io.io_rwsem);
@@ -1019,7 +1044,7 @@ try_onemore:
        if (IS_ERR(sbi->meta_inode)) {
                f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
                err = PTR_ERR(sbi->meta_inode);
-                goto free_sb_buf;
+                goto free_options;
        }
        err = get_valid_checkpoint(sbi);
@@ -1122,10 +1147,19 @@ try_onemore:
                goto free_proc;
        if (!retry)
-                sbi->need_fsck = true;
+                set_sbi_flag(sbi, SBI_NEED_FSCK);
        /* recover fsynced data */
        if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+                /*
+                 * mount should be failed, when device has readonly mode, and
+                 * previous checkpoint was not done by clean system shutdown.
+                 */
+                if (bdev_read_only(sb->s_bdev) &&
+                                !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) {
+                        err = -EROFS;
+                        goto free_kobj;
+                }
                err = recover_fsync_data(sbi);
                if (err) {
                        f2fs_msg(sb, KERN_ERR,
@@ -1144,6 +1178,7 @@ try_onemore:
                if (err)
                        goto free_kobj;
        }
+        kfree(options);
        return 0;
 free_kobj:
@@ -1168,6 +1203,8 @@ free_cp:
 free_meta_inode:
        make_bad_inode(sbi->meta_inode);
        iput(sbi->meta_inode);
+free_options:
+        kfree(options);
 free_sb_buf:
        brelse(raw_super_buf);
 free_sbi:
@@ -1188,11 +1225,18 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags,
        return mount_bdev(fs_type, flags, dev_name, data, f2fs_fill_super);
 }
+static void kill_f2fs_super(struct super_block *sb)
+{
+        if (sb->s_root)
+                set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE);
+        kill_block_super(sb);
+}
 static struct file_system_type f2fs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "f2fs",
        .mount          = f2fs_mount,
-        .kill_sb        = kill_block_super,
+        .kill_sb        = kill_f2fs_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("f2fs");
@@ -1220,6 +1264,8 @@ static int __init init_f2fs_fs(void)
 {
        int err;
+        f2fs_build_trace_ios();
        err = init_inodecache();
        if (err)
                goto fail;
@@ -1229,12 +1275,9 @@ static int __init init_f2fs_fs(void)
        err = create_segment_manager_caches();
        if (err)
                goto free_node_manager_caches;
-        err = create_gc_caches();
-        if (err)
-                goto free_segment_manager_caches;
        err = create_checkpoint_caches();
        if (err)
-                goto free_gc_caches;
+                goto free_segment_manager_caches;
        f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj);
        if (!f2fs_kset) {
                err = -ENOMEM;
@@ -1251,8 +1294,6 @@ free_kset:
        kset_unregister(f2fs_kset);
 free_checkpoint_caches:
        destroy_checkpoint_caches();
-free_gc_caches:
-        destroy_gc_caches();
 free_segment_manager_caches:
        destroy_segment_manager_caches();
 free_node_manager_caches:
@@ -1269,11 +1310,11 @@ static void __exit exit_f2fs_fs(void)
        f2fs_destroy_root_stats();
        unregister_filesystem(&f2fs_fs_type);
        destroy_checkpoint_caches();
-        destroy_gc_caches();
        destroy_segment_manager_caches();
        destroy_node_manager_caches();
        destroy_inodecache();
        kset_unregister(f2fs_kset);
+        f2fs_destroy_trace_ios();
 }
 module_init(init_f2fs_fs)
diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c
new file mode 100644
index 000000000000..875aa8179bc1
--- /dev/null
+++ b/fs/f2fs/trace.c
@@ -0,0 +1,159 @@
+/*
+ * f2fs IO tracer
+ *
+ * Copyright (c) 2014 Motorola Mobility
+ * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/fs.h>
+#include <linux/f2fs_fs.h>
+#include <linux/sched.h>
+#include <linux/radix-tree.h>
+#include "f2fs.h"
+#include "trace.h"
+static RADIX_TREE(pids, GFP_ATOMIC);
+static spinlock_t pids_lock;
+static struct last_io_info last_io;
+static inline void __print_last_io(void)
+{
+        if (!last_io.len)
+                return;
+        trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n",
+                        last_io.major, last_io.minor,
+                        last_io.pid, "----------------",
+                        last_io.type,
+                        last_io.fio.rw, last_io.fio.blk_addr,
+                        last_io.len);
+        memset(&last_io, 0, sizeof(last_io));
+}
+static int __file_type(struct inode *inode, pid_t pid)
+{
+        if (f2fs_is_atomic_file(inode))
+                return __ATOMIC_FILE;
+        else if (f2fs_is_volatile_file(inode))
+                return __VOLATILE_FILE;
+        else if (S_ISDIR(inode->i_mode))
+                return __DIR_FILE;
+        else if (inode->i_ino == F2FS_NODE_INO(F2FS_I_SB(inode)))
+                return __NODE_FILE;
+        else if (inode->i_ino == F2FS_META_INO(F2FS_I_SB(inode)))
+                return __META_FILE;
+        else if (pid)
+                return __NORMAL_FILE;
+        else
+                return __MISC_FILE;
+}
+void f2fs_trace_pid(struct page *page)
+{
+        struct inode *inode = page->mapping->host;
+        pid_t pid = task_pid_nr(current);
+        void *p;
+        page->private = pid;
+        if (radix_tree_preload(GFP_NOFS))
+                return;
+        spin_lock(&pids_lock);
+        p = radix_tree_lookup(&pids, pid);
+        if (p == current)
+                goto out;
+        if (p)
+                radix_tree_delete(&pids, pid);
+        f2fs_radix_tree_insert(&pids, pid, current);
+        trace_printk("%3x:%3x %4x %-16s\n",
+                        MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
+                        pid, current->comm);
+out:
+        spin_unlock(&pids_lock);
+        radix_tree_preload_end();
+}
+void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush)
+{
+        struct inode *inode;
+        pid_t pid;
+        int major, minor;
+        if (flush) {
+                __print_last_io();
+                return;
+        }
+        inode = page->mapping->host;
+        pid = page_private(page);
+        major = MAJOR(inode->i_sb->s_dev);
+        minor = MINOR(inode->i_sb->s_dev);
+        if (last_io.major == major && last_io.minor == minor &&
+                        last_io.pid == pid &&
+                        last_io.type == __file_type(inode, pid) &&
+                        last_io.fio.rw == fio->rw &&
+                        last_io.fio.blk_addr + last_io.len == fio->blk_addr) {
+                last_io.len++;
+                return;
+        }
+        __print_last_io();
+        last_io.major = major;
+        last_io.minor = minor;
+        last_io.pid = pid;
+        last_io.type = __file_type(inode, pid);
+        last_io.fio = *fio;
+        last_io.len = 1;
+        return;
+}
+void f2fs_build_trace_ios(void)
+{
+        spin_lock_init(&pids_lock);
+}
+#define PIDVEC_SIZE     128
+static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index,
+                                                        unsigned int max_items)
+{
+        struct radix_tree_iter iter;
+        void **slot;
+        unsigned int ret = 0;
+        if (unlikely(!max_items))
+                return 0;
+        radix_tree_for_each_slot(slot, &pids, &iter, first_index) {
+                results[ret] = iter.index;
+                if (++ret == PIDVEC_SIZE)
+                        break;
+        }
+        return ret;
+}
+void f2fs_destroy_trace_ios(void)
+{
+        pid_t pid[PIDVEC_SIZE];
+        pid_t next_pid = 0;
+        unsigned int found;
+        spin_lock(&pids_lock);
+        while ((found = gang_lookup_pids(pid, next_pid, PIDVEC_SIZE))) {
+                unsigned idx;
+                next_pid = pid[found - 1] + 1;
+                for (idx = 0; idx < found; idx++)
+                        radix_tree_delete(&pids, pid[idx]);
+        }
+        spin_unlock(&pids_lock);
+}
diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h
new file mode 100644
index 000000000000..1041dbeb52ae
--- /dev/null
+++ b/fs/f2fs/trace.h
@@ -0,0 +1,46 @@
+/*
+ * f2fs IO tracer
+ *
+ * Copyright (c) 2014 Motorola Mobility
+ * Copyright (c) 2014 Jaegeuk Kim <jaegeuk@kernel.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __F2FS_TRACE_H__
+#define __F2FS_TRACE_H__
+#ifdef CONFIG_F2FS_IO_TRACE
+#include <trace/events/f2fs.h>
+enum file_type {
+        __NORMAL_FILE,
+        __DIR_FILE,
+        __NODE_FILE,
+        __META_FILE,
+        __ATOMIC_FILE,
+        __VOLATILE_FILE,
+        __MISC_FILE,
+};
+struct last_io_info {
+        int major, minor;
+        pid_t pid;
+        enum file_type type;
+        struct f2fs_io_info fio;
+        block_t len;
+};
+extern void f2fs_trace_pid(struct page *);
+extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int);
+extern void f2fs_build_trace_ios(void);
+extern void f2fs_destroy_trace_ios(void);
+#else
+#define f2fs_trace_pid(p)
+#define f2fs_trace_ios(p, i, n)
+#define f2fs_build_trace_ios()
+#define f2fs_destroy_trace_ios()
+#endif
+#endif /* __F2FS_TRACE_H__ */
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 7b41a2dcdd76..497c7c5263c7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -580,7 +580,7 @@ static void fat_set_state(struct super_block *sb,
 {
        struct buffer_head *bh;
        struct fat_boot_sector *b;
-        struct msdos_sb_info *sbi = sb->s_fs_info;
+        struct msdos_sb_info *sbi = MSDOS_SB(sb);
        /* do not change any thing if mounted read only */
        if ((sb->s_flags & MS_RDONLY) && !force)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 99d440a4a6ba..ee85cd4e136a 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -740,14 +740,15 @@ static int __init fcntl_init(void)
         * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
         * is defined as O_NONBLOCK on some platforms and not on others.
         */
-        BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
+        BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
                O_RDONLY        | O_WRONLY      | O_RDWR        |
                O_CREAT         | O_EXCL        | O_NOCTTY      |
                O_TRUNC         | O_APPEND      | /* O_NONBLOCK | */
                __O_SYNC        | O_DSYNC       | FASYNC        |
                O_DIRECT        | O_LARGEFILE   | O_DIRECTORY   |
                O_NOFOLLOW      | O_NOATIME     | O_CLOEXEC     |
-                __FMODE_EXEC    | O_PATH        | __O_TMPFILE
+                __FMODE_EXEC    | O_PATH        | __O_TMPFILE   |
+                __FMODE_NONOTIFY
                ));
        fasync_cache = kmem_cache_create("fasync_cache",
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5fbfea..073657f755d4 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -66,15 +66,21 @@ int writeback_in_progress(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL(writeback_in_progress);
-static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
+struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
-        struct super_block *sb = inode->i_sb;
+        struct super_block *sb;
-        if (sb_is_blkdev_sb(sb))
+        if (!inode)
-                return inode->i_mapping->backing_dev_info;
+                return &noop_backing_dev_info;
+        sb = inode->i_sb;
+#ifdef CONFIG_BLOCK
+        if (sb_is_blkdev_sb(sb))
+                return blk_get_backing_dev_info(I_BDEV(inode));
+#endif
        return sb->s_bdi;
 }
+EXPORT_SYMBOL_GPL(inode_to_bdi);
 static inline struct inode *wb_inode(struct list_head *head)
 {
@@ -247,14 +253,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
        return ret;
 }
+#define EXPIRE_DIRTY_ATIME 0x0001
 /*
 * Move expired (dirtied before work->older_than_this) dirty inodes from
 * @delaying_queue to @dispatch_queue.
 */
 static int move_expired_inodes(struct list_head *delaying_queue,
                               struct list_head *dispatch_queue,
+                               int flags,
                               struct wb_writeback_work *work)
 {
+        unsigned long *older_than_this = NULL;
+        unsigned long expire_time;
        LIST_HEAD(tmp);
        struct list_head *pos, *node;
        struct super_block *sb = NULL;
@@ -262,13 +273,21 @@ static int move_expired_inodes(struct list_head *delaying_queue,
        int do_sb_sort = 0;
        int moved = 0;
+        if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+                older_than_this = work->older_than_this;
+        else if ((work->reason == WB_REASON_SYNC) == 0) {
+                expire_time = jiffies - (HZ * 86400);
+                older_than_this = &expire_time;
+        }
        while (!list_empty(delaying_queue)) {
                inode = wb_inode(delaying_queue->prev);
-                if (work->older_than_this &&
+                if (older_than_this &&
-                    inode_dirtied_after(inode, *work->older_than_this))
+                    inode_dirtied_after(inode, *older_than_this))
                        break;
                list_move(&inode->i_wb_list, &tmp);
                moved++;
+                if (flags & EXPIRE_DIRTY_ATIME)
+                        set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
                if (sb_is_blkdev_sb(inode->i_sb))
                        continue;
                if (sb && sb != inode->i_sb)
@@ -309,9 +328,12 @@ out:
 static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
        int moved;
        assert_spin_locked(&wb->list_lock);
        list_splice_init(&wb->b_more_io, &wb->b_io);
-        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+        moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+        moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+                                     EXPIRE_DIRTY_ATIME, work);
        trace_writeback_queue_io(wb, work, moved);
 }
@@ -435,6 +457,8 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
                 * updates after data IO completion.
                 */
                redirty_tail(inode, wb);
+        } else if (inode->i_state & I_DIRTY_TIME) {
+                list_move(&inode->i_wb_list, &wb->b_dirty_time);
        } else {
                /* The inode is clean. Remove from writeback lists. */
                list_del_init(&inode->i_wb_list);
@@ -481,7 +505,13 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        spin_lock(&inode->i_lock);
        dirty = inode->i_state & I_DIRTY;
-        inode->i_state &= ~I_DIRTY;
+        if (((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) &&
+             (inode->i_state & I_DIRTY_TIME)) ||
+            (inode->i_state & I_DIRTY_TIME_EXPIRED)) {
+                dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+                trace_writeback_lazytime(inode);
+        }
+        inode->i_state &= ~dirty;
        /*
         * Paired with smp_mb() in __mark_inode_dirty().  This allows
@@ -501,8 +531,10 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        spin_unlock(&inode->i_lock);
+        if (dirty & I_DIRTY_TIME)
+                mark_inode_dirty_sync(inode);
        /* Don't write the inode if only I_DIRTY_PAGES was set */
-        if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+        if (dirty & ~I_DIRTY_PAGES) {
                int err = write_inode(inode, wbc);
                if (ret == 0)
                        ret = err;
@@ -550,7 +582,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
         * make sure inode is on some writeback list and leave it there unless
         * we have completely cleaned the inode.
         */
-        if (!(inode->i_state & I_DIRTY) &&
+        if (!(inode->i_state & I_DIRTY_ALL) &&
            (wbc->sync_mode != WB_SYNC_ALL ||
             !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
                goto out;
@@ -565,7 +597,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
         * If inode is clean, remove it from writeback lists. Otherwise don't
         * touch it. See comment above for explanation.
         */
-        if (!(inode->i_state & I_DIRTY))
+        if (!(inode->i_state & I_DIRTY_ALL))
                list_del_init(&inode->i_wb_list);
        spin_unlock(&wb->list_lock);
        inode_sync_complete(inode);
@@ -707,7 +739,7 @@ static long writeback_sb_inodes(struct super_block *sb,
                wrote += write_chunk - wbc.nr_to_write;
                spin_lock(&wb->list_lock);
                spin_lock(&inode->i_lock);
-                if (!(inode->i_state & I_DIRTY))
+                if (!(inode->i_state & I_DIRTY_ALL))
                        wrote++;
                requeue_inode(inode, wb, &wbc);
                inode_sync_complete(inode);
@@ -1145,16 +1177,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
 * page->mapping->host, so the page-dirtying time is recorded in the internal
 * blockdev inode.
 */
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
        struct super_block *sb = inode->i_sb;
        struct backing_dev_info *bdi = NULL;
+        int dirtytime;
+        trace_writeback_mark_inode_dirty(inode, flags);
        /*
         * Don't do this for I_DIRTY_PAGES - that doesn't actually
         * dirty the inode itself
         */
-        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
                trace_writeback_dirty_inode_start(inode, flags);
                if (sb->s_op->dirty_inode)
@@ -1162,6 +1198,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                trace_writeback_dirty_inode(inode, flags);
        }
+        if (flags & I_DIRTY_INODE)
+                flags &= ~I_DIRTY_TIME;
+        dirtytime = flags & I_DIRTY_TIME;
        /*
         * Paired with smp_mb() in __writeback_single_inode() for the
@@ -1169,16 +1208,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
         */
        smp_mb();
-        if ((inode->i_state & flags) == flags)
+        if (((inode->i_state & flags) == flags) ||
+            (dirtytime && (inode->i_state & I_DIRTY_INODE)))
                return;
        if (unlikely(block_dump))
                block_dump___mark_inode_dirty(inode);
        spin_lock(&inode->i_lock);
+        if (dirtytime && (inode->i_state & I_DIRTY_INODE))
+                goto out_unlock_inode;
        if ((inode->i_state & flags) != flags) {
                const int was_dirty = inode->i_state & I_DIRTY;
+                if (flags & I_DIRTY_INODE)
+                        inode->i_state &= ~I_DIRTY_TIME;
                inode->i_state |= flags;
                /*
@@ -1225,8 +1269,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
                        }
                        inode->dirtied_when = jiffies;
-                        list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+                        list_move(&inode->i_wb_list, dirtytime ?
+                                  &bdi->wb.b_dirty_time : &bdi->wb.b_dirty);
                        spin_unlock(&bdi->wb.list_lock);
+                        trace_writeback_dirty_inode_enqueue(inode);
                        if (wakeup_bdi)
                                bdi_wakeup_thread_delayed(bdi);
diff --git a/fs/fs_pin.c b/fs/fs_pin.c
index 9368236ca100..b06c98796afb 100644
--- a/fs/fs_pin.c
+++ b/fs/fs_pin.c
@@ -1,78 +1,102 @@
 #include <linux/fs.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
-#include <linux/fs_pin.h>
 #include "internal.h"
 #include "mount.h"
-static void pin_free_rcu(struct rcu_head *head)
-{
-        kfree(container_of(head, struct fs_pin, rcu));
-}
 static DEFINE_SPINLOCK(pin_lock);
-void pin_put(struct fs_pin *p)
-{
-        if (atomic_long_dec_and_test(&p->count))
-                call_rcu(&p->rcu, pin_free_rcu);
-}
 void pin_remove(struct fs_pin *pin)
 {
        spin_lock(&pin_lock);
        hlist_del(&pin->m_list);
        hlist_del(&pin->s_list);
        spin_unlock(&pin_lock);
+        spin_lock_irq(&pin->wait.lock);
+        pin->done = 1;
+        wake_up_locked(&pin->wait);
+        spin_unlock_irq(&pin->wait.lock);
 }
-void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+void pin_insert_group(struct fs_pin *pin, struct vfsmount *m, struct hlist_head *p)
 {
        spin_lock(&pin_lock);
-        hlist_add_head(&pin->s_list, &m->mnt_sb->s_pins);
+        if (p)
+                hlist_add_head(&pin->s_list, p);
        hlist_add_head(&pin->m_list, &real_mount(m)->mnt_pins);
        spin_unlock(&pin_lock);
 }
+void pin_insert(struct fs_pin *pin, struct vfsmount *m)
+{
+        pin_insert_group(pin, m, &m->mnt_sb->s_pins);
+}
+void pin_kill(struct fs_pin *p)
+{
+        wait_queue_t wait;
+        if (!p) {
+                rcu_read_unlock();
+                return;
+        }
+        init_wait(&wait);
+        spin_lock_irq(&p->wait.lock);
+        if (likely(!p->done)) {
+                p->done = -1;
+                spin_unlock_irq(&p->wait.lock);
+                rcu_read_unlock();
+                p->kill(p);
+                return;
+        }
+        if (p->done > 0) {
+                spin_unlock_irq(&p->wait.lock);
+                rcu_read_unlock();
+                return;
+        }
+        __add_wait_queue(&p->wait, &wait);
+        while (1) {
+                set_current_state(TASK_UNINTERRUPTIBLE);
+                spin_unlock_irq(&p->wait.lock);
+                rcu_read_unlock();
+                schedule();
+                rcu_read_lock();
+                if (likely(list_empty(&wait.task_list)))
+                        break;
+                /* OK, we know p couldn't have been freed yet */
+                spin_lock_irq(&p->wait.lock);
+                if (p->done > 0) {
+                        spin_unlock_irq(&p->wait.lock);
+                        break;
+                }
+        }
+        rcu_read_unlock();
+}
 void mnt_pin_kill(struct mount *m)
 {
        while (1) {
                struct hlist_node *p;
-                struct fs_pin *pin;
                rcu_read_lock();
                p = ACCESS_ONCE(m->mnt_pins.first);
                if (!p) {
                        rcu_read_unlock();
                        break;
                }
-                pin = hlist_entry(p, struct fs_pin, m_list);
+                pin_kill(hlist_entry(p, struct fs_pin, m_list));
-                if (!atomic_long_inc_not_zero(&pin->count)) {
-                        rcu_read_unlock();
-                        cpu_relax();
-                        continue;
-                }
-                rcu_read_unlock();
-                pin->kill(pin);
        }
 }
-void sb_pin_kill(struct super_block *sb)
+void group_pin_kill(struct hlist_head *p)
 {
        while (1) {
-                struct hlist_node *p;
+                struct hlist_node *q;
-                struct fs_pin *pin;
                rcu_read_lock();
-                p = ACCESS_ONCE(sb->s_pins.first);
+                q = ACCESS_ONCE(p->first);
-                if (!p) {
+                if (!q) {
                        rcu_read_unlock();
                        break;
                }
-                pin = hlist_entry(p, struct fs_pin, s_list);
+                pin_kill(hlist_entry(q, struct fs_pin, s_list));
-                if (!atomic_long_inc_not_zero(&pin->count)) {
-                        rcu_read_unlock();
-                        cpu_relax();
-                        continue;
-                }
-                rcu_read_unlock();
-                pin->kill(pin);
        }
 }
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index ba1107977f2e..ed19a7d622fa 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -131,6 +131,13 @@ static void fuse_req_init_context(struct fuse_req *req)
        req->in.h.pid = current->pid;
 }
+void fuse_set_initialized(struct fuse_conn *fc)
+{
+        /* Make sure stores before this are seen on another CPU */
+        smp_wmb();
+        fc->initialized = 1;
+}
 static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
 {
        return !fc->initialized || (for_background && fc->blocked);
@@ -155,6 +162,8 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
                if (intr)
                        goto out;
        }
+        /* Matches smp_wmb() in fuse_set_initialized() */
+        smp_rmb();
        err = -ENOTCONN;
        if (!fc->connected)
@@ -253,6 +262,8 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
        atomic_inc(&fc->num_waiting);
        wait_event(fc->blocked_waitq, fc->initialized);
+        /* Matches smp_wmb() in fuse_set_initialized() */
+        smp_rmb();
        req = fuse_request_alloc(0);
        if (!req)
                req = get_reserved_req(fc, file);
@@ -511,6 +522,39 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 }
 EXPORT_SYMBOL_GPL(fuse_request_send);
+static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
+{
+        if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS)
+                args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE;
+        if (fc->minor < 9) {
+                switch (args->in.h.opcode) {
+                case FUSE_LOOKUP:
+                case FUSE_CREATE:
+                case FUSE_MKNOD:
+                case FUSE_MKDIR:
+                case FUSE_SYMLINK:
+                case FUSE_LINK:
+                        args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+                        break;
+                case FUSE_GETATTR:
+                case FUSE_SETATTR:
+                        args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+                        break;
+                }
+        }
+        if (fc->minor < 12) {
+                switch (args->in.h.opcode) {
+                case FUSE_CREATE:
+                        args->in.args[0].size = sizeof(struct fuse_open_in);
+                        break;
+                case FUSE_MKNOD:
+                        args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
+                        break;
+                }
+        }
+}
 ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
 {
        struct fuse_req *req;
@@ -520,6 +564,9 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
        if (IS_ERR(req))
                return PTR_ERR(req);
+        /* Needs to be done after fuse_get_req() so that fc->minor is valid */
+        fuse_adjust_compat(fc, args);
        req->in.h.opcode = args->in.h.opcode;
        req->in.h.nodeid = args->in.h.nodeid;
        req->in.numargs = args->in.numargs;
@@ -2127,7 +2174,7 @@ void fuse_abort_conn(struct fuse_conn *fc)
        if (fc->connected) {
                fc->connected = 0;
                fc->blocked = 0;
-                fc->initialized = 1;
+                fuse_set_initialized(fc);
                end_io_requests(fc);
                end_queued_requests(fc);
                end_polls(fc);
@@ -2146,7 +2193,7 @@ int fuse_dev_release(struct inode *inode, struct file *file)
                spin_lock(&fc->lock);
                fc->connected = 0;
                fc->blocked = 0;
-                fc->initialized = 1;
+                fuse_set_initialized(fc);
                end_queued_requests(fc);
                end_polls(fc);
                wake_up_all(&fc->blocked_waitq);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 252b8a5de8b5..08e7b1a9d5d0 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -156,10 +156,7 @@ static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
        args->in.args[0].size = name->len + 1;
        args->in.args[0].value = name->name;
        args->out.numargs = 1;
-        if (fc->minor < 9)
+        args->out.args[0].size = sizeof(struct fuse_entry_out);
-                args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
-        else
-                args->out.args[0].size = sizeof(struct fuse_entry_out);
        args->out.args[0].value = outarg;
 }
@@ -422,16 +419,12 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
        args.in.h.opcode = FUSE_CREATE;
        args.in.h.nodeid = get_node_id(dir);
        args.in.numargs = 2;
-        args.in.args[0].size = fc->minor < 12 ? sizeof(struct fuse_open_in) :
+        args.in.args[0].size = sizeof(inarg);
-                                                sizeof(inarg);
        args.in.args[0].value = &inarg;
        args.in.args[1].size = entry->d_name.len + 1;
        args.in.args[1].value = entry->d_name.name;
        args.out.numargs = 2;
-        if (fc->minor < 9)
+        args.out.args[0].size = sizeof(outentry);
-                args.out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
-        else
-                args.out.args[0].size = sizeof(outentry);
        args.out.args[0].value = &outentry;
        args.out.args[1].size = sizeof(outopen);
        args.out.args[1].value = &outopen;
@@ -539,10 +532,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
        memset(&outarg, 0, sizeof(outarg));
        args->in.h.nodeid = get_node_id(dir);
        args->out.numargs = 1;
-        if (fc->minor < 9)
+        args->out.args[0].size = sizeof(outarg);
-                args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
-        else
-                args->out.args[0].size = sizeof(outarg);
        args->out.args[0].value = &outarg;
        err = fuse_simple_request(fc, args);
        if (err)
@@ -592,8 +582,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
        inarg.umask = current_umask();
        args.in.h.opcode = FUSE_MKNOD;
        args.in.numargs = 2;
-        args.in.args[0].size = fc->minor < 12 ? FUSE_COMPAT_MKNOD_IN_SIZE :
+        args.in.args[0].size = sizeof(inarg);
-                                                sizeof(inarg);
        args.in.args[0].value = &inarg;
        args.in.args[1].size = entry->d_name.len + 1;
        args.in.args[1].value = entry->d_name.name;
@@ -899,10 +888,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
        args.in.args[0].size = sizeof(inarg);
        args.in.args[0].value = &inarg;
        args.out.numargs = 1;
-        if (fc->minor < 9)
+        args.out.args[0].size = sizeof(outarg);
-                args.out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
-        else
-                args.out.args[0].size = sizeof(outarg);
        args.out.args[0].value = &outarg;
        err = fuse_simple_request(fc, &args);
        if (!err) {
@@ -1574,10 +1560,7 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
        args->in.args[0].size = sizeof(*inarg_p);
        args->in.args[0].value = inarg_p;
        args->out.numargs = 1;
-        if (fc->minor < 9)
+        args->out.args[0].size = sizeof(*outarg_p);
-                args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
-        else
-                args->out.args[0].size = sizeof(*outarg_p);
        args->out.args[0].value = outarg_p;
 }
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 760b2c552197..c01ec3bdcfd8 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1159,7 +1159,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
        mutex_lock(&inode->i_mutex);
        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
@@ -1464,7 +1464,7 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 {
        struct inode *inode = req->inode;
        struct fuse_inode *fi = get_fuse_inode(inode);
-        struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(inode);
        int i;
        list_del(&req->writepages_entry);
@@ -1658,7 +1658,7 @@ static int fuse_writepage_locked(struct page *page)
        req->end = fuse_writepage_end;
        req->inode = inode;
-        inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK);
+        inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
        spin_lock(&fc->lock);
@@ -1768,7 +1768,7 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req,
        if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
                                        old_req->state == FUSE_REQ_PENDING)) {
-                struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+                struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
                copy_highpage(old_req->pages[0], page);
                spin_unlock(&fc->lock);
@@ -1872,7 +1872,7 @@ static int fuse_writepages_fill(struct page *page,
        req->page_descs[req->num_pages].offset = 0;
        req->page_descs[req->num_pages].length = PAGE_SIZE;
-        inc_bdi_stat(page->mapping->backing_dev_info, BDI_WRITEBACK);
+        inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
        inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
        err = 0;
@@ -2062,7 +2062,6 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = fuse_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e0fc6725d1d0..1cdfb07c1376 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -906,4 +906,6 @@ int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
 int fuse_do_setattr(struct inode *inode, struct iattr *attr,
                    struct file *file);
+void fuse_set_initialized(struct fuse_conn *fc);
 #endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 6749109f255d..e8799c11424b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -308,7 +308,6 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
                if (!fc->writeback_cache || !S_ISREG(attr->mode))
                        inode->i_flags |= S_NOCMTIME;
                inode->i_generation = generation;
-                inode->i_data.backing_dev_info = &fc->bdi;
                fuse_init_inode(inode, attr);
                unlock_new_inode(inode);
        } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
@@ -424,8 +423,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
        args.in.h.opcode = FUSE_STATFS;
        args.in.h.nodeid = get_node_id(dentry->d_inode);
        args.out.numargs = 1;
-        args.out.args[0].size =
+        args.out.args[0].size = sizeof(outarg);
-                fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg);
        args.out.args[0].value = &outarg;
        err = fuse_simple_request(fc, &args);
        if (!err)
@@ -898,7 +896,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                fc->max_write = max_t(unsigned, 4096, fc->max_write);
                fc->conn_init = 1;
        }
-        fc->initialized = 1;
+        fuse_set_initialized(fc);
        wake_up_all(&fc->blocked_waitq);
 }
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 3088e2a38e30..7b3143064af1 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -73,7 +73,7 @@ int gfs2_set_acl(struct inode *inode, struct posix_acl *acl, int type)
        BUG_ON(name == NULL);
-        if (acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
+        if (acl && acl->a_count > GFS2_ACL_MAX_ENTRIES(GFS2_SB(inode)))
                return -E2BIG;
        if (type == ACL_TYPE_ACCESS) {
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 805b37fed638..4ad4f94edebe 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -289,7 +289,7 @@ continue_unlock:
                if (!clear_page_dirty_for_io(page))
                        goto continue_unlock;
-                trace_wbc_writepage(wbc, mapping->backing_dev_info);
+                trace_wbc_writepage(wbc, inode_to_bdi(inode));
                ret = __gfs2_jdata_writepage(page, wbc);
                if (unlikely(ret)) {
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index c5a34f09e228..6371192961e2 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1896,7 +1896,8 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
        ht = kzalloc(size, GFP_NOFS | __GFP_NOWARN);
        if (ht == NULL)
-                ht = vzalloc(size);
+                ht = __vmalloc(size, GFP_NOFS | __GFP_NOWARN | __GFP_ZERO,
+                               PAGE_KERNEL);
        if (!ht)
                return -ENOMEM;
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 6e600abf694a..3e32bb8e2d7e 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -498,7 +498,6 @@ static const struct vm_operations_struct gfs2_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = gfs2_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 /**
@@ -655,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 {
        struct address_space *mapping = file->f_mapping;
        struct inode *inode = mapping->host;
-        int sync_state = inode->i_state & I_DIRTY;
+        int sync_state = inode->i_state & I_DIRTY_ALL;
        struct gfs2_inode *ip = GFS2_I(inode);
        int ret = 0, ret1 = 0;
@@ -668,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
        if (!gfs2_is_jdata(ip))
                sync_state &= ~I_DIRTY_PAGES;
        if (datasync)
-                sync_state &= ~I_DIRTY_SYNC;
+                sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
        if (sync_state) {
                ret = sync_inode_metadata(inode, 1);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index a23524aa3eac..f42dffba056a 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -173,19 +173,14 @@ void gfs2_glock_add_to_lru(struct gfs2_glock *gl)
        spin_unlock(&lru_lock);
 }
-static void __gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
+static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
 {
+        spin_lock(&lru_lock);
        if (!list_empty(&gl->gl_lru)) {
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
                clear_bit(GLF_LRU, &gl->gl_flags);
        }
-}
-static void gfs2_glock_remove_from_lru(struct gfs2_glock *gl)
-{
-        spin_lock(&lru_lock);
-        __gfs2_glock_remove_from_lru(gl);
        spin_unlock(&lru_lock);
 }
@@ -205,9 +200,7 @@ void gfs2_glock_put(struct gfs2_glock *gl)
        lockref_mark_dead(&gl->gl_lockref);
-        spin_lock(&lru_lock);
+        gfs2_glock_remove_from_lru(gl);
-        __gfs2_glock_remove_from_lru(gl);
-        spin_unlock(&lru_lock);
        spin_unlock(&gl->gl_lockref.lock);
        spin_lock_bucket(gl->gl_hash);
        hlist_bl_del_rcu(&gl->gl_list);
@@ -775,7 +768,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->flags = 0;
                mapping_set_gfp_mask(mapping, GFP_NOFS);
                mapping->private_data = NULL;
-                mapping->backing_dev_info = s->s_bdi;
                mapping->writeback_index = 0;
        }
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9054002ebe70..73c72253faac 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -543,10 +543,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
        }
        error = gfs2_dir_add(&dip->i_inode, name, ip, da);
-        if (error)
-                goto fail_end_trans;
-fail_end_trans:
        gfs2_trans_end(sdp);
 fail_ipreserv:
        gfs2_inplace_release(dip);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 8633ad328ee2..efc8e254787c 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -112,7 +112,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        mapping->private_data = NULL;
-        mapping->backing_dev_info = sb->s_bdi;
        mapping->writeback_index = 0;
        spin_lock_init(&sdp->sd_log_lock);
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c8b148bbdc8b..3aa17d4d1cfc 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -145,7 +145,8 @@ static void gfs2_qd_dispose(struct list_head *list)
 }
-static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+static enum lru_status gfs2_qd_isolate(struct list_head *item,
+                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
        struct list_head *dispose = arg;
        struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
@@ -155,7 +156,7 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock,
        if (qd->qd_lockref.count == 0) {
                lockref_mark_dead(&qd->qd_lockref);
-                list_move(&qd->qd_lru, dispose);
+                list_lru_isolate_move(lru, &qd->qd_lru, dispose);
        }
        spin_unlock(&qd->qd_lockref.lock);
@@ -171,8 +172,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
        if (!(sc->gfp_mask & __GFP_FS))
                return SHRINK_STOP;
-        freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate,
+        freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
-                                   &dispose, &sc->nr_to_scan);
+                                     gfs2_qd_isolate, &dispose);
        gfs2_qd_dispose(&dispose);
@@ -182,7 +183,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
 static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
                                          struct shrink_control *sc)
 {
-        return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid));
+        return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
 }
 struct shrinker gfs2_qd_shrinker = {
@@ -667,7 +668,7 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
 static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
                             s64 change, struct gfs2_quota_data *qd,
-                             struct fs_disk_quota *fdq)
+                             struct qc_dqblk *fdq)
 {
        struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -697,16 +698,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc,
        be64_add_cpu(&q.qu_value, change);
        qd->qd_qb.qb_value = q.qu_value;
        if (fdq) {
-                if (fdq->d_fieldmask & FS_DQ_BSOFT) {
+                if (fdq->d_fieldmask & QC_SPC_SOFT) {
-                        q.qu_warn = cpu_to_be64(fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift);
+                        q.qu_warn = cpu_to_be64(fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift);
                        qd->qd_qb.qb_warn = q.qu_warn;
                }
-                if (fdq->d_fieldmask & FS_DQ_BHARD) {
+                if (fdq->d_fieldmask & QC_SPC_HARD) {
-                        q.qu_limit = cpu_to_be64(fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift);
+                        q.qu_limit = cpu_to_be64(fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift);
                        qd->qd_qb.qb_limit = q.qu_limit;
                }
-                if (fdq->d_fieldmask & FS_DQ_BCOUNT) {
+                if (fdq->d_fieldmask & QC_SPACE) {
-                        q.qu_value = cpu_to_be64(fdq->d_bcount >> sdp->sd_fsb2bb_shift);
+                        q.qu_value = cpu_to_be64(fdq->d_space >> sdp->sd_sb.sb_bsize_shift);
                        qd->qd_qb.qb_value = q.qu_value;
                }
        }
@@ -1497,7 +1498,7 @@ static int gfs2_quota_get_xstate(struct super_block *sb,
 }
 static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
-                          struct fs_disk_quota *fdq)
+                          struct qc_dqblk *fdq)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_quota_lvb *qlvb;
@@ -1505,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
        struct gfs2_holder q_gh;
        int error;
-        memset(fdq, 0, sizeof(struct fs_disk_quota));
+        memset(fdq, 0, sizeof(*fdq));
        if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
                return -ESRCH; /* Crazy XFS error code */
@@ -1522,12 +1523,9 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
                goto out;
        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
-        fdq->d_version = FS_DQUOT_VERSION;
+        fdq->d_spc_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_sb.sb_bsize_shift;
-        fdq->d_flags = (qid.type == USRQUOTA) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
+        fdq->d_spc_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_sb.sb_bsize_shift;
-        fdq->d_id = from_kqid_munged(current_user_ns(), qid);
+        fdq->d_space = be64_to_cpu(qlvb->qb_value) << sdp->sd_sb.sb_bsize_shift;
-        fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit) << sdp->sd_fsb2bb_shift;
-        fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn) << sdp->sd_fsb2bb_shift;
-        fdq->d_bcount = be64_to_cpu(qlvb->qb_value) << sdp->sd_fsb2bb_shift;
        gfs2_glock_dq_uninit(&q_gh);
 out:
@@ -1536,10 +1534,10 @@ out:
 }
 /* GFS2 only supports a subset of the XFS fields */
-#define GFS2_FIELDMASK (FS_DQ_BSOFT|FS_DQ_BHARD|FS_DQ_BCOUNT)
+#define GFS2_FIELDMASK (QC_SPC_SOFT|QC_SPC_HARD|QC_SPACE)
 static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
-                          struct fs_disk_quota *fdq)
+                          struct qc_dqblk *fdq)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
        struct gfs2_inode *ip = GFS2_I(sdp->sd_quota_inode);
@@ -1583,17 +1581,17 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
                goto out_i;
        /* If nothing has changed, this is a no-op */
-        if ((fdq->d_fieldmask & FS_DQ_BSOFT) &&
+        if ((fdq->d_fieldmask & QC_SPC_SOFT) &&
-            ((fdq->d_blk_softlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
+            ((fdq->d_spc_softlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_warn)))
-                fdq->d_fieldmask ^= FS_DQ_BSOFT;
+                fdq->d_fieldmask ^= QC_SPC_SOFT;
-        if ((fdq->d_fieldmask & FS_DQ_BHARD) &&
+        if ((fdq->d_fieldmask & QC_SPC_HARD) &&
-            ((fdq->d_blk_hardlimit >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
+            ((fdq->d_spc_hardlimit >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_limit)))
-                fdq->d_fieldmask ^= FS_DQ_BHARD;
+                fdq->d_fieldmask ^= QC_SPC_HARD;
-        if ((fdq->d_fieldmask & FS_DQ_BCOUNT) &&
+        if ((fdq->d_fieldmask & QC_SPACE) &&
-            ((fdq->d_bcount >> sdp->sd_fsb2bb_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
+            ((fdq->d_space >> sdp->sd_sb.sb_bsize_shift) == be64_to_cpu(qd->qd_qb.qb_value)))
-                fdq->d_fieldmask ^= FS_DQ_BCOUNT;
+                fdq->d_fieldmask ^= QC_SPACE;
        if (fdq->d_fieldmask == 0)
                goto out_i;
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 573bd3b758fa..1b645773c98e 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -439,7 +439,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
        ls->ls_recover_jid_done = jid;
        ls->ls_recover_jid_status = message;
-        sprintf(env_jid, "JID=%d", jid);
+        sprintf(env_jid, "JID=%u", jid);
        sprintf(env_status, "RECOVERY=%s",
                message == LM_RD_SUCCESS ? "Done" : "Failed");
        kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 5b327f837de7..1666382b198d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -743,7 +743,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
-        struct backing_dev_info *bdi = metamapping->backing_dev_info;
+        struct backing_dev_info *bdi = inode_to_bdi(metamapping->host);
        int ret = 0;
        if (wbc->sync_mode == WB_SYNC_ALL)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 3ab566ba5696..ae8e8811f0e8 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -96,7 +96,7 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
        struct super_block *sb = sdp->sd_vfs;
        int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
-        return snprintf(buf, PAGE_SIZE, "%u\n", frozen);
+        return snprintf(buf, PAGE_SIZE, "%d\n", frozen);
 }
 static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5eba47f593f8..c274aca8e8dc 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -62,12 +62,6 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
        return container_of(inode, struct hugetlbfs_inode_info, vfs_inode);
 }
-static struct backing_dev_info hugetlbfs_backing_dev_info = {
-        .name           = "hugetlbfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 int sysctl_hugetlb_shm_group;
 enum {
@@ -498,7 +492,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
                lockdep_set_class(&inode->i_mapping->i_mmap_rwsem,
                                &hugetlbfs_i_mmap_rwsem_key);
                inode->i_mapping->a_ops = &hugetlbfs_aops;
-                inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_mapping->private_data = resv_map;
                info = HUGETLBFS_I(inode);
@@ -1032,10 +1025,6 @@ static int __init init_hugetlbfs_fs(void)
                return -ENOTSUPP;
        }
-        error = bdi_init(&hugetlbfs_backing_dev_info);
-        if (error)
-                return error;
        error = -ENOMEM;
        hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
                                        sizeof(struct hugetlbfs_inode_info),
@@ -1071,7 +1060,6 @@ static int __init init_hugetlbfs_fs(void)
 out:
        kmem_cache_destroy(hugetlbfs_inode_cachep);
 out2:
-        bdi_destroy(&hugetlbfs_backing_dev_info);
        return error;
 }
@@ -1091,7 +1079,6 @@ static void __exit exit_hugetlbfs_fs(void)
        for_each_hstate(h)
                kern_unmount(hugetlbfs_vfsmount[i++]);
        unregister_filesystem(&hugetlbfs_fs_type);
-        bdi_destroy(&hugetlbfs_backing_dev_info);
 }
 module_init(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index aa149e7262ac..f00b16f45507 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -18,6 +18,7 @@
 #include <linux/buffer_head.h> /* for inode_has_buffers */
 #include <linux/ratelimit.h>
 #include <linux/list_lru.h>
+#include <trace/events/writeback.h>
 #include "internal.h"
 /*
@@ -30,7 +31,7 @@
 * inode_sb_list_lock protects:
 *   sb->s_inodes, inode->i_sb_list
 * bdi->wb.list_lock protects:
- *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
 * inode_hash_lock protects:
 *   inode_hashtable, inode->i_hash
 *
@@ -170,20 +171,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        atomic_set(&mapping->i_mmap_writable, 0);
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
        mapping->private_data = NULL;
-        mapping->backing_dev_info = &default_backing_dev_info;
        mapping->writeback_index = 0;
-        /*
-         * If the block_device provides a backing_dev_info for client
-         * inodes then use that.  Otherwise the inode share the bdev's
-         * backing_dev_info.
-         */
-        if (sb->s_bdev) {
-                struct backing_dev_info *bdi;
-                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-                mapping->backing_dev_info = bdi;
-        }
        inode->i_private = NULL;
        inode->i_mapping = mapping;
        INIT_HLIST_HEAD(&inode->i_dentry);      /* buggered by rcu freeing */
@@ -194,7 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 #ifdef CONFIG_FSNOTIFY
        inode->i_fsnotify_mask = 0;
 #endif
+        inode->i_flctx = NULL;
        this_cpu_inc(nr_inodes);
        return 0;
@@ -237,6 +225,7 @@ void __destroy_inode(struct inode *inode)
        BUG_ON(inode_has_buffers(inode));
        security_inode_free(inode);
        fsnotify_inode_delete(inode);
+        locks_free_lock_context(inode->i_flctx);
        if (!inode->i_nlink) {
                WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
                atomic_long_dec(&inode->i_sb->s_remove_count);
@@ -355,7 +344,6 @@ void address_space_init_once(struct address_space *mapping)
        INIT_LIST_HEAD(&mapping->private_list);
        spin_lock_init(&mapping->private_lock);
        mapping->i_mmap = RB_ROOT;
-        INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
 }
 EXPORT_SYMBOL(address_space_init_once);
@@ -416,7 +404,8 @@ static void inode_lru_list_add(struct inode *inode)
 */
 void inode_add_lru(struct inode *inode)
 {
-        if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+        if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
+                                I_FREEING | I_WILL_FREE)) &&
            !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
                inode_lru_list_add(inode);
 }
@@ -647,7 +636,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
                        spin_unlock(&inode->i_lock);
                        continue;
                }
-                if (inode->i_state & I_DIRTY && !kill_dirty) {
+                if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
                        spin_unlock(&inode->i_lock);
                        busy = 1;
                        continue;
@@ -685,8 +674,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 * LRU does not have strict ordering. Hence we don't want to reclaim inodes
 * with this flag set because they are the inodes that are out of order.
 */
-static enum lru_status
+static enum lru_status inode_lru_isolate(struct list_head *item,
-inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+                struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
        struct list_head *freeable = arg;
        struct inode    *inode = container_of(item, struct inode, i_lru);
@@ -704,7 +693,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
         */
        if (atomic_read(&inode->i_count) ||
            (inode->i_state & ~I_REFERENCED)) {
-                list_del_init(&inode->i_lru);
+                list_lru_isolate(lru, &inode->i_lru);
                spin_unlock(&inode->i_lock);
                this_cpu_dec(nr_unused);
                return LRU_REMOVED;
@@ -738,7 +727,7 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
        WARN_ON(inode->i_state & I_NEW);
        inode->i_state |= I_FREEING;
-        list_move(&inode->i_lru, freeable);
+        list_lru_isolate_move(lru, &inode->i_lru, freeable);
        spin_unlock(&inode->i_lock);
        this_cpu_dec(nr_unused);
@@ -751,14 +740,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 * to trim from the LRU. Inodes to be freed are moved to a temporary list and
 * then are freed outside inode_lock by dispose_list().
 */
-long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
+long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
-                     int nid)
 {
        LIST_HEAD(freeable);
        long freed;
-        freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate,
+        freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
-                                       &freeable, &nr_to_scan);
+                                     inode_lru_isolate, &freeable);
        dispose_list(&freeable);
        return freed;
 }
@@ -1282,6 +1270,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
 }
 EXPORT_SYMBOL(ilookup);
+/**
+ * find_inode_nowait - find an inode in the inode cache
+ * @sb:         super block of file system to search
+ * @hashval:    hash value (usually inode number) to search for
+ * @match:      callback used for comparisons between inodes
+ * @data:       opaque data pointer to pass to @match
+ *
+ * Search for the inode specified by @hashval and @data in the inode
+ * cache, where the helper function @match will return 0 if the inode
+ * does not match, 1 if the inode does match, and -1 if the search
+ * should be stopped.  The @match function must be responsible for
+ * taking the i_lock spin_lock and checking i_state for an inode being
+ * freed or being initialized, and incrementing the reference count
+ * before returning 1.  It also must not sleep, since it is called with
+ * the inode_hash_lock spinlock held.
+ *
+ * This is a even more generalized version of ilookup5() when the
+ * function must never block --- find_inode() can block in
+ * __wait_on_freeing_inode() --- or when the caller can not increment
+ * the reference count because the resulting iput() might cause an
+ * inode eviction.  The tradeoff is that the @match funtion must be
+ * very carefully implemented.
+ */
+struct inode *find_inode_nowait(struct super_block *sb,
+                                unsigned long hashval,
+                                int (*match)(struct inode *, unsigned long,
+                                             void *),
+                                void *data)
+{
+        struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+        struct inode *inode, *ret_inode = NULL;
+        int mval;
+        spin_lock(&inode_hash_lock);
+        hlist_for_each_entry(inode, head, i_hash) {
+                if (inode->i_sb != sb)
+                        continue;
+                mval = match(inode, hashval, data);
+                if (mval == 0)
+                        continue;
+                if (mval == 1)
+                        ret_inode = inode;
+                goto out;
+        }
+out:
+        spin_unlock(&inode_hash_lock);
+        return ret_inode;
+}
+EXPORT_SYMBOL(find_inode_nowait);
 int insert_inode_locked(struct inode *inode)
 {
        struct super_block *sb = inode->i_sb;
@@ -1432,11 +1470,20 @@ static void iput_final(struct inode *inode)
 */
 void iput(struct inode *inode)
 {
-        if (inode) {
+        if (!inode)
-                BUG_ON(inode->i_state & I_CLEAR);
+                return;
+        BUG_ON(inode->i_state & I_CLEAR);
-                if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
+retry:
-                        iput_final(inode);
+        if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
+                if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
+                        atomic_inc(&inode->i_count);
+                        inode->i_state &= ~I_DIRTY_TIME;
+                        spin_unlock(&inode->i_lock);
+                        trace_writeback_lazytime_iput(inode);
+                        mark_inode_dirty_sync(inode);
+                        goto retry;
+                }
+                iput_final(inode);
        }
 }
 EXPORT_SYMBOL(iput);
@@ -1495,14 +1542,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
        return 0;
 }
-/*
+int generic_update_time(struct inode *inode, struct timespec *time, int flags)
- * This does the actual work of updating an inodes time or version.  Must have
- * had called mnt_want_write() before calling this.
- */
-static int update_time(struct inode *inode, struct timespec *time, int flags)
 {
-        if (inode->i_op->update_time)
+        int iflags = I_DIRTY_TIME;
-                return inode->i_op->update_time(inode, time, flags);
        if (flags & S_ATIME)
                inode->i_atime = *time;
@@ -1512,9 +1554,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
                inode->i_ctime = *time;
        if (flags & S_MTIME)
                inode->i_mtime = *time;
-        mark_inode_dirty_sync(inode);
+        if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+                iflags |= I_DIRTY_SYNC;
+        __mark_inode_dirty(inode, iflags);
        return 0;
 }
+EXPORT_SYMBOL(generic_update_time);
+/*
+ * This does the actual work of updating an inodes time or version.  Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+        int (*update_time)(struct inode *, struct timespec *, int);
+        update_time = inode->i_op->update_time ? inode->i_op->update_time :
+                generic_update_time;
+        return update_time(inode, time, flags);
+}
 /**
 *      touch_atime     -       update the access time
diff --git a/fs/internal.h b/fs/internal.h
index e9a61fe67575..30459dab409d 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -14,6 +14,7 @@ struct file_system_type;
 struct linux_binprm;
 struct path;
 struct mount;
+struct shrink_control;
 /*
 * block_dev.c
@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f);
 * inode.c
 */
 extern spinlock_t inode_sb_list_lock;
-extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
+extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
-                            int nid);
 extern void inode_add_lru(struct inode *inode);
 /*
@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool);
 */
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern int d_set_mounted(struct dentry *dentry);
-extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
-                            int nid);
 /*
 * read_write.c
@@ -145,7 +144,7 @@ extern const struct file_operations pipefifo_fops;
 /*
 * fs_pin.c
 */
-extern void sb_pin_kill(struct super_block *sb);
+extern void group_pin_kill(struct hlist_head *p);
 extern void mnt_pin_kill(struct mount *m);
 /*
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 214c3c11fbc2..5d01d2638ca5 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -379,6 +379,11 @@ int __generic_block_fiemap(struct inode *inode,
                                past_eof = true;
                }
                cond_resched();
+                if (fatal_signal_pending(current)) {
+                        ret = -EINTR;
+                        break;
+                }
        } while (1);
        /* If ret is 1 then we just hit the end of the extent array */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index bb63254ed848..735d7522a3a9 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -362,6 +362,9 @@ repeat:
                        rs.cont_size = isonum_733(rr->u.CE.size);
                        break;
                case SIG('E', 'R'):
+                        /* Invalid length of ER tag id? */
+                        if (rr->u.ER.len_id + offsetof(struct rock_ridge, u.ER.data) > rr->len)
+                                goto out;
                        ISOFS_SB(inode->i_sb)->s_rock = 1;
                        printk(KERN_DEBUG "ISO 9660 Extensions: ");
                        {
diff --git a/fs/isofs/util.c b/fs/isofs/util.c
index 01e1ee7a998b..005a15cfd30a 100644
--- a/fs/isofs/util.c
+++ b/fs/isofs/util.c
@@ -2,6 +2,7 @@
 *  linux/fs/isofs/util.c
 */
+#include <linux/time.h>
 #include "isofs.h"
 /* 
@@ -17,9 +18,9 @@
 int iso_date(char * p, int flag)
 {
        int year, month, day, hour, minute, second, tz;
-        int crtime, days, i;
+        int crtime;
-        year = p[0] - 70;
+        year = p[0];
        month = p[1];
        day = p[2];
        hour = p[3];
@@ -31,18 +32,7 @@ int iso_date(char * p, int flag)
        if (year < 0) {
                crtime = 0;
        } else {
-                int monlen[12] = {31,28,31,30,31,30,31,31,30,31,30,31};
+                crtime = mktime64(year+1900, month, day, hour, minute, second);
-                days = year * 365;
-                if (year > 2)
-                        days += (year+1) / 4;
-                for (i = 1; i < month; i++)
-                        days += monlen[i-1];
-                if (((year+2) % 4) == 0 && month > 2)
-                        days++;
-                days += day - 1;
-                crtime = ((((days * 24) + hour) * 60 + minute) * 60)
-                        + second;
                /* sign extend */
                if (tz & 0x80)
diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c
index 92e0644bf867..556de100ebd5 100644
--- a/fs/jffs2/compr_rubin.c
+++ b/fs/jffs2/compr_rubin.c
@@ -84,11 +84,6 @@ static inline int pullbit(struct pushpull *pp)
        return bit;
 }
-static inline int pulledbits(struct pushpull *pp)
-{
-        return pp->ofs;
-}
 static void init_rubin(struct rubin_state *rs, int div, int *bits)
 {
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index 7654e87b0428..9ad5ba4b299b 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -510,6 +510,10 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
                                sumlen = c->sector_size - je32_to_cpu(sm->offset);
                                sumptr = buf + buf_size - sumlen;
+                                /* sm->offset maybe wrong but MAGIC maybe right */
+                                if (sumlen > c->sector_size)
+                                        goto full_scan;
                                /* Now, make sure the summary itself is available */
                                if (sumlen > buf_size) {
                                        /* Need to kmalloc for this. */
@@ -544,6 +548,7 @@ static int jffs2_scan_eraseblock (struct jffs2_sb_info *c, struct jffs2_eraseblo
                }
        }
+full_scan:
        buf_ofs = jeb->offset;
        if (!buf_size) {
diff --git a/fs/jfs/endian24.h b/fs/jfs/endian24.h
deleted file mode 100644
index fa92f7f1d0d0..000000000000
--- a/fs/jfs/endian24.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- *   Copyright (C) International Business Machines Corp., 2001
- *
- *   This program is free software;  you can redistribute it and/or modify
- *   it under the terms of the GNU General Public License as published by
- *   the Free Software Foundation; either version 2 of the License, or
- *   (at your option) any later version.
- *
- *   This program is distributed in the hope that it will be useful,
- *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
- *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
- *   the GNU General Public License for more details.
- *
- *   You should have received a copy of the GNU General Public License
- *   along with this program;  if not, write to the Free Software
- *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- */
-#ifndef _H_ENDIAN24
-#define _H_ENDIAN24
-/*
- *      endian24.h:
- *
- * Endian conversion for 24-byte data
- *
- */
-#define __swab24(x) \
-({ \
-        __u32 __x = (x); \
-        ((__u32)( \
-                ((__x & (__u32)0x000000ffUL) << 16) | \
-                 (__x & (__u32)0x0000ff00UL)        | \
-                ((__x & (__u32)0x00ff0000UL) >> 16) )); \
-})
-#if (defined(__KERNEL__) && defined(__LITTLE_ENDIAN)) || (defined(__BYTE_ORDER) && (__BYTE_ORDER == __LITTLE_ENDIAN))
-        #define __cpu_to_le24(x) ((__u32)(x))
-        #define __le24_to_cpu(x) ((__u32)(x))
-#else
-        #define __cpu_to_le24(x) __swab24(x)
-        #define __le24_to_cpu(x) __swab24(x)
-#endif
-#ifdef __KERNEL__
-        #define cpu_to_le24 __cpu_to_le24
-        #define le24_to_cpu __le24_to_cpu
-#endif
-#endif                          /* !_H_ENDIAN24 */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 33aa0cc1f8b8..10815f8dfd8b 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -39,7 +39,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
                return rc;
        mutex_lock(&inode->i_mutex);
-        if (!(inode->i_state & I_DIRTY) ||
+        if (!(inode->i_state & I_DIRTY_ALL) ||
            (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
                /* Make sure committed changes hit the disk */
                jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 984c2bbf4f61..d88576e23fe4 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -1040,8 +1040,8 @@ static int dtSplitUp(tid_t tid,
                pxdlist.maxnpxd = 1;
                pxdlist.npxd = 0;
                pxd = &pxdlist.pxd[0];
-                PXDaddress(pxd, nxaddr)
+                PXDaddress(pxd, nxaddr);
-                    PXDlength(pxd, xlen + n);
+                PXDlength(pxd, xlen + n);
                split->pxdlist = &pxdlist;
                if ((rc = dtExtendPage(tid, ip, split, btstack))) {
                        nxaddr = addressPXD(pxd);
diff --git a/fs/jfs/jfs_types.h b/fs/jfs/jfs_types.h
index 43ea3713c083..8f602dcb51fa 100644
--- a/fs/jfs/jfs_types.h
+++ b/fs/jfs/jfs_types.h
@@ -30,8 +30,6 @@
 #include <linux/types.h>
 #include <linux/nls.h>
-#include "endian24.h"
 /*
 * transaction and lock id's
 *
@@ -59,26 +57,42 @@ struct timestruc_t {
 /*
 *      physical xd (pxd)
+ *
+ *      The leftmost 24 bits of len_addr are the extent length.
+ *      The rightmost 8 bits of len_addr are the most signficant bits of
+ *      the extent address
 */
 typedef struct {
-        unsigned len:24;
+        __le32 len_addr;
-        unsigned addr1:8;
        __le32 addr2;
 } pxd_t;
 /* xd_t field construction */
-#define PXDlength(pxd, length32)        ((pxd)->len = __cpu_to_le24(length32))
+static inline void PXDlength(pxd_t *pxd, __u32 len)
-#define PXDaddress(pxd, address64)\
+{
-{\
+        pxd->len_addr = (pxd->len_addr & cpu_to_le32(~0xffffff)) |
-        (pxd)->addr1 = ((s64)address64) >> 32;\
+                        cpu_to_le32(len & 0xffffff);
-        (pxd)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
+}
+static inline void PXDaddress(pxd_t *pxd, __u64 addr)
+{
+        pxd->len_addr = (pxd->len_addr & cpu_to_le32(0xffffff)) |
+                        cpu_to_le32((addr >> 32)<<24);
+        pxd->addr2 = cpu_to_le32(addr & 0xffffffff);
 }
 /* xd_t field extraction */
-#define lengthPXD(pxd)  __le24_to_cpu((pxd)->len)
+static inline __u32 lengthPXD(pxd_t *pxd)
-#define addressPXD(pxd)\
+{
-        ( ((s64)((pxd)->addr1)) << 32 | __le32_to_cpu((pxd)->addr2))
+        return le32_to_cpu((pxd)->len_addr) & 0xffffff;
+}
+static inline __u64 addressPXD(pxd_t *pxd)
+{
+        __u64 n = le32_to_cpu(pxd->len_addr) & ~0xffffff;
+        return (n << 8) + le32_to_cpu(pxd->addr2);
+}
 #define MAXTREEHEIGHT 8
 /* pxd list */
@@ -93,12 +107,10 @@ struct pxdlist {
 *      data extent descriptor (dxd)
 */
 typedef struct {
-        unsigned flag:8;        /* 1: flags */
+        __u8 flag;      /* 1: flags */
-        unsigned rsrvd:24;
+        __u8 rsrvd[3];
        __le32 size;            /* 4: size in byte */
-        unsigned len:24;        /* 3: length in unit of fsblksize */
+        pxd_t loc;              /* 8: address and length in unit of fsblksize */
-        unsigned addr1:8;       /* 1: address in unit of fsblksize */
-        __le32 addr2;           /* 4: address in unit of fsblksize */
 } dxd_t;                        /* - 16 - */
 /* dxd_t flags */
@@ -109,12 +121,11 @@ typedef struct {
 #define DXD_CORRUPT     0x08    /* Inconsistency detected */
 /* dxd_t field construction
- *      Conveniently, the PXD macros work for DXD
 */
-#define DXDlength       PXDlength
+#define DXDlength(dxd, len)     PXDlength(&(dxd)->loc, len)
-#define DXDaddress      PXDaddress
+#define DXDaddress(dxd, addr)   PXDaddress(&(dxd)->loc, addr)
-#define lengthDXD       lengthPXD
+#define lengthDXD(dxd)  lengthPXD(&(dxd)->loc)
-#define addressDXD      addressPXD
+#define addressDXD(dxd) addressPXD(&(dxd)->loc)
 #define DXDsize(dxd, size32) ((dxd)->size = cpu_to_le32(size32))
 #define sizeDXD(dxd)    le32_to_cpu((dxd)->size)
diff --git a/fs/jfs/jfs_xtree.h b/fs/jfs/jfs_xtree.h
index 08c0c749b986..1e0987986d5f 100644
--- a/fs/jfs/jfs_xtree.h
+++ b/fs/jfs/jfs_xtree.h
@@ -29,13 +29,11 @@
 *      extent allocation descriptor (xad)
 */
 typedef struct xad {
-        unsigned flag:8;        /* 1: flag */
+        __u8 flag;      /* 1: flag */
-        unsigned rsvrd:16;      /* 2: reserved */
+        __u8 rsvrd[2];  /* 2: reserved */
-        unsigned off1:8;        /* 1: offset in unit of fsblksize */
+        __u8 off1;      /* 1: offset in unit of fsblksize */
-        __le32 off2;            /* 4: offset in unit of fsblksize */
+        __le32 off2;    /* 4: offset in unit of fsblksize */
-        unsigned len:24;        /* 3: length in unit of fsblksize */
+        pxd_t loc;      /* 8: length and address in unit of fsblksize */
-        unsigned addr1:8;       /* 1: address in unit of fsblksize */
-        __le32 addr2;           /* 4: address in unit of fsblksize */
 } xad_t;                        /* (16) */
 #define MAXXLEN         ((1 << 24) - 1)
@@ -49,19 +47,14 @@ typedef struct xad {
        (xad)->off1 = ((u64)offset64) >> 32;\
        (xad)->off2 = __cpu_to_le32((offset64) & 0xffffffff);\
 }
-#define XADaddress(xad, address64)\
+#define XADaddress(xad, address64) PXDaddress(&(xad)->loc, address64)
-{\
+#define XADlength(xad, length32) PXDlength(&(xad)->loc, length32)
-        (xad)->addr1 = ((u64)address64) >> 32;\
-        (xad)->addr2 = __cpu_to_le32((address64) & 0xffffffff);\
-}
-#define XADlength(xad, length32)        (xad)->len = __cpu_to_le24(length32)
 /* xad_t field extraction */
 #define offsetXAD(xad)\
        ( ((s64)((xad)->off1)) << 32 | __le32_to_cpu((xad)->off2))
-#define addressXAD(xad)\
+#define addressXAD(xad) addressPXD(&(xad)->loc)
-        ( ((s64)((xad)->addr1)) << 32 | __le32_to_cpu((xad)->addr2))
+#define lengthXAD(xad) lengthPXD(&(xad)->loc)
-#define lengthXAD(xad)  __le24_to_cpu((xad)->len)
 /* xad list */
 struct xadlist {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 16c3a9556634..5d30c56ae075 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -619,8 +619,7 @@ out_mount_failed:
        iput(sbi->direct_inode);
        sbi->direct_inode = NULL;
 out_unload:
-        if (sbi->nls_tab)
+        unload_nls(sbi->nls_tab);
-                unload_nls(sbi->nls_tab);
 out_kfree:
        kfree(sbi);
        return ret;
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 37989f02a226..6acc9648f986 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -201,10 +201,14 @@ static unsigned int kernfs_name_hash(const char *name, const void *ns)
 static int kernfs_name_compare(unsigned int hash, const char *name,
                               const void *ns, const struct kernfs_node *kn)
 {
-        if (hash != kn->hash)
+        if (hash < kn->hash)
-                return hash - kn->hash;
+                return -1;
-        if (ns != kn->ns)
+        if (hash > kn->hash)
-                return ns - kn->ns;
+                return 1;
+        if (ns < kn->ns)
+                return -1;
+        if (ns > kn->ns)
+                return 1;
        return strcmp(name, kn->name);
 }
@@ -407,8 +411,9 @@ void kernfs_put(struct kernfs_node *kn)
        if (kernfs_type(kn) == KERNFS_LINK)
                kernfs_put(kn->symlink.target_kn);
-        if (!(kn->flags & KERNFS_STATIC_NAME))
-                kfree(kn->name);
+        kfree_const(kn->name);
        if (kn->iattr) {
                if (kn->iattr->ia_secdata)
                        security_release_secctx(kn->iattr->ia_secdata,
@@ -502,15 +507,12 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
                                             const char *name, umode_t mode,
                                             unsigned flags)
 {
-        char *dup_name = NULL;
        struct kernfs_node *kn;
        int ret;
-        if (!(flags & KERNFS_STATIC_NAME)) {
+        name = kstrdup_const(name, GFP_KERNEL);
-                name = dup_name = kstrdup(name, GFP_KERNEL);
+        if (!name)
-                if (!name)
+                return NULL;
-                        return NULL;
-        }
        kn = kmem_cache_zalloc(kernfs_node_cache, GFP_KERNEL);
        if (!kn)
@@ -534,7 +536,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 err_out2:
        kmem_cache_free(kernfs_node_cache, kn);
 err_out1:
-        kfree(dup_name);
+        kfree_const(name);
        return NULL;
 }
@@ -1260,7 +1262,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
        /* rename kernfs_node */
        if (strcmp(kn->name, new_name) != 0) {
                error = -ENOMEM;
-                new_name = kstrdup(new_name, GFP_KERNEL);
+                new_name = kstrdup_const(new_name, GFP_KERNEL);
                if (!new_name)
                        goto out;
        } else {
@@ -1281,9 +1283,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
        kn->ns = new_ns;
        if (new_name) {
-                if (!(kn->flags & KERNFS_STATIC_NAME))
+                old_name = kn->name;
-                        old_name = kn->name;
-                kn->flags &= ~KERNFS_STATIC_NAME;
                kn->name = new_name;
        }
@@ -1293,7 +1293,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
        kernfs_link_sibling(kn);
        kernfs_put(old_parent);
-        kfree(old_name);
+        kfree_const(old_name);
        error = 0;
 out:
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index ddc9f9612f16..b684e8a132e6 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -901,7 +901,6 @@ const struct file_operations kernfs_file_fops = {
 * @ops: kernfs operations for the file
 * @priv: private data for the file
 * @ns: optional namespace tag of the file
- * @name_is_static: don't copy file name
 * @key: lockdep key for the file's active_ref, %NULL to disable lockdep
 *
 * Returns the created node on success, ERR_PTR() value on error.
@@ -911,7 +910,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
                                         umode_t mode, loff_t size,
                                         const struct kernfs_ops *ops,
                                         void *priv, const void *ns,
-                                         bool name_is_static,
                                         struct lock_class_key *key)
 {
        struct kernfs_node *kn;
@@ -919,8 +917,6 @@ struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent,
        int rc;
        flags = KERNFS_FILE;
-        if (name_is_static)
-                flags |= KERNFS_STATIC_NAME;
        kn = kernfs_new_node(parent, name, (mode & S_IALLUGO) | S_IFREG, flags);
        if (!kn)
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 985217626e66..9000874a945b 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,12 +24,6 @@ static const struct address_space_operations kernfs_aops = {
        .write_end      = simple_write_end,
 };
-static struct backing_dev_info kernfs_bdi = {
-        .name           = "kernfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 static const struct inode_operations kernfs_iops = {
        .permission     = kernfs_iop_permission,
        .setattr        = kernfs_iop_setattr,
@@ -40,12 +34,6 @@ static const struct inode_operations kernfs_iops = {
        .listxattr      = kernfs_iop_listxattr,
 };
-void __init kernfs_inode_init(void)
-{
-        if (bdi_init(&kernfs_bdi))
-                panic("failed to init kernfs_bdi");
-}
 static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
 {
        static DEFINE_MUTEX(iattr_mutex);
@@ -298,7 +286,6 @@ static void kernfs_init_inode(struct kernfs_node *kn, struct inode *inode)
        kernfs_get(kn);
        inode->i_private = kn;
        inode->i_mapping->a_ops = &kernfs_aops;
-        inode->i_mapping->backing_dev_info = &kernfs_bdi;
        inode->i_op = &kernfs_iops;
        set_default_inode_attr(inode, kn->mode);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index dc84a3ef9ca2..af9fa7499919 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -88,7 +88,6 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name);
 ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
                            size_t size);
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
-void kernfs_inode_init(void);
 /*
 * dir.c
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index f973ae9b05f1..8eaf417187f1 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -246,5 +246,4 @@ void __init kernfs_init(void)
        kernfs_node_cache = kmem_cache_create("kernfs_node_cache",
                                              sizeof(struct kernfs_node),
                                              0, SLAB_PANIC, NULL);
-        kernfs_inode_init();
 }
diff --git a/fs/libfs.c b/fs/libfs.c
index 005843ce5dbd..b2ffdb045be4 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -948,7 +948,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
        mutex_lock(&inode->i_mutex);
        ret = sync_mapping_buffers(inode->i_mapping);
-        if (!(inode->i_state & I_DIRTY))
+        if (!(inode->i_state & I_DIRTY_ALL))
                goto out;
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                goto out;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 1cc6ec51e6b1..47a32b6d9b90 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -65,7 +65,7 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
        return (struct sockaddr *)&nsm->sm_addr;
 }
-static struct rpc_clnt *nsm_create(struct net *net)
+static struct rpc_clnt *nsm_create(struct net *net, const char *nodename)
 {
        struct sockaddr_in sin = {
                .sin_family             = AF_INET,
@@ -77,6 +77,7 @@ static struct rpc_clnt *nsm_create(struct net *net)
                .address                = (struct sockaddr *)&sin,
                .addrsize               = sizeof(sin),
                .servername             = "rpc.statd",
+                .nodename               = nodename,
                .program                = &nsm_program,
                .version                = NSM_VERSION,
                .authflavor             = RPC_AUTH_NULL,
@@ -102,7 +103,7 @@ out:
        return clnt;
 }
-static struct rpc_clnt *nsm_client_get(struct net *net)
+static struct rpc_clnt *nsm_client_get(struct net *net, const char *nodename)
 {
        struct rpc_clnt *clnt, *new;
        struct lockd_net *ln = net_generic(net, lockd_net_id);
@@ -111,7 +112,7 @@ static struct rpc_clnt *nsm_client_get(struct net *net)
        if (clnt != NULL)
                goto out;
-        clnt = new = nsm_create(net);
+        clnt = new = nsm_create(net, nodename);
        if (IS_ERR(clnt))
                goto out;
@@ -190,19 +191,23 @@ int nsm_monitor(const struct nlm_host *host)
        struct nsm_res  res;
        int             status;
        struct rpc_clnt *clnt;
+        const char *nodename = NULL;
        dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name);
        if (nsm->sm_monitored)
                return 0;
+        if (host->h_rpcclnt)
+                nodename = host->h_rpcclnt->cl_nodename;
        /*
         * Choose whether to record the caller_name or IP address of
         * this peer in the local rpc.statd's database.
         */
        nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf;
-        clnt = nsm_client_get(host->net);
+        clnt = nsm_client_get(host->net, nodename);
        if (IS_ERR(clnt)) {
                status = PTR_ERR(clnt);
                dprintk("lockd: failed to create NSM upcall transport, "
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index e94c887da2d7..55505cbe11af 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -138,10 +138,6 @@ lockd(void *vrqstp)
        dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n");
-        if (!nlm_timeout)
-                nlm_timeout = LOCKD_DFLT_TIMEO;
-        nlmsvc_timeout = nlm_timeout * HZ;
        /*
         * The main request loop. We don't terminate until the last
         * NFS mount or NFS daemon has gone away.
@@ -350,6 +346,10 @@ static struct svc_serv *lockd_create_svc(void)
                printk(KERN_WARNING
                        "lockd_up: no pid, %d users??\n", nlmsvc_users);
+        if (!nlm_timeout)
+                nlm_timeout = LOCKD_DFLT_TIMEO;
+        nlmsvc_timeout = nlm_timeout * HZ;
        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, svc_rpcb_cleanup);
        if (!serv) {
                printk(KERN_WARNING "lockd_up: create service failed\n");
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 56598742dde4..5581e020644b 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -57,8 +57,8 @@ static DEFINE_SPINLOCK(nlm_blocked_lock);
 static const char *nlmdbg_cookie2a(const struct nlm_cookie *cookie)
 {
        /*
-         * We can get away with a static buffer because we're only
+         * We can get away with a static buffer because this is only called
-         * called with BKL held.
+         * from lockd, which is single-threaded.
         */
        static char buf[2*NLM_MAXCOOKIELEN+1];
        unsigned int i, len = sizeof(buf);
diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c
index d12ff4e2dbe7..665ef5a05183 100644
--- a/fs/lockd/svcsubs.c
+++ b/fs/lockd/svcsubs.c
@@ -164,12 +164,15 @@ nlm_traverse_locks(struct nlm_host *host, struct nlm_file *file,
 {
        struct inode     *inode = nlmsvc_file_inode(file);
        struct file_lock *fl;
+        struct file_lock_context *flctx = inode->i_flctx;
        struct nlm_host  *lockhost;
+        if (!flctx || list_empty_careful(&flctx->flc_posix))
+                return 0;
 again:
        file->f_locks = 0;
-        spin_lock(&inode->i_lock);
+        spin_lock(&flctx->flc_lock);
-        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+        list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
                if (fl->fl_lmops != &nlmsvc_lock_operations)
                        continue;
@@ -180,7 +183,7 @@ again:
                if (match(lockhost, host)) {
                        struct file_lock lock = *fl;
-                        spin_unlock(&inode->i_lock);
+                        spin_unlock(&flctx->flc_lock);
                        lock.fl_type  = F_UNLCK;
                        lock.fl_start = 0;
                        lock.fl_end   = OFFSET_MAX;
@@ -192,7 +195,7 @@ again:
                        goto again;
                }
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&flctx->flc_lock);
        return 0;
 }
@@ -223,18 +226,21 @@ nlm_file_inuse(struct nlm_file *file)
 {
        struct inode     *inode = nlmsvc_file_inode(file);
        struct file_lock *fl;
+        struct file_lock_context *flctx = inode->i_flctx;
        if (file->f_count || !list_empty(&file->f_blocks) || file->f_shares)
                return 1;
-        spin_lock(&inode->i_lock);
+        if (flctx && !list_empty_careful(&flctx->flc_posix)) {
-        for (fl = inode->i_flock; fl; fl = fl->fl_next) {
+                spin_lock(&flctx->flc_lock);
-                if (fl->fl_lmops == &nlmsvc_lock_operations) {
+                list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
-                        spin_unlock(&inode->i_lock);
+                        if (fl->fl_lmops == &nlmsvc_lock_operations) {
-                        return 1;
+                                spin_unlock(&flctx->flc_lock);
+                                return 1;
+                        }
                }
+                spin_unlock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
        file->f_locks = 0;
        return 0;
 }
diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c
index 9340e7e10ef6..5b651daad518 100644
--- a/fs/lockd/xdr.c
+++ b/fs/lockd/xdr.c
@@ -95,14 +95,6 @@ nlm_decode_fh(__be32 *p, struct nfs_fh *f)
        return p + XDR_QUADLEN(NFS2_FHSIZE);
 }
-static inline __be32 *
-nlm_encode_fh(__be32 *p, struct nfs_fh *f)
-{
-        *p++ = htonl(NFS2_FHSIZE);
-        memcpy(p, f->data, NFS2_FHSIZE);
-        return p + XDR_QUADLEN(NFS2_FHSIZE);
-}
 /*
 * Encode and decode owner handle
 */
diff --git a/fs/locks.c b/fs/locks.c
index 735b8d3fa78c..365c82e1b3a9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -137,7 +137,7 @@
 #define IS_POSIX(fl)    (fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)    (fl->fl_flags & FL_FLOCK)
-#define IS_LEASE(fl)    (fl->fl_flags & (FL_LEASE|FL_DELEG))
+#define IS_LEASE(fl)    (fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
 #define IS_OFDLCK(fl)   (fl->fl_flags & FL_OFDLCK)
 static bool lease_breaking(struct file_lock *fl)
@@ -157,14 +157,11 @@ static int target_leasetype(struct file_lock *fl)
 int leases_enable = 1;
 int lease_break_time = 45;
-#define for_each_lock(inode, lockp) \
-        for (lockp = &inode->i_flock; *lockp != NULL; lockp = &(*lockp)->fl_next)
 /*
 * The global file_lock_list is only used for displaying /proc/locks, so we
 * keep a list on each CPU, with each list protected by its own spinlock via
 * the file_lock_lglock. Note that alterations to the list also require that
- * the relevant i_lock is held.
+ * the relevant flc_lock is held.
 */
 DEFINE_STATIC_LGLOCK(file_lock_lglock);
 static DEFINE_PER_CPU(struct hlist_head, file_lock_list);
@@ -192,21 +189,68 @@ static DEFINE_HASHTABLE(blocked_hash, BLOCKED_HASH_BITS);
 * contrast to those that are acting as records of acquired locks).
 *
 * Note that when we acquire this lock in order to change the above fields,
- * we often hold the i_lock as well. In certain cases, when reading the fields
+ * we often hold the flc_lock as well. In certain cases, when reading the fields
 * protected by this lock, we can skip acquiring it iff we already hold the
- * i_lock.
+ * flc_lock.
 *
 * In particular, adding an entry to the fl_block list requires that you hold
- * both the i_lock and the blocked_lock_lock (acquired in that order). Deleting
+ * both the flc_lock and the blocked_lock_lock (acquired in that order).
- * an entry from the list however only requires the file_lock_lock.
+ * Deleting an entry from the list however only requires the file_lock_lock.
 */
 static DEFINE_SPINLOCK(blocked_lock_lock);
+static struct kmem_cache *flctx_cache __read_mostly;
 static struct kmem_cache *filelock_cache __read_mostly;
+static struct file_lock_context *
+locks_get_lock_context(struct inode *inode)
+{
+        struct file_lock_context *new;
+        if (likely(inode->i_flctx))
+                goto out;
+        new = kmem_cache_alloc(flctx_cache, GFP_KERNEL);
+        if (!new)
+                goto out;
+        spin_lock_init(&new->flc_lock);
+        INIT_LIST_HEAD(&new->flc_flock);
+        INIT_LIST_HEAD(&new->flc_posix);
+        INIT_LIST_HEAD(&new->flc_lease);
+        /*
+         * Assign the pointer if it's not already assigned. If it is, then
+         * free the context we just allocated.
+         */
+        spin_lock(&inode->i_lock);
+        if (likely(!inode->i_flctx)) {
+                inode->i_flctx = new;
+                new = NULL;
+        }
+        spin_unlock(&inode->i_lock);
+        if (new)
+                kmem_cache_free(flctx_cache, new);
+out:
+        return inode->i_flctx;
+}
+void
+locks_free_lock_context(struct file_lock_context *ctx)
+{
+        if (ctx) {
+                WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
+                WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
+                WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
+                kmem_cache_free(flctx_cache, ctx);
+        }
+}
 static void locks_init_lock_heads(struct file_lock *fl)
 {
        INIT_HLIST_NODE(&fl->fl_link);
+        INIT_LIST_HEAD(&fl->fl_list);
        INIT_LIST_HEAD(&fl->fl_block);
        init_waitqueue_head(&fl->fl_wait);
 }
@@ -243,6 +287,7 @@ EXPORT_SYMBOL_GPL(locks_release_private);
 void locks_free_lock(struct file_lock *fl)
 {
        BUG_ON(waitqueue_active(&fl->fl_wait));
+        BUG_ON(!list_empty(&fl->fl_list));
        BUG_ON(!list_empty(&fl->fl_block));
        BUG_ON(!hlist_unhashed(&fl->fl_link));
@@ -257,8 +302,8 @@ locks_dispose_list(struct list_head *dispose)
        struct file_lock *fl;
        while (!list_empty(dispose)) {
-                fl = list_first_entry(dispose, struct file_lock, fl_block);
+                fl = list_first_entry(dispose, struct file_lock, fl_list);
-                list_del_init(&fl->fl_block);
+                list_del_init(&fl->fl_list);
                locks_free_lock(fl);
        }
 }
@@ -513,7 +558,7 @@ static int posix_same_owner(struct file_lock *fl1, struct file_lock *fl2)
        return fl1->fl_owner == fl2->fl_owner;
 }
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
 static void locks_insert_global_locks(struct file_lock *fl)
 {
        lg_local_lock(&file_lock_lglock);
@@ -522,12 +567,12 @@ static void locks_insert_global_locks(struct file_lock *fl)
        lg_local_unlock(&file_lock_lglock);
 }
-/* Must be called with the i_lock held! */
+/* Must be called with the flc_lock held! */
 static void locks_delete_global_locks(struct file_lock *fl)
 {
        /*
         * Avoid taking lock if already unhashed. This is safe since this check
-         * is done while holding the i_lock, and new insertions into the list
+         * is done while holding the flc_lock, and new insertions into the list
         * also require that it be held.
         */
        if (hlist_unhashed(&fl->fl_link))
@@ -579,10 +624,10 @@ static void locks_delete_block(struct file_lock *waiter)
 * the order they blocked. The documentation doesn't require this but
 * it seems like the reasonable thing to do.
 *
- * Must be called with both the i_lock and blocked_lock_lock held. The fl_block
+ * Must be called with both the flc_lock and blocked_lock_lock held. The
- * list itself is protected by the blocked_lock_lock, but by ensuring that the
+ * fl_block list itself is protected by the blocked_lock_lock, but by ensuring
- * i_lock is also held on insertions we can avoid taking the blocked_lock_lock
+ * that the flc_lock is also held on insertions we can avoid taking the
- * in some cases when we see that the fl_block list is empty.
+ * blocked_lock_lock in some cases when we see that the fl_block list is empty.
 */
 static void __locks_insert_block(struct file_lock *blocker,
                                        struct file_lock *waiter)
@@ -594,7 +639,7 @@ static void __locks_insert_block(struct file_lock *blocker,
                locks_insert_global_blocked(waiter);
 }
-/* Must be called with i_lock held. */
+/* Must be called with flc_lock held. */
 static void locks_insert_block(struct file_lock *blocker,
                                        struct file_lock *waiter)
 {
@@ -606,15 +651,15 @@ static void locks_insert_block(struct file_lock *blocker,
 /*
 * Wake up processes blocked waiting for blocker.
 *
- * Must be called with the inode->i_lock held!
+ * Must be called with the inode->flc_lock held!
 */
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
        /*
         * Avoid taking global lock if list is empty. This is safe since new
-         * blocked requests are only added to the list under the i_lock, and
+         * blocked requests are only added to the list under the flc_lock, and
-         * the i_lock is always held here. Note that removal from the fl_block
+         * the flc_lock is always held here. Note that removal from the fl_block
-         * list does not require the i_lock, so we must recheck list_empty()
+         * list does not require the flc_lock, so we must recheck list_empty()
         * after acquiring the blocked_lock_lock.
         */
        if (list_empty(&blocker->fl_block))
@@ -635,63 +680,32 @@ static void locks_wake_up_blocks(struct file_lock *blocker)
        spin_unlock(&blocked_lock_lock);
 }
-/* Insert file lock fl into an inode's lock list at the position indicated
+static void
- * by pos. At the same time add the lock to the global file lock list.
+locks_insert_lock_ctx(struct file_lock *fl, struct list_head *before)
- *
- * Must be called with the i_lock held!
- */
-static void locks_insert_lock(struct file_lock **pos, struct file_lock *fl)
 {
        fl->fl_nspid = get_pid(task_tgid(current));
+        list_add_tail(&fl->fl_list, before);
-        /* insert into file's list */
-        fl->fl_next = *pos;
-        *pos = fl;
        locks_insert_global_locks(fl);
 }
-/**
+static void
- * locks_delete_lock - Delete a lock and then free it.
+locks_unlink_lock_ctx(struct file_lock *fl)
- * @thisfl_p: pointer that points to the fl_next field of the previous
- *            inode->i_flock list entry
- *
- * Unlink a lock from all lists and free the namespace reference, but don't
- * free it yet. Wake up processes that are blocked waiting for this lock and
- * notify the FS that the lock has been cleared.
- *
- * Must be called with the i_lock held!
- */
-static void locks_unlink_lock(struct file_lock **thisfl_p)
 {
-        struct file_lock *fl = *thisfl_p;
        locks_delete_global_locks(fl);
+        list_del_init(&fl->fl_list);
-        *thisfl_p = fl->fl_next;
-        fl->fl_next = NULL;
        if (fl->fl_nspid) {
                put_pid(fl->fl_nspid);
                fl->fl_nspid = NULL;
        }
        locks_wake_up_blocks(fl);
 }
-/*
+static void
- * Unlink a lock from all lists and free it.
+locks_delete_lock_ctx(struct file_lock *fl, struct list_head *dispose)
- *
- * Must be called with i_lock held!
- */
-static void locks_delete_lock(struct file_lock **thisfl_p,
-                              struct list_head *dispose)
 {
-        struct file_lock *fl = *thisfl_p;
+        locks_unlink_lock_ctx(fl);
-        locks_unlink_lock(thisfl_p);
        if (dispose)
-                list_add(&fl->fl_block, dispose);
+                list_add(&fl->fl_list, dispose);
        else
                locks_free_lock(fl);
 }
@@ -746,22 +760,27 @@ void
 posix_test_lock(struct file *filp, struct file_lock *fl)
 {
        struct file_lock *cfl;
+        struct file_lock_context *ctx;
        struct inode *inode = file_inode(filp);
-        spin_lock(&inode->i_lock);
+        ctx = inode->i_flctx;
-        for (cfl = file_inode(filp)->i_flock; cfl; cfl = cfl->fl_next) {
+        if (!ctx || list_empty_careful(&ctx->flc_posix)) {
-                if (!IS_POSIX(cfl))
-                        continue;
-                if (posix_locks_conflict(fl, cfl))
-                        break;
-        }
-        if (cfl) {
-                locks_copy_conflock(fl, cfl);
-                if (cfl->fl_nspid)
-                        fl->fl_pid = pid_vnr(cfl->fl_nspid);
-        } else
                fl->fl_type = F_UNLCK;
-        spin_unlock(&inode->i_lock);
+                return;
+        }
+        spin_lock(&ctx->flc_lock);
+        list_for_each_entry(cfl, &ctx->flc_posix, fl_list) {
+                if (posix_locks_conflict(fl, cfl)) {
+                        locks_copy_conflock(fl, cfl);
+                        if (cfl->fl_nspid)
+                                fl->fl_pid = pid_vnr(cfl->fl_nspid);
+                        goto out;
+                }
+        }
+        fl->fl_type = F_UNLCK;
+out:
+        spin_unlock(&ctx->flc_lock);
        return;
 }
 EXPORT_SYMBOL(posix_test_lock);
@@ -845,34 +864,34 @@ static int posix_locks_deadlock(struct file_lock *caller_fl,
 static int flock_lock_file(struct file *filp, struct file_lock *request)
 {
        struct file_lock *new_fl = NULL;
-        struct file_lock **before;
+        struct file_lock *fl;
-        struct inode * inode = file_inode(filp);
+        struct file_lock_context *ctx;
+        struct inode *inode = file_inode(filp);
        int error = 0;
-        int found = 0;
+        bool found = false;
        LIST_HEAD(dispose);
+        ctx = locks_get_lock_context(inode);
+        if (!ctx)
+                return -ENOMEM;
        if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
                new_fl = locks_alloc_lock();
                if (!new_fl)
                        return -ENOMEM;
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        if (request->fl_flags & FL_ACCESS)
                goto find_conflict;
-        for_each_lock(inode, before) {
+        list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
-                struct file_lock *fl = *before;
-                if (IS_POSIX(fl))
-                        break;
-                if (IS_LEASE(fl))
-                        continue;
                if (filp != fl->fl_file)
                        continue;
                if (request->fl_type == fl->fl_type)
                        goto out;
-                found = 1;
+                found = true;
-                locks_delete_lock(before, &dispose);
+                locks_delete_lock_ctx(fl, &dispose);
                break;
        }
@@ -882,23 +901,8 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
                goto out;
        }
-        /*
-         * If a higher-priority process was blocked on the old file lock,
-         * give it the opportunity to lock the file.
-         */
-        if (found) {
-                spin_unlock(&inode->i_lock);
-                cond_resched();
-                spin_lock(&inode->i_lock);
-        }
 find_conflict:
-        for_each_lock(inode, before) {
+        list_for_each_entry(fl, &ctx->flc_flock, fl_list) {
-                struct file_lock *fl = *before;
-                if (IS_POSIX(fl))
-                        break;
-                if (IS_LEASE(fl))
-                        continue;
                if (!flock_locks_conflict(request, fl))
                        continue;
                error = -EAGAIN;
@@ -911,12 +915,12 @@ find_conflict:
        if (request->fl_flags & FL_ACCESS)
                goto out;
        locks_copy_lock(new_fl, request);
-        locks_insert_lock(before, new_fl);
+        locks_insert_lock_ctx(new_fl, &ctx->flc_flock);
        new_fl = NULL;
        error = 0;
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        if (new_fl)
                locks_free_lock(new_fl);
        locks_dispose_list(&dispose);
@@ -925,16 +929,20 @@ out:
 static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
 {
-        struct file_lock *fl;
+        struct file_lock *fl, *tmp;
        struct file_lock *new_fl = NULL;
        struct file_lock *new_fl2 = NULL;
        struct file_lock *left = NULL;
        struct file_lock *right = NULL;
-        struct file_lock **before;
+        struct file_lock_context *ctx;
        int error;
        bool added = false;
        LIST_HEAD(dispose);
+        ctx = locks_get_lock_context(inode);
+        if (!ctx)
+                return -ENOMEM;
        /*
         * We may need two file_lock structures for this operation,
         * so we get them in advance to avoid races.
@@ -948,15 +956,14 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                new_fl2 = locks_alloc_lock();
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        /*
         * New lock request. Walk all POSIX locks and look for conflicts. If
         * there are any, either return error or put the request on the
         * blocker's list of waiters and the global blocked_hash.
         */
        if (request->fl_type != F_UNLCK) {
-                for_each_lock(inode, before) {
+                list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-                        fl = *before;
                        if (!IS_POSIX(fl))
                                continue;
                        if (!posix_locks_conflict(request, fl))
@@ -986,29 +993,25 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
        if (request->fl_flags & FL_ACCESS)
                goto out;
-        /*
+        /* Find the first old lock with the same owner as the new lock */
-         * Find the first old lock with the same owner as the new lock.
+        list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-         */
+                if (posix_same_owner(request, fl))
-        
+                        break;
-        before = &inode->i_flock;
-        /* First skip locks owned by other processes.  */
-        while ((fl = *before) && (!IS_POSIX(fl) ||
-                                  !posix_same_owner(request, fl))) {
-                before = &fl->fl_next;
        }
        /* Process locks with this owner. */
-        while ((fl = *before) && posix_same_owner(request, fl)) {
+        list_for_each_entry_safe_from(fl, tmp, &ctx->flc_posix, fl_list) {
-                /* Detect adjacent or overlapping regions (if same lock type)
+                if (!posix_same_owner(request, fl))
-                 */
+                        break;
+                /* Detect adjacent or overlapping regions (if same lock type) */
                if (request->fl_type == fl->fl_type) {
                        /* In all comparisons of start vs end, use
                         * "start - 1" rather than "end + 1". If end
                         * is OFFSET_MAX, end + 1 will become negative.
                         */
                        if (fl->fl_end < request->fl_start - 1)
-                                goto next_lock;
+                                continue;
                        /* If the next lock in the list has entirely bigger
                         * addresses than the new one, insert the lock here.
                         */
@@ -1029,18 +1032,17 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        else
                                request->fl_end = fl->fl_end;
                        if (added) {
-                                locks_delete_lock(before, &dispose);
+                                locks_delete_lock_ctx(fl, &dispose);
                                continue;
                        }
                        request = fl;
                        added = true;
-                }
+                } else {
-                else {
                        /* Processing for different lock types is a bit
                         * more complex.
                         */
                        if (fl->fl_end < request->fl_start)
-                                goto next_lock;
+                                continue;
                        if (fl->fl_start > request->fl_end)
                                break;
                        if (request->fl_type == F_UNLCK)
@@ -1059,7 +1061,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                                 * one (This may happen several times).
                                 */
                                if (added) {
-                                        locks_delete_lock(before, &dispose);
+                                        locks_delete_lock_ctx(fl, &dispose);
                                        continue;
                                }
                                /*
@@ -1075,15 +1077,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                                locks_copy_lock(new_fl, request);
                                request = new_fl;
                                new_fl = NULL;
-                                locks_delete_lock(before, &dispose);
+                                locks_insert_lock_ctx(request, &fl->fl_list);
-                                locks_insert_lock(before, request);
+                                locks_delete_lock_ctx(fl, &dispose);
                                added = true;
                        }
                }
-                /* Go on to next lock.
-                 */
-        next_lock:
-                before = &fl->fl_next;
        }
        /*
@@ -1108,7 +1106,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        goto out;
                }
                locks_copy_lock(new_fl, request);
-                locks_insert_lock(before, new_fl);
+                locks_insert_lock_ctx(new_fl, &fl->fl_list);
+                fl = new_fl;
                new_fl = NULL;
        }
        if (right) {
@@ -1119,7 +1118,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                        left = new_fl2;
                        new_fl2 = NULL;
                        locks_copy_lock(left, right);
-                        locks_insert_lock(before, left);
+                        locks_insert_lock_ctx(left, &fl->fl_list);
                }
                right->fl_start = request->fl_end + 1;
                locks_wake_up_blocks(right);
@@ -1129,7 +1128,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
                locks_wake_up_blocks(left);
        }
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        /*
         * Free any unused locks.
         */
@@ -1199,22 +1198,29 @@ EXPORT_SYMBOL(posix_lock_file_wait);
 */
 int locks_mandatory_locked(struct file *file)
 {
+        int ret;
        struct inode *inode = file_inode(file);
+        struct file_lock_context *ctx;
        struct file_lock *fl;
+        ctx = inode->i_flctx;
+        if (!ctx || list_empty_careful(&ctx->flc_posix))
+                return 0;
        /*
         * Search the lock list for this inode for any POSIX locks.
         */
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
-        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+        ret = 0;
-                if (!IS_POSIX(fl))
+        list_for_each_entry(fl, &ctx->flc_posix, fl_list) {
-                        continue;
                if (fl->fl_owner != current->files &&
-                    fl->fl_owner != file)
+                    fl->fl_owner != file) {
+                        ret = -EAGAIN;
                        break;
+                }
        }
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
-        return fl ? -EAGAIN : 0;
+        return ret;
 }
 /**
@@ -1294,9 +1300,8 @@ static void lease_clear_pending(struct file_lock *fl, int arg)
 }
 /* We already had a lease on this file; just change its type */
-int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
+int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose)
 {
-        struct file_lock *fl = *before;
        int error = assign_type(fl, arg);
        if (error)
@@ -1313,7 +1318,7 @@ int lease_modify(struct file_lock **before, int arg, struct list_head *dispose)
                        printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync);
                        fl->fl_fasync = NULL;
                }
-                locks_delete_lock(before, dispose);
+                locks_delete_lock_ctx(fl, dispose);
        }
        return 0;
 }
@@ -1329,25 +1334,24 @@ static bool past_time(unsigned long then)
 static void time_out_leases(struct inode *inode, struct list_head *dispose)
 {
-        struct file_lock **before;
+        struct file_lock_context *ctx = inode->i_flctx;
-        struct file_lock *fl;
+        struct file_lock *fl, *tmp;
-        lockdep_assert_held(&inode->i_lock);
+        lockdep_assert_held(&ctx->flc_lock);
-        before = &inode->i_flock;
+        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list) {
-        while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
                trace_time_out_leases(inode, fl);
                if (past_time(fl->fl_downgrade_time))
-                        lease_modify(before, F_RDLCK, dispose);
+                        lease_modify(fl, F_RDLCK, dispose);
                if (past_time(fl->fl_break_time))
-                        lease_modify(before, F_UNLCK, dispose);
+                        lease_modify(fl, F_UNLCK, dispose);
-                if (fl == *before)      /* lease_modify may have freed fl */
-                        before = &fl->fl_next;
        }
 }
 static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 {
+        if ((breaker->fl_flags & FL_LAYOUT) != (lease->fl_flags & FL_LAYOUT))
+                return false;
        if ((breaker->fl_flags & FL_DELEG) && (lease->fl_flags & FL_LEASE))
                return false;
        return locks_conflict(breaker, lease);
@@ -1356,11 +1360,12 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker)
 static bool
 any_leases_conflict(struct inode *inode, struct file_lock *breaker)
 {
+        struct file_lock_context *ctx = inode->i_flctx;
        struct file_lock *fl;
-        lockdep_assert_held(&inode->i_lock);
+        lockdep_assert_held(&ctx->flc_lock);
-        for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) {
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
                if (leases_conflict(fl, breaker))
                        return true;
        }
@@ -1384,7 +1389,8 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 {
        int error = 0;
        struct file_lock *new_fl;
-        struct file_lock *fl, **before;
+        struct file_lock_context *ctx = inode->i_flctx;
+        struct file_lock *fl;
        unsigned long break_time;
        int want_write = (mode & O_ACCMODE) != O_RDONLY;
        LIST_HEAD(dispose);
@@ -1394,7 +1400,13 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                return PTR_ERR(new_fl);
        new_fl->fl_flags = type;
-        spin_lock(&inode->i_lock);
+        /* typically we will check that ctx is non-NULL before calling */
+        if (!ctx) {
+                WARN_ON_ONCE(1);
+                return error;
+        }
+        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
@@ -1408,9 +1420,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                        break_time++;   /* so that 0 means no break time */
        }
-        for (before = &inode->i_flock;
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-                        ((fl = *before) != NULL) && IS_LEASE(fl);
-                        before = &fl->fl_next) {
                if (!leases_conflict(fl, new_fl))
                        continue;
                if (want_write) {
@@ -1419,17 +1429,16 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
                        fl->fl_flags |= FL_UNLOCK_PENDING;
                        fl->fl_break_time = break_time;
                } else {
-                        if (lease_breaking(inode->i_flock))
+                        if (lease_breaking(fl))
                                continue;
                        fl->fl_flags |= FL_DOWNGRADE_PENDING;
                        fl->fl_downgrade_time = break_time;
                }
                if (fl->fl_lmops->lm_break(fl))
-                        locks_delete_lock(before, &dispose);
+                        locks_delete_lock_ctx(fl, &dispose);
        }
-        fl = inode->i_flock;
+        if (list_empty(&ctx->flc_lease))
-        if (!fl || !IS_LEASE(fl))
                goto out;
        if (mode & O_NONBLOCK) {
@@ -1439,18 +1448,19 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
        }
 restart:
-        break_time = inode->i_flock->fl_break_time;
+        fl = list_first_entry(&ctx->flc_lease, struct file_lock, fl_list);
+        break_time = fl->fl_break_time;
        if (break_time != 0)
                break_time -= jiffies;
        if (break_time == 0)
                break_time++;
-        locks_insert_block(inode->i_flock, new_fl);
+        locks_insert_block(fl, new_fl);
        trace_break_lease_block(inode, new_fl);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        error = wait_event_interruptible_timeout(new_fl->fl_wait,
                                                !new_fl->fl_next, break_time);
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        trace_break_lease_unblock(inode, new_fl);
        locks_delete_block(new_fl);
        if (error >= 0) {
@@ -1462,12 +1472,10 @@ restart:
                        time_out_leases(inode, &dispose);
                if (any_leases_conflict(inode, new_fl))
                        goto restart;
                error = 0;
        }
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        locks_free_lock(new_fl);
        return error;
@@ -1487,14 +1495,18 @@ EXPORT_SYMBOL(__break_lease);
 void lease_get_mtime(struct inode *inode, struct timespec *time)
 {
        bool has_lease = false;
-        struct file_lock *flock;
+        struct file_lock_context *ctx = inode->i_flctx;
+        struct file_lock *fl;
-        if (inode->i_flock) {
+        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-                spin_lock(&inode->i_lock);
+                spin_lock(&ctx->flc_lock);
-                flock = inode->i_flock;
+                if (!list_empty(&ctx->flc_lease)) {
-                if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK))
+                        fl = list_first_entry(&ctx->flc_lease,
-                        has_lease = true;
+                                                struct file_lock, fl_list);
-                spin_unlock(&inode->i_lock);
+                        if (fl->fl_type == F_WRLCK)
+                                has_lease = true;
+                }
+                spin_unlock(&ctx->flc_lock);
        }
        if (has_lease)
@@ -1532,20 +1544,22 @@ int fcntl_getlease(struct file *filp)
 {
        struct file_lock *fl;
        struct inode *inode = file_inode(filp);
+        struct file_lock_context *ctx = inode->i_flctx;
        int type = F_UNLCK;
        LIST_HEAD(dispose);
-        spin_lock(&inode->i_lock);
+        if (ctx && !list_empty_careful(&ctx->flc_lease)) {
-        time_out_leases(file_inode(filp), &dispose);
+                spin_lock(&ctx->flc_lock);
-        for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl);
+                time_out_leases(file_inode(filp), &dispose);
-                        fl = fl->fl_next) {
+                list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-                if (fl->fl_file == filp) {
+                        if (fl->fl_file != filp)
+                                continue;
                        type = target_leasetype(fl);
                        break;
                }
+                spin_unlock(&ctx->flc_lock);
+                locks_dispose_list(&dispose);
        }
-        spin_unlock(&inode->i_lock);
-        locks_dispose_list(&dispose);
        return type;
 }
@@ -1560,11 +1574,14 @@ int fcntl_getlease(struct file *filp)
 * conflict with the lease we're trying to set.
 */
 static int
-check_conflicting_open(const struct dentry *dentry, const long arg)
+check_conflicting_open(const struct dentry *dentry, const long arg, int flags)
 {
        int ret = 0;
        struct inode *inode = dentry->d_inode;
+        if (flags & FL_LAYOUT)
+                return 0;
        if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0))
                return -EAGAIN;
@@ -1578,9 +1595,10 @@ check_conflicting_open(const struct dentry *dentry, const long arg)
 static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
-        struct file_lock *fl, **before, **my_before = NULL, *lease;
+        struct file_lock *fl, *my_fl = NULL, *lease;
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
+        struct file_lock_context *ctx;
        bool is_deleg = (*flp)->fl_flags & FL_DELEG;
        int error;
        LIST_HEAD(dispose);
@@ -1588,6 +1606,10 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
        lease = *flp;
        trace_generic_add_lease(inode, lease);
+        ctx = locks_get_lock_context(inode);
+        if (!ctx)
+                return -ENOMEM;
        /*
         * In the delegation case we need mutual exclusion with
         * a number of operations that take the i_mutex.  We trylock
@@ -1606,9 +1628,9 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
                return -EINVAL;
        }
-        spin_lock(&inode->i_lock);
+        spin_lock(&ctx->flc_lock);
        time_out_leases(inode, &dispose);
-        error = check_conflicting_open(dentry, arg);
+        error = check_conflicting_open(dentry, arg, lease->fl_flags);
        if (error)
                goto out;
@@ -1621,13 +1643,13 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
         * except for this filp.
         */
        error = -EAGAIN;
-        for (before = &inode->i_flock;
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
-                        ((fl = *before) != NULL) && IS_LEASE(fl);
+                if (fl->fl_file == filp &&
-                        before = &fl->fl_next) {
+                    fl->fl_owner == lease->fl_owner) {
-                if (fl->fl_file == filp) {
+                        my_fl = fl;
-                        my_before = before;
                        continue;
                }
                /*
                 * No exclusive leases if someone else has a lease on
                 * this file:
@@ -1642,9 +1664,8 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
                        goto out;
        }
-        if (my_before != NULL) {
+        if (my_fl != NULL) {
-                lease = *my_before;
+                error = lease->fl_lmops->lm_change(my_fl, arg, &dispose);
-                error = lease->fl_lmops->lm_change(my_before, arg, &dispose);
                if (error)
                        goto out;
                goto out_setup;
@@ -1654,7 +1675,7 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
        if (!leases_enable)
                goto out;
-        locks_insert_lock(before, lease);
+        locks_insert_lock_ctx(lease, &ctx->flc_lease);
        /*
         * The check in break_lease() is lockless. It's possible for another
         * open to race in after we did the earlier check for a conflicting
@@ -1665,46 +1686,51 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
         * precedes these checks.
         */
        smp_mb();
-        error = check_conflicting_open(dentry, arg);
+        error = check_conflicting_open(dentry, arg, lease->fl_flags);
-        if (error)
+        if (error) {
-                goto out_unlink;
+                locks_unlink_lock_ctx(lease);
+                goto out;
+        }
 out_setup:
        if (lease->fl_lmops->lm_setup)
                lease->fl_lmops->lm_setup(lease, priv);
 out:
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        if (is_deleg)
                mutex_unlock(&inode->i_mutex);
-        if (!error && !my_before)
+        if (!error && !my_fl)
                *flp = NULL;
        return error;
-out_unlink:
-        locks_unlink_lock(before);
-        goto out;
 }
-static int generic_delete_lease(struct file *filp)
+static int generic_delete_lease(struct file *filp, void *owner)
 {
        int error = -EAGAIN;
-        struct file_lock *fl, **before;
+        struct file_lock *fl, *victim = NULL;
        struct dentry *dentry = filp->f_path.dentry;
        struct inode *inode = dentry->d_inode;
+        struct file_lock_context *ctx = inode->i_flctx;
        LIST_HEAD(dispose);
-        spin_lock(&inode->i_lock);
+        if (!ctx) {
-        time_out_leases(inode, &dispose);
+                trace_generic_delete_lease(inode, NULL);
-        for (before = &inode->i_flock;
+                return error;
-                        ((fl = *before) != NULL) && IS_LEASE(fl);
+        }
-                        before = &fl->fl_next) {
-                if (fl->fl_file == filp)
+        spin_lock(&ctx->flc_lock);
+        list_for_each_entry(fl, &ctx->flc_lease, fl_list) {
+                if (fl->fl_file == filp &&
+                    fl->fl_owner == owner) {
+                        victim = fl;
                        break;
+                }
        }
        trace_generic_delete_lease(inode, fl);
-        if (fl)
+        if (victim)
-                error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose);
+                error = fl->fl_lmops->lm_change(victim, F_UNLCK, &dispose);
-        spin_unlock(&inode->i_lock);
+        spin_unlock(&ctx->flc_lock);
        locks_dispose_list(&dispose);
        return error;
 }
@@ -1737,13 +1763,14 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
        switch (arg) {
        case F_UNLCK:
-                return generic_delete_lease(filp);
+                return generic_delete_lease(filp, *priv);
        case F_RDLCK:
        case F_WRLCK:
                if (!(*flp)->fl_lmops->lm_break) {
                        WARN_ON_ONCE(1);
                        return -ENOLCK;
                }
                return generic_add_lease(filp, arg, flp, priv);
        default:
                return -EINVAL;
@@ -1816,7 +1843,7 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg)
 int fcntl_setlease(unsigned int fd, struct file *filp, long arg)
 {
        if (arg == F_UNLCK)
-                return vfs_setlease(filp, F_UNLCK, NULL, NULL);
+                return vfs_setlease(filp, F_UNLCK, NULL, (void **)&filp);
        return do_fcntl_add_lease(fd, filp, arg);
 }
@@ -2171,7 +2198,7 @@ again:
         */
        /*
         * we need that spin_lock here - it prevents reordering between
-         * update of inode->i_flock and check for it done in close().
+         * update of i_flctx->flc_posix and check for it done in close().
         * rcu_read_lock() wouldn't do.
         */
        spin_lock(&current->files->file_lock);
@@ -2331,13 +2358,14 @@ out:
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
        struct file_lock lock;
+        struct file_lock_context *ctx = file_inode(filp)->i_flctx;
        /*
         * If there are no locks held on this file, we don't need to call
         * posix_lock_file().  Another process could be setting a lock on this
         * file at the same time, but we wouldn't remove that lock anyway.
         */
-        if (!file_inode(filp)->i_flock)
+        if (!ctx || list_empty(&ctx->flc_posix))
                return;
        lock.fl_type = F_UNLCK;
@@ -2358,67 +2386,68 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 EXPORT_SYMBOL(locks_remove_posix);
+/* The i_flctx must be valid when calling into here */
+static void
+locks_remove_flock(struct file *filp)
+{
+        struct file_lock fl = {
+                .fl_owner = filp,
+                .fl_pid = current->tgid,
+                .fl_file = filp,
+                .fl_flags = FL_FLOCK,
+                .fl_type = F_UNLCK,
+                .fl_end = OFFSET_MAX,
+        };
+        struct file_lock_context *flctx = file_inode(filp)->i_flctx;
+        if (list_empty(&flctx->flc_flock))
+                return;
+        if (filp->f_op->flock)
+                filp->f_op->flock(filp, F_SETLKW, &fl);
+        else
+                flock_lock_file(filp, &fl);
+        if (fl.fl_ops && fl.fl_ops->fl_release_private)
+                fl.fl_ops->fl_release_private(&fl);
+}
+/* The i_flctx must be valid when calling into here */
+static void
+locks_remove_lease(struct file *filp)
+{
+        struct inode *inode = file_inode(filp);
+        struct file_lock_context *ctx = inode->i_flctx;
+        struct file_lock *fl, *tmp;
+        LIST_HEAD(dispose);
+        if (list_empty(&ctx->flc_lease))
+                return;
+        spin_lock(&ctx->flc_lock);
+        list_for_each_entry_safe(fl, tmp, &ctx->flc_lease, fl_list)
+                if (filp == fl->fl_file)
+                        lease_modify(fl, F_UNLCK, &dispose);
+        spin_unlock(&ctx->flc_lock);
+        locks_dispose_list(&dispose);
+}
 /*
 * This function is called on the last close of an open file.
 */
 void locks_remove_file(struct file *filp)
 {
-        struct inode * inode = file_inode(filp);
+        if (!file_inode(filp)->i_flctx)
-        struct file_lock *fl;
-        struct file_lock **before;
-        LIST_HEAD(dispose);
-        if (!inode->i_flock)
                return;
+        /* remove any OFD locks */
        locks_remove_posix(filp, filp);
-        if (filp->f_op->flock) {
+        /* remove flock locks */
-                struct file_lock fl = {
+        locks_remove_flock(filp);
-                        .fl_owner = filp,
-                        .fl_pid = current->tgid,
-                        .fl_file = filp,
-                        .fl_flags = FL_FLOCK,
-                        .fl_type = F_UNLCK,
-                        .fl_end = OFFSET_MAX,
-                };
-                filp->f_op->flock(filp, F_SETLKW, &fl);
-                if (fl.fl_ops && fl.fl_ops->fl_release_private)
-                        fl.fl_ops->fl_release_private(&fl);
-        }
-        spin_lock(&inode->i_lock);
-        before = &inode->i_flock;
-        while ((fl = *before) != NULL) {
+        /* remove any leases */
-                if (fl->fl_file == filp) {
+        locks_remove_lease(filp);
-                        if (IS_LEASE(fl)) {
-                                lease_modify(before, F_UNLCK, &dispose);
-                                continue;
-                        }
-                        /*
-                         * There's a leftover lock on the list of a type that
-                         * we didn't expect to see. Most likely a classic
-                         * POSIX lock that ended up not getting released
-                         * properly, or that raced onto the list somehow. Log
-                         * some info about it and then just remove it from
-                         * the list.
-                         */
-                        WARN(!IS_FLOCK(fl),
-                                "leftover lock: dev=%u:%u ino=%lu type=%hhd flags=0x%x start=%lld end=%lld\n",
-                                MAJOR(inode->i_sb->s_dev),
-                                MINOR(inode->i_sb->s_dev), inode->i_ino,
-                                fl->fl_type, fl->fl_flags,
-                                fl->fl_start, fl->fl_end);
-                        locks_delete_lock(before, &dispose);
-                        continue;
-                }
-                before = &fl->fl_next;
-        }
-        spin_unlock(&inode->i_lock);
-        locks_dispose_list(&dispose);
 }
 /**
@@ -2621,6 +2650,9 @@ static int __init filelock_init(void)
 {
        int i;
+        flctx_cache = kmem_cache_create("file_lock_ctx",
+                        sizeof(struct file_lock_context), 0, SLAB_PANIC, NULL);
        filelock_cache = kmem_cache_create("file_lock_cache",
                        sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
diff --git a/fs/mount.h b/fs/mount.h
index 0ad6f760ce52..6a61c2b3e385 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -2,6 +2,7 @@
 #include <linux/seq_file.h>
 #include <linux/poll.h>
 #include <linux/ns_common.h>
+#include <linux/fs_pin.h>
 struct mnt_namespace {
        atomic_t                count;
@@ -62,7 +63,8 @@ struct mount {
        int mnt_group_id;               /* peer group identifier */
        int mnt_expiry_mark;            /* true if marked for expiry */
        struct hlist_head mnt_pins;
-        struct path mnt_ex_mountpoint;
+        struct fs_pin mnt_umount;
+        struct dentry *mnt_ex_mountpoint;
 };
 #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */
diff --git a/fs/namei.c b/fs/namei.c
index bc35b02883bb..96ca11dea4a2 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -118,15 +118,6 @@
 * POSIX.1 2.4: an empty pathname is invalid (ENOENT).
 * PATH_MAX includes the nul terminator --RR.
 */
-void final_putname(struct filename *name)
-{
-        if (name->separate) {
-                __putname(name->name);
-                kfree(name);
-        } else {
-                __putname(name);
-        }
-}
 #define EMBEDDED_NAME_MAX       (PATH_MAX - sizeof(struct filename))
@@ -145,6 +136,7 @@ getname_flags(const char __user *filename, int flags, int *empty)
        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);
+        result->refcnt = 1;
        /*
         * First, try to embed the struct filename inside the names_cache
@@ -179,6 +171,7 @@ recopy:
                }
                result->name = kname;
                result->separate = true;
+                result->refcnt = 1;
                max = PATH_MAX;
                goto recopy;
        }
@@ -202,7 +195,7 @@ recopy:
        return result;
 error:
-        final_putname(result);
+        putname(result);
        return err;
 }
@@ -212,43 +205,56 @@ getname(const char __user * filename)
        return getname_flags(filename, 0, NULL);
 }
-/*
- * The "getname_kernel()" interface doesn't do pathnames longer
- * than EMBEDDED_NAME_MAX. Deal with it - you're a kernel user.
- */
 struct filename *
 getname_kernel(const char * filename)
 {
        struct filename *result;
-        char *kname;
+        int len = strlen(filename) + 1;
-        int len;
-        len = strlen(filename);
-        if (len >= EMBEDDED_NAME_MAX)
-                return ERR_PTR(-ENAMETOOLONG);
        result = __getname();
        if (unlikely(!result))
                return ERR_PTR(-ENOMEM);
-        kname = (char *)result + sizeof(*result);
+        if (len <= EMBEDDED_NAME_MAX) {
-        result->name = kname;
+                result->name = (char *)(result) + sizeof(*result);
+                result->separate = false;
+        } else if (len <= PATH_MAX) {
+                struct filename *tmp;
+                tmp = kmalloc(sizeof(*tmp), GFP_KERNEL);
+                if (unlikely(!tmp)) {
+                        __putname(result);
+                        return ERR_PTR(-ENOMEM);
+                }
+                tmp->name = (char *)result;
+                tmp->separate = true;
+                result = tmp;
+        } else {
+                __putname(result);
+                return ERR_PTR(-ENAMETOOLONG);
+        }
+        memcpy((char *)result->name, filename, len);
        result->uptr = NULL;
        result->aname = NULL;
-        result->separate = false;
+        result->refcnt = 1;
+        audit_getname(result);
-        strlcpy(kname, filename, EMBEDDED_NAME_MAX);
        return result;
 }
-#ifdef CONFIG_AUDITSYSCALL
 void putname(struct filename *name)
 {
-        if (unlikely(!audit_dummy_context()))
+        BUG_ON(name->refcnt <= 0);
-                return audit_putname(name);
-        final_putname(name);
+        if (--name->refcnt > 0)
+                return;
+        if (name->separate) {
+                __putname(name->name);
+                kfree(name);
+        } else
+                __putname(name);
 }
-#endif
 static int check_acl(struct inode *inode, int mask)
 {
@@ -2036,31 +2042,47 @@ static int filename_lookup(int dfd, struct filename *name,
 static int do_path_lookup(int dfd, const char *name,
                                unsigned int flags, struct nameidata *nd)
 {
-        struct filename filename = { .name = name };
+        struct filename *filename = getname_kernel(name);
+        int retval = PTR_ERR(filename);
-        return filename_lookup(dfd, &filename, flags, nd);
+        if (!IS_ERR(filename)) {
+                retval = filename_lookup(dfd, filename, flags, nd);
+                putname(filename);
+        }
+        return retval;
 }
 /* does lookup, returns the object with parent locked */
 struct dentry *kern_path_locked(const char *name, struct path *path)
 {
+        struct filename *filename = getname_kernel(name);
        struct nameidata nd;
        struct dentry *d;
-        int err = do_path_lookup(AT_FDCWD, name, LOOKUP_PARENT, &nd);
+        int err;
-        if (err)
-                return ERR_PTR(err);
+        if (IS_ERR(filename))
+                return ERR_CAST(filename);
+        err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd);
+        if (err) {
+                d = ERR_PTR(err);
+                goto out;
+        }
        if (nd.last_type != LAST_NORM) {
                path_put(&nd.path);
-                return ERR_PTR(-EINVAL);
+                d = ERR_PTR(-EINVAL);
+                goto out;
        }
        mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
        d = __lookup_hash(&nd.last, nd.path.dentry, 0);
        if (IS_ERR(d)) {
                mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
                path_put(&nd.path);
-                return d;
+                goto out;
        }
        *path = nd.path;
+out:
+        putname(filename);
        return d;
 }
@@ -2351,13 +2373,17 @@ static int
 filename_mountpoint(int dfd, struct filename *s, struct path *path,
                        unsigned int flags)
 {
-        int error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
+        int error;
+        if (IS_ERR(s))
+                return PTR_ERR(s);
+        error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_RCU);
        if (unlikely(error == -ECHILD))
                error = path_mountpoint(dfd, s->name, path, flags);
        if (unlikely(error == -ESTALE))
                error = path_mountpoint(dfd, s->name, path, flags | LOOKUP_REVAL);
        if (likely(!error))
                audit_inode(s, path->dentry, 0);
+        putname(s);
        return error;
 }
@@ -2379,21 +2405,14 @@ int
 user_path_mountpoint_at(int dfd, const char __user *name, unsigned int flags,
                        struct path *path)
 {
-        struct filename *s = getname(name);
+        return filename_mountpoint(dfd, getname(name), path, flags);
-        int error;
-        if (IS_ERR(s))
-                return PTR_ERR(s);
-        error = filename_mountpoint(dfd, s, path, flags);
-        putname(s);
-        return error;
 }
 int
 kern_path_mountpoint(int dfd, const char *name, struct path *path,
                        unsigned int flags)
 {
-        struct filename s = {.name = name};
+        return filename_mountpoint(dfd, getname_kernel(name), path, flags);
-        return filename_mountpoint(dfd, &s, path, flags);
 }
 EXPORT_SYMBOL(kern_path_mountpoint);
@@ -3273,7 +3292,7 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
 {
        struct nameidata nd;
        struct file *file;
-        struct filename filename = { .name = name };
+        struct filename *filename;
        int flags = op->lookup_flags | LOOKUP_ROOT;
        nd.root.mnt = mnt;
@@ -3282,15 +3301,20 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt,
        if (d_is_symlink(dentry) && op->intent & LOOKUP_OPEN)
                return ERR_PTR(-ELOOP);
-        file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_RCU);
+        filename = getname_kernel(name);
+        if (unlikely(IS_ERR(filename)))
+                return ERR_CAST(filename);
+        file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU);
        if (unlikely(file == ERR_PTR(-ECHILD)))
-                file = path_openat(-1, &filename, &nd, op, flags);
+                file = path_openat(-1, filename, &nd, op, flags);
        if (unlikely(file == ERR_PTR(-ESTALE)))
-                file = path_openat(-1, &filename, &nd, op, flags | LOOKUP_REVAL);
+                file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL);
+        putname(filename);
        return file;
 }
-struct dentry *kern_path_create(int dfd, const char *pathname,
+static struct dentry *filename_create(int dfd, struct filename *name,
                                struct path *path, unsigned int lookup_flags)
 {
        struct dentry *dentry = ERR_PTR(-EEXIST);
@@ -3305,7 +3329,7 @@ struct dentry *kern_path_create(int dfd, const char *pathname,
         */
        lookup_flags &= LOOKUP_REVAL;
-        error = do_path_lookup(dfd, pathname, LOOKUP_PARENT|lookup_flags, &nd);
+        error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd);
        if (error)
                return ERR_PTR(error);
@@ -3359,6 +3383,19 @@ out:
        path_put(&nd.path);
        return dentry;
 }
+struct dentry *kern_path_create(int dfd, const char *pathname,
+                                struct path *path, unsigned int lookup_flags)
+{
+        struct filename *filename = getname_kernel(pathname);
+        struct dentry *res;
+        if (IS_ERR(filename))
+                return ERR_CAST(filename);
+        res = filename_create(dfd, filename, path, lookup_flags);
+        putname(filename);
+        return res;
+}
 EXPORT_SYMBOL(kern_path_create);
 void done_path_create(struct path *path, struct dentry *dentry)
@@ -3377,7 +3414,7 @@ struct dentry *user_path_create(int dfd, const char __user *pathname,
        struct dentry *res;
        if (IS_ERR(tmp))
                return ERR_CAST(tmp);
-        res = kern_path_create(dfd, tmp->name, path, lookup_flags);
+        res = filename_create(dfd, tmp, path, lookup_flags);
        putname(tmp);
        return res;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index cd1e9681a0cf..72a286e0d33e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -190,6 +190,14 @@ unsigned int mnt_get_count(struct mount *mnt)
 #endif
 }
+static void drop_mountpoint(struct fs_pin *p)
+{
+        struct mount *m = container_of(p, struct mount, mnt_umount);
+        dput(m->mnt_ex_mountpoint);
+        pin_remove(p);
+        mntput(&m->mnt);
+}
 static struct mount *alloc_vfsmnt(const char *name)
 {
        struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
@@ -201,7 +209,7 @@ static struct mount *alloc_vfsmnt(const char *name)
                        goto out_free_cache;
                if (name) {
-                        mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+                        mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL);
                        if (!mnt->mnt_devname)
                                goto out_free_id;
                }
@@ -229,12 +237,13 @@ static struct mount *alloc_vfsmnt(const char *name)
 #ifdef CONFIG_FSNOTIFY
                INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
+                init_fs_pin(&mnt->mnt_umount, drop_mountpoint);
        }
        return mnt;
 #ifdef CONFIG_SMP
 out_free_devname:
-        kfree(mnt->mnt_devname);
+        kfree_const(mnt->mnt_devname);
 #endif
 out_free_id:
        mnt_free_id(mnt);
@@ -568,7 +577,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 static void free_vfsmnt(struct mount *mnt)
 {
-        kfree(mnt->mnt_devname);
+        kfree_const(mnt->mnt_devname);
 #ifdef CONFIG_SMP
        free_percpu(mnt->mnt_pcp);
 #endif
@@ -1289,7 +1298,6 @@ static HLIST_HEAD(unmounted);	/* protected by namespace_sem */
 static void namespace_unlock(void)
 {
-        struct mount *mnt;
        struct hlist_head head = unmounted;
        if (likely(hlist_empty(&head))) {
@@ -1299,23 +1307,11 @@ static void namespace_unlock(void)
        head.first->pprev = &head.first;
        INIT_HLIST_HEAD(&unmounted);
-        /* undo decrements we'd done in umount_tree() */
-        hlist_for_each_entry(mnt, &head, mnt_hash)
-                if (mnt->mnt_ex_mountpoint.mnt)
-                        mntget(mnt->mnt_ex_mountpoint.mnt);
        up_write(&namespace_sem);
        synchronize_rcu();
-        while (!hlist_empty(&head)) {
+        group_pin_kill(&head);
-                mnt = hlist_entry(head.first, struct mount, mnt_hash);
-                hlist_del_init(&mnt->mnt_hash);
-                if (mnt->mnt_ex_mountpoint.mnt)
-                        path_put(&mnt->mnt_ex_mountpoint);
-                mntput(&mnt->mnt);
-        }
 }
 static inline void namespace_lock(void)
@@ -1334,7 +1330,6 @@ void umount_tree(struct mount *mnt, int how)
 {
        HLIST_HEAD(tmp_list);
        struct mount *p;
-        struct mount *last = NULL;
        for (p = mnt; p; p = next_mnt(p, mnt)) {
                hlist_del_init_rcu(&p->mnt_hash);
@@ -1347,33 +1342,28 @@ void umount_tree(struct mount *mnt, int how)
        if (how)
                propagate_umount(&tmp_list);
-        hlist_for_each_entry(p, &tmp_list, mnt_hash) {
+        while (!hlist_empty(&tmp_list)) {
+                p = hlist_entry(tmp_list.first, struct mount, mnt_hash);
+                hlist_del_init_rcu(&p->mnt_hash);
                list_del_init(&p->mnt_expire);
                list_del_init(&p->mnt_list);
                __touch_mnt_namespace(p->mnt_ns);
                p->mnt_ns = NULL;
                if (how < 2)
                        p->mnt.mnt_flags |= MNT_SYNC_UMOUNT;
+                pin_insert_group(&p->mnt_umount, &p->mnt_parent->mnt, &unmounted);
                if (mnt_has_parent(p)) {
                        hlist_del_init(&p->mnt_mp_list);
                        put_mountpoint(p->mnt_mp);
                        mnt_add_count(p->mnt_parent, -1);
-                        /* move the reference to mountpoint into ->mnt_ex_mountpoint */
+                        /* old mountpoint will be dropped when we can do that */
-                        p->mnt_ex_mountpoint.dentry = p->mnt_mountpoint;
+                        p->mnt_ex_mountpoint = p->mnt_mountpoint;
-                        p->mnt_ex_mountpoint.mnt = &p->mnt_parent->mnt;
                        p->mnt_mountpoint = p->mnt.mnt_root;
                        p->mnt_parent = p;
                        p->mnt_mp = NULL;
                }
                change_mnt_propagation(p, MS_PRIVATE);
-                last = p;
-        }
-        if (last) {
-                last->mnt_hash.next = unmounted.first;
-                if (unmounted.first)
-                        unmounted.first->pprev = &last->mnt_hash.next;
-                unmounted.first = tmp_list.first;
-                unmounted.first->pprev = &unmounted.first;
        }
 }
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 008960101520..e7ca827d7694 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -77,6 +77,7 @@ static int ncp_hash_dentry(const struct dentry *, struct qstr *);
 static int ncp_compare_dentry(const struct dentry *, const struct dentry *,
                unsigned int, const char *, const struct qstr *);
 static int ncp_delete_dentry(const struct dentry *);
+static void ncp_d_prune(struct dentry *dentry);
 const struct dentry_operations ncp_dentry_operations =
 {
@@ -84,6 +85,7 @@ const struct dentry_operations ncp_dentry_operations =
        .d_hash         = ncp_hash_dentry,
        .d_compare      = ncp_compare_dentry,
        .d_delete       = ncp_delete_dentry,
+        .d_prune        = ncp_d_prune,
 };
 #define ncp_namespace(i)        (NCP_SERVER(i)->name_space[NCP_FINFO(i)->volNumber])
@@ -384,42 +386,6 @@ finished:
        return val;
 }
-static struct dentry *
-ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos)
-{
-        struct dentry *dent = dentry;
-        if (d_validate(dent, parent)) {
-                if (dent->d_name.len <= NCP_MAXPATHLEN &&
-                    (unsigned long)dent->d_fsdata == fpos) {
-                        if (!dent->d_inode) {
-                                dput(dent);
-                                dent = NULL;
-                        }
-                        return dent;
-                }
-                dput(dent);
-        }
-        /* If a pointer is invalid, we search the dentry. */
-        spin_lock(&parent->d_lock);
-        list_for_each_entry(dent, &parent->d_subdirs, d_child) {
-                if ((unsigned long)dent->d_fsdata == fpos) {
-                        if (dent->d_inode)
-                                dget(dent);
-                        else
-                                dent = NULL;
-                        spin_unlock(&parent->d_lock);
-                        goto out;
-                }
-        }
-        spin_unlock(&parent->d_lock);
-        return NULL;
-out:
-        return dent;
-}
 static time_t ncp_obtain_mtime(struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
@@ -435,6 +401,20 @@ static time_t ncp_obtain_mtime(struct dentry *dentry)
        return ncp_date_dos2unix(i.modifyTime, i.modifyDate);
 }
+static inline void
+ncp_invalidate_dircache_entries(struct dentry *parent)
+{
+        struct ncp_server *server = NCP_SERVER(parent->d_inode);
+        struct dentry *dentry;
+        spin_lock(&parent->d_lock);
+        list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
+                dentry->d_fsdata = NULL;
+                ncp_age_dentry(server, dentry);
+        }
+        spin_unlock(&parent->d_lock);
+}
 static int ncp_readdir(struct file *file, struct dir_context *ctx)
 {
        struct dentry *dentry = file->f_path.dentry;
@@ -500,10 +480,21 @@ static int ncp_readdir(struct file *file, struct dir_context *ctx)
                        struct dentry *dent;
                        bool over;
-                        dent = ncp_dget_fpos(ctl.cache->dentry[ctl.idx],
+                        spin_lock(&dentry->d_lock);
-                                                dentry, ctx->pos);
+                        if (!(NCP_FINFO(inode)->flags & NCPI_DIR_CACHE)) { 
-                        if (!dent)
+                                spin_unlock(&dentry->d_lock);
+                                goto invalid_cache;
+                        }
+                        dent = ctl.cache->dentry[ctl.idx];
+                        if (unlikely(!lockref_get_not_dead(&dent->d_lockref))) {
+                                spin_unlock(&dentry->d_lock);
+                                goto invalid_cache;
+                        }
+                        spin_unlock(&dentry->d_lock);
+                        if (!dent->d_inode) {
+                                dput(dent);
                                goto invalid_cache;
+                        }
                        over = !dir_emit(ctx, dent->d_name.name,
                                        dent->d_name.len,
                                        dent->d_inode->i_ino, DT_UNKNOWN);
@@ -548,6 +539,9 @@ init_cache:
        ctl.filled = 0;
        ctl.valid  = 1;
 read_really:
+        spin_lock(&dentry->d_lock);
+        NCP_FINFO(inode)->flags |= NCPI_DIR_CACHE;
+        spin_unlock(&dentry->d_lock);
        if (ncp_is_server_root(inode)) {
                ncp_read_volume_list(file, ctx, &ctl);
        } else {
@@ -573,6 +567,13 @@ out:
        return result;
 }
+static void ncp_d_prune(struct dentry *dentry)
+{
+        if (!dentry->d_fsdata)  /* not referenced from page cache */
+                return;
+        NCP_FINFO(dentry->d_parent->d_inode)->flags &= ~NCPI_DIR_CACHE;
+}
 static int
 ncp_fill_cache(struct file *file, struct dir_context *ctx,
                struct ncp_cache_control *ctrl, struct ncp_entry_info *entry,
@@ -630,6 +631,10 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
                        d_instantiate(newdent, inode);
                        if (!hashed)
                                d_rehash(newdent);
+                } else {
+                        spin_lock(&dentry->d_lock);
+                        NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+                        spin_unlock(&dentry->d_lock);
                }
        } else {
                struct inode *inode = newdent->d_inode;
@@ -639,12 +644,6 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
                mutex_unlock(&inode->i_mutex);
        }
-        if (newdent->d_inode) {
-                ino = newdent->d_inode->i_ino;
-                newdent->d_fsdata = (void *) ctl.fpos;
-                ncp_new_dentry(newdent);
-        }
        if (ctl.idx >= NCP_DIRCACHE_SIZE) {
                if (ctl.page) {
                        kunmap(ctl.page);
@@ -660,8 +659,13 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
                        ctl.cache = kmap(ctl.page);
        }
        if (ctl.cache) {
-                ctl.cache->dentry[ctl.idx] = newdent;
+                if (newdent->d_inode) {
-                valid = 1;
+                        newdent->d_fsdata = newdent;
+                        ctl.cache->dentry[ctl.idx] = newdent;
+                        ino = newdent->d_inode->i_ino;
+                        ncp_new_dentry(newdent);
+                }
+                valid = 1;
        }
        dput(newdent);
 end_advance:
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index e31e589369a4..01a9e16e9782 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -267,7 +267,6 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
        if (inode) {
                atomic_set(&NCP_FINFO(inode)->opened, info->opened);
-                inode->i_mapping->backing_dev_info = sb->s_bdi;
                inode->i_ino = info->ino;
                ncp_set_attr(inode, info);
                if (S_ISREG(inode->i_mode)) {
@@ -560,7 +559,7 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent)
        server = NCP_SBP(sb);
        memset(server, 0, sizeof(*server));
-        error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY);
+        error = bdi_setup_and_register(&server->bdi, "ncpfs");
        if (error)
                goto out_fput;
diff --git a/fs/ncpfs/ncp_fs_i.h b/fs/ncpfs/ncp_fs_i.h
index 4b0bec477846..c4794504f843 100644
--- a/fs/ncpfs/ncp_fs_i.h
+++ b/fs/ncpfs/ncp_fs_i.h
@@ -22,6 +22,7 @@ struct ncp_inode_info {
        int     access;
        int     flags;
 #define NCPI_KLUDGE_SYMLINK     0x0001
+#define NCPI_DIR_CACHE          0x0002
        __u8    file_handle[6];
        struct inode vfs_inode;
 };
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index b785f74bfe3c..250e443a07f3 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -184,36 +184,6 @@ ncp_new_dentry(struct dentry* dentry)
        dentry->d_time = jiffies;
 }
-static inline void
-ncp_renew_dentries(struct dentry *parent)
-{
-        struct ncp_server *server = NCP_SERVER(parent->d_inode);
-        struct dentry *dentry;
-        spin_lock(&parent->d_lock);
-        list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
-                if (dentry->d_fsdata == NULL)
-                        ncp_age_dentry(server, dentry);
-                else
-                        ncp_new_dentry(dentry);
-        }
-        spin_unlock(&parent->d_lock);
-}
-static inline void
-ncp_invalidate_dircache_entries(struct dentry *parent)
-{
-        struct ncp_server *server = NCP_SERVER(parent->d_inode);
-        struct dentry *dentry;
-        spin_lock(&parent->d_lock);
-        list_for_each_entry(dentry, &parent->d_subdirs, d_child) {
-                dentry->d_fsdata = NULL;
-                ncp_age_dentry(server, dentry);
-        }
-        spin_unlock(&parent->d_lock);
-}
 struct ncp_cache_head {
        time_t          mtime;
        unsigned long   time;   /* cache age */
diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig
index 3dece03f2fc8..c7abc10279af 100644
--- a/fs/nfs/Kconfig
+++ b/fs/nfs/Kconfig
@@ -128,6 +128,11 @@ config PNFS_OBJLAYOUT
        depends on NFS_V4_1 && SCSI_OSD_ULD
        default NFS_V4
+config PNFS_FLEXFILE_LAYOUT
+        tristate
+        depends on NFS_V4_1 && NFS_V3
+        default m
 config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN
        string "NFSv4.1 Implementation ID Domain"
        depends on NFS_V4_1
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 04cb830fa09f..1e987acf20c9 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -27,9 +27,10 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o
          dns_resolve.o nfs4trace.o
 nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o
 nfsv4-$(CONFIG_SYSCTL)  += nfs4sysctl.o
-nfsv4-$(CONFIG_NFS_V4_1)        += pnfs.o pnfs_dev.o
+nfsv4-$(CONFIG_NFS_V4_1)        += pnfs.o pnfs_dev.o pnfs_nfs.o
 nfsv4-$(CONFIG_NFS_V4_2)        += nfs42proc.o
 obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/
 obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/
 obj-$(CONFIG_PNFS_BLOCK) += blocklayout/
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 77fec6a55f57..1cac3c175d18 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -860,12 +860,14 @@ static const struct nfs_pageio_ops bl_pg_read_ops = {
        .pg_init = bl_pg_init_read,
        .pg_test = bl_pg_test_read,
        .pg_doio = pnfs_generic_pg_readpages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static const struct nfs_pageio_ops bl_pg_write_ops = {
        .pg_init = bl_pg_init_write,
        .pg_test = bl_pg_test_write,
        .pg_doio = pnfs_generic_pg_writepages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static struct pnfs_layoutdriver_type blocklayout_type = {
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index b8fb3a4ef649..351be9205bf8 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -128,22 +128,24 @@ nfs41_callback_svc(void *vrqstp)
                if (try_to_freeze())
                        continue;
-                prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+                prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_UNINTERRUPTIBLE);
                spin_lock_bh(&serv->sv_cb_lock);
                if (!list_empty(&serv->sv_cb_list)) {
                        req = list_first_entry(&serv->sv_cb_list,
                                        struct rpc_rqst, rq_bc_list);
                        list_del(&req->rq_bc_list);
                        spin_unlock_bh(&serv->sv_cb_lock);
+                        finish_wait(&serv->sv_cb_waitq, &wq);
                        dprintk("Invoking bc_svc_process()\n");
                        error = bc_svc_process(serv, req, rqstp);
                        dprintk("bc_svc_process() returned w/ error code= %d\n",
                                error);
                } else {
                        spin_unlock_bh(&serv->sv_cb_lock);
-                        schedule();
+                        /* schedule_timeout to game the hung task watchdog */
+                        schedule_timeout(60 * HZ);
+                        finish_wait(&serv->sv_cb_waitq, &wq);
                }
-                finish_wait(&serv->sv_cb_waitq, &wq);
        }
        return 0;
 }
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e36a9d78ea49..197806fb87ff 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -427,6 +427,8 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args,
        if (clp == NULL)
                goto out;
+        if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
+                goto out;
        tbl = &clp->cl_session->bc_slot_table;
        spin_lock(&tbl->slot_tbl_lock);
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index f4ccfe6521ec..19ca95cdfd9b 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -313,7 +313,7 @@ __be32 decode_devicenotify_args(struct svc_rqst *rqstp,
                goto out;
        }
-        args->devs = kmalloc(n * sizeof(*args->devs), GFP_KERNEL);
+        args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
        if (!args->devs) {
                status = htonl(NFS4ERR_DELAY);
                goto out;
@@ -415,7 +415,7 @@ static __be32 decode_rc_list(struct xdr_stream *xdr,
                             rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t));
                if (unlikely(p == NULL))
                        goto out;
-                rc_list->rcl_refcalls = kmalloc(rc_list->rcl_nrefcalls *
+                rc_list->rcl_refcalls = kmalloc_array(rc_list->rcl_nrefcalls,
                                                sizeof(*rc_list->rcl_refcalls),
                                                GFP_KERNEL);
                if (unlikely(rc_list->rcl_refcalls == NULL))
@@ -464,8 +464,10 @@ static __be32 decode_cb_sequence_args(struct svc_rqst *rqstp,
                for (i = 0; i < args->csa_nrclists; i++) {
                        status = decode_rc_list(xdr, &args->csa_rclists[i]);
-                        if (status)
+                        if (status) {
+                                args->csa_nrclists = i;
                                goto out_free;
+                        }
                }
        }
        status = 0;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 7f3f60641344..a1f0685b42ff 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -85,25 +85,30 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
 {
        struct inode *inode = state->inode;
        struct file_lock *fl;
+        struct file_lock_context *flctx = inode->i_flctx;
+        struct list_head *list;
        int status = 0;
-        if (inode->i_flock == NULL)
+        if (flctx == NULL)
                goto out;
-        /* Protect inode->i_flock using the i_lock */
+        list = &flctx->flc_posix;
-        spin_lock(&inode->i_lock);
+        spin_lock(&flctx->flc_lock);
-        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+restart:
-                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
+        list_for_each_entry(fl, list, fl_list) {
-                        continue;
                if (nfs_file_open_context(fl->fl_file) != ctx)
                        continue;
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&flctx->flc_lock);
                status = nfs4_lock_delegation_recall(fl, state, stateid);
                if (status < 0)
                        goto out;
-                spin_lock(&inode->i_lock);
+                spin_lock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
+        if (list == &flctx->flc_posix) {
+                list = &flctx->flc_flock;
+                goto restart;
+        }
+        spin_unlock(&flctx->flc_lock);
 out:
        return status;
 }
@@ -175,7 +180,6 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
                        delegation->cred = get_rpccred(cred);
                        clear_bit(NFS_DELEGATION_NEED_RECLAIM,
                                  &delegation->flags);
-                        NFS_I(inode)->delegation_state = delegation->type;
                        spin_unlock(&delegation->lock);
                        put_rpccred(oldcred);
                        rcu_read_unlock();
@@ -270,7 +274,6 @@ nfs_detach_delegation_locked(struct nfs_inode *nfsi,
        set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
        list_del_rcu(&delegation->super_list);
        delegation->inode = NULL;
-        nfsi->delegation_state = 0;
        rcu_assign_pointer(nfsi->delegation, NULL);
        spin_unlock(&delegation->lock);
        return delegation;
@@ -301,6 +304,17 @@ nfs_inode_detach_delegation(struct inode *inode)
        return nfs_detach_delegation(nfsi, delegation, server);
 }
+static void
+nfs_update_inplace_delegation(struct nfs_delegation *delegation,
+                const struct nfs_delegation *update)
+{
+        if (nfs4_stateid_is_newer(&update->stateid, &delegation->stateid)) {
+                delegation->stateid.seqid = update->stateid.seqid;
+                smp_wmb();
+                delegation->type = update->type;
+        }
+}
 /**
 * nfs_inode_set_delegation - set up a delegation on an inode
 * @inode: inode to which delegation applies
@@ -334,9 +348,11 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
        old_delegation = rcu_dereference_protected(nfsi->delegation,
                                        lockdep_is_held(&clp->cl_lock));
        if (old_delegation != NULL) {
-                if (nfs4_stateid_match(&delegation->stateid,
+                /* Is this an update of the existing delegation? */
-                                        &old_delegation->stateid) &&
+                if (nfs4_stateid_match_other(&old_delegation->stateid,
-                                delegation->type == old_delegation->type) {
+                                        &delegation->stateid)) {
+                        nfs_update_inplace_delegation(old_delegation,
+                                        delegation);
                        goto out;
                }
                /*
@@ -360,7 +376,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                        goto out;
        }
        list_add_rcu(&delegation->super_list, &server->delegations);
-        nfsi->delegation_state = delegation->type;
        rcu_assign_pointer(nfsi->delegation, delegation);
        delegation = NULL;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 10bf07280f4a..e907c8cf732e 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -66,6 +66,10 @@ static struct kmem_cache *nfs_direct_cachep;
 /*
 * This represents a set of asynchronous requests that we're waiting on
 */
+struct nfs_direct_mirror {
+        ssize_t count;
+};
 struct nfs_direct_req {
        struct kref             kref;           /* release manager */
@@ -78,8 +82,13 @@ struct nfs_direct_req {
        /* completion state */
        atomic_t                io_count;       /* i/os we're waiting for */
        spinlock_t              lock;           /* protect completion state */
+        struct nfs_direct_mirror mirrors[NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX];
+        int                     mirror_count;
        ssize_t                 count,          /* bytes actually processed */
                                bytes_left,     /* bytes left to be sent */
+                                io_start,       /* start of IO */
                                error;          /* any reported error */
        struct completion       completion;     /* wait for i/o completion */
@@ -108,26 +117,56 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
        return atomic_dec_and_test(&dreq->io_count);
 }
+void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
+{
+        dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+}
+EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
+static void
+nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
+{
+        int i;
+        ssize_t count;
+        WARN_ON_ONCE(hdr->pgio_mirror_idx >= dreq->mirror_count);
+        count = dreq->mirrors[hdr->pgio_mirror_idx].count;
+        if (count + dreq->io_start < hdr->io_start + hdr->good_bytes) {
+                count = hdr->io_start + hdr->good_bytes - dreq->io_start;
+                dreq->mirrors[hdr->pgio_mirror_idx].count = count;
+        }
+        /* update the dreq->count by finding the minimum agreed count from all
+         * mirrors */
+        count = dreq->mirrors[0].count;
+        for (i = 1; i < dreq->mirror_count; i++)
+                count = min(count, dreq->mirrors[i].count);
+        dreq->count = count;
+}
 /*
 * nfs_direct_select_verf - select the right verifier
 * @dreq - direct request possibly spanning multiple servers
 * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
- * @ds_idx - index of data server in data server list, only valid if ds_clp set
+ * @commit_idx - commit bucket index for the DS
 *
 * returns the correct verifier to use given the role of the server
 */
 static struct nfs_writeverf *
 nfs_direct_select_verf(struct nfs_direct_req *dreq,
                       struct nfs_client *ds_clp,
-                       int ds_idx)
+                       int commit_idx)
 {
        struct nfs_writeverf *verfp = &dreq->verf;
 #ifdef CONFIG_NFS_V4_1
        if (ds_clp) {
                /* pNFS is in use, use the DS verf */
-                if (ds_idx >= 0 && ds_idx < dreq->ds_cinfo.nbuckets)
+                if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
-                        verfp = &dreq->ds_cinfo.buckets[ds_idx].direct_verf;
+                        verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
                else
                        WARN_ON_ONCE(1);
        }
@@ -148,8 +187,7 @@ static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
 {
        struct nfs_writeverf *verfp;
-        verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+        verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
-                                      hdr->ds_idx);
        WARN_ON_ONCE(verfp->committed >= 0);
        memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
        WARN_ON_ONCE(verfp->committed < 0);
@@ -169,8 +207,7 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
 {
        struct nfs_writeverf *verfp;
-        verfp = nfs_direct_select_verf(dreq, hdr->ds_clp,
+        verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
-                                         hdr->ds_idx);
        if (verfp->committed < 0) {
                nfs_direct_set_hdr_verf(dreq, hdr);
                return 0;
@@ -193,7 +230,11 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
        verfp = nfs_direct_select_verf(dreq, data->ds_clp,
                                         data->ds_commit_index);
-        WARN_ON_ONCE(verfp->committed < 0);
+        /* verifier not set so always fail */
+        if (verfp->committed < 0)
+                return 1;
        return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf));
 }
@@ -212,6 +253,12 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
 */
 ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 {
+        struct inode *inode = iocb->ki_filp->f_mapping->host;
+        /* we only support swap file calling nfs_direct_IO */
+        if (!IS_SWAPFILE(inode))
+                return 0;
 #ifndef CONFIG_NFS_SWAP
        dprintk("NFS: nfs_direct_IO (%pD) off/no(%Ld/%lu) EINVAL\n",
                        iocb->ki_filp, (long long) pos, iter->nr_segs);
@@ -236,13 +283,25 @@ static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
 void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
                              struct nfs_direct_req *dreq)
 {
-        cinfo->lock = &dreq->lock;
+        cinfo->lock = &dreq->inode->i_lock;
        cinfo->mds = &dreq->mds_cinfo;
        cinfo->ds = &dreq->ds_cinfo;
        cinfo->dreq = dreq;
        cinfo->completion_ops = &nfs_direct_commit_completion_ops;
 }
+static inline void nfs_direct_setup_mirroring(struct nfs_direct_req *dreq,
+                                             struct nfs_pageio_descriptor *pgio,
+                                             struct nfs_page *req)
+{
+        int mirror_count = 1;
+        if (pgio->pg_ops->pg_get_mirror_count)
+                mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+        dreq->mirror_count = mirror_count;
+}
 static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
 {
        struct nfs_direct_req *dreq;
@@ -257,6 +316,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
        INIT_LIST_HEAD(&dreq->mds_cinfo.list);
        dreq->verf.committed = NFS_INVALID_STABLE_HOW;  /* not set yet */
        INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
+        dreq->mirror_count = 1;
        spin_lock_init(&dreq->lock);
        return dreq;
@@ -363,7 +423,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr)
        if (test_bit(NFS_IOHDR_ERROR, &hdr->flags) && (hdr->good_bytes == 0))
                dreq->error = hdr->error;
        else
-                dreq->count += hdr->good_bytes;
+                nfs_direct_good_bytes(dreq, hdr);
        spin_unlock(&dreq->lock);
        while (!list_empty(&hdr->pages)) {
@@ -541,6 +602,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
        dreq->inode = inode;
        dreq->bytes_left = count;
+        dreq->io_start = pos;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        l_ctx = nfs_get_lock_context(dreq->ctx);
        if (IS_ERR(l_ctx)) {
@@ -573,6 +635,20 @@ out:
        return result;
 }
+static void
+nfs_direct_write_scan_commit_list(struct inode *inode,
+                                  struct list_head *list,
+                                  struct nfs_commit_info *cinfo)
+{
+        spin_lock(cinfo->lock);
+#ifdef CONFIG_NFS_V4_1
+        if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
+                NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
+#endif
+        nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
+        spin_unlock(cinfo->lock);
+}
 static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 {
        struct nfs_pageio_descriptor desc;
@@ -580,20 +656,23 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
        LIST_HEAD(reqs);
        struct nfs_commit_info cinfo;
        LIST_HEAD(failed);
+        int i;
        nfs_init_cinfo_from_dreq(&cinfo, dreq);
-        pnfs_recover_commit_reqs(dreq->inode, &reqs, &cinfo);
+        nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
-        spin_lock(cinfo.lock);
-        nfs_scan_commit_list(&cinfo.mds->list, &reqs, &cinfo, 0);
-        spin_unlock(cinfo.lock);
        dreq->count = 0;
+        for (i = 0; i < dreq->mirror_count; i++)
+                dreq->mirrors[i].count = 0;
        get_dreq(dreq);
        nfs_pageio_init_write(&desc, dreq->inode, FLUSH_STABLE, false,
                              &nfs_direct_write_completion_ops);
        desc.pg_dreq = dreq;
+        req = nfs_list_entry(reqs.next);
+        nfs_direct_setup_mirroring(dreq, &desc, req);
        list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
                if (!nfs_pageio_add_request(&desc, req)) {
                        nfs_list_remove_request(req);
@@ -640,7 +719,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
                nfs_list_remove_request(req);
                if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
                        /* Note the rewrite will go through mds */
-                        nfs_mark_request_commit(req, NULL, &cinfo);
+                        nfs_mark_request_commit(req, NULL, &cinfo, 0);
                } else
                        nfs_release_request(req);
                nfs_unlock_and_release_request(req);
@@ -715,7 +794,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
                dreq->error = hdr->error;
        }
        if (dreq->error == 0) {
-                dreq->count += hdr->good_bytes;
+                nfs_direct_good_bytes(dreq, hdr);
                if (nfs_write_need_commit(hdr)) {
                        if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
                                request_commit = true;
@@ -739,7 +818,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
                nfs_list_remove_request(req);
                if (request_commit) {
                        kref_get(&req->wb_kref);
-                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+                        nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+                                hdr->ds_commit_idx);
                }
                nfs_unlock_and_release_request(req);
        }
@@ -820,6 +900,9 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
                                result = PTR_ERR(req);
                                break;
                        }
+                        nfs_direct_setup_mirroring(dreq, &desc, req);
                        nfs_lock_request(req);
                        req->wb_index = pos >> PAGE_SHIFT;
                        req->wb_offset = pos & ~PAGE_MASK;
@@ -928,6 +1011,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
        dreq->inode = inode;
        dreq->bytes_left = count;
+        dreq->io_start = pos;
        dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp));
        l_ctx = nfs_get_lock_context(dreq->ctx);
        if (IS_ERR(l_ctx)) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2ab6f00dba5b..94712fc781fa 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -646,7 +646,6 @@ static const struct vm_operations_struct nfs_file_vm_ops = {
        .fault = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = nfs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 static int nfs_need_sync_write(struct file *filp, struct inode *inode)
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 7afb52f6a25a..91e88a7ecef0 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -118,13 +118,6 @@ static void filelayout_reset_read(struct nfs_pgio_header *hdr)
        }
 }
-static void filelayout_fenceme(struct inode *inode, struct pnfs_layout_hdr *lo)
-{
-        if (!test_and_clear_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
-                return;
-        pnfs_return_layout(inode);
-}
 static int filelayout_async_handle_error(struct rpc_task *task,
                                         struct nfs4_state *state,
                                         struct nfs_client *clp,
@@ -207,7 +200,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
                dprintk("%s DS connection error %d\n", __func__,
                        task->tk_status);
                nfs4_mark_deviceid_unavailable(devid);
-                set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+                pnfs_error_mark_layout_for_return(inode, lseg);
                rpc_wake_up(&tbl->slot_tbl_waitq);
                /* fall through */
        default:
@@ -339,16 +332,6 @@ static void filelayout_read_count_stats(struct rpc_task *task, void *data)
        rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
-static void filelayout_read_release(void *data)
-{
-        struct nfs_pgio_header *hdr = data;
-        struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
-        filelayout_fenceme(lo->plh_inode, lo);
-        nfs_put_client(hdr->ds_clp);
-        hdr->mds_ops->rpc_release(data);
-}
 static int filelayout_write_done_cb(struct rpc_task *task,
                                struct nfs_pgio_header *hdr)
 {
@@ -371,17 +354,6 @@ static int filelayout_write_done_cb(struct rpc_task *task,
        return 0;
 }
-/* Fake up some data that will cause nfs_commit_release to retry the writes. */
-static void prepare_to_resend_writes(struct nfs_commit_data *data)
-{
-        struct nfs_page *first = nfs_list_entry(data->pages.next);
-        data->task.tk_status = 0;
-        memcpy(&data->verf.verifier, &first->wb_verf,
-               sizeof(data->verf.verifier));
-        data->verf.verifier.data[0]++; /* ensure verifier mismatch */
-}
 static int filelayout_commit_done_cb(struct rpc_task *task,
                                     struct nfs_commit_data *data)
 {
@@ -393,7 +365,7 @@ static int filelayout_commit_done_cb(struct rpc_task *task,
        switch (err) {
        case -NFS4ERR_RESET_TO_MDS:
-                prepare_to_resend_writes(data);
+                pnfs_generic_prepare_to_resend_writes(data);
                return -EAGAIN;
        case -EAGAIN:
                rpc_restart_call_prepare(task);
@@ -451,16 +423,6 @@ static void filelayout_write_count_stats(struct rpc_task *task, void *data)
        rpc_count_iostats(task, NFS_SERVER(hdr->inode)->client->cl_metrics);
 }
-static void filelayout_write_release(void *data)
-{
-        struct nfs_pgio_header *hdr = data;
-        struct pnfs_layout_hdr *lo = hdr->lseg->pls_layout;
-        filelayout_fenceme(lo->plh_inode, lo);
-        nfs_put_client(hdr->ds_clp);
-        hdr->mds_ops->rpc_release(data);
-}
 static void filelayout_commit_prepare(struct rpc_task *task, void *data)
 {
        struct nfs_commit_data *wdata = data;
@@ -471,14 +433,6 @@ static void filelayout_commit_prepare(struct rpc_task *task, void *data)
                        task);
 }
-static void filelayout_write_commit_done(struct rpc_task *task, void *data)
-{
-        struct nfs_commit_data *wdata = data;
-        /* Note this may cause RPC to be resent */
-        wdata->mds_ops->rpc_call_done(task, data);
-}
 static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
 {
        struct nfs_commit_data *cdata = data;
@@ -486,35 +440,25 @@ static void filelayout_commit_count_stats(struct rpc_task *task, void *data)
        rpc_count_iostats(task, NFS_SERVER(cdata->inode)->client->cl_metrics);
 }
-static void filelayout_commit_release(void *calldata)
-{
-        struct nfs_commit_data *data = calldata;
-        data->completion_ops->completion(data);
-        pnfs_put_lseg(data->lseg);
-        nfs_put_client(data->ds_clp);
-        nfs_commitdata_release(data);
-}
 static const struct rpc_call_ops filelayout_read_call_ops = {
        .rpc_call_prepare = filelayout_read_prepare,
        .rpc_call_done = filelayout_read_call_done,
        .rpc_count_stats = filelayout_read_count_stats,
-        .rpc_release = filelayout_read_release,
+        .rpc_release = pnfs_generic_rw_release,
 };
 static const struct rpc_call_ops filelayout_write_call_ops = {
        .rpc_call_prepare = filelayout_write_prepare,
        .rpc_call_done = filelayout_write_call_done,
        .rpc_count_stats = filelayout_write_count_stats,
-        .rpc_release = filelayout_write_release,
+        .rpc_release = pnfs_generic_rw_release,
 };
 static const struct rpc_call_ops filelayout_commit_call_ops = {
        .rpc_call_prepare = filelayout_commit_prepare,
-        .rpc_call_done = filelayout_write_commit_done,
+        .rpc_call_done = pnfs_generic_write_commit_done,
        .rpc_count_stats = filelayout_commit_count_stats,
-        .rpc_release = filelayout_commit_release,
+        .rpc_release = pnfs_generic_commit_release,
 };
 static enum pnfs_try_status
@@ -548,7 +492,7 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
        /* No multipath support. Use first DS */
        atomic_inc(&ds->ds_clp->cl_count);
        hdr->ds_clp = ds->ds_clp;
-        hdr->ds_idx = idx;
+        hdr->ds_commit_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
                hdr->args.fh = fh;
@@ -557,8 +501,9 @@ filelayout_read_pagelist(struct nfs_pgio_header *hdr)
        hdr->mds_offset = offset;
        /* Perform an asynchronous read to ds */
-        nfs_initiate_pgio(ds_clnt, hdr,
+        nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
-                            &filelayout_read_call_ops, 0, RPC_TASK_SOFTCONN);
+                          NFS_PROTO(hdr->inode), &filelayout_read_call_ops,
+                          0, RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
 }
@@ -591,16 +536,16 @@ filelayout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
        hdr->pgio_done_cb = filelayout_write_done_cb;
        atomic_inc(&ds->ds_clp->cl_count);
        hdr->ds_clp = ds->ds_clp;
-        hdr->ds_idx = idx;
+        hdr->ds_commit_idx = idx;
        fh = nfs4_fl_select_ds_fh(lseg, j);
        if (fh)
                hdr->args.fh = fh;
        hdr->args.offset = filelayout_get_dserver_offset(lseg, offset);
        /* Perform an asynchronous write */
-        nfs_initiate_pgio(ds_clnt, hdr,
+        nfs_initiate_pgio(ds_clnt, hdr, hdr->cred,
-                                    &filelayout_write_call_ops, sync,
+                          NFS_PROTO(hdr->inode), &filelayout_write_call_ops,
-                                    RPC_TASK_SOFTCONN);
+                          sync, RPC_TASK_SOFTCONN);
        return PNFS_ATTEMPTED;
 }
@@ -988,12 +933,14 @@ static const struct nfs_pageio_ops filelayout_pg_read_ops = {
        .pg_init = filelayout_pg_init_read,
        .pg_test = filelayout_pg_test,
        .pg_doio = pnfs_generic_pg_readpages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static const struct nfs_pageio_ops filelayout_pg_write_ops = {
        .pg_init = filelayout_pg_init_write,
        .pg_test = filelayout_pg_test,
        .pg_doio = pnfs_generic_pg_writepages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
@@ -1004,87 +951,28 @@ static u32 select_bucket_index(struct nfs4_filelayout_segment *fl, u32 j)
                return j;
 }
-/* The generic layer is about to remove the req from the commit list.
- * If this will make the bucket empty, it will need to put the lseg reference.
- * Note this is must be called holding the inode (/cinfo) lock
- */
-static void
-filelayout_clear_request_commit(struct nfs_page *req,
-                                struct nfs_commit_info *cinfo)
-{
-        struct pnfs_layout_segment *freeme = NULL;
-        if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
-                goto out;
-        cinfo->ds->nwritten--;
-        if (list_is_singular(&req->wb_list)) {
-                struct pnfs_commit_bucket *bucket;
-                bucket = list_first_entry(&req->wb_list,
-                                          struct pnfs_commit_bucket,
-                                          written);
-                freeme = bucket->wlseg;
-                bucket->wlseg = NULL;
-        }
-out:
-        nfs_request_remove_commit_list(req, cinfo);
-        pnfs_put_lseg_locked(freeme);
-}
 static void
 filelayout_mark_request_commit(struct nfs_page *req,
                               struct pnfs_layout_segment *lseg,
-                               struct nfs_commit_info *cinfo)
+                               struct nfs_commit_info *cinfo,
+                               u32 ds_commit_idx)
 {
        struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
        u32 i, j;
-        struct list_head *list;
-        struct pnfs_commit_bucket *buckets;
        if (fl->commit_through_mds) {
-                list = &cinfo->mds->list;
+                nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
-                spin_lock(cinfo->lock);
+        } else {
-                goto mds_commit;
+                /* Note that we are calling nfs4_fl_calc_j_index on each page
-        }
+                 * that ends up being committed to a data server.  An attractive
+                 * alternative is to add a field to nfs_write_data and nfs_page
-        /* Note that we are calling nfs4_fl_calc_j_index on each page
+                 * to store the value calculated in filelayout_write_pagelist
-         * that ends up being committed to a data server.  An attractive
+                 * and just use that here.
-         * alternative is to add a field to nfs_write_data and nfs_page
-         * to store the value calculated in filelayout_write_pagelist
-         * and just use that here.
-         */
-        j = nfs4_fl_calc_j_index(lseg, req_offset(req));
-        i = select_bucket_index(fl, j);
-        spin_lock(cinfo->lock);
-        buckets = cinfo->ds->buckets;
-        list = &buckets[i].written;
-        if (list_empty(list)) {
-                /* Non-empty buckets hold a reference on the lseg.  That ref
-                 * is normally transferred to the COMMIT call and released
-                 * there.  It could also be released if the last req is pulled
-                 * off due to a rewrite, in which case it will be done in
-                 * filelayout_clear_request_commit
                 */
-                buckets[i].wlseg = pnfs_get_lseg(lseg);
+                j = nfs4_fl_calc_j_index(lseg, req_offset(req));
-        }
+                i = select_bucket_index(fl, j);
-        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+                pnfs_layout_mark_request_commit(req, lseg, cinfo, i);
-        cinfo->ds->nwritten++;
-mds_commit:
-        /* nfs_request_add_commit_list(). We need to add req to list without
-         * dropping cinfo lock.
-         */
-        set_bit(PG_CLEAN, &(req)->wb_flags);
-        nfs_list_add_request(req, list);
-        cinfo->mds->ncommit++;
-        spin_unlock(cinfo->lock);
-        if (!cinfo->dreq) {
-                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
-                inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
-                             BDI_RECLAIMABLE);
-                __mark_inode_dirty(req->wb_context->dentry->d_inode,
-                                   I_DIRTY_DATASYNC);
        }
 }
@@ -1138,101 +1026,15 @@ static int filelayout_initiate_commit(struct nfs_commit_data *data, int how)
        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
        if (fh)
                data->args.fh = fh;
-        return nfs_initiate_commit(ds_clnt, data,
+        return nfs_initiate_commit(ds_clnt, data, NFS_PROTO(data->inode),
                                   &filelayout_commit_call_ops, how,
                                   RPC_TASK_SOFTCONN);
 out_err:
-        prepare_to_resend_writes(data);
+        pnfs_generic_prepare_to_resend_writes(data);
-        filelayout_commit_release(data);
+        pnfs_generic_commit_release(data);
        return -EAGAIN;
 }
-static int
-transfer_commit_list(struct list_head *src, struct list_head *dst,
-                     struct nfs_commit_info *cinfo, int max)
-{
-        struct nfs_page *req, *tmp;
-        int ret = 0;
-        list_for_each_entry_safe(req, tmp, src, wb_list) {
-                if (!nfs_lock_request(req))
-                        continue;
-                kref_get(&req->wb_kref);
-                if (cond_resched_lock(cinfo->lock))
-                        list_safe_reset_next(req, tmp, wb_list);
-                nfs_request_remove_commit_list(req, cinfo);
-                clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
-                nfs_list_add_request(req, dst);
-                ret++;
-                if ((ret == max) && !cinfo->dreq)
-                        break;
-        }
-        return ret;
-}
-/* Note called with cinfo->lock held. */
-static int
-filelayout_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
-                               struct nfs_commit_info *cinfo,
-                               int max)
-{
-        struct list_head *src = &bucket->written;
-        struct list_head *dst = &bucket->committing;
-        int ret;
-        ret = transfer_commit_list(src, dst, cinfo, max);
-        if (ret) {
-                cinfo->ds->nwritten -= ret;
-                cinfo->ds->ncommitting += ret;
-                bucket->clseg = bucket->wlseg;
-                if (list_empty(src))
-                        bucket->wlseg = NULL;
-                else
-                        pnfs_get_lseg(bucket->clseg);
-        }
-        return ret;
-}
-/* Move reqs from written to committing lists, returning count of number moved.
- * Note called with cinfo->lock held.
- */
-static int filelayout_scan_commit_lists(struct nfs_commit_info *cinfo,
-                                        int max)
-{
-        int i, rv = 0, cnt;
-        for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
-                cnt = filelayout_scan_ds_commit_list(&cinfo->ds->buckets[i],
-                                                     cinfo, max);
-                max -= cnt;
-                rv += cnt;
-        }
-        return rv;
-}
-/* Pull everything off the committing lists and dump into @dst */
-static void filelayout_recover_commit_reqs(struct list_head *dst,
-                                           struct nfs_commit_info *cinfo)
-{
-        struct pnfs_commit_bucket *b;
-        struct pnfs_layout_segment *freeme;
-        int i;
-restart:
-        spin_lock(cinfo->lock);
-        for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
-                if (transfer_commit_list(&b->written, dst, cinfo, 0)) {
-                        freeme = b->wlseg;
-                        b->wlseg = NULL;
-                        spin_unlock(cinfo->lock);
-                        pnfs_put_lseg(freeme);
-                        goto restart;
-                }
-        }
-        cinfo->ds->nwritten = 0;
-        spin_unlock(cinfo->lock);
-}
 /* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
 *                                 for @page
 * @cinfo - commit info for current inode
@@ -1263,108 +1065,14 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
        return NULL;
 }
-static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
-{
-        struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
-        struct pnfs_commit_bucket *bucket;
-        struct pnfs_layout_segment *freeme;
-        int i;
-        for (i = idx; i < fl_cinfo->nbuckets; i++) {
-                bucket = &fl_cinfo->buckets[i];
-                if (list_empty(&bucket->committing))
-                        continue;
-                nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
-                spin_lock(cinfo->lock);
-                freeme = bucket->clseg;
-                bucket->clseg = NULL;
-                spin_unlock(cinfo->lock);
-                pnfs_put_lseg(freeme);
-        }
-}
-static unsigned int
-alloc_ds_commits(struct nfs_commit_info *cinfo, struct list_head *list)
-{
-        struct pnfs_ds_commit_info *fl_cinfo;
-        struct pnfs_commit_bucket *bucket;
-        struct nfs_commit_data *data;
-        int i;
-        unsigned int nreq = 0;
-        fl_cinfo = cinfo->ds;
-        bucket = fl_cinfo->buckets;
-        for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
-                if (list_empty(&bucket->committing))
-                        continue;
-                data = nfs_commitdata_alloc();
-                if (!data)
-                        break;
-                data->ds_commit_index = i;
-                spin_lock(cinfo->lock);
-                data->lseg = bucket->clseg;
-                bucket->clseg = NULL;
-                spin_unlock(cinfo->lock);
-                list_add(&data->pages, list);
-                nreq++;
-        }
-        /* Clean up on error */
-        filelayout_retry_commit(cinfo, i);
-        /* Caller will clean up entries put on list */
-        return nreq;
-}
-/* This follows nfs_commit_list pretty closely */
 static int
 filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
                           int how, struct nfs_commit_info *cinfo)
 {
-        struct nfs_commit_data *data, *tmp;
+        return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
-        LIST_HEAD(list);
+                                            filelayout_initiate_commit);
-        unsigned int nreq = 0;
-        if (!list_empty(mds_pages)) {
-                data = nfs_commitdata_alloc();
-                if (data != NULL) {
-                        data->lseg = NULL;
-                        list_add(&data->pages, &list);
-                        nreq++;
-                } else {
-                        nfs_retry_commit(mds_pages, NULL, cinfo);
-                        filelayout_retry_commit(cinfo, 0);
-                        cinfo->completion_ops->error_cleanup(NFS_I(inode));
-                        return -ENOMEM;
-                }
-        }
-        nreq += alloc_ds_commits(cinfo, &list);
-        if (nreq == 0) {
-                cinfo->completion_ops->error_cleanup(NFS_I(inode));
-                goto out;
-        }
-        atomic_add(nreq, &cinfo->mds->rpcs_out);
-        list_for_each_entry_safe(data, tmp, &list, pages) {
-                list_del_init(&data->pages);
-                if (!data->lseg) {
-                        nfs_init_commit(data, mds_pages, NULL, cinfo);
-                        nfs_initiate_commit(NFS_CLIENT(inode), data,
-                                            data->mds_ops, how, 0);
-                } else {
-                        struct pnfs_commit_bucket *buckets;
-                        buckets = cinfo->ds->buckets;
-                        nfs_init_commit(data, &buckets[data->ds_commit_index].committing, data->lseg, cinfo);
-                        filelayout_initiate_commit(data, how);
-                }
-        }
-out:
-        cinfo->ds->ncommitting = 0;
-        return PNFS_ATTEMPTED;
 }
 static struct nfs4_deviceid_node *
 filelayout_alloc_deviceid_node(struct nfs_server *server,
                struct pnfs_device *pdev, gfp_t gfp_flags)
@@ -1421,9 +1129,9 @@ static struct pnfs_layoutdriver_type filelayout_type = {
        .pg_write_ops           = &filelayout_pg_write_ops,
        .get_ds_info            = &filelayout_get_ds_info,
        .mark_request_commit    = filelayout_mark_request_commit,
-        .clear_request_commit   = filelayout_clear_request_commit,
+        .clear_request_commit   = pnfs_generic_clear_request_commit,
-        .scan_commit_lists      = filelayout_scan_commit_lists,
+        .scan_commit_lists      = pnfs_generic_scan_commit_lists,
-        .recover_commit_reqs    = filelayout_recover_commit_reqs,
+        .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
        .search_commit_reqs     = filelayout_search_commit_reqs,
        .commit_pagelist        = filelayout_commit_pagelist,
        .read_pagelist          = filelayout_read_pagelist,
diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h
index 7c9f800c49d7..2896cb833a11 100644
--- a/fs/nfs/filelayout/filelayout.h
+++ b/fs/nfs/filelayout/filelayout.h
@@ -33,13 +33,6 @@
 #include "../pnfs.h"
 /*
- * Default data server connection timeout and retrans vaules.
- * Set by module paramters dataserver_timeo and dataserver_retrans.
- */
-#define NFS4_DEF_DS_TIMEO   600 /* in tenths of a second */
-#define NFS4_DEF_DS_RETRANS 5
-/*
 * Field testing shows we need to support up to 4096 stripe indices.
 * We store each index as a u8 (u32 on the wire) to keep the memory footprint
 * reasonable. This in turn means we support a maximum of 256
@@ -48,32 +41,11 @@
 #define NFS4_PNFS_MAX_STRIPE_CNT 4096
 #define NFS4_PNFS_MAX_MULTI_CNT  256 /* 256 fit into a u8 stripe_index */
-/* error codes for internal use */
-#define NFS4ERR_RESET_TO_MDS   12001
 enum stripetype4 {
        STRIPE_SPARSE = 1,
        STRIPE_DENSE = 2
 };
-/* Individual ip address */
-struct nfs4_pnfs_ds_addr {
-        struct sockaddr_storage da_addr;
-        size_t                  da_addrlen;
-        struct list_head        da_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
-        char                    *da_remotestr;  /* human readable addr+port */
-};
-struct nfs4_pnfs_ds {
-        struct list_head        ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
-        char                    *ds_remotestr;  /* comma sep list of addrs */
-        struct list_head        ds_addrs;
-        struct nfs_client       *ds_clp;
-        atomic_t                ds_count;
-        unsigned long           ds_state;
-#define NFS4DS_CONNECTING       0       /* ds is establishing connection */
-};
 struct nfs4_file_layout_dsaddr {
        struct nfs4_deviceid_node       id_node;
        u32                             stripe_count;
@@ -119,17 +91,6 @@ FILELAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg)
        return &FILELAYOUT_LSEG(lseg)->dsaddr->id_node;
 }
-static inline void
-filelayout_mark_devid_invalid(struct nfs4_deviceid_node *node)
-{
-        u32 *p = (u32 *)&node->deviceid;
-        printk(KERN_WARNING "NFS: Deviceid [%x%x%x%x] marked out of use.\n",
-                p[0], p[1], p[2], p[3]);
-        set_bit(NFS_DEVICEID_INVALID, &node->flags);
-}
 static inline bool
 filelayout_test_devid_invalid(struct nfs4_deviceid_node *node)
 {
@@ -142,7 +103,6 @@ filelayout_test_devid_unavailable(struct nfs4_deviceid_node *node);
 extern struct nfs_fh *
 nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j);
-extern void print_ds(struct nfs4_pnfs_ds *ds);
 u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset);
 u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j);
 struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index bfecac781f19..4f372e224603 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -31,7 +31,6 @@
 #include <linux/nfs_fs.h>
 #include <linux/vmalloc.h>
 #include <linux/module.h>
-#include <linux/sunrpc/addr.h>
 #include "../internal.h"
 #include "../nfs4session.h"
@@ -42,183 +41,6 @@
 static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
 static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
-/*
- * Data server cache
- *
- * Data servers can be mapped to different device ids.
- * nfs4_pnfs_ds reference counting
- *   - set to 1 on allocation
- *   - incremented when a device id maps a data server already in the cache.
- *   - decremented when deviceid is removed from the cache.
- */
-static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
-static LIST_HEAD(nfs4_data_server_cache);
-/* Debug routines */
-void
-print_ds(struct nfs4_pnfs_ds *ds)
-{
-        if (ds == NULL) {
-                printk("%s NULL device\n", __func__);
-                return;
-        }
-        printk("        ds %s\n"
-                "        ref count %d\n"
-                "        client %p\n"
-                "        cl_exchange_flags %x\n",
-                ds->ds_remotestr,
-                atomic_read(&ds->ds_count), ds->ds_clp,
-                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
-}
-static bool
-same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
-{
-        struct sockaddr_in *a, *b;
-        struct sockaddr_in6 *a6, *b6;
-        if (addr1->sa_family != addr2->sa_family)
-                return false;
-        switch (addr1->sa_family) {
-        case AF_INET:
-                a = (struct sockaddr_in *)addr1;
-                b = (struct sockaddr_in *)addr2;
-                if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
-                    a->sin_port == b->sin_port)
-                        return true;
-                break;
-        case AF_INET6:
-                a6 = (struct sockaddr_in6 *)addr1;
-                b6 = (struct sockaddr_in6 *)addr2;
-                /* LINKLOCAL addresses must have matching scope_id */
-                if (ipv6_addr_src_scope(&a6->sin6_addr) ==
-                    IPV6_ADDR_SCOPE_LINKLOCAL &&
-                    a6->sin6_scope_id != b6->sin6_scope_id)
-                        return false;
-                if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
-                    a6->sin6_port == b6->sin6_port)
-                        return true;
-                break;
-        default:
-                dprintk("%s: unhandled address family: %u\n",
-                        __func__, addr1->sa_family);
-                return false;
-        }
-        return false;
-}
-static bool
-_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
-                               const struct list_head *dsaddrs2)
-{
-        struct nfs4_pnfs_ds_addr *da1, *da2;
-        /* step through both lists, comparing as we go */
-        for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
-             da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
-             da1 != NULL && da2 != NULL;
-             da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
-             da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
-                if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
-                                   (struct sockaddr *)&da2->da_addr))
-                        return false;
-        }
-        if (da1 == NULL && da2 == NULL)
-                return true;
-        return false;
-}
-/*
- * Lookup DS by addresses.  nfs4_ds_cache_lock is held
- */
-static struct nfs4_pnfs_ds *
-_data_server_lookup_locked(const struct list_head *dsaddrs)
-{
-        struct nfs4_pnfs_ds *ds;
-        list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
-                if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
-                        return ds;
-        return NULL;
-}
-/*
- * Create an rpc connection to the nfs4_pnfs_ds data server
- * Currently only supports IPv4 and IPv6 addresses
- */
-static int
-nfs4_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds)
-{
-        struct nfs_client *clp = ERR_PTR(-EIO);
-        struct nfs4_pnfs_ds_addr *da;
-        int status = 0;
-        dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
-                mds_srv->nfs_client->cl_rpcclient->cl_auth->au_flavor);
-        list_for_each_entry(da, &ds->ds_addrs, da_node) {
-                dprintk("%s: DS %s: trying address %s\n",
-                        __func__, ds->ds_remotestr, da->da_remotestr);
-                clp = nfs4_set_ds_client(mds_srv->nfs_client,
-                                        (struct sockaddr *)&da->da_addr,
-                                        da->da_addrlen, IPPROTO_TCP,
-                                        dataserver_timeo, dataserver_retrans);
-                if (!IS_ERR(clp))
-                        break;
-        }
-        if (IS_ERR(clp)) {
-                status = PTR_ERR(clp);
-                goto out;
-        }
-        status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
-        if (status)
-                goto out_put;
-        smp_wmb();
-        ds->ds_clp = clp;
-        dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
-out:
-        return status;
-out_put:
-        nfs_put_client(clp);
-        goto out;
-}
-static void
-destroy_ds(struct nfs4_pnfs_ds *ds)
-{
-        struct nfs4_pnfs_ds_addr *da;
-        dprintk("--> %s\n", __func__);
-        ifdebug(FACILITY)
-                print_ds(ds);
-        nfs_put_client(ds->ds_clp);
-        while (!list_empty(&ds->ds_addrs)) {
-                da = list_first_entry(&ds->ds_addrs,
-                                      struct nfs4_pnfs_ds_addr,
-                                      da_node);
-                list_del_init(&da->da_node);
-                kfree(da->da_remotestr);
-                kfree(da);
-        }
-        kfree(ds->ds_remotestr);
-        kfree(ds);
-}
 void
 nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
 {
@@ -229,259 +51,13 @@ nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr)
        for (i = 0; i < dsaddr->ds_num; i++) {
                ds = dsaddr->ds_list[i];
-                if (ds != NULL) {
+                if (ds != NULL)
-                        if (atomic_dec_and_lock(&ds->ds_count,
+                        nfs4_pnfs_ds_put(ds);
-                                                &nfs4_ds_cache_lock)) {
-                                list_del_init(&ds->ds_node);
-                                spin_unlock(&nfs4_ds_cache_lock);
-                                destroy_ds(ds);
-                        }
-                }
        }
        kfree(dsaddr->stripe_indices);
        kfree(dsaddr);
 }
-/*
- * Create a string with a human readable address and port to avoid
- * complicated setup around many dprinks.
- */
-static char *
-nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
-{
-        struct nfs4_pnfs_ds_addr *da;
-        char *remotestr;
-        size_t len;
-        char *p;
-        len = 3;        /* '{', '}' and eol */
-        list_for_each_entry(da, dsaddrs, da_node) {
-                len += strlen(da->da_remotestr) + 1;    /* string plus comma */
-        }
-        remotestr = kzalloc(len, gfp_flags);
-        if (!remotestr)
-                return NULL;
-        p = remotestr;
-        *(p++) = '{';
-        len--;
-        list_for_each_entry(da, dsaddrs, da_node) {
-                size_t ll = strlen(da->da_remotestr);
-                if (ll > len)
-                        goto out_err;
-                memcpy(p, da->da_remotestr, ll);
-                p += ll;
-                len -= ll;
-                if (len < 1)
-                        goto out_err;
-                (*p++) = ',';
-                len--;
-        }
-        if (len < 2)
-                goto out_err;
-        *(p++) = '}';
-        *p = '\0';
-        return remotestr;
-out_err:
-        kfree(remotestr);
-        return NULL;
-}
-static struct nfs4_pnfs_ds *
-nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
-{
-        struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
-        char *remotestr;
-        if (list_empty(dsaddrs)) {
-                dprintk("%s: no addresses defined\n", __func__);
-                goto out;
-        }
-        ds = kzalloc(sizeof(*ds), gfp_flags);
-        if (!ds)
-                goto out;
-        /* this is only used for debugging, so it's ok if its NULL */
-        remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
-        spin_lock(&nfs4_ds_cache_lock);
-        tmp_ds = _data_server_lookup_locked(dsaddrs);
-        if (tmp_ds == NULL) {
-                INIT_LIST_HEAD(&ds->ds_addrs);
-                list_splice_init(dsaddrs, &ds->ds_addrs);
-                ds->ds_remotestr = remotestr;
-                atomic_set(&ds->ds_count, 1);
-                INIT_LIST_HEAD(&ds->ds_node);
-                ds->ds_clp = NULL;
-                list_add(&ds->ds_node, &nfs4_data_server_cache);
-                dprintk("%s add new data server %s\n", __func__,
-                        ds->ds_remotestr);
-        } else {
-                kfree(remotestr);
-                kfree(ds);
-                atomic_inc(&tmp_ds->ds_count);
-                dprintk("%s data server %s found, inc'ed ds_count to %d\n",
-                        __func__, tmp_ds->ds_remotestr,
-                        atomic_read(&tmp_ds->ds_count));
-                ds = tmp_ds;
-        }
-        spin_unlock(&nfs4_ds_cache_lock);
-out:
-        return ds;
-}
-/*
- * Currently only supports ipv4, ipv6 and one multi-path address.
- */
-static struct nfs4_pnfs_ds_addr *
-decode_ds_addr(struct net *net, struct xdr_stream *streamp, gfp_t gfp_flags)
-{
-        struct nfs4_pnfs_ds_addr *da = NULL;
-        char *buf, *portstr;
-        __be16 port;
-        int nlen, rlen;
-        int tmp[2];
-        __be32 *p;
-        char *netid, *match_netid;
-        size_t len, match_netid_len;
-        char *startsep = "";
-        char *endsep = "";
-        /* r_netid */
-        p = xdr_inline_decode(streamp, 4);
-        if (unlikely(!p))
-                goto out_err;
-        nlen = be32_to_cpup(p++);
-        p = xdr_inline_decode(streamp, nlen);
-        if (unlikely(!p))
-                goto out_err;
-        netid = kmalloc(nlen+1, gfp_flags);
-        if (unlikely(!netid))
-                goto out_err;
-        netid[nlen] = '\0';
-        memcpy(netid, p, nlen);
-        /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
-        p = xdr_inline_decode(streamp, 4);
-        if (unlikely(!p))
-                goto out_free_netid;
-        rlen = be32_to_cpup(p);
-        p = xdr_inline_decode(streamp, rlen);
-        if (unlikely(!p))
-                goto out_free_netid;
-        /* port is ".ABC.DEF", 8 chars max */
-        if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
-                dprintk("%s: Invalid address, length %d\n", __func__,
-                        rlen);
-                goto out_free_netid;
-        }
-        buf = kmalloc(rlen + 1, gfp_flags);
-        if (!buf) {
-                dprintk("%s: Not enough memory\n", __func__);
-                goto out_free_netid;
-        }
-        buf[rlen] = '\0';
-        memcpy(buf, p, rlen);
-        /* replace port '.' with '-' */
-        portstr = strrchr(buf, '.');
-        if (!portstr) {
-                dprintk("%s: Failed finding expected dot in port\n",
-                        __func__);
-                goto out_free_buf;
-        }
-        *portstr = '-';
-        /* find '.' between address and port */
-        portstr = strrchr(buf, '.');
-        if (!portstr) {
-                dprintk("%s: Failed finding expected dot between address and "
-                        "port\n", __func__);
-                goto out_free_buf;
-        }
-        *portstr = '\0';
-        da = kzalloc(sizeof(*da), gfp_flags);
-        if (unlikely(!da))
-                goto out_free_buf;
-        INIT_LIST_HEAD(&da->da_node);
-        if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
-                      sizeof(da->da_addr))) {
-                dprintk("%s: error parsing address %s\n", __func__, buf);
-                goto out_free_da;
-        }
-        portstr++;
-        sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
-        port = htons((tmp[0] << 8) | (tmp[1]));
-        switch (da->da_addr.ss_family) {
-        case AF_INET:
-                ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
-                da->da_addrlen = sizeof(struct sockaddr_in);
-                match_netid = "tcp";
-                match_netid_len = 3;
-                break;
-        case AF_INET6:
-                ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
-                da->da_addrlen = sizeof(struct sockaddr_in6);
-                match_netid = "tcp6";
-                match_netid_len = 4;
-                startsep = "[";
-                endsep = "]";
-                break;
-        default:
-                dprintk("%s: unsupported address family: %u\n",
-                        __func__, da->da_addr.ss_family);
-                goto out_free_da;
-        }
-        if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
-                dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
-                        __func__, netid, match_netid);
-                goto out_free_da;
-        }
-        /* save human readable address */
-        len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
-        da->da_remotestr = kzalloc(len, gfp_flags);
-        /* NULL is ok, only used for dprintk */
-        if (da->da_remotestr)
-                snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
-                         buf, endsep, ntohs(port));
-        dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
-        kfree(buf);
-        kfree(netid);
-        return da;
-out_free_da:
-        kfree(da);
-out_free_buf:
-        dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
-        kfree(buf);
-out_free_netid:
-        kfree(netid);
-out_err:
-        return NULL;
-}
 /* Decode opaque device data and return the result */
 struct nfs4_file_layout_dsaddr *
 nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
@@ -584,8 +160,8 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
                mp_count = be32_to_cpup(p); /* multipath count */
                for (j = 0; j < mp_count; j++) {
-                        da = decode_ds_addr(server->nfs_client->cl_net,
+                        da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
-                                            &stream, gfp_flags);
+                                                    &stream, gfp_flags);
                        if (da)
                                list_add_tail(&da->da_node, &dsaddrs);
                }
@@ -681,22 +257,7 @@ nfs4_fl_select_ds_fh(struct pnfs_layout_segment *lseg, u32 j)
        return flseg->fh_array[i];
 }
-static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+/* Upon return, either ds is connected, or ds is NULL */
-{
-        might_sleep();
-        wait_on_bit_action(&ds->ds_state, NFS4DS_CONNECTING,
-                           nfs_wait_bit_killable, TASK_KILLABLE);
-}
-static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
-{
-        smp_mb__before_atomic();
-        clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
-        smp_mb__after_atomic();
-        wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
-}
 struct nfs4_pnfs_ds *
 nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 {
@@ -704,29 +265,23 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
        struct nfs4_pnfs_ds *ds = dsaddr->ds_list[ds_idx];
        struct nfs4_deviceid_node *devid = FILELAYOUT_DEVID_NODE(lseg);
        struct nfs4_pnfs_ds *ret = ds;
+        struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
        if (ds == NULL) {
                printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
                        __func__, ds_idx);
-                filelayout_mark_devid_invalid(devid);
+                pnfs_generic_mark_devid_invalid(devid);
                goto out;
        }
        smp_rmb();
        if (ds->ds_clp)
                goto out_test_devid;
-        if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
+        nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
-                struct nfs_server *s = NFS_SERVER(lseg->pls_layout->plh_inode);
+                             dataserver_retrans, 4,
-                int err;
+                             s->nfs_client->cl_minorversion,
+                             s->nfs_client->cl_rpcclient->cl_auth->au_flavor);
-                err = nfs4_ds_connect(s, ds);
-                if (err)
-                        nfs4_mark_deviceid_unavailable(devid);
-                nfs4_clear_ds_conn_bit(ds);
-        } else {
-                /* Either ds is connected, or ds is NULL */
-                nfs4_wait_ds_connect(ds);
-        }
 out_test_devid:
        if (filelayout_test_devid_unavailable(devid))
                ret = NULL;
diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile
new file mode 100644
index 000000000000..1d2c9f6bbcd4
--- /dev/null
+++ b/fs/nfs/flexfilelayout/Makefile
@@ -0,0 +1,5 @@
+#
+# Makefile for the pNFS Flexfile Layout Driver kernel module
+#
+obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o
+nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
new file mode 100644
index 000000000000..315cc68945b9
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -0,0 +1,1533 @@
+/*
+ * Module for pnfs flexfile layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/module.h>
+#include <linux/sunrpc/metrics.h>
+#include <linux/nfs_idmap.h>
+#include "flexfilelayout.h"
+#include "../nfs4session.h"
+#include "../internal.h"
+#include "../delegation.h"
+#include "../nfs4trace.h"
+#include "../iostat.h"
+#include "../nfs.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+#define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
+static struct pnfs_layout_hdr *
+ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
+{
+        struct nfs4_flexfile_layout *ffl;
+        ffl = kzalloc(sizeof(*ffl), gfp_flags);
+        if (ffl) {
+                INIT_LIST_HEAD(&ffl->error_list);
+                return &ffl->generic_hdr;
+        } else
+                return NULL;
+}
+static void
+ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+        struct nfs4_ff_layout_ds_err *err, *n;
+        list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
+                                 list) {
+                list_del(&err->list);
+                kfree(err);
+        }
+        kfree(FF_LAYOUT_FROM_HDR(lo));
+}
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE);
+        if (unlikely(p == NULL))
+                return -ENOBUFS;
+        memcpy(stateid, p, NFS4_STATEID_SIZE);
+        dprintk("%s: stateid id= [%x%x%x%x]\n", __func__,
+                p[0], p[1], p[2], p[3]);
+        return 0;
+}
+static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE);
+        if (unlikely(!p))
+                return -ENOBUFS;
+        memcpy(devid, p, NFS4_DEVICEID4_SIZE);
+        nfs4_print_deviceid(devid);
+        return 0;
+}
+static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh)
+{
+        __be32 *p;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                return -ENOBUFS;
+        fh->size = be32_to_cpup(p++);
+        if (fh->size > sizeof(struct nfs_fh)) {
+                printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n",
+                       fh->size);
+                return -EOVERFLOW;
+        }
+        /* fh.data */
+        p = xdr_inline_decode(xdr, fh->size);
+        if (unlikely(!p))
+                return -ENOBUFS;
+        memcpy(&fh->data, p, fh->size);
+        dprintk("%s: fh len %d\n", __func__, fh->size);
+        return 0;
+}
+/*
+ * Currently only stringified uids and gids are accepted.
+ * I.e., kerberos is not supported to the DSes, so no pricipals.
+ *
+ * That means that one common function will suffice, but when
+ * principals are added, this should be split to accomodate
+ * calls to both nfs_map_name_to_uid() and nfs_map_group_to_gid().
+ */
+static int
+decode_name(struct xdr_stream *xdr, u32 *id)
+{
+        __be32 *p;
+        int len;
+        /* opaque_length(4)*/
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                return -ENOBUFS;
+        len = be32_to_cpup(p++);
+        if (len < 0)
+                return -EINVAL;
+        dprintk("%s: len %u\n", __func__, len);
+        /* opaque body */
+        p = xdr_inline_decode(xdr, len);
+        if (unlikely(!p))
+                return -ENOBUFS;
+        if (!nfs_map_string_to_numeric((char *)p, len, id))
+                return -EINVAL;
+        return 0;
+}
+static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
+{
+        int i;
+        if (fls->mirror_array) {
+                for (i = 0; i < fls->mirror_array_cnt; i++) {
+                        /* normally mirror_ds is freed in
+                         * .free_deviceid_node but we still do it here
+                         * for .alloc_lseg error path */
+                        if (fls->mirror_array[i]) {
+                                kfree(fls->mirror_array[i]->fh_versions);
+                                nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+                                kfree(fls->mirror_array[i]);
+                        }
+                }
+                kfree(fls->mirror_array);
+                fls->mirror_array = NULL;
+        }
+}
+static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
+{
+        int ret = 0;
+        dprintk("--> %s\n", __func__);
+        /* FIXME: remove this check when layout segment support is added */
+        if (lgr->range.offset != 0 ||
+            lgr->range.length != NFS4_MAX_UINT64) {
+                dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
+                        __func__);
+                ret = -EINVAL;
+        }
+        dprintk("--> %s returns %d\n", __func__, ret);
+        return ret;
+}
+static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
+{
+        if (fls) {
+                ff_layout_free_mirror_array(fls);
+                kfree(fls);
+        }
+}
+static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
+{
+        struct nfs4_ff_layout_mirror *tmp;
+        int i, j;
+        for (i = 0; i < fls->mirror_array_cnt - 1; i++) {
+                for (j = i + 1; j < fls->mirror_array_cnt; j++)
+                        if (fls->mirror_array[i]->efficiency <
+                            fls->mirror_array[j]->efficiency) {
+                                tmp = fls->mirror_array[i];
+                                fls->mirror_array[i] = fls->mirror_array[j];
+                                fls->mirror_array[j] = tmp;
+                        }
+        }
+}
+static struct pnfs_layout_segment *
+ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
+                     struct nfs4_layoutget_res *lgr,
+                     gfp_t gfp_flags)
+{
+        struct pnfs_layout_segment *ret;
+        struct nfs4_ff_layout_segment *fls = NULL;
+        struct xdr_stream stream;
+        struct xdr_buf buf;
+        struct page *scratch;
+        u64 stripe_unit;
+        u32 mirror_array_cnt;
+        __be32 *p;
+        int i, rc;
+        dprintk("--> %s\n", __func__);
+        scratch = alloc_page(gfp_flags);
+        if (!scratch)
+                return ERR_PTR(-ENOMEM);
+        xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages,
+                              lgr->layoutp->len);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+        /* stripe unit and mirror_array_cnt */
+        rc = -EIO;
+        p = xdr_inline_decode(&stream, 8 + 4);
+        if (!p)
+                goto out_err_free;
+        p = xdr_decode_hyper(p, &stripe_unit);
+        mirror_array_cnt = be32_to_cpup(p++);
+        dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__,
+                stripe_unit, mirror_array_cnt);
+        if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT ||
+            mirror_array_cnt == 0)
+                goto out_err_free;
+        rc = -ENOMEM;
+        fls = kzalloc(sizeof(*fls), gfp_flags);
+        if (!fls)
+                goto out_err_free;
+        fls->mirror_array_cnt = mirror_array_cnt;
+        fls->stripe_unit = stripe_unit;
+        fls->mirror_array = kcalloc(fls->mirror_array_cnt,
+                                    sizeof(fls->mirror_array[0]), gfp_flags);
+        if (fls->mirror_array == NULL)
+                goto out_err_free;
+        for (i = 0; i < fls->mirror_array_cnt; i++) {
+                struct nfs4_deviceid devid;
+                struct nfs4_deviceid_node *idnode;
+                u32 ds_count;
+                u32 fh_count;
+                int j;
+                rc = -EIO;
+                p = xdr_inline_decode(&stream, 4);
+                if (!p)
+                        goto out_err_free;
+                ds_count = be32_to_cpup(p);
+                /* FIXME: allow for striping? */
+                if (ds_count != 1)
+                        goto out_err_free;
+                fls->mirror_array[i] =
+                        kzalloc(sizeof(struct nfs4_ff_layout_mirror),
+                                gfp_flags);
+                if (fls->mirror_array[i] == NULL) {
+                        rc = -ENOMEM;
+                        goto out_err_free;
+                }
+                spin_lock_init(&fls->mirror_array[i]->lock);
+                fls->mirror_array[i]->ds_count = ds_count;
+                /* deviceid */
+                rc = decode_deviceid(&stream, &devid);
+                if (rc)
+                        goto out_err_free;
+                idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
+                                                &devid, lh->plh_lc_cred,
+                                                gfp_flags);
+                /*
+                 * upon success, mirror_ds is allocated by previous
+                 * getdeviceinfo, or newly by .alloc_deviceid_node
+                 * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
+                 */
+                if (idnode)
+                        fls->mirror_array[i]->mirror_ds =
+                                FF_LAYOUT_MIRROR_DS(idnode);
+                else
+                        goto out_err_free;
+                /* efficiency */
+                rc = -EIO;
+                p = xdr_inline_decode(&stream, 4);
+                if (!p)
+                        goto out_err_free;
+                fls->mirror_array[i]->efficiency = be32_to_cpup(p);
+                /* stateid */
+                rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid);
+                if (rc)
+                        goto out_err_free;
+                /* fh */
+                p = xdr_inline_decode(&stream, 4);
+                if (!p)
+                        goto out_err_free;
+                fh_count = be32_to_cpup(p);
+                fls->mirror_array[i]->fh_versions =
+                        kzalloc(fh_count * sizeof(struct nfs_fh),
+                                gfp_flags);
+                if (fls->mirror_array[i]->fh_versions == NULL) {
+                        rc = -ENOMEM;
+                        goto out_err_free;
+                }
+                for (j = 0; j < fh_count; j++) {
+                        rc = decode_nfs_fh(&stream,
+                                           &fls->mirror_array[i]->fh_versions[j]);
+                        if (rc)
+                                goto out_err_free;
+                }
+                fls->mirror_array[i]->fh_versions_cnt = fh_count;
+                /* user */
+                rc = decode_name(&stream, &fls->mirror_array[i]->uid);
+                if (rc)
+                        goto out_err_free;
+                /* group */
+                rc = decode_name(&stream, &fls->mirror_array[i]->gid);
+                if (rc)
+                        goto out_err_free;
+                dprintk("%s: uid %d gid %d\n", __func__,
+                        fls->mirror_array[i]->uid,
+                        fls->mirror_array[i]->gid);
+        }
+        ff_layout_sort_mirrors(fls);
+        rc = ff_layout_check_layout(lgr);
+        if (rc)
+                goto out_err_free;
+        ret = &fls->generic_hdr;
+        dprintk("<-- %s (success)\n", __func__);
+out_free_page:
+        __free_page(scratch);
+        return ret;
+out_err_free:
+        _ff_layout_free_lseg(fls);
+        ret = ERR_PTR(rc);
+        dprintk("<-- %s (%d)\n", __func__, rc);
+        goto out_free_page;
+}
+static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
+{
+        struct pnfs_layout_segment *lseg;
+        list_for_each_entry(lseg, &layout->plh_segs, pls_list)
+                if (lseg->pls_range.iomode == IOMODE_RW)
+                        return true;
+        return false;
+}
+static void
+ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
+{
+        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+        int i;
+        dprintk("--> %s\n", __func__);
+        for (i = 0; i < fls->mirror_array_cnt; i++) {
+                if (fls->mirror_array[i]) {
+                        nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds);
+                        fls->mirror_array[i]->mirror_ds = NULL;
+                        if (fls->mirror_array[i]->cred) {
+                                put_rpccred(fls->mirror_array[i]->cred);
+                                fls->mirror_array[i]->cred = NULL;
+                        }
+                }
+        }
+        if (lseg->pls_range.iomode == IOMODE_RW) {
+                struct nfs4_flexfile_layout *ffl;
+                struct inode *inode;
+                ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
+                inode = ffl->generic_hdr.plh_inode;
+                spin_lock(&inode->i_lock);
+                if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
+                        ffl->commit_info.nbuckets = 0;
+                        kfree(ffl->commit_info.buckets);
+                        ffl->commit_info.buckets = NULL;
+                }
+                spin_unlock(&inode->i_lock);
+        }
+        _ff_layout_free_lseg(fls);
+}
+/* Return 1 until we have multiple lsegs support */
+static int
+ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
+{
+        return 1;
+}
+static int
+ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
+                            struct nfs_commit_info *cinfo,
+                            gfp_t gfp_flags)
+{
+        struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
+        struct pnfs_commit_bucket *buckets;
+        int size;
+        if (cinfo->ds->nbuckets != 0) {
+                /* This assumes there is only one RW lseg per file.
+                 * To support multiple lseg per file, we need to
+                 * change struct pnfs_commit_bucket to allow dynamic
+                 * increasing nbuckets.
+                 */
+                return 0;
+        }
+        size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
+        buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
+                          gfp_flags);
+        if (!buckets)
+                return -ENOMEM;
+        else {
+                int i;
+                spin_lock(cinfo->lock);
+                if (cinfo->ds->nbuckets != 0)
+                        kfree(buckets);
+                else {
+                        cinfo->ds->buckets = buckets;
+                        cinfo->ds->nbuckets = size;
+                        for (i = 0; i < size; i++) {
+                                INIT_LIST_HEAD(&buckets[i].written);
+                                INIT_LIST_HEAD(&buckets[i].committing);
+                                /* mark direct verifier as unset */
+                                buckets[i].direct_verf.committed =
+                                        NFS_INVALID_STABLE_HOW;
+                        }
+                }
+                spin_unlock(cinfo->lock);
+                return 0;
+        }
+}
+static struct nfs4_pnfs_ds *
+ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio,
+                                  int *best_idx)
+{
+        struct nfs4_ff_layout_segment *fls;
+        struct nfs4_pnfs_ds *ds;
+        int idx;
+        fls = FF_LAYOUT_LSEG(pgio->pg_lseg);
+        /* mirrors are sorted by efficiency */
+        for (idx = 0; idx < fls->mirror_array_cnt; idx++) {
+                ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false);
+                if (ds) {
+                        *best_idx = idx;
+                        return ds;
+                }
+        }
+        return NULL;
+}
+static void
+ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
+                        struct nfs_page *req)
+{
+        struct nfs_pgio_mirror *pgm;
+        struct nfs4_ff_layout_mirror *mirror;
+        struct nfs4_pnfs_ds *ds;
+        int ds_idx;
+        /* Use full layout for now */
+        if (!pgio->pg_lseg)
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   req->wb_context,
+                                                   0,
+                                                   NFS4_MAX_UINT64,
+                                                   IOMODE_READ,
+                                                   GFP_KERNEL);
+        /* If no lseg, fall back to read through mds */
+        if (pgio->pg_lseg == NULL)
+                goto out_mds;
+        ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx);
+        if (!ds)
+                goto out_mds;
+        mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx);
+        pgio->pg_mirror_idx = ds_idx;
+        /* read always uses only one mirror - idx 0 for pgio layer */
+        pgm = &pgio->pg_mirrors[0];
+        pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize;
+        return;
+out_mds:
+        pnfs_put_lseg(pgio->pg_lseg);
+        pgio->pg_lseg = NULL;
+        nfs_pageio_reset_read_mds(pgio);
+}
+static void
+ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
+                        struct nfs_page *req)
+{
+        struct nfs4_ff_layout_mirror *mirror;
+        struct nfs_pgio_mirror *pgm;
+        struct nfs_commit_info cinfo;
+        struct nfs4_pnfs_ds *ds;
+        int i;
+        int status;
+        if (!pgio->pg_lseg)
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   req->wb_context,
+                                                   0,
+                                                   NFS4_MAX_UINT64,
+                                                   IOMODE_RW,
+                                                   GFP_NOFS);
+        /* If no lseg, fall back to write through mds */
+        if (pgio->pg_lseg == NULL)
+                goto out_mds;
+        nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
+        status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
+        if (status < 0)
+                goto out_mds;
+        /* Use a direct mapping of ds_idx to pgio mirror_idx */
+        if (WARN_ON_ONCE(pgio->pg_mirror_count !=
+            FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
+                goto out_mds;
+        for (i = 0; i < pgio->pg_mirror_count; i++) {
+                ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true);
+                if (!ds)
+                        goto out_mds;
+                pgm = &pgio->pg_mirrors[i];
+                mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i);
+                pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize;
+        }
+        return;
+out_mds:
+        pnfs_put_lseg(pgio->pg_lseg);
+        pgio->pg_lseg = NULL;
+        nfs_pageio_reset_write_mds(pgio);
+}
+static unsigned int
+ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
+                                    struct nfs_page *req)
+{
+        if (!pgio->pg_lseg)
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   req->wb_context,
+                                                   0,
+                                                   NFS4_MAX_UINT64,
+                                                   IOMODE_RW,
+                                                   GFP_NOFS);
+        if (pgio->pg_lseg)
+                return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
+        /* no lseg means that pnfs is not in use, so no mirroring here */
+        pnfs_put_lseg(pgio->pg_lseg);
+        pgio->pg_lseg = NULL;
+        nfs_pageio_reset_write_mds(pgio);
+        return 1;
+}
+static const struct nfs_pageio_ops ff_layout_pg_read_ops = {
+        .pg_init = ff_layout_pg_init_read,
+        .pg_test = pnfs_generic_pg_test,
+        .pg_doio = pnfs_generic_pg_readpages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+static const struct nfs_pageio_ops ff_layout_pg_write_ops = {
+        .pg_init = ff_layout_pg_init_write,
+        .pg_test = pnfs_generic_pg_test,
+        .pg_doio = pnfs_generic_pg_writepages,
+        .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
+};
+static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
+{
+        struct rpc_task *task = &hdr->task;
+        pnfs_layoutcommit_inode(hdr->inode, false);
+        if (retry_pnfs) {
+                dprintk("%s Reset task %5u for i/o through pNFS "
+                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+                        hdr->task.tk_pid,
+                        hdr->inode->i_sb->s_id,
+                        (unsigned long long)NFS_FILEID(hdr->inode),
+                        hdr->args.count,
+                        (unsigned long long)hdr->args.offset);
+                if (!hdr->dreq) {
+                        struct nfs_open_context *ctx;
+                        ctx = nfs_list_entry(hdr->pages.next)->wb_context;
+                        set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
+                        hdr->completion_ops->error_cleanup(&hdr->pages);
+                } else {
+                        nfs_direct_set_resched_writes(hdr->dreq);
+                        /* fake unstable write to let common nfs resend pages */
+                        hdr->verf.committed = NFS_UNSTABLE;
+                        hdr->good_bytes = 0;
+                }
+                return;
+        }
+        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+                dprintk("%s Reset task %5u for i/o through MDS "
+                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+                        hdr->task.tk_pid,
+                        hdr->inode->i_sb->s_id,
+                        (unsigned long long)NFS_FILEID(hdr->inode),
+                        hdr->args.count,
+                        (unsigned long long)hdr->args.offset);
+                task->tk_status = pnfs_write_done_resend_to_mds(hdr);
+        }
+}
+static void ff_layout_reset_read(struct nfs_pgio_header *hdr)
+{
+        struct rpc_task *task = &hdr->task;
+        pnfs_layoutcommit_inode(hdr->inode, false);
+        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+                dprintk("%s Reset task %5u for i/o through MDS "
+                        "(req %s/%llu, %u bytes @ offset %llu)\n", __func__,
+                        hdr->task.tk_pid,
+                        hdr->inode->i_sb->s_id,
+                        (unsigned long long)NFS_FILEID(hdr->inode),
+                        hdr->args.count,
+                        (unsigned long long)hdr->args.offset);
+                task->tk_status = pnfs_read_done_resend_to_mds(hdr);
+        }
+}
+static int ff_layout_async_handle_error_v4(struct rpc_task *task,
+                                           struct nfs4_state *state,
+                                           struct nfs_client *clp,
+                                           struct pnfs_layout_segment *lseg,
+                                           int idx)
+{
+        struct pnfs_layout_hdr *lo = lseg->pls_layout;
+        struct inode *inode = lo->plh_inode;
+        struct nfs_server *mds_server = NFS_SERVER(inode);
+        struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+        struct nfs_client *mds_client = mds_server->nfs_client;
+        struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table;
+        if (task->tk_status >= 0)
+                return 0;
+        switch (task->tk_status) {
+        /* MDS state errors */
+        case -NFS4ERR_DELEG_REVOKED:
+        case -NFS4ERR_ADMIN_REVOKED:
+        case -NFS4ERR_BAD_STATEID:
+                if (state == NULL)
+                        break;
+                nfs_remove_bad_delegation(state->inode);
+        case -NFS4ERR_OPENMODE:
+                if (state == NULL)
+                        break;
+                if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+                        goto out_bad_stateid;
+                goto wait_on_recovery;
+        case -NFS4ERR_EXPIRED:
+                if (state != NULL) {
+                        if (nfs4_schedule_stateid_recovery(mds_server, state) < 0)
+                                goto out_bad_stateid;
+                }
+                nfs4_schedule_lease_recovery(mds_client);
+                goto wait_on_recovery;
+        /* DS session errors */
+        case -NFS4ERR_BADSESSION:
+        case -NFS4ERR_BADSLOT:
+        case -NFS4ERR_BAD_HIGH_SLOT:
+        case -NFS4ERR_DEADSESSION:
+        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+        case -NFS4ERR_SEQ_FALSE_RETRY:
+        case -NFS4ERR_SEQ_MISORDERED:
+                dprintk("%s ERROR %d, Reset session. Exchangeid "
+                        "flags 0x%x\n", __func__, task->tk_status,
+                        clp->cl_exchange_flags);
+                nfs4_schedule_session_recovery(clp->cl_session, task->tk_status);
+                break;
+        case -NFS4ERR_DELAY:
+        case -NFS4ERR_GRACE:
+                rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX);
+                break;
+        case -NFS4ERR_RETRY_UNCACHED_REP:
+                break;
+        /* Invalidate Layout errors */
+        case -NFS4ERR_PNFS_NO_LAYOUT:
+        case -ESTALE:           /* mapped NFS4ERR_STALE */
+        case -EBADHANDLE:       /* mapped NFS4ERR_BADHANDLE */
+        case -EISDIR:           /* mapped NFS4ERR_ISDIR */
+        case -NFS4ERR_FHEXPIRED:
+        case -NFS4ERR_WRONG_TYPE:
+                dprintk("%s Invalid layout error %d\n", __func__,
+                        task->tk_status);
+                /*
+                 * Destroy layout so new i/o will get a new layout.
+                 * Layout will not be destroyed until all current lseg
+                 * references are put. Mark layout as invalid to resend failed
+                 * i/o and all i/o waiting on the slot table to the MDS until
+                 * layout is destroyed and a new valid layout is obtained.
+                 */
+                pnfs_destroy_layout(NFS_I(inode));
+                rpc_wake_up(&tbl->slot_tbl_waitq);
+                goto reset;
+        /* RPC connection errors */
+        case -ECONNREFUSED:
+        case -EHOSTDOWN:
+        case -EHOSTUNREACH:
+        case -ENETUNREACH:
+        case -EIO:
+        case -ETIMEDOUT:
+        case -EPIPE:
+                dprintk("%s DS connection error %d\n", __func__,
+                        task->tk_status);
+                nfs4_mark_deviceid_unavailable(devid);
+                rpc_wake_up(&tbl->slot_tbl_waitq);
+                /* fall through */
+        default:
+                if (ff_layout_has_available_ds(lseg))
+                        return -NFS4ERR_RESET_TO_PNFS;
+reset:
+                dprintk("%s Retry through MDS. Error %d\n", __func__,
+                        task->tk_status);
+                return -NFS4ERR_RESET_TO_MDS;
+        }
+out:
+        task->tk_status = 0;
+        return -EAGAIN;
+out_bad_stateid:
+        task->tk_status = -EIO;
+        return 0;
+wait_on_recovery:
+        rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL);
+        if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0)
+                rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task);
+        goto out;
+}
+/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */
+static int ff_layout_async_handle_error_v3(struct rpc_task *task,
+                                           struct pnfs_layout_segment *lseg,
+                                           int idx)
+{
+        struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx);
+        if (task->tk_status >= 0)
+                return 0;
+        if (task->tk_status != -EJUKEBOX) {
+                dprintk("%s DS connection error %d\n", __func__,
+                        task->tk_status);
+                nfs4_mark_deviceid_unavailable(devid);
+                if (ff_layout_has_available_ds(lseg))
+                        return -NFS4ERR_RESET_TO_PNFS;
+                else
+                        return -NFS4ERR_RESET_TO_MDS;
+        }
+        if (task->tk_status == -EJUKEBOX)
+                nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY);
+        task->tk_status = 0;
+        rpc_restart_call(task);
+        rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
+        return -EAGAIN;
+}
+static int ff_layout_async_handle_error(struct rpc_task *task,
+                                        struct nfs4_state *state,
+                                        struct nfs_client *clp,
+                                        struct pnfs_layout_segment *lseg,
+                                        int idx)
+{
+        int vers = clp->cl_nfs_mod->rpc_vers->number;
+        switch (vers) {
+        case 3:
+                return ff_layout_async_handle_error_v3(task, lseg, idx);
+        case 4:
+                return ff_layout_async_handle_error_v4(task, state, clp,
+                                                       lseg, idx);
+        default:
+                /* should never happen */
+                WARN_ON_ONCE(1);
+                return 0;
+        }
+}
+static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
+                                        int idx, u64 offset, u64 length,
+                                        u32 status, int opnum)
+{
+        struct nfs4_ff_layout_mirror *mirror;
+        int err;
+        mirror = FF_LAYOUT_COMP(lseg, idx);
+        err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+                                       mirror, offset, length, status, opnum,
+                                       GFP_NOIO);
+        dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
+}
+/* NFS_PROTO call done callback routines */
+static int ff_layout_read_done_cb(struct rpc_task *task,
+                                struct nfs_pgio_header *hdr)
+{
+        struct inode *inode;
+        int err;
+        trace_nfs4_pnfs_read(hdr, task->tk_status);
+        if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+                hdr->res.op_status = NFS4ERR_NXIO;
+        if (task->tk_status < 0 && hdr->res.op_status)
+                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+                                            hdr->args.offset, hdr->args.count,
+                                            hdr->res.op_status, OP_READ);
+        err = ff_layout_async_handle_error(task, hdr->args.context->state,
+                                           hdr->ds_clp, hdr->lseg,
+                                           hdr->pgio_mirror_idx);
+        switch (err) {
+        case -NFS4ERR_RESET_TO_PNFS:
+                set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                        &hdr->lseg->pls_layout->plh_flags);
+                pnfs_read_resend_pnfs(hdr);
+                return task->tk_status;
+        case -NFS4ERR_RESET_TO_MDS:
+                inode = hdr->lseg->pls_layout->plh_inode;
+                pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+                ff_layout_reset_read(hdr);
+                return task->tk_status;
+        case -EAGAIN:
+                rpc_restart_call_prepare(task);
+                return -EAGAIN;
+        }
+        return 0;
+}
+/*
+ * We reference the rpc_cred of the first WRITE that triggers the need for
+ * a LAYOUTCOMMIT, and use it to send the layoutcommit compound.
+ * rfc5661 is not clear about which credential should be used.
+ *
+ * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so
+ * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751
+ * we always send layoutcommit after DS writes.
+ */
+static void
+ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr)
+{
+        pnfs_set_layoutcommit(hdr);
+        dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino,
+                (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb);
+}
+static bool
+ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
+{
+        /* No mirroring for now */
+        struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx);
+        return ff_layout_test_devid_unavailable(node);
+}
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+                                         struct nfs_pgio_header *hdr)
+{
+        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+                rpc_exit(task, -EIO);
+                return -EIO;
+        }
+        if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+                dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid);
+                if (ff_layout_has_available_ds(hdr->lseg))
+                        pnfs_read_resend_pnfs(hdr);
+                else
+                        ff_layout_reset_read(hdr);
+                rpc_exit(task, 0);
+                return -EAGAIN;
+        }
+        hdr->pgio_done_cb = ff_layout_read_done_cb;
+        return 0;
+}
+/*
+ * Call ops for the async read/write cases
+ * In the case of dense layouts, the offset needs to be reset to its
+ * original value.
+ */
+static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        if (ff_layout_read_prepare_common(task, hdr))
+                return;
+        rpc_call_start(task);
+}
+static int ff_layout_setup_sequence(struct nfs_client *ds_clp,
+                                    struct nfs4_sequence_args *args,
+                                    struct nfs4_sequence_res *res,
+                                    struct rpc_task *task)
+{
+        if (ds_clp->cl_session)
+                return nfs41_setup_sequence(ds_clp->cl_session,
+                                           args,
+                                           res,
+                                           task);
+        return nfs40_setup_sequence(ds_clp->cl_slot_tbl,
+                                   args,
+                                   res,
+                                   task);
+}
+static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        if (ff_layout_read_prepare_common(task, hdr))
+                return;
+        if (ff_layout_setup_sequence(hdr->ds_clp,
+                                     &hdr->args.seq_args,
+                                     &hdr->res.seq_res,
+                                     task))
+                return;
+        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+                        hdr->args.lock_context, FMODE_READ) == -EIO)
+                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+static void ff_layout_read_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
+        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+            task->tk_status == 0) {
+                nfs4_sequence_done(task, &hdr->res.seq_res);
+                return;
+        }
+        /* Note this may cause RPC to be resent */
+        hdr->mds_ops->rpc_call_done(task, hdr);
+}
+static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        rpc_count_iostats_metrics(task,
+            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
+}
+static int ff_layout_write_done_cb(struct rpc_task *task,
+                                struct nfs_pgio_header *hdr)
+{
+        struct inode *inode;
+        int err;
+        trace_nfs4_pnfs_write(hdr, task->tk_status);
+        if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status)
+                hdr->res.op_status = NFS4ERR_NXIO;
+        if (task->tk_status < 0 && hdr->res.op_status)
+                ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx,
+                                            hdr->args.offset, hdr->args.count,
+                                            hdr->res.op_status, OP_WRITE);
+        err = ff_layout_async_handle_error(task, hdr->args.context->state,
+                                           hdr->ds_clp, hdr->lseg,
+                                           hdr->pgio_mirror_idx);
+        switch (err) {
+        case -NFS4ERR_RESET_TO_PNFS:
+        case -NFS4ERR_RESET_TO_MDS:
+                inode = hdr->lseg->pls_layout->plh_inode;
+                pnfs_error_mark_layout_for_return(inode, hdr->lseg);
+                if (err == -NFS4ERR_RESET_TO_PNFS) {
+                        pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
+                        ff_layout_reset_write(hdr, true);
+                } else {
+                        pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
+                        ff_layout_reset_write(hdr, false);
+                }
+                return task->tk_status;
+        case -EAGAIN:
+                rpc_restart_call_prepare(task);
+                return -EAGAIN;
+        }
+        if (hdr->res.verf->committed == NFS_FILE_SYNC ||
+            hdr->res.verf->committed == NFS_DATA_SYNC)
+                ff_layout_set_layoutcommit(hdr);
+        return 0;
+}
+static int ff_layout_commit_done_cb(struct rpc_task *task,
+                                     struct nfs_commit_data *data)
+{
+        struct inode *inode;
+        int err;
+        trace_nfs4_pnfs_commit_ds(data, task->tk_status);
+        if (task->tk_status == -ETIMEDOUT && !data->res.op_status)
+                data->res.op_status = NFS4ERR_NXIO;
+        if (task->tk_status < 0 && data->res.op_status)
+                ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index,
+                                            data->args.offset, data->args.count,
+                                            data->res.op_status, OP_COMMIT);
+        err = ff_layout_async_handle_error(task, NULL, data->ds_clp,
+                                           data->lseg, data->ds_commit_index);
+        switch (err) {
+        case -NFS4ERR_RESET_TO_PNFS:
+        case -NFS4ERR_RESET_TO_MDS:
+                inode = data->lseg->pls_layout->plh_inode;
+                pnfs_error_mark_layout_for_return(inode, data->lseg);
+                if (err == -NFS4ERR_RESET_TO_PNFS)
+                        pnfs_set_retry_layoutget(data->lseg->pls_layout);
+                else
+                        pnfs_clear_retry_layoutget(data->lseg->pls_layout);
+                pnfs_generic_prepare_to_resend_writes(data);
+                return -EAGAIN;
+        case -EAGAIN:
+                rpc_restart_call_prepare(task);
+                return -EAGAIN;
+        }
+        if (data->verf.committed == NFS_UNSTABLE)
+                pnfs_commit_set_layoutcommit(data);
+        return 0;
+}
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+                                          struct nfs_pgio_header *hdr)
+{
+        if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
+                rpc_exit(task, -EIO);
+                return -EIO;
+        }
+        if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) {
+                bool retry_pnfs;
+                retry_pnfs = ff_layout_has_available_ds(hdr->lseg);
+                dprintk("%s task %u reset io to %s\n", __func__,
+                        task->tk_pid, retry_pnfs ? "pNFS" : "MDS");
+                ff_layout_reset_write(hdr, retry_pnfs);
+                rpc_exit(task, 0);
+                return -EAGAIN;
+        }
+        return 0;
+}
+static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        if (ff_layout_write_prepare_common(task, hdr))
+                return;
+        rpc_call_start(task);
+}
+static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        if (ff_layout_write_prepare_common(task, hdr))
+                return;
+        if (ff_layout_setup_sequence(hdr->ds_clp,
+                                     &hdr->args.seq_args,
+                                     &hdr->res.seq_res,
+                                     task))
+                return;
+        if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context,
+                        hdr->args.lock_context, FMODE_WRITE) == -EIO)
+                rpc_exit(task, -EIO); /* lost lock, terminate I/O */
+}
+static void ff_layout_write_call_done(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
+            task->tk_status == 0) {
+                nfs4_sequence_done(task, &hdr->res.seq_res);
+                return;
+        }
+        /* Note this may cause RPC to be resent */
+        hdr->mds_ops->rpc_call_done(task, hdr);
+}
+static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        rpc_count_iostats_metrics(task,
+            &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
+}
+static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
+{
+        rpc_call_start(task);
+}
+static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
+{
+        struct nfs_commit_data *wdata = data;
+        ff_layout_setup_sequence(wdata->ds_clp,
+                                 &wdata->args.seq_args,
+                                 &wdata->res.seq_res,
+                                 task);
+}
+static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
+{
+        struct nfs_commit_data *cdata = data;
+        rpc_count_iostats_metrics(task,
+            &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
+}
+static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
+        .rpc_call_prepare = ff_layout_read_prepare_v3,
+        .rpc_call_done = ff_layout_read_call_done,
+        .rpc_count_stats = ff_layout_read_count_stats,
+        .rpc_release = pnfs_generic_rw_release,
+};
+static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
+        .rpc_call_prepare = ff_layout_read_prepare_v4,
+        .rpc_call_done = ff_layout_read_call_done,
+        .rpc_count_stats = ff_layout_read_count_stats,
+        .rpc_release = pnfs_generic_rw_release,
+};
+static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
+        .rpc_call_prepare = ff_layout_write_prepare_v3,
+        .rpc_call_done = ff_layout_write_call_done,
+        .rpc_count_stats = ff_layout_write_count_stats,
+        .rpc_release = pnfs_generic_rw_release,
+};
+static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
+        .rpc_call_prepare = ff_layout_write_prepare_v4,
+        .rpc_call_done = ff_layout_write_call_done,
+        .rpc_count_stats = ff_layout_write_count_stats,
+        .rpc_release = pnfs_generic_rw_release,
+};
+static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
+        .rpc_call_prepare = ff_layout_commit_prepare_v3,
+        .rpc_call_done = pnfs_generic_write_commit_done,
+        .rpc_count_stats = ff_layout_commit_count_stats,
+        .rpc_release = pnfs_generic_commit_release,
+};
+static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
+        .rpc_call_prepare = ff_layout_commit_prepare_v4,
+        .rpc_call_done = pnfs_generic_write_commit_done,
+        .rpc_count_stats = ff_layout_commit_count_stats,
+        .rpc_release = pnfs_generic_commit_release,
+};
+static enum pnfs_try_status
+ff_layout_read_pagelist(struct nfs_pgio_header *hdr)
+{
+        struct pnfs_layout_segment *lseg = hdr->lseg;
+        struct nfs4_pnfs_ds *ds;
+        struct rpc_clnt *ds_clnt;
+        struct rpc_cred *ds_cred;
+        loff_t offset = hdr->args.offset;
+        u32 idx = hdr->pgio_mirror_idx;
+        int vers;
+        struct nfs_fh *fh;
+        dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n",
+                __func__, hdr->inode->i_ino,
+                hdr->args.pgbase, (size_t)hdr->args.count, offset);
+        ds = nfs4_ff_layout_prepare_ds(lseg, idx, false);
+        if (!ds)
+                goto out_failed;
+        ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+                                                   hdr->inode);
+        if (IS_ERR(ds_clnt))
+                goto out_failed;
+        ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+        if (IS_ERR(ds_cred))
+                goto out_failed;
+        vers = nfs4_ff_layout_ds_version(lseg, idx);
+        dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__,
+                ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers);
+        atomic_inc(&ds->ds_clp->cl_count);
+        hdr->ds_clp = ds->ds_clp;
+        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+        if (fh)
+                hdr->args.fh = fh;
+        /*
+         * Note that if we ever decide to split across DSes,
+         * then we may need to handle dense-like offsets.
+         */
+        hdr->args.offset = offset;
+        hdr->mds_offset = offset;
+        /* Perform an asynchronous read to ds */
+        nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+                          vers == 3 ? &ff_layout_read_call_ops_v3 :
+                                      &ff_layout_read_call_ops_v4,
+                          0, RPC_TASK_SOFTCONN);
+        return PNFS_ATTEMPTED;
+out_failed:
+        if (ff_layout_has_available_ds(lseg))
+                return PNFS_TRY_AGAIN;
+        return PNFS_NOT_ATTEMPTED;
+}
+/* Perform async writes. */
+static enum pnfs_try_status
+ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync)
+{
+        struct pnfs_layout_segment *lseg = hdr->lseg;
+        struct nfs4_pnfs_ds *ds;
+        struct rpc_clnt *ds_clnt;
+        struct rpc_cred *ds_cred;
+        loff_t offset = hdr->args.offset;
+        int vers;
+        struct nfs_fh *fh;
+        int idx = hdr->pgio_mirror_idx;
+        ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+        if (!ds)
+                return PNFS_NOT_ATTEMPTED;
+        ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+                                                   hdr->inode);
+        if (IS_ERR(ds_clnt))
+                return PNFS_NOT_ATTEMPTED;
+        ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred);
+        if (IS_ERR(ds_cred))
+                return PNFS_NOT_ATTEMPTED;
+        vers = nfs4_ff_layout_ds_version(lseg, idx);
+        dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n",
+                __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count,
+                offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count),
+                vers);
+        hdr->pgio_done_cb = ff_layout_write_done_cb;
+        atomic_inc(&ds->ds_clp->cl_count);
+        hdr->ds_clp = ds->ds_clp;
+        hdr->ds_commit_idx = idx;
+        fh = nfs4_ff_layout_select_ds_fh(lseg, idx);
+        if (fh)
+                hdr->args.fh = fh;
+        /*
+         * Note that if we ever decide to split across DSes,
+         * then we may need to handle dense-like offsets.
+         */
+        hdr->args.offset = offset;
+        /* Perform an asynchronous write */
+        nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops,
+                          vers == 3 ? &ff_layout_write_call_ops_v3 :
+                                      &ff_layout_write_call_ops_v4,
+                          sync, RPC_TASK_SOFTCONN);
+        return PNFS_ATTEMPTED;
+}
+static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+        return i;
+}
+static struct nfs_fh *
+select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i)
+{
+        struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+        /* FIXME: Assume that there is only one NFS version available
+         * for the DS.
+         */
+        return &flseg->mirror_array[i]->fh_versions[0];
+}
+static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how)
+{
+        struct pnfs_layout_segment *lseg = data->lseg;
+        struct nfs4_pnfs_ds *ds;
+        struct rpc_clnt *ds_clnt;
+        struct rpc_cred *ds_cred;
+        u32 idx;
+        int vers;
+        struct nfs_fh *fh;
+        idx = calc_ds_index_from_commit(lseg, data->ds_commit_index);
+        ds = nfs4_ff_layout_prepare_ds(lseg, idx, true);
+        if (!ds)
+                goto out_err;
+        ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp,
+                                                   data->inode);
+        if (IS_ERR(ds_clnt))
+                goto out_err;
+        ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred);
+        if (IS_ERR(ds_cred))
+                goto out_err;
+        vers = nfs4_ff_layout_ds_version(lseg, idx);
+        dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__,
+                data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count),
+                vers);
+        data->commit_done_cb = ff_layout_commit_done_cb;
+        data->cred = ds_cred;
+        atomic_inc(&ds->ds_clp->cl_count);
+        data->ds_clp = ds->ds_clp;
+        fh = select_ds_fh_from_commit(lseg, data->ds_commit_index);
+        if (fh)
+                data->args.fh = fh;
+        return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops,
+                                   vers == 3 ? &ff_layout_commit_call_ops_v3 :
+                                               &ff_layout_commit_call_ops_v4,
+                                   how, RPC_TASK_SOFTCONN);
+out_err:
+        pnfs_generic_prepare_to_resend_writes(data);
+        pnfs_generic_commit_release(data);
+        return -EAGAIN;
+}
+static int
+ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+                           int how, struct nfs_commit_info *cinfo)
+{
+        return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo,
+                                            ff_layout_initiate_commit);
+}
+static struct pnfs_ds_commit_info *
+ff_layout_get_ds_info(struct inode *inode)
+{
+        struct pnfs_layout_hdr *layout = NFS_I(inode)->layout;
+        if (layout == NULL)
+                return NULL;
+        return &FF_LAYOUT_FROM_HDR(layout)->commit_info;
+}
+static void
+ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d)
+{
+        nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
+                                                  id_node));
+}
+static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
+                                  struct xdr_stream *xdr,
+                                  const struct nfs4_layoutreturn_args *args)
+{
+        struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
+        __be32 *start;
+        int count = 0, ret = 0;
+        start = xdr_reserve_space(xdr, 4);
+        if (unlikely(!start))
+                return -E2BIG;
+        /* This assume we always return _ALL_ layouts */
+        spin_lock(&hdr->plh_inode->i_lock);
+        ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
+        spin_unlock(&hdr->plh_inode->i_lock);
+        *start = cpu_to_be32(count);
+        return ret;
+}
+/* report nothing for now */
+static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
+                                     struct xdr_stream *xdr,
+                                     const struct nfs4_layoutreturn_args *args)
+{
+        __be32 *p;
+        p = xdr_reserve_space(xdr, 4);
+        if (likely(p))
+                *p = cpu_to_be32(0);
+}
+static struct nfs4_deviceid_node *
+ff_layout_alloc_deviceid_node(struct nfs_server *server,
+                              struct pnfs_device *pdev, gfp_t gfp_flags)
+{
+        struct nfs4_ff_layout_ds *dsaddr;
+        dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags);
+        if (!dsaddr)
+                return NULL;
+        return &dsaddr->id_node;
+}
+static void
+ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
+                              struct xdr_stream *xdr,
+                              const struct nfs4_layoutreturn_args *args)
+{
+        struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+        __be32 *start;
+        dprintk("%s: Begin\n", __func__);
+        start = xdr_reserve_space(xdr, 4);
+        BUG_ON(!start);
+        if (ff_layout_encode_ioerr(flo, xdr, args))
+                goto out;
+        ff_layout_encode_iostats(flo, xdr, args);
+out:
+        *start = cpu_to_be32((xdr->p - start - 1) * 4);
+        dprintk("%s: Return\n", __func__);
+}
+static struct pnfs_layoutdriver_type flexfilelayout_type = {
+        .id                     = LAYOUT_FLEX_FILES,
+        .name                   = "LAYOUT_FLEX_FILES",
+        .owner                  = THIS_MODULE,
+        .alloc_layout_hdr       = ff_layout_alloc_layout_hdr,
+        .free_layout_hdr        = ff_layout_free_layout_hdr,
+        .alloc_lseg             = ff_layout_alloc_lseg,
+        .free_lseg              = ff_layout_free_lseg,
+        .pg_read_ops            = &ff_layout_pg_read_ops,
+        .pg_write_ops           = &ff_layout_pg_write_ops,
+        .get_ds_info            = ff_layout_get_ds_info,
+        .free_deviceid_node     = ff_layout_free_deveiceid_node,
+        .mark_request_commit    = pnfs_layout_mark_request_commit,
+        .clear_request_commit   = pnfs_generic_clear_request_commit,
+        .scan_commit_lists      = pnfs_generic_scan_commit_lists,
+        .recover_commit_reqs    = pnfs_generic_recover_commit_reqs,
+        .commit_pagelist        = ff_layout_commit_pagelist,
+        .read_pagelist          = ff_layout_read_pagelist,
+        .write_pagelist         = ff_layout_write_pagelist,
+        .alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
+        .encode_layoutreturn    = ff_layout_encode_layoutreturn,
+};
+static int __init nfs4flexfilelayout_init(void)
+{
+        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n",
+               __func__);
+        return pnfs_register_layoutdriver(&flexfilelayout_type);
+}
+static void __exit nfs4flexfilelayout_exit(void)
+{
+        printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n",
+               __func__);
+        pnfs_unregister_layoutdriver(&flexfilelayout_type);
+}
+MODULE_ALIAS("nfs-layouttype4-4");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("The NFSv4 flexfile layout driver");
+module_init(nfs4flexfilelayout_init);
+module_exit(nfs4flexfilelayout_exit);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
new file mode 100644
index 000000000000..070f20445b2d
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -0,0 +1,155 @@
+/*
+ * NFSv4 flexfile layout driver data structures.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H
+#define FS_NFS_NFS4FLEXFILELAYOUT_H
+#include "../pnfs.h"
+/* XXX: Let's filter out insanely large mirror count for now to avoid oom
+ * due to network error etc. */
+#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096
+struct nfs4_ff_ds_version {
+        u32                             version;
+        u32                             minor_version;
+        u32                             rsize;
+        u32                             wsize;
+        bool                            tightly_coupled;
+};
+/* chained in global deviceid hlist */
+struct nfs4_ff_layout_ds {
+        struct nfs4_deviceid_node       id_node;
+        u32                             ds_versions_cnt;
+        struct nfs4_ff_ds_version       *ds_versions;
+        struct nfs4_pnfs_ds             *ds;
+};
+struct nfs4_ff_layout_ds_err {
+        struct list_head                list; /* linked in mirror error_list */
+        u64                             offset;
+        u64                             length;
+        int                             status;
+        enum nfs_opnum4                 opnum;
+        nfs4_stateid                    stateid;
+        struct nfs4_deviceid            deviceid;
+};
+struct nfs4_ff_layout_mirror {
+        u32                             ds_count;
+        u32                             efficiency;
+        struct nfs4_ff_layout_ds        *mirror_ds;
+        u32                             fh_versions_cnt;
+        struct nfs_fh                   *fh_versions;
+        nfs4_stateid                    stateid;
+        struct nfs4_string              user_name;
+        struct nfs4_string              group_name;
+        u32                             uid;
+        u32                             gid;
+        struct rpc_cred                 *cred;
+        spinlock_t                      lock;
+};
+struct nfs4_ff_layout_segment {
+        struct pnfs_layout_segment      generic_hdr;
+        u64                             stripe_unit;
+        u32                             mirror_array_cnt;
+        struct nfs4_ff_layout_mirror    **mirror_array;
+};
+struct nfs4_flexfile_layout {
+        struct pnfs_layout_hdr generic_hdr;
+        struct pnfs_ds_commit_info commit_info;
+        struct list_head        error_list; /* nfs4_ff_layout_ds_err */
+};
+static inline struct nfs4_flexfile_layout *
+FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
+{
+        return container_of(lo, struct nfs4_flexfile_layout, generic_hdr);
+}
+static inline struct nfs4_ff_layout_segment *
+FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg)
+{
+        return container_of(lseg,
+                            struct nfs4_ff_layout_segment,
+                            generic_hdr);
+}
+static inline struct nfs4_deviceid_node *
+FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx)
+{
+        if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt ||
+            FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL ||
+            FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL)
+                return NULL;
+        return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node;
+}
+static inline struct nfs4_ff_layout_ds *
+FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node)
+{
+        return container_of(node, struct nfs4_ff_layout_ds, id_node);
+}
+static inline struct nfs4_ff_layout_mirror *
+FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx)
+{
+        if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt)
+                return NULL;
+        return FF_LAYOUT_LSEG(lseg)->mirror_array[idx];
+}
+static inline u32
+FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg)
+{
+        return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt;
+}
+static inline bool
+ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node)
+{
+        return nfs4_test_deviceid_unavailable(node);
+}
+static inline int
+nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx)
+{
+        return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version;
+}
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+                            gfp_t gfp_flags);
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds);
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+                             struct nfs4_ff_layout_mirror *mirror, u64 offset,
+                             u64 length, int status, enum nfs_opnum4 opnum,
+                             gfp_t gfp_flags);
+int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
+                              struct xdr_stream *xdr, int *count,
+                              const struct pnfs_layout_range *range);
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+                          bool fail_return);
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
+                                 u32 ds_idx,
+                                 struct nfs_client *ds_clp,
+                                 struct inode *inode);
+struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
+                                       u32 ds_idx, struct rpc_cred *mdscred);
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
new file mode 100644
index 000000000000..e2c01f204a95
--- /dev/null
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -0,0 +1,552 @@
+/*
+ * Device operations for the pnfs nfs4 file layout driver.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tao Peng <bergwolf@primarydata.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/module.h>
+#include <linux/sunrpc/addr.h>
+#include "../internal.h"
+#include "../nfs4session.h"
+#include "flexfilelayout.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS_LD
+static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO;
+static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS;
+void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+        if (mirror_ds)
+                nfs4_put_deviceid_node(&mirror_ds->id_node);
+}
+void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
+{
+        nfs4_print_deviceid(&mirror_ds->id_node.deviceid);
+        nfs4_pnfs_ds_put(mirror_ds->ds);
+        kfree(mirror_ds);
+}
+/* Decode opaque device data and construct new_ds using it */
+struct nfs4_ff_layout_ds *
+nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
+                            gfp_t gfp_flags)
+{
+        struct xdr_stream stream;
+        struct xdr_buf buf;
+        struct page *scratch;
+        struct list_head dsaddrs;
+        struct nfs4_pnfs_ds_addr *da;
+        struct nfs4_ff_layout_ds *new_ds = NULL;
+        struct nfs4_ff_ds_version *ds_versions = NULL;
+        u32 mp_count;
+        u32 version_count;
+        __be32 *p;
+        int i, ret = -ENOMEM;
+        /* set up xdr stream */
+        scratch = alloc_page(gfp_flags);
+        if (!scratch)
+                goto out_err;
+        new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags);
+        if (!new_ds)
+                goto out_scratch;
+        nfs4_init_deviceid_node(&new_ds->id_node,
+                                server,
+                                &pdev->dev_id);
+        INIT_LIST_HEAD(&dsaddrs);
+        xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen);
+        xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE);
+        /* multipath count */
+        p = xdr_inline_decode(&stream, 4);
+        if (unlikely(!p))
+                goto out_err_drain_dsaddrs;
+        mp_count = be32_to_cpup(p);
+        dprintk("%s: multipath ds count %d\n", __func__, mp_count);
+        for (i = 0; i < mp_count; i++) {
+                /* multipath ds */
+                da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net,
+                                            &stream, gfp_flags);
+                if (da)
+                        list_add_tail(&da->da_node, &dsaddrs);
+        }
+        if (list_empty(&dsaddrs)) {
+                dprintk("%s: no suitable DS addresses found\n",
+                        __func__);
+                ret = -ENOMEDIUM;
+                goto out_err_drain_dsaddrs;
+        }
+        /* version count */
+        p = xdr_inline_decode(&stream, 4);
+        if (unlikely(!p))
+                goto out_err_drain_dsaddrs;
+        version_count = be32_to_cpup(p);
+        dprintk("%s: version count %d\n", __func__, version_count);
+        ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version),
+                              gfp_flags);
+        if (!ds_versions)
+                goto out_scratch;
+        for (i = 0; i < version_count; i++) {
+                /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) +
+                 * tightly_coupled(4) */
+                p = xdr_inline_decode(&stream, 20);
+                if (unlikely(!p))
+                        goto out_err_drain_dsaddrs;
+                ds_versions[i].version = be32_to_cpup(p++);
+                ds_versions[i].minor_version = be32_to_cpup(p++);
+                ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL);
+                ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL);
+                ds_versions[i].tightly_coupled = be32_to_cpup(p);
+                if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE)
+                        ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE;
+                if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE)
+                        ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE;
+                if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) {
+                        dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__,
+                                i, ds_versions[i].version,
+                                ds_versions[i].minor_version);
+                        ret = -EPROTONOSUPPORT;
+                        goto out_err_drain_dsaddrs;
+                }
+                dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n",
+                        __func__, i, ds_versions[i].version,
+                        ds_versions[i].minor_version,
+                        ds_versions[i].rsize,
+                        ds_versions[i].wsize,
+                        ds_versions[i].tightly_coupled);
+        }
+        new_ds->ds_versions = ds_versions;
+        new_ds->ds_versions_cnt = version_count;
+        new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags);
+        if (!new_ds->ds)
+                goto out_err_drain_dsaddrs;
+        /* If DS was already in cache, free ds addrs */
+        while (!list_empty(&dsaddrs)) {
+                da = list_first_entry(&dsaddrs,
+                                      struct nfs4_pnfs_ds_addr,
+                                      da_node);
+                list_del_init(&da->da_node);
+                kfree(da->da_remotestr);
+                kfree(da);
+        }
+        __free_page(scratch);
+        return new_ds;
+out_err_drain_dsaddrs:
+        while (!list_empty(&dsaddrs)) {
+                da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr,
+                                      da_node);
+                list_del_init(&da->da_node);
+                kfree(da->da_remotestr);
+                kfree(da);
+        }
+        kfree(ds_versions);
+out_scratch:
+        __free_page(scratch);
+out_err:
+        kfree(new_ds);
+        dprintk("%s ERROR: returning %d\n", __func__, ret);
+        return NULL;
+}
+static u64
+end_offset(u64 start, u64 len)
+{
+        u64 end;
+        end = start + len;
+        return end >= start ? end : NFS4_MAX_UINT64;
+}
+static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
+                            u64 offset, u64 length)
+{
+        u64 end;
+        end = max_t(u64, end_offset(err->offset, err->length),
+                    end_offset(offset, length));
+        err->offset = min_t(u64, err->offset, offset);
+        err->length = end - err->offset;
+}
+static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err,  u64 offset,
+                               u64 length, int status, enum nfs_opnum4 opnum,
+                               nfs4_stateid *stateid,
+                               struct nfs4_deviceid *deviceid)
+{
+        return err->status == status && err->opnum == opnum &&
+               nfs4_stateid_match(&err->stateid, stateid) &&
+               !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
+               end_offset(err->offset, err->length) >= offset &&
+               err->offset <= end_offset(offset, length);
+}
+static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
+                           struct nfs4_ff_layout_ds_err *new)
+{
+        if (!ds_error_can_merge(old, new->offset, new->length, new->status,
+                                new->opnum, &new->stateid, &new->deviceid))
+                return false;
+        extend_ds_error(old, new->offset, new->length);
+        return true;
+}
+static bool
+ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
+                              struct nfs4_ff_layout_ds_err *dserr)
+{
+        struct nfs4_ff_layout_ds_err *err;
+        list_for_each_entry(err, &flo->error_list, list) {
+                if (merge_ds_error(err, dserr)) {
+                        return true;
+                }
+        }
+        list_add(&dserr->list, &flo->error_list);
+        return false;
+}
+static bool
+ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
+                          u64 length, int status, enum nfs_opnum4 opnum,
+                          nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
+{
+        bool found = false;
+        struct nfs4_ff_layout_ds_err *err;
+        list_for_each_entry(err, &flo->error_list, list) {
+                if (ds_error_can_merge(err, offset, length, status, opnum,
+                                       stateid, deviceid)) {
+                        found = true;
+                        extend_ds_error(err, offset, length);
+                        break;
+                }
+        }
+        return found;
+}
+int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
+                             struct nfs4_ff_layout_mirror *mirror, u64 offset,
+                             u64 length, int status, enum nfs_opnum4 opnum,
+                             gfp_t gfp_flags)
+{
+        struct nfs4_ff_layout_ds_err *dserr;
+        bool needfree;
+        if (status == 0)
+                return 0;
+        if (mirror->mirror_ds == NULL)
+                return -EINVAL;
+        spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+        if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
+                                      &mirror->stateid,
+                                      &mirror->mirror_ds->id_node.deviceid)) {
+                spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+                return 0;
+        }
+        spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+        dserr = kmalloc(sizeof(*dserr), gfp_flags);
+        if (!dserr)
+                return -ENOMEM;
+        INIT_LIST_HEAD(&dserr->list);
+        dserr->offset = offset;
+        dserr->length = length;
+        dserr->status = status;
+        dserr->opnum = opnum;
+        nfs4_stateid_copy(&dserr->stateid, &mirror->stateid);
+        memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid,
+               NFS4_DEVICEID4_SIZE);
+        spin_lock(&flo->generic_hdr.plh_inode->i_lock);
+        needfree = ff_layout_add_ds_error_locked(flo, dserr);
+        spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
+        if (needfree)
+                kfree(dserr);
+        return 0;
+}
+/* currently we only support AUTH_NONE and AUTH_SYS */
+static rpc_authflavor_t
+nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror)
+{
+        if (mirror->uid == (u32)-1)
+                return RPC_AUTH_NULL;
+        return RPC_AUTH_UNIX;
+}
+/* fetch cred for NFSv3 DS */
+static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror,
+                                      struct nfs4_pnfs_ds *ds)
+{
+        if (ds->ds_clp && !mirror->cred &&
+            mirror->mirror_ds->ds_versions[0].version == 3) {
+                struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth;
+                struct rpc_cred *cred;
+                struct auth_cred acred = {
+                        .uid = make_kuid(&init_user_ns, mirror->uid),
+                        .gid = make_kgid(&init_user_ns, mirror->gid),
+                };
+                /* AUTH_NULL ignores acred */
+                cred = auth->au_ops->lookup_cred(auth, &acred, 0);
+                if (IS_ERR(cred)) {
+                        dprintk("%s: lookup_cred failed with %ld\n",
+                                __func__, PTR_ERR(cred));
+                        return PTR_ERR(cred);
+                } else {
+                        mirror->cred = cred;
+                }
+        }
+        return 0;
+}
+struct nfs_fh *
+nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
+{
+        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
+        struct nfs_fh *fh = NULL;
+        struct nfs4_deviceid_node *devid;
+        if (mirror == NULL || mirror->mirror_ds == NULL ||
+            mirror->mirror_ds->ds == NULL) {
+                printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n",
+                        __func__, mirror_idx);
+                if (mirror && mirror->mirror_ds) {
+                        devid = &mirror->mirror_ds->id_node;
+                        pnfs_generic_mark_devid_invalid(devid);
+                }
+                goto out;
+        }
+        /* FIXME: For now assume there is only 1 version available for the DS */
+        fh = &mirror->fh_versions[0];
+out:
+        return fh;
+}
+/* Upon return, either ds is connected, or ds is NULL */
+struct nfs4_pnfs_ds *
+nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
+                          bool fail_return)
+{
+        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+        struct nfs4_pnfs_ds *ds = NULL;
+        struct nfs4_deviceid_node *devid;
+        struct inode *ino = lseg->pls_layout->plh_inode;
+        struct nfs_server *s = NFS_SERVER(ino);
+        unsigned int max_payload;
+        rpc_authflavor_t flavor;
+        if (mirror == NULL || mirror->mirror_ds == NULL ||
+            mirror->mirror_ds->ds == NULL) {
+                printk(KERN_ERR "NFS: %s: No data server for offset index %d\n",
+                        __func__, ds_idx);
+                if (mirror && mirror->mirror_ds) {
+                        devid = &mirror->mirror_ds->id_node;
+                        pnfs_generic_mark_devid_invalid(devid);
+                }
+                goto out;
+        }
+        devid = &mirror->mirror_ds->id_node;
+        if (ff_layout_test_devid_unavailable(devid))
+                goto out;
+        ds = mirror->mirror_ds->ds;
+        /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */
+        smp_rmb();
+        if (ds->ds_clp)
+                goto out;
+        flavor = nfs4_ff_layout_choose_authflavor(mirror);
+        /* FIXME: For now we assume the server sent only one version of NFS
+         * to use for the DS.
+         */
+        nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
+                             dataserver_retrans,
+                             mirror->mirror_ds->ds_versions[0].version,
+                             mirror->mirror_ds->ds_versions[0].minor_version,
+                             flavor);
+        /* connect success, check rsize/wsize limit */
+        if (ds->ds_clp) {
+                max_payload =
+                        nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient),
+                                       NULL);
+                if (mirror->mirror_ds->ds_versions[0].rsize > max_payload)
+                        mirror->mirror_ds->ds_versions[0].rsize = max_payload;
+                if (mirror->mirror_ds->ds_versions[0].wsize > max_payload)
+                        mirror->mirror_ds->ds_versions[0].wsize = max_payload;
+        } else {
+                ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+                                         mirror, lseg->pls_range.offset,
+                                         lseg->pls_range.length, NFS4ERR_NXIO,
+                                         OP_ILLEGAL, GFP_NOIO);
+                if (fail_return) {
+                        pnfs_error_mark_layout_for_return(ino, lseg);
+                        if (ff_layout_has_available_ds(lseg))
+                                pnfs_set_retry_layoutget(lseg->pls_layout);
+                        else
+                                pnfs_clear_retry_layoutget(lseg->pls_layout);
+                } else {
+                        if (ff_layout_has_available_ds(lseg))
+                                set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                                        &lseg->pls_layout->plh_flags);
+                        else {
+                                pnfs_error_mark_layout_for_return(ino, lseg);
+                                pnfs_clear_retry_layoutget(lseg->pls_layout);
+                        }
+                }
+        }
+        if (ff_layout_update_mirror_cred(mirror, ds))
+                ds = NULL;
+out:
+        return ds;
+}
+struct rpc_cred *
+ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx,
+                      struct rpc_cred *mdscred)
+{
+        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+        struct rpc_cred *cred = ERR_PTR(-EINVAL);
+        if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true))
+                goto out;
+        if (mirror && mirror->cred)
+                cred = mirror->cred;
+        else
+                cred = mdscred;
+out:
+        return cred;
+}
+/**
+* Find or create a DS rpc client with th MDS server rpc client auth flavor
+* in the nfs_client cl_ds_clients list.
+*/
+struct rpc_clnt *
+nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
+                                 struct nfs_client *ds_clp, struct inode *inode)
+{
+        struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx);
+        switch (mirror->mirror_ds->ds_versions[0].version) {
+        case 3:
+                /* For NFSv3 DS, flavor is set when creating DS connections */
+                return ds_clp->cl_rpcclient;
+        case 4:
+                return nfs4_find_or_create_ds_client(ds_clp, inode);
+        default:
+                BUG();
+        }
+}
+static bool is_range_intersecting(u64 offset1, u64 length1,
+                                  u64 offset2, u64 length2)
+{
+        u64 end1 = end_offset(offset1, length1);
+        u64 end2 = end_offset(offset2, length2);
+        return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
+               (end2 == NFS4_MAX_UINT64 || end2 > offset1);
+}
+/* called with inode i_lock held */
+int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
+                              struct xdr_stream *xdr, int *count,
+                              const struct pnfs_layout_range *range)
+{
+        struct nfs4_ff_layout_ds_err *err, *n;
+        __be32 *p;
+        list_for_each_entry_safe(err, n, &flo->error_list, list) {
+                if (!is_range_intersecting(err->offset, err->length,
+                                           range->offset, range->length))
+                        continue;
+                /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
+                 * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4)
+                 */
+                p = xdr_reserve_space(xdr,
+                                24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE);
+                if (unlikely(!p))
+                        return -ENOBUFS;
+                p = xdr_encode_hyper(p, err->offset);
+                p = xdr_encode_hyper(p, err->length);
+                p = xdr_encode_opaque_fixed(p, &err->stateid,
+                                            NFS4_STATEID_SIZE);
+                p = xdr_encode_opaque_fixed(p, &err->deviceid,
+                                            NFS4_DEVICEID4_SIZE);
+                *p++ = cpu_to_be32(err->status);
+                *p++ = cpu_to_be32(err->opnum);
+                *count += 1;
+                list_del(&err->list);
+                dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
+                        __func__, err->offset, err->length, err->status,
+                        err->opnum, *count);
+                kfree(err);
+        }
+        return 0;
+}
+bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+{
+        struct nfs4_ff_layout_mirror *mirror;
+        struct nfs4_deviceid_node *devid;
+        int idx;
+        for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
+                mirror = FF_LAYOUT_COMP(lseg, idx);
+                if (mirror && mirror->mirror_ds) {
+                        devid = &mirror->mirror_ds->id_node;
+                        if (!ff_layout_test_devid_unavailable(devid))
+                                return true;
+                }
+        }
+        return false;
+}
+module_param(dataserver_retrans, uint, 0644);
+MODULE_PARM_DESC(dataserver_retrans, "The  number of times the NFSv4.1 client "
+                        "retries a request before it attempts further "
+                        " recovery  action.");
+module_param(dataserver_timeo, uint, 0644);
+MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the "
+                        "NFSv4.1  client  waits for a response from a "
+                        " data server before it retries an NFS request.");
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 2f5db844c172..857e2a99acc8 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -152,7 +152,7 @@ void nfs_fattr_map_and_free_names(struct nfs_server *server, struct nfs_fattr *f
                nfs_fattr_free_group_name(fattr);
 }
-static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
+int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *res)
 {
        unsigned long val;
        char buf[16];
@@ -166,6 +166,7 @@ static int nfs_map_string_to_numeric(const char *name, size_t namelen, __u32 *re
        *res = val;
        return 1;
 }
+EXPORT_SYMBOL_GPL(nfs_map_string_to_numeric);
 static int nfs_map_numeric_to_string(__u32 id, char *buf, size_t buflen)
 {
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 4bffe637ea32..83107be3dd01 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -352,8 +352,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
        nfs_attr_check_mountpoint(sb, fattr);
-        if (((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) &&
+        if (nfs_attr_use_mounted_on_fileid(fattr))
-            !nfs_attr_use_mounted_on_fileid(fattr))
+                fattr->fileid = fattr->mounted_on_fileid;
+        else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
                goto out_no_inode;
        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
                goto out_no_inode;
@@ -387,7 +388,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
                if (S_ISREG(inode->i_mode)) {
                        inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
                        inode->i_data.a_ops = &nfs_file_aops;
-                        inode->i_data.backing_dev_info = &NFS_SB(sb)->backing_dev_info;
                } else if (S_ISDIR(inode->i_mode)) {
                        inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
                        inode->i_fop = &nfs_dir_operations;
@@ -506,10 +506,15 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr)
                attr->ia_valid &= ~ATTR_MODE;
        if (attr->ia_valid & ATTR_SIZE) {
+                loff_t i_size;
                BUG_ON(!S_ISREG(inode->i_mode));
-                if (attr->ia_size == i_size_read(inode))
+                i_size = i_size_read(inode);
+                if (attr->ia_size == i_size)
                        attr->ia_valid &= ~ATTR_SIZE;
+                else if (attr->ia_size < i_size && IS_SWAPFILE(inode))
+                        return -ETXTBSY;
        }
        /* Optimization: if the end result is no change, don't RPC */
@@ -1770,7 +1775,6 @@ static inline void nfs4_init_once(struct nfs_inode *nfsi)
 #if IS_ENABLED(CONFIG_NFS_V4)
        INIT_LIST_HEAD(&nfsi->open_states);
        nfsi->delegation = NULL;
-        nfsi->delegation_state = 0;
        init_rwsem(&nfsi->rwsem);
        nfsi->layout = NULL;
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index efaa31c70fbe..b802fb3a2d99 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -6,6 +6,7 @@
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/crc32.h>
+#include <linux/nfs_page.h>
 #define NFS_MS_MASK (MS_RDONLY|MS_NOSUID|MS_NODEV|MS_NOEXEC|MS_SYNCHRONOUS)
@@ -31,8 +32,6 @@ static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr)
            (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) &&
             ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0)))
                return 0;
-        fattr->fileid = fattr->mounted_on_fileid;
        return 1;
 }
@@ -189,9 +188,15 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
                                             const struct sockaddr *ds_addr,
                                             int ds_addrlen, int ds_proto,
                                             unsigned int ds_timeo,
-                                             unsigned int ds_retrans);
+                                             unsigned int ds_retrans,
+                                             u32 minor_version,
+                                             rpc_authflavor_t au_flavor);
 extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
                                                struct inode *);
+extern struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+                        const struct sockaddr *ds_addr, int ds_addrlen,
+                        int ds_proto, unsigned int ds_timeo,
+                        unsigned int ds_retrans, rpc_authflavor_t au_flavor);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -244,9 +249,12 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
 void nfs_pgio_header_free(struct nfs_pgio_header *);
 void nfs_pgio_data_destroy(struct nfs_pgio_header *);
 int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *);
-int nfs_initiate_pgio(struct rpc_clnt *, struct nfs_pgio_header *,
+int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
-                      const struct rpc_call_ops *, int, int);
+                      struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
+                      const struct rpc_call_ops *call_ops, int how, int flags);
 void nfs_free_request(struct nfs_page *req);
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
 static inline void nfs_iocounter_init(struct nfs_io_counter *c)
 {
@@ -254,6 +262,12 @@ static inline void nfs_iocounter_init(struct nfs_io_counter *c)
        atomic_set(&c->io_count, 0);
 }
+static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
+{
+        WARN_ON_ONCE(desc->pg_mirror_count < 1);
+        return desc->pg_mirror_count > 1;
+}
 /* nfs2xdr.c */
 extern struct rpc_procinfo nfs_procedures[];
 extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -377,7 +391,7 @@ extern struct rpc_stat nfs_rpcstat;
 extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
-extern void nfs_sb_active(struct super_block *sb);
+extern bool nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
 /* namespace.c */
@@ -416,7 +430,6 @@ int  nfs_show_options(struct seq_file *, struct dentry *);
 int  nfs_show_devname(struct seq_file *, struct dentry *);
 int  nfs_show_path(struct seq_file *, struct dentry *);
 int  nfs_show_stats(struct seq_file *, struct dentry *);
-void nfs_put_super(struct super_block *);
 int nfs_remount(struct super_block *sb, int *flags, char *raw_data);
 /* write.c */
@@ -429,6 +442,7 @@ extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
 extern void nfs_commit_prepare(struct rpc_task *task, void *calldata);
 extern int nfs_initiate_commit(struct rpc_clnt *clnt,
                               struct nfs_commit_data *data,
+                               const struct nfs_rpc_ops *nfs_ops,
                               const struct rpc_call_ops *call_ops,
                               int how, int flags);
 extern void nfs_init_commit(struct nfs_commit_data *data,
@@ -442,13 +456,15 @@ int nfs_scan_commit(struct inode *inode, struct list_head *dst,
                    struct nfs_commit_info *cinfo);
 void nfs_mark_request_commit(struct nfs_page *req,
                             struct pnfs_layout_segment *lseg,
-                             struct nfs_commit_info *cinfo);
+                             struct nfs_commit_info *cinfo,
+                             u32 ds_commit_idx);
 int nfs_write_need_commit(struct nfs_pgio_header *);
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
                            int how, struct nfs_commit_info *cinfo);
 void nfs_retry_commit(struct list_head *page_list,
                      struct pnfs_layout_segment *lseg,
-                      struct nfs_commit_info *cinfo);
+                      struct nfs_commit_info *cinfo,
+                      u32 ds_commit_idx);
 void nfs_commitdata_release(struct nfs_commit_data *data);
 void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
                                 struct nfs_commit_info *cinfo);
@@ -459,6 +475,7 @@ void nfs_init_cinfo(struct nfs_commit_info *cinfo,
                    struct nfs_direct_req *dreq);
 int nfs_key_timeout_notify(struct file *filp, struct inode *inode);
 bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx);
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio);
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
@@ -482,6 +499,7 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
        inode_dio_wait(inode);
 }
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
+extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -495,6 +513,26 @@ extern int nfs41_walk_client_list(struct nfs_client *clp,
                                struct nfs_client **result,
                                struct rpc_cred *cred);
+static inline struct inode *nfs_igrab_and_active(struct inode *inode)
+{
+        inode = igrab(inode);
+        if (inode != NULL && !nfs_sb_active(inode->i_sb)) {
+                iput(inode);
+                inode = NULL;
+        }
+        return inode;
+}
+static inline void nfs_iput_and_deactive(struct inode *inode)
+{
+        if (inode != NULL) {
+                struct super_block *sb = inode->i_sb;
+                iput(inode);
+                nfs_sb_deactive(sb);
+        }
+}
 /*
 * Determine the device name as a string
 */
@@ -560,6 +598,19 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize)
 }
 /*
+ * Record the page as unstable and mark its inode as dirty.
+ */
+static inline
+void nfs_mark_page_unstable(struct page *page)
+{
+        struct inode *inode = page_file_mapping(page)->host;
+        inc_zone_page_state(page, NR_UNSTABLE_NFS);
+        inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE);
+         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
+}
+/*
 * Determine the number of bytes of data the page contains
 */
 static inline
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 5f61b83f4a1c..b4e03ed8599d 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -481,7 +481,8 @@ out_overflow:
 *              void;
 *      };
 */
-static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
+static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result,
+                           __u32 *op_status)
 {
        enum nfs_stat status;
        int error;
@@ -489,6 +490,8 @@ static int decode_attrstat(struct xdr_stream *xdr, struct nfs_fattr *result)
        error = decode_stat(xdr, &status);
        if (unlikely(error))
                goto out;
+        if (op_status)
+                *op_status = status;
        if (status != NFS_OK)
                goto out_default;
        error = decode_fattr(xdr, result);
@@ -808,7 +811,7 @@ out_default:
 static int nfs2_xdr_dec_attrstat(struct rpc_rqst *req, struct xdr_stream *xdr,
                                 struct nfs_fattr *result)
 {
-        return decode_attrstat(xdr, result);
+        return decode_attrstat(xdr, result, NULL);
 }
 static int nfs2_xdr_dec_diropres(struct rpc_rqst *req, struct xdr_stream *xdr,
@@ -865,6 +868,7 @@ static int nfs2_xdr_dec_readres(struct rpc_rqst *req, struct xdr_stream *xdr,
        error = decode_stat(xdr, &status);
        if (unlikely(error))
                goto out;
+        result->op_status = status;
        if (status != NFS_OK)
                goto out_default;
        error = decode_fattr(xdr, result->fattr);
@@ -882,7 +886,7 @@ static int nfs2_xdr_dec_writeres(struct rpc_rqst *req, struct xdr_stream *xdr,
 {
        /* All NFSv2 writes are "file sync" writes */
        result->verf->committed = NFS_FILE_SYNC;
-        return decode_attrstat(xdr, result->fattr);
+        return decode_attrstat(xdr, result->fattr, &result->op_status);
 }
 /**
diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h
index 333ae4068506..e134d6548ab7 100644
--- a/fs/nfs/nfs3_fs.h
+++ b/fs/nfs/nfs3_fs.h
@@ -30,5 +30,7 @@ struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subver
 struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *,
                                     struct nfs_fattr *, rpc_authflavor_t);
+/* nfs3super.c */
+extern struct nfs_subversion nfs_v3;
 #endif /* __LINUX_FS_NFS_NFS3_FS_H */
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 8c1b437c5403..9e9fa347a948 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -1,5 +1,6 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_mount.h>
+#include <linux/sunrpc/addr.h>
 #include "internal.h"
 #include "nfs3_fs.h"
@@ -64,3 +65,43 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
                nfs_init_server_aclclient(server);
        return server;
 }
+/*
+ * Set up a pNFS Data Server client over NFSv3.
+ *
+ * Return any existing nfs_client that matches server address,port,version
+ * and minorversion.
+ *
+ * For a new nfs_client, use a soft mount (default), a low retrans and a
+ * low timeout interval so that if a connection is lost, we retry through
+ * the MDS.
+ */
+struct nfs_client *nfs3_set_ds_client(struct nfs_client *mds_clp,
+                const struct sockaddr *ds_addr, int ds_addrlen,
+                int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+                rpc_authflavor_t au_flavor)
+{
+        struct nfs_client_initdata cl_init = {
+                .addr = ds_addr,
+                .addrlen = ds_addrlen,
+                .nfs_mod = &nfs_v3,
+                .proto = ds_proto,
+                .net = mds_clp->cl_net,
+        };
+        struct rpc_timeout ds_timeout;
+        struct nfs_client *clp;
+        char buf[INET6_ADDRSTRLEN + 1];
+        /* fake a hostname because lockd wants it */
+        if (rpc_ntop(ds_addr, buf, sizeof(buf)) <= 0)
+                return ERR_PTR(-EINVAL);
+        cl_init.hostname = buf;
+        /* Use the MDS nfs_client cl_ipaddr. */
+        nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
+        clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
+                             au_flavor);
+        return clp;
+}
+EXPORT_SYMBOL_GPL(nfs3_set_ds_client);
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index 524f9f837408..78e557c3ab87 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -800,6 +800,9 @@ static int nfs3_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
        struct inode *inode = hdr->inode;
+        if (hdr->pgio_done_cb != NULL)
+                return hdr->pgio_done_cb(task, hdr);
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
@@ -825,6 +828,9 @@ static int nfs3_write_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
        struct inode *inode = hdr->inode;
+        if (hdr->pgio_done_cb != NULL)
+                return hdr->pgio_done_cb(task, hdr);
        if (nfs3_async_handle_jukebox(task, inode))
                return -EAGAIN;
        if (task->tk_status >= 0)
@@ -845,6 +851,9 @@ static void nfs3_proc_commit_rpc_prepare(struct rpc_task *task, struct nfs_commi
 static int nfs3_commit_done(struct rpc_task *task, struct nfs_commit_data *data)
 {
+        if (data->commit_done_cb != NULL)
+                return data->commit_done_cb(task, data);
        if (nfs3_async_handle_jukebox(task, data->inode))
                return -EAGAIN;
        nfs_refresh_inode(data->inode, data->res.fattr);
diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c
index 6af29c2da352..5c4394e4656b 100644
--- a/fs/nfs/nfs3super.c
+++ b/fs/nfs/nfs3super.c
@@ -7,7 +7,7 @@
 #include "nfs3_fs.h"
 #include "nfs.h"
-static struct nfs_subversion nfs_v3 = {
+struct nfs_subversion nfs_v3 = {
        .owner = THIS_MODULE,
        .nfs_fs   = &nfs_fs_type,
        .rpc_vers = &nfs_version3,
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 8f4cbe7f4aa8..2a932fdc57cb 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1636,6 +1636,7 @@ static int nfs3_xdr_dec_read3res(struct rpc_rqst *req, struct xdr_stream *xdr,
        error = decode_post_op_attr(xdr, result->fattr);
        if (unlikely(error))
                goto out;
+        result->op_status = status;
        if (status != NFS3_OK)
                goto out_status;
        error = decode_read3resok(xdr, result);
@@ -1708,6 +1709,7 @@ static int nfs3_xdr_dec_write3res(struct rpc_rqst *req, struct xdr_stream *xdr,
        error = decode_wcc_data(xdr, result->fattr);
        if (unlikely(error))
                goto out;
+        result->op_status = status;
        if (status != NFS3_OK)
                goto out_status;
        error = decode_write3resok(xdr, result);
@@ -2323,6 +2325,7 @@ static int nfs3_xdr_dec_commit3res(struct rpc_rqst *req,
        error = decode_wcc_data(xdr, result->fattr);
        if (unlikely(error))
                goto out;
+        result->op_status = status;
        if (status != NFS3_OK)
                goto out_status;
        error = decode_writeverf3(xdr, &result->verf->verifier);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a08178764cf9..fdef424b0cd3 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -44,6 +44,7 @@ enum nfs4_client_state {
 #define NFS4_RENEW_TIMEOUT              0x01
 #define NFS4_RENEW_DELEGATION_CB        0x02
+struct nfs_seqid_counter;
 struct nfs4_minor_version_ops {
        u32     minor_version;
        unsigned init_caps;
@@ -56,6 +57,8 @@ struct nfs4_minor_version_ops {
                        struct nfs_fsinfo *);
        void    (*free_lock_state)(struct nfs_server *,
                        struct nfs4_lock_state *);
+        struct nfs_seqid *
+                (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
        const struct rpc_call_ops *call_sync_ops;
        const struct nfs4_state_recovery_ops *reboot_recovery_ops;
        const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -443,6 +446,12 @@ extern void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid);
 extern void nfs_release_seqid(struct nfs_seqid *seqid);
 extern void nfs_free_seqid(struct nfs_seqid *seqid);
+extern int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
+                                struct nfs4_sequence_args *args,
+                                struct nfs4_sequence_res *res,
+                                struct rpc_task *task);
+extern int nfs4_sequence_done(struct rpc_task *task,
+                              struct nfs4_sequence_res *res);
 extern void nfs4_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 03311259b0c4..8646af9b11d2 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -228,6 +228,7 @@ static void nfs4_shutdown_client(struct nfs_client *clp)
        kfree(clp->cl_serverowner);
        kfree(clp->cl_serverscope);
        kfree(clp->cl_implid);
+        kfree(clp->cl_owner_id);
 }
 void nfs4_free_client(struct nfs_client *clp)
@@ -452,6 +453,14 @@ static void nfs4_swap_callback_idents(struct nfs_client *keep,
        spin_unlock(&nn->nfs_client_lock);
 }
+static bool nfs4_match_client_owner_id(const struct nfs_client *clp1,
+                const struct nfs_client *clp2)
+{
+        if (clp1->cl_owner_id == NULL || clp2->cl_owner_id == NULL)
+                return true;
+        return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0;
+}
 /**
 * nfs40_walk_client_list - Find server that recognizes a client ID
 *
@@ -483,9 +492,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
                if (pos->rpc_ops != new->rpc_ops)
                        continue;
-                if (pos->cl_proto != new->cl_proto)
-                        continue;
                if (pos->cl_minorversion != new->cl_minorversion)
                        continue;
@@ -510,6 +516,9 @@ int nfs40_walk_client_list(struct nfs_client *new,
                if (pos->cl_clientid != new->cl_clientid)
                        continue;
+                if (!nfs4_match_client_owner_id(pos, new))
+                        continue;
                atomic_inc(&pos->cl_count);
                spin_unlock(&nn->nfs_client_lock);
@@ -566,20 +575,14 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
 }
 /*
- * Returns true if the server owners match
+ * Returns true if the server major ids match
 */
 static bool
-nfs4_match_serverowners(struct nfs_client *a, struct nfs_client *b)
+nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b)
 {
        struct nfs41_server_owner *o1 = a->cl_serverowner;
        struct nfs41_server_owner *o2 = b->cl_serverowner;
-        if (o1->minor_id != o2->minor_id) {
-                dprintk("NFS: --> %s server owner minor IDs do not match\n",
-                        __func__);
-                return false;
-        }
        if (o1->major_id_sz != o2->major_id_sz)
                goto out_major_mismatch;
        if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
@@ -621,9 +624,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
                if (pos->rpc_ops != new->rpc_ops)
                        continue;
-                if (pos->cl_proto != new->cl_proto)
-                        continue;
                if (pos->cl_minorversion != new->cl_minorversion)
                        continue;
@@ -639,7 +639,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
                        prev = pos;
                        status = nfs_wait_client_init_complete(pos);
-                        if (status == 0) {
+                        if (pos->cl_cons_state == NFS_CS_SESSION_INITING) {
                                nfs4_schedule_lease_recovery(pos);
                                status = nfs4_wait_clnt_recover(pos);
                        }
@@ -654,7 +654,19 @@ int nfs41_walk_client_list(struct nfs_client *new,
                if (!nfs4_match_clientids(pos, new))
                        continue;
-                if (!nfs4_match_serverowners(pos, new))
+                /*
+                 * Note that session trunking is just a special subcase of
+                 * client id trunking. In either case, we want to fall back
+                 * to using the existing nfs_client.
+                 */
+                if (!nfs4_check_clientid_trunking(pos, new))
+                        continue;
+                /* Unlike NFSv4.0, we know that NFSv4.1 always uses the
+                 * uniform string, however someone might switch the
+                 * uniquifier string on us.
+                 */
+                if (!nfs4_match_client_owner_id(pos, new))
                        continue;
                atomic_inc(&pos->cl_count);
@@ -837,14 +849,15 @@ error:
 */
 struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
                const struct sockaddr *ds_addr, int ds_addrlen,
-                int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
+                int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
+                u32 minor_version, rpc_authflavor_t au_flavor)
 {
        struct nfs_client_initdata cl_init = {
                .addr = ds_addr,
                .addrlen = ds_addrlen,
                .nfs_mod = &nfs_v4,
                .proto = ds_proto,
-                .minorversion = mds_clp->cl_minorversion,
+                .minorversion = minor_version,
                .net = mds_clp->cl_net,
        };
        struct rpc_timeout ds_timeout;
@@ -862,7 +875,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_client* mds_clp,
         */
        nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
        clp = nfs_get_client(&cl_init, &ds_timeout, mds_clp->cl_ipaddr,
-                             mds_clp->cl_rpcclient->cl_auth->au_flavor);
+                             au_flavor);
        dprintk("<-- %s %p\n", __func__, clp);
        return clp;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index e7f8d5ff2581..88180ac5ea0e 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -495,12 +495,11 @@ static void nfs4_set_sequence_privileged(struct nfs4_sequence_args *args)
        args->sa_privileged = 1;
 }
-static int nfs40_setup_sequence(const struct nfs_server *server,
+int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
-                                struct nfs4_sequence_args *args,
+                         struct nfs4_sequence_args *args,
-                                struct nfs4_sequence_res *res,
+                         struct nfs4_sequence_res *res,
-                                struct rpc_task *task)
+                         struct rpc_task *task)
 {
-        struct nfs4_slot_table *tbl = server->nfs_client->cl_slot_tbl;
        struct nfs4_slot *slot;
        /* slot already allocated? */
@@ -535,6 +534,7 @@ out_sleep:
        spin_unlock(&tbl->slot_tbl_lock);
        return -EAGAIN;
 }
+EXPORT_SYMBOL_GPL(nfs40_setup_sequence);
 static int nfs40_sequence_done(struct rpc_task *task,
                               struct nfs4_sequence_res *res)
@@ -694,8 +694,7 @@ out_retry:
 }
 EXPORT_SYMBOL_GPL(nfs41_sequence_done);
-static int nfs4_sequence_done(struct rpc_task *task,
+int nfs4_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
-                               struct nfs4_sequence_res *res)
 {
        if (res->sr_slot == NULL)
                return 1;
@@ -703,6 +702,7 @@ static int nfs4_sequence_done(struct rpc_task *task,
                return nfs40_sequence_done(task, res);
        return nfs41_sequence_done(task, res);
 }
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
 int nfs41_setup_sequence(struct nfs4_session *session,
                                struct nfs4_sequence_args *args,
@@ -777,7 +777,8 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
        int ret = 0;
        if (!session)
-                return nfs40_setup_sequence(server, args, res, task);
+                return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
+                                            args, res, task);
        dprintk("--> %s clp %p session %p sr_slot %u\n",
                __func__, session->clp, session, res->sr_slot ?
@@ -818,14 +819,16 @@ static int nfs4_setup_sequence(const struct nfs_server *server,
                               struct nfs4_sequence_res *res,
                               struct rpc_task *task)
 {
-        return nfs40_setup_sequence(server, args, res, task);
+        return nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
+                                    args, res, task);
 }
-static int nfs4_sequence_done(struct rpc_task *task,
+int nfs4_sequence_done(struct rpc_task *task,
-                               struct nfs4_sequence_res *res)
+                       struct nfs4_sequence_res *res)
 {
        return nfs40_sequence_done(task, res);
 }
+EXPORT_SYMBOL_GPL(nfs4_sequence_done);
 #endif  /* !CONFIG_NFS_V4_1 */
@@ -937,6 +940,31 @@ static bool nfs4_clear_cap_atomic_open_v1(struct nfs_server *server,
        return true;
 }
+static u32
+nfs4_map_atomic_open_share(struct nfs_server *server,
+                fmode_t fmode, int openflags)
+{
+        u32 res = 0;
+        switch (fmode & (FMODE_READ | FMODE_WRITE)) {
+        case FMODE_READ:
+                res = NFS4_SHARE_ACCESS_READ;
+                break;
+        case FMODE_WRITE:
+                res = NFS4_SHARE_ACCESS_WRITE;
+                break;
+        case FMODE_READ|FMODE_WRITE:
+                res = NFS4_SHARE_ACCESS_BOTH;
+        }
+        if (!(server->caps & NFS_CAP_ATOMIC_OPEN_V1))
+                goto out;
+        /* Want no delegation if we're using O_DIRECT */
+        if (openflags & O_DIRECT)
+                res |= NFS4_SHARE_WANT_NO_DELEG;
+out:
+        return res;
+}
 static enum open_claim_type4
 nfs4_map_atomic_open_claim(struct nfs_server *server,
                enum open_claim_type4 claim)
@@ -977,6 +1005,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        struct dentry *parent = dget_parent(dentry);
        struct inode *dir = parent->d_inode;
        struct nfs_server *server = NFS_SERVER(dir);
+        struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
        struct nfs4_opendata *p;
        p = kzalloc(sizeof(*p), gfp_mask);
@@ -987,8 +1016,9 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        if (IS_ERR(p->f_label))
                goto err_free_p;
-        p->o_arg.seqid = nfs_alloc_seqid(&sp->so_seqid, gfp_mask);
+        alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
-        if (p->o_arg.seqid == NULL)
+        p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask);
+        if (IS_ERR(p->o_arg.seqid))
                goto err_free_label;
        nfs_sb_active(dentry->d_sb);
        p->dentry = dget(dentry);
@@ -997,6 +1027,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
        atomic_inc(&sp->so_count);
        p->o_arg.open_flags = flags;
        p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+        p->o_arg.share_access = nfs4_map_atomic_open_share(server,
+                        fmode, flags);
        /* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
         * will return permission denied for all bits until close */
        if (!(flags & O_EXCL)) {
@@ -1117,8 +1149,6 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
                return 0;
        if ((delegation->type & fmode) != fmode)
                return 0;
-        if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
-                return 0;
        if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
                return 0;
        nfs_mark_delegation_referenced(delegation);
@@ -1169,6 +1199,16 @@ static bool nfs_need_update_open_stateid(struct nfs4_state *state,
        return false;
 }
+static void nfs_resync_open_stateid_locked(struct nfs4_state *state)
+{
+        if (state->n_wronly)
+                set_bit(NFS_O_WRONLY_STATE, &state->flags);
+        if (state->n_rdonly)
+                set_bit(NFS_O_RDONLY_STATE, &state->flags);
+        if (state->n_rdwr)
+                set_bit(NFS_O_RDWR_STATE, &state->flags);
+}
 static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
                nfs4_stateid *stateid, fmode_t fmode)
 {
@@ -1187,8 +1227,12 @@ static void nfs_clear_open_stateid_locked(struct nfs4_state *state,
        }
        if (stateid == NULL)
                return;
-        if (!nfs_need_update_open_stateid(state, stateid))
+        /* Handle races with OPEN */
+        if (!nfs4_stateid_match_other(stateid, &state->open_stateid) ||
+            !nfs4_stateid_is_newer(stateid, &state->open_stateid)) {
+                nfs_resync_open_stateid_locked(state);
                return;
+        }
        if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
                nfs4_stateid_copy(&state->stateid, stateid);
        nfs4_stateid_copy(&state->open_stateid, stateid);
@@ -1283,6 +1327,23 @@ no_delegation:
        return ret;
 }
+static bool nfs4_update_lock_stateid(struct nfs4_lock_state *lsp,
+                const nfs4_stateid *stateid)
+{
+        struct nfs4_state *state = lsp->ls_state;
+        bool ret = false;
+        spin_lock(&state->state_lock);
+        if (!nfs4_stateid_match_other(stateid, &lsp->ls_stateid))
+                goto out_noupdate;
+        if (!nfs4_stateid_is_newer(stateid, &lsp->ls_stateid))
+                goto out_noupdate;
+        nfs4_stateid_copy(&lsp->ls_stateid, stateid);
+        ret = true;
+out_noupdate:
+        spin_unlock(&state->state_lock);
+        return ret;
+}
 static void nfs4_return_incompatible_delegation(struct inode *inode, fmode_t fmode)
 {
@@ -1681,8 +1742,8 @@ static void nfs4_open_confirm_prepare(struct rpc_task *task, void *calldata)
 {
        struct nfs4_opendata *data = calldata;
-        nfs40_setup_sequence(data->o_arg.server, &data->c_arg.seq_args,
+        nfs40_setup_sequence(data->o_arg.server->nfs_client->cl_slot_tbl,
-                                &data->c_res.seq_res, task);
+                             &data->c_arg.seq_args, &data->c_res.seq_res, task);
 }
 static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata)
@@ -2589,6 +2650,11 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_EXPIRED:
+                        if (!nfs4_stateid_match(&calldata->arg.stateid,
+                                                &state->stateid)) {
+                                rpc_restart_call_prepare(task);
+                                goto out_release;
+                        }
                        if (calldata->arg.fmode == 0)
                                break;
                default:
@@ -2621,6 +2687,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
        is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
        is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
+        nfs4_stateid_copy(&calldata->arg.stateid, &state->stateid);
        /* Calculate the change in open mode */
        calldata->arg.fmode = 0;
        if (state->n_rdwr == 0) {
@@ -2655,6 +2722,9 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                        goto out_wait;
                    }
        }
+        calldata->arg.share_access =
+                nfs4_map_atomic_open_share(NFS_SERVER(inode),
+                                calldata->arg.fmode, 0);
        nfs_fattr_init(calldata->res.fattr);
        calldata->timestamp = jiffies;
@@ -2677,45 +2747,10 @@ static const struct rpc_call_ops nfs4_close_ops = {
        .rpc_release = nfs4_free_closedata,
 };
-static bool nfs4_state_has_opener(struct nfs4_state *state)
-{
-        /* first check existing openers */
-        if (test_bit(NFS_O_RDONLY_STATE, &state->flags) != 0 &&
-            state->n_rdonly != 0)
-                return true;
-        if (test_bit(NFS_O_WRONLY_STATE, &state->flags) != 0 &&
-            state->n_wronly != 0)
-                return true;
-        if (test_bit(NFS_O_RDWR_STATE, &state->flags) != 0 &&
-            state->n_rdwr != 0)
-                return true;
-        return false;
-}
 static bool nfs4_roc(struct inode *inode)
 {
-        struct nfs_inode *nfsi = NFS_I(inode);
+        if (!nfs_have_layout(inode))
-        struct nfs_open_context *ctx;
-        struct nfs4_state *state;
-        spin_lock(&inode->i_lock);
-        list_for_each_entry(ctx, &nfsi->open_files, list) {
-                state = ctx->state;
-                if (state == NULL)
-                        continue;
-                if (nfs4_state_has_opener(state)) {
-                        spin_unlock(&inode->i_lock);
-                        return false;
-                }
-        }
-        spin_unlock(&inode->i_lock);
-        if (nfs4_check_delegation(inode, FMODE_READ))
                return false;
        return pnfs_roc(inode);
 }
@@ -2733,6 +2768,7 @@ static bool nfs4_roc(struct inode *inode)
 int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
 {
        struct nfs_server *server = NFS_SERVER(state->inode);
+        struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
        struct nfs4_closedata *calldata;
        struct nfs4_state_owner *sp = state->owner;
        struct rpc_task *task;
@@ -2759,10 +2795,10 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
        calldata->inode = state->inode;
        calldata->state = state;
        calldata->arg.fh = NFS_FH(state->inode);
-        calldata->arg.stateid = &state->open_stateid;
        /* Serialization for the sequence id */
-        calldata->arg.seqid = nfs_alloc_seqid(&state->owner->so_seqid, gfp_mask);
+        alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
-        if (calldata->arg.seqid == NULL)
+        calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
+        if (IS_ERR(calldata->arg.seqid))
                goto out_free_calldata;
        calldata->arg.fmode = 0;
        calldata->arg.bitmask = server->cache_consistency_bitmask;
@@ -4917,11 +4953,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
 }
 static unsigned int
-nfs4_init_nonuniform_client_string(const struct nfs_client *clp,
+nfs4_init_nonuniform_client_string(struct nfs_client *clp,
                                   char *buf, size_t len)
 {
        unsigned int result;
+        if (clp->cl_owner_id != NULL)
+                return strlcpy(buf, clp->cl_owner_id, len);
        rcu_read_lock();
        result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s",
                                clp->cl_ipaddr,
@@ -4930,24 +4969,32 @@ nfs4_init_nonuniform_client_string(const struct nfs_client *clp,
                                rpc_peeraddr2str(clp->cl_rpcclient,
                                                        RPC_DISPLAY_PROTO));
        rcu_read_unlock();
+        clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
        return result;
 }
 static unsigned int
-nfs4_init_uniform_client_string(const struct nfs_client *clp,
+nfs4_init_uniform_client_string(struct nfs_client *clp,
                                char *buf, size_t len)
 {
        const char *nodename = clp->cl_rpcclient->cl_nodename;
+        unsigned int result;
+        if (clp->cl_owner_id != NULL)
+                return strlcpy(buf, clp->cl_owner_id, len);
        if (nfs4_client_id_uniquifier[0] != '\0')
-                return scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
+                result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s",
                                clp->rpc_ops->version,
                                clp->cl_minorversion,
                                nfs4_client_id_uniquifier,
                                nodename);
-        return scnprintf(buf, len, "Linux NFSv%u.%u %s",
+        else
+                result = scnprintf(buf, len, "Linux NFSv%u.%u %s",
                                clp->rpc_ops->version, clp->cl_minorversion,
                                nodename);
+        clp->cl_owner_id = kstrdup(buf, GFP_KERNEL);
+        return result;
 }
 /*
@@ -5128,9 +5175,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 static void nfs4_delegreturn_release(void *calldata)
 {
        struct nfs4_delegreturndata *data = calldata;
+        struct inode *inode = data->inode;
-        if (data->roc)
+        if (inode) {
-                pnfs_roc_release(data->inode);
+                if (data->roc)
+                        pnfs_roc_release(inode);
+                nfs_iput_and_deactive(inode);
+        }
        kfree(calldata);
 }
@@ -5187,9 +5238,9 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
        nfs_fattr_init(data->res.fattr);
        data->timestamp = jiffies;
        data->rpc_status = 0;
-        data->inode = inode;
+        data->inode = nfs_igrab_and_active(inode);
-        data->roc = list_empty(&NFS_I(inode)->open_files) ?
+        if (data->inode)
-                    pnfs_roc(inode) : false;
+                data->roc = nfs4_roc(inode);
        task_setup_data.callback_data = data;
        msg.rpc_argp = &data->args;
@@ -5344,7 +5395,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl,
        p->arg.fl = &p->fl;
        p->arg.seqid = seqid;
        p->res.seqid = seqid;
-        p->arg.stateid = &lsp->ls_stateid;
        p->lsp = lsp;
        atomic_inc(&lsp->ls_count);
        /* Ensure we don't close file until we're done freeing locks! */
@@ -5371,14 +5421,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
                return;
        switch (task->tk_status) {
                case 0:
-                        nfs4_stateid_copy(&calldata->lsp->ls_stateid,
-                                        &calldata->res.stateid);
                        renew_lease(calldata->server, calldata->timestamp);
-                        break;
+                        do_vfs_lock(calldata->fl.fl_file, &calldata->fl);
+                        if (nfs4_update_lock_stateid(calldata->lsp,
+                                        &calldata->res.stateid))
+                                break;
                case -NFS4ERR_BAD_STATEID:
                case -NFS4ERR_OLD_STATEID:
                case -NFS4ERR_STALE_STATEID:
                case -NFS4ERR_EXPIRED:
+                        if (!nfs4_stateid_match(&calldata->arg.stateid,
+                                                &calldata->lsp->ls_stateid))
+                                rpc_restart_call_prepare(task);
                        break;
                default:
                        if (nfs4_async_handle_error(task, calldata->server,
@@ -5394,6 +5448,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
        if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
                goto out_wait;
+        nfs4_stateid_copy(&calldata->arg.stateid, &calldata->lsp->ls_stateid);
        if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
                /* Note: exit _without_ running nfs4_locku_done */
                goto out_no_action;
@@ -5464,6 +5519,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        struct nfs_seqid *seqid;
        struct nfs4_lock_state *lsp;
        struct rpc_task *task;
+        struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
        int status = 0;
        unsigned char fl_flags = request->fl_flags;
@@ -5487,9 +5543,10 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        lsp = request->fl_u.nfs4_fl.owner;
        if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) == 0)
                goto out;
-        seqid = nfs_alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
+        alloc_seqid = NFS_SERVER(inode)->nfs_client->cl_mvops->alloc_seqid;
+        seqid = alloc_seqid(&lsp->ls_seqid, GFP_KERNEL);
        status = -ENOMEM;
-        if (seqid == NULL)
+        if (IS_ERR(seqid))
                goto out;
        task = nfs4_do_unlck(request, nfs_file_open_context(request->fl_file), lsp, seqid);
        status = PTR_ERR(task);
@@ -5522,6 +5579,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        struct nfs4_lockdata *p;
        struct inode *inode = lsp->ls_state->inode;
        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_seqid *(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
        p = kzalloc(sizeof(*p), gfp_mask);
        if (p == NULL)
@@ -5530,12 +5588,12 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
        p->arg.fh = NFS_FH(inode);
        p->arg.fl = &p->fl;
        p->arg.open_seqid = nfs_alloc_seqid(&lsp->ls_state->owner->so_seqid, gfp_mask);
-        if (p->arg.open_seqid == NULL)
+        if (IS_ERR(p->arg.open_seqid))
                goto out_free;
-        p->arg.lock_seqid = nfs_alloc_seqid(&lsp->ls_seqid, gfp_mask);
+        alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid;
-        if (p->arg.lock_seqid == NULL)
+        p->arg.lock_seqid = alloc_seqid(&lsp->ls_seqid, gfp_mask);
+        if (IS_ERR(p->arg.lock_seqid))
                goto out_free_seqid;
-        p->arg.lock_stateid = &lsp->ls_stateid;
        p->arg.lock_owner.clientid = server->nfs_client->cl_clientid;
        p->arg.lock_owner.id = lsp->ls_seqid.owner_id;
        p->arg.lock_owner.s_dev = server->s_dev;
@@ -5562,15 +5620,19 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
                goto out_wait;
        /* Do we need to do an open_to_lock_owner? */
-        if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
+        if (!test_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags)) {
                if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
                        goto out_release_lock_seqid;
                }
-                data->arg.open_stateid = &state->open_stateid;
+                nfs4_stateid_copy(&data->arg.open_stateid,
+                                &state->open_stateid);
                data->arg.new_lock_owner = 1;
                data->res.open_seqid = data->arg.open_seqid;
-        } else
+        } else {
                data->arg.new_lock_owner = 0;
+                nfs4_stateid_copy(&data->arg.lock_stateid,
+                                &data->lsp->ls_stateid);
+        }
        if (!nfs4_valid_open_stateid(state)) {
                data->rpc_status = -EBADF;
                task->tk_action = NULL;
@@ -5594,6 +5656,7 @@ out_wait:
 static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 {
        struct nfs4_lockdata *data = calldata;
+        struct nfs4_lock_state *lsp = data->lsp;
        dprintk("%s: begin!\n", __func__);
@@ -5601,18 +5664,36 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
                return;
        data->rpc_status = task->tk_status;
-        if (data->arg.new_lock_owner != 0) {
+        switch (task->tk_status) {
-                if (data->rpc_status == 0)
+        case 0:
-                        nfs_confirm_seqid(&data->lsp->ls_seqid, 0);
+                renew_lease(NFS_SERVER(data->ctx->dentry->d_inode),
-                else
+                                data->timestamp);
-                        goto out;
+                if (data->arg.new_lock) {
-        }
+                        data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
-        if (data->rpc_status == 0) {
+                        if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) {
-                nfs4_stateid_copy(&data->lsp->ls_stateid, &data->res.stateid);
+                                rpc_restart_call_prepare(task);
-                set_bit(NFS_LOCK_INITIALIZED, &data->lsp->ls_flags);
+                                break;
-                renew_lease(NFS_SERVER(data->ctx->dentry->d_inode), data->timestamp);
+                        }
+                }
+                if (data->arg.new_lock_owner != 0) {
+                        nfs_confirm_seqid(&lsp->ls_seqid, 0);
+                        nfs4_stateid_copy(&lsp->ls_stateid, &data->res.stateid);
+                        set_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+                } else if (!nfs4_update_lock_stateid(lsp, &data->res.stateid))
+                        rpc_restart_call_prepare(task);
+                break;
+        case -NFS4ERR_BAD_STATEID:
+        case -NFS4ERR_OLD_STATEID:
+        case -NFS4ERR_STALE_STATEID:
+        case -NFS4ERR_EXPIRED:
+                if (data->arg.new_lock_owner != 0) {
+                        if (!nfs4_stateid_match(&data->arg.open_stateid,
+                                                &lsp->ls_state->open_stateid))
+                                rpc_restart_call_prepare(task);
+                } else if (!nfs4_stateid_match(&data->arg.lock_stateid,
+                                                &lsp->ls_stateid))
+                                rpc_restart_call_prepare(task);
        }
-out:
        dprintk("%s: done, ret = %d!\n", __func__, data->rpc_status);
 }
@@ -5693,7 +5774,8 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
                if (recovery_type == NFS_LOCK_RECLAIM)
                        data->arg.reclaim = NFS_LOCK_RECLAIM;
                nfs4_set_sequence_privileged(&data->arg.seq_args);
-        }
+        } else
+                data->arg.new_lock = 1;
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
@@ -5817,10 +5899,8 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-        struct nfs4_state_owner *sp = state->owner;
        struct nfs_inode *nfsi = NFS_I(state->inode);
        unsigned char fl_flags = request->fl_flags;
-        unsigned int seq;
        int status = -ENOLCK;
        if ((fl_flags & FL_POSIX) &&
@@ -5840,25 +5920,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
                /* ...but avoid races with delegation recall... */
                request->fl_flags = fl_flags & ~FL_SLEEP;
                status = do_vfs_lock(request->fl_file, request);
-                goto out_unlock;
+                up_read(&nfsi->rwsem);
-        }
-        seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
-        up_read(&nfsi->rwsem);
-        status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
-        if (status != 0)
                goto out;
-        down_read(&nfsi->rwsem);
-        if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
-                status = -NFS4ERR_DELAY;
-                goto out_unlock;
        }
-        /* Note: we always want to sleep here! */
-        request->fl_flags = fl_flags | FL_SLEEP;
-        if (do_vfs_lock(request->fl_file, request) < 0)
-                printk(KERN_WARNING "NFS: %s: VFS is out of sync with lock "
-                        "manager!\n", __func__);
-out_unlock:
        up_read(&nfsi->rwsem);
+        status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
 out:
        request->fl_flags = fl_flags;
        return status;
@@ -5965,8 +6031,8 @@ static void nfs4_release_lockowner_prepare(struct rpc_task *task, void *calldata
 {
        struct nfs_release_lockowner_data *data = calldata;
        struct nfs_server *server = data->server;
-        nfs40_setup_sequence(server, &data->args.seq_args,
+        nfs40_setup_sequence(server->nfs_client->cl_slot_tbl,
-                                &data->res.seq_res, task);
+                             &data->args.seq_args, &data->res.seq_res, task);
        data->args.lock_owner.clientid = server->nfs_client->cl_clientid;
        data->timestamp = jiffies;
 }
@@ -6582,47 +6648,47 @@ nfs41_same_server_scope(struct nfs41_server_scope *a,
 int nfs4_proc_bind_conn_to_session(struct nfs_client *clp, struct rpc_cred *cred)
 {
        int status;
+        struct nfs41_bind_conn_to_session_args args = {
+                .client = clp,
+                .dir = NFS4_CDFC4_FORE_OR_BOTH,
+        };
        struct nfs41_bind_conn_to_session_res res;
        struct rpc_message msg = {
                .rpc_proc =
                        &nfs4_procedures[NFSPROC4_CLNT_BIND_CONN_TO_SESSION],
-                .rpc_argp = clp,
+                .rpc_argp = &args,
                .rpc_resp = &res,
                .rpc_cred = cred,
        };
        dprintk("--> %s\n", __func__);
-        res.session = kzalloc(sizeof(struct nfs4_session), GFP_NOFS);
+        nfs4_copy_sessionid(&args.sessionid, &clp->cl_session->sess_id);
-        if (unlikely(res.session == NULL)) {
+        if (!(clp->cl_session->flags & SESSION4_BACK_CHAN))
-                status = -ENOMEM;
+                args.dir = NFS4_CDFC4_FORE;
-                goto out;
-        }
        status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
        trace_nfs4_bind_conn_to_session(clp, status);
        if (status == 0) {
-                if (memcmp(res.session->sess_id.data,
+                if (memcmp(res.sessionid.data,
                    clp->cl_session->sess_id.data, NFS4_MAX_SESSIONID_LEN)) {
                        dprintk("NFS: %s: Session ID mismatch\n", __func__);
                        status = -EIO;
-                        goto out_session;
+                        goto out;
                }
-                if (res.dir != NFS4_CDFS4_BOTH) {
+                if ((res.dir & args.dir) != res.dir || res.dir == 0) {
                        dprintk("NFS: %s: Unexpected direction from server\n",
                                __func__);
                        status = -EIO;
-                        goto out_session;
+                        goto out;
                }
-                if (res.use_conn_in_rdma_mode) {
+                if (res.use_conn_in_rdma_mode != args.use_conn_in_rdma_mode) {
                        dprintk("NFS: %s: Server returned RDMA mode = true\n",
                                __func__);
                        status = -EIO;
-                        goto out_session;
+                        goto out;
                }
        }
-out_session:
-        kfree(res.session);
 out:
        dprintk("<-- %s status= %d\n", __func__, status);
        return status;
@@ -7100,10 +7166,11 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args)
                args->bc_attrs.max_reqs);
 }
-static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args,
+                struct nfs41_create_session_res *res)
 {
        struct nfs4_channel_attrs *sent = &args->fc_attrs;
-        struct nfs4_channel_attrs *rcvd = &session->fc_attrs;
+        struct nfs4_channel_attrs *rcvd = &res->fc_attrs;
        if (rcvd->max_resp_sz > sent->max_resp_sz)
                return -EINVAL;
@@ -7122,11 +7189,14 @@ static int nfs4_verify_fore_channel_attrs(struct nfs41_create_session_args *args
        return 0;
 }
-static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args, struct nfs4_session *session)
+static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args,
+                struct nfs41_create_session_res *res)
 {
        struct nfs4_channel_attrs *sent = &args->bc_attrs;
-        struct nfs4_channel_attrs *rcvd = &session->bc_attrs;
+        struct nfs4_channel_attrs *rcvd = &res->bc_attrs;
+        if (!(res->flags & SESSION4_BACK_CHAN))
+                goto out;
        if (rcvd->max_rqst_sz > sent->max_rqst_sz)
                return -EINVAL;
        if (rcvd->max_resp_sz < sent->max_resp_sz)
@@ -7138,18 +7208,30 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
                return -EINVAL;
        if (rcvd->max_reqs != sent->max_reqs)
                return -EINVAL;
+out:
        return 0;
 }
 static int nfs4_verify_channel_attrs(struct nfs41_create_session_args *args,
-                                     struct nfs4_session *session)
+                                     struct nfs41_create_session_res *res)
 {
        int ret;
-        ret = nfs4_verify_fore_channel_attrs(args, session);
+        ret = nfs4_verify_fore_channel_attrs(args, res);
        if (ret)
                return ret;
-        return nfs4_verify_back_channel_attrs(args, session);
+        return nfs4_verify_back_channel_attrs(args, res);
+}
+static void nfs4_update_session(struct nfs4_session *session,
+                struct nfs41_create_session_res *res)
+{
+        nfs4_copy_sessionid(&session->sess_id, &res->sessionid);
+        session->flags = res->flags;
+        memcpy(&session->fc_attrs, &res->fc_attrs, sizeof(session->fc_attrs));
+        if (res->flags & SESSION4_BACK_CHAN)
+                memcpy(&session->bc_attrs, &res->bc_attrs,
+                                sizeof(session->bc_attrs));
 }
 static int _nfs4_proc_create_session(struct nfs_client *clp,
@@ -7158,11 +7240,12 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
        struct nfs4_session *session = clp->cl_session;
        struct nfs41_create_session_args args = {
                .client = clp,
+                .clientid = clp->cl_clientid,
+                .seqid = clp->cl_seqid,
                .cb_program = NFS4_CALLBACK,
        };
-        struct nfs41_create_session_res res = {
+        struct nfs41_create_session_res res;
-                .client = clp,
-        };
        struct rpc_message msg = {
                .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CREATE_SESSION],
                .rpc_argp = &args,
@@ -7179,11 +7262,15 @@ static int _nfs4_proc_create_session(struct nfs_client *clp,
        if (!status) {
                /* Verify the session's negotiated channel_attrs values */
-                status = nfs4_verify_channel_attrs(&args, session);
+                status = nfs4_verify_channel_attrs(&args, &res);
                /* Increment the clientid slot sequence id */
-                clp->cl_seqid++;
+                if (clp->cl_seqid == res.seqid)
+                        clp->cl_seqid++;
+                if (status)
+                        goto out;
+                nfs4_update_session(session, &res);
        }
+out:
        return status;
 }
@@ -7528,6 +7615,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
                return;
        if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
                                          NFS_I(lgp->args.inode)->layout,
+                                          &lgp->args.range,
                                          lgp->args.ctx->state)) {
                rpc_exit(task, NFS4_OK);
        }
@@ -7783,9 +7871,13 @@ static void nfs4_layoutreturn_release(void *calldata)
        spin_lock(&lo->plh_inode->i_lock);
        if (lrp->res.lrs_present)
                pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
+        pnfs_clear_layoutreturn_waitbit(lo);
+        clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
+        rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
        lo->plh_block_lgets--;
        spin_unlock(&lo->plh_inode->i_lock);
        pnfs_put_layout_hdr(lrp->args.layout);
+        nfs_iput_and_deactive(lrp->inode);
        kfree(calldata);
        dprintk("<-- %s\n", __func__);
 }
@@ -7796,7 +7888,7 @@ static const struct rpc_call_ops nfs4_layoutreturn_call_ops = {
        .rpc_release = nfs4_layoutreturn_release,
 };
-int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
+int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
 {
        struct rpc_task *task;
        struct rpc_message msg = {
@@ -7811,14 +7903,23 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp)
                .callback_ops = &nfs4_layoutreturn_call_ops,
                .callback_data = lrp,
        };
-        int status;
+        int status = 0;
        dprintk("--> %s\n", __func__);
+        if (!sync) {
+                lrp->inode = nfs_igrab_and_active(lrp->args.inode);
+                if (!lrp->inode) {
+                        nfs4_layoutreturn_release(lrp);
+                        return -EAGAIN;
+                }
+                task_setup_data.flags |= RPC_TASK_ASYNC;
+        }
        nfs4_init_sequence(&lrp->args.seq_args, &lrp->res.seq_res, 1);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
-        status = task->tk_status;
+        if (sync)
+                status = task->tk_status;
        trace_nfs4_layoutreturn(lrp->args.inode, status);
        dprintk("<-- %s status=%d\n", __func__, status);
        rpc_put_task(task);
@@ -7912,6 +8013,7 @@ static void nfs4_layoutcommit_release(void *calldata)
        nfs_post_op_update_inode_force_wcc(data->args.inode,
                                           data->res.fattr);
        put_rpccred(data->cred);
+        nfs_iput_and_deactive(data->inode);
        kfree(data);
 }
@@ -7936,7 +8038,6 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
                .rpc_message = &msg,
                .callback_ops = &nfs4_layoutcommit_ops,
                .callback_data = data,
-                .flags = RPC_TASK_ASYNC,
        };
        struct rpc_task *task;
        int status = 0;
@@ -7947,18 +8048,21 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
                data->args.lastbytewritten,
                data->args.inode->i_ino);
+        if (!sync) {
+                data->inode = nfs_igrab_and_active(data->args.inode);
+                if (data->inode == NULL) {
+                        nfs4_layoutcommit_release(data);
+                        return -EAGAIN;
+                }
+                task_setup_data.flags = RPC_TASK_ASYNC;
+        }
        nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
        task = rpc_run_task(&task_setup_data);
        if (IS_ERR(task))
                return PTR_ERR(task);
-        if (sync == false)
+        if (sync)
-                goto out;
+                status = task->tk_status;
-        status = nfs4_wait_for_completion_rpc_task(task);
-        if (status != 0)
-                goto out;
-        status = task->tk_status;
        trace_nfs4_layoutcommit(data->args.inode, status);
-out:
        dprintk("%s: status %d\n", __func__, status);
        rpc_put_task(task);
        return status;
@@ -8386,6 +8490,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
        .match_stateid = nfs4_match_stateid,
        .find_root_sec = nfs4_find_root_sec,
        .free_lock_state = nfs4_release_lockowner,
+        .alloc_seqid = nfs_alloc_seqid,
        .call_sync_ops = &nfs40_call_sync_ops,
        .reboot_recovery_ops = &nfs40_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs40_nograce_recovery_ops,
@@ -8394,6 +8499,12 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 };
 #if defined(CONFIG_NFS_V4_1)
+static struct nfs_seqid *
+nfs_alloc_no_seqid(struct nfs_seqid_counter *arg1, gfp_t arg2)
+{
+        return NULL;
+}
 static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
        .minor_version = 1,
        .init_caps = NFS_CAP_READDIRPLUS
@@ -8407,6 +8518,7 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
        .match_stateid = nfs41_match_stateid,
        .find_root_sec = nfs41_find_root_sec,
        .free_lock_state = nfs41_free_lock_state,
+        .alloc_seqid = nfs_alloc_no_seqid,
        .call_sync_ops = &nfs41_call_sync_ops,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -8433,6 +8545,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
        .find_root_sec = nfs41_find_root_sec,
        .free_lock_state = nfs41_free_lock_state,
        .call_sync_ops = &nfs41_call_sync_ops,
+        .alloc_seqid = nfs_alloc_no_seqid,
        .reboot_recovery_ops = &nfs41_reboot_recovery_ops,
        .nograce_recovery_ops = &nfs41_nograce_recovery_ops,
        .state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index e799dc3c3b1d..e23366effcfb 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -450,7 +450,7 @@ int nfs4_setup_session_slot_tables(struct nfs4_session *ses)
        tbl = &ses->fc_slot_table;
        tbl->session = ses;
        status = nfs4_realloc_slot_table(tbl, ses->fc_attrs.max_reqs, 1);
-        if (status) /* -ENOMEM */
+        if (status || !(ses->flags & SESSION4_BACK_CHAN)) /* -ENOMEM */
                return status;
        /* Back channel */
        tbl = &ses->bc_slot_table;
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index b34ada9bc6a2..fc46c7455898 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -118,6 +118,12 @@ static inline int nfs4_has_persistent_session(const struct nfs_client *clp)
        return 0;
 }
+static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst,
+                const struct nfs4_sessionid *src)
+{
+        memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN);
+}
 #ifdef CONFIG_CRC32
 /*
 * nfs_session_id_hash - calculate the crc32 hash for the session id
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 5194933ed419..5ad908e9ce9c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1003,11 +1003,11 @@ struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_m
        struct nfs_seqid *new;
        new = kmalloc(sizeof(*new), gfp_mask);
-        if (new != NULL) {
+        if (new == NULL)
-                new->sequence = counter;
+                return ERR_PTR(-ENOMEM);
-                INIT_LIST_HEAD(&new->list);
+        new->sequence = counter;
-                new->task = NULL;
+        INIT_LIST_HEAD(&new->list);
-        }
+        new->task = NULL;
        return new;
 }
@@ -1015,7 +1015,7 @@ void nfs_release_seqid(struct nfs_seqid *seqid)
 {
        struct nfs_seqid_counter *sequence;
-        if (list_empty(&seqid->list))
+        if (seqid == NULL || list_empty(&seqid->list))
                return;
        sequence = seqid->sequence;
        spin_lock(&sequence->lock);
@@ -1071,13 +1071,15 @@ static void nfs_increment_seqid(int status, struct nfs_seqid *seqid)
 void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
 {
-        struct nfs4_state_owner *sp = container_of(seqid->sequence,
+        struct nfs4_state_owner *sp;
-                                        struct nfs4_state_owner, so_seqid);
-        struct nfs_server *server = sp->so_server;
+        if (seqid == NULL)
+                return;
+        sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
        if (status == -NFS4ERR_BAD_SEQID)
                nfs4_drop_state_owner(sp);
-        if (!nfs4_has_session(server->nfs_client))
+        if (!nfs4_has_session(sp->so_server->nfs_client))
                nfs_increment_seqid(status, seqid);
 }
@@ -1088,14 +1090,18 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
 */
 void nfs_increment_lock_seqid(int status, struct nfs_seqid *seqid)
 {
-        nfs_increment_seqid(status, seqid);
+        if (seqid != NULL)
+                nfs_increment_seqid(status, seqid);
 }
 int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
 {
-        struct nfs_seqid_counter *sequence = seqid->sequence;
+        struct nfs_seqid_counter *sequence;
        int status = 0;
+        if (seqid == NULL)
+                goto out;
+        sequence = seqid->sequence;
        spin_lock(&sequence->lock);
        seqid->task = task;
        if (list_empty(&seqid->list))
@@ -1106,6 +1112,7 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task)
        status = -EAGAIN;
 unlock:
        spin_unlock(&sequence->lock);
+out:
        return status;
 }
@@ -1366,49 +1373,55 @@ static int nfs4_reclaim_locks(struct nfs4_state *state, const struct nfs4_state_
        struct nfs_inode *nfsi = NFS_I(inode);
        struct file_lock *fl;
        int status = 0;
+        struct file_lock_context *flctx = inode->i_flctx;
+        struct list_head *list;
-        if (inode->i_flock == NULL)
+        if (flctx == NULL)
                return 0;
+        list = &flctx->flc_posix;
        /* Guard against delegation returns and new lock/unlock calls */
        down_write(&nfsi->rwsem);
-        /* Protect inode->i_flock using the BKL */
+        spin_lock(&flctx->flc_lock);
-        spin_lock(&inode->i_lock);
+restart:
-        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
+        list_for_each_entry(fl, list, fl_list) {
-                if (!(fl->fl_flags & (FL_POSIX|FL_FLOCK)))
-                        continue;
                if (nfs_file_open_context(fl->fl_file)->state != state)
                        continue;
-                spin_unlock(&inode->i_lock);
+                spin_unlock(&flctx->flc_lock);
                status = ops->recover_lock(state, fl);
                switch (status) {
-                        case 0:
+                case 0:
-                                break;
+                        break;
-                        case -ESTALE:
+                case -ESTALE:
-                        case -NFS4ERR_ADMIN_REVOKED:
+                case -NFS4ERR_ADMIN_REVOKED:
-                        case -NFS4ERR_STALE_STATEID:
+                case -NFS4ERR_STALE_STATEID:
-                        case -NFS4ERR_BAD_STATEID:
+                case -NFS4ERR_BAD_STATEID:
-                        case -NFS4ERR_EXPIRED:
+                case -NFS4ERR_EXPIRED:
-                        case -NFS4ERR_NO_GRACE:
+                case -NFS4ERR_NO_GRACE:
-                        case -NFS4ERR_STALE_CLIENTID:
+                case -NFS4ERR_STALE_CLIENTID:
-                        case -NFS4ERR_BADSESSION:
+                case -NFS4ERR_BADSESSION:
-                        case -NFS4ERR_BADSLOT:
+                case -NFS4ERR_BADSLOT:
-                        case -NFS4ERR_BAD_HIGH_SLOT:
+                case -NFS4ERR_BAD_HIGH_SLOT:
-                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+                case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
-                                goto out;
+                        goto out;
-                        default:
+                default:
-                                printk(KERN_ERR "NFS: %s: unhandled error %d\n",
+                        pr_err("NFS: %s: unhandled error %d\n",
-                                         __func__, status);
+                                        __func__, status);
-                        case -ENOMEM:
+                case -ENOMEM:
-                        case -NFS4ERR_DENIED:
+                case -NFS4ERR_DENIED:
-                        case -NFS4ERR_RECLAIM_BAD:
+                case -NFS4ERR_RECLAIM_BAD:
-                        case -NFS4ERR_RECLAIM_CONFLICT:
+                case -NFS4ERR_RECLAIM_CONFLICT:
-                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
+                        /* kill_proc(fl->fl_pid, SIGLOST, 1); */
-                                status = 0;
+                        status = 0;
                }
-                spin_lock(&inode->i_lock);
+                spin_lock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
+        if (list == &flctx->flc_posix) {
+                list = &flctx->flc_flock;
+                goto restart;
+        }
+        spin_unlock(&flctx->flc_lock);
 out:
        up_write(&nfsi->rwsem);
        return status;
diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c
index 6f340f02f2ba..75090feeafad 100644
--- a/fs/nfs/nfs4super.c
+++ b/fs/nfs/nfs4super.c
@@ -53,7 +53,6 @@ static const struct super_operations nfs4_sops = {
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs4_write_inode,
        .drop_inode     = nfs_drop_inode,
-        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .evict_inode    = nfs4_evict_inode,
        .umount_begin   = nfs_umount_begin,
@@ -346,6 +345,9 @@ out:
 static void __exit exit_nfs_v4(void)
 {
+        /* Not called in the _init(), conditionally loaded */
+        nfs4_pnfs_v3_ds_connect_unload();
        unregister_nfs_version(&nfs_v4);
        nfs4_unregister_sysctl();
        nfs_idmap_quit();
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index cb4376b78ed9..5c399ec41079 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -946,7 +946,10 @@ static void encode_uint64(struct xdr_stream *xdr, u64 n)
 static void encode_nfs4_seqid(struct xdr_stream *xdr,
                const struct nfs_seqid *seqid)
 {
-        encode_uint32(xdr, seqid->sequence->counter);
+        if (seqid != NULL)
+                encode_uint32(xdr, seqid->sequence->counter);
+        else
+                encode_uint32(xdr, 0);
 }
 static void encode_compound_hdr(struct xdr_stream *xdr,
@@ -1125,7 +1128,7 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 {
        encode_op_hdr(xdr, OP_CLOSE, decode_close_maxsz, hdr);
        encode_nfs4_seqid(xdr, arg->seqid);
-        encode_nfs4_stateid(xdr, arg->stateid);
+        encode_nfs4_stateid(xdr, &arg->stateid);
 }
 static void encode_commit(struct xdr_stream *xdr, const struct nfs_commitargs *args, struct compound_hdr *hdr)
@@ -1301,12 +1304,12 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
        *p = cpu_to_be32(args->new_lock_owner);
        if (args->new_lock_owner){
                encode_nfs4_seqid(xdr, args->open_seqid);
-                encode_nfs4_stateid(xdr, args->open_stateid);
+                encode_nfs4_stateid(xdr, &args->open_stateid);
                encode_nfs4_seqid(xdr, args->lock_seqid);
                encode_lockowner(xdr, &args->lock_owner);
        }
        else {
-                encode_nfs4_stateid(xdr, args->lock_stateid);
+                encode_nfs4_stateid(xdr, &args->lock_stateid);
                encode_nfs4_seqid(xdr, args->lock_seqid);
        }
 }
@@ -1330,7 +1333,7 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
        encode_op_hdr(xdr, OP_LOCKU, decode_locku_maxsz, hdr);
        encode_uint32(xdr, nfs4_lock_type(args->fl, 0));
        encode_nfs4_seqid(xdr, args->seqid);
-        encode_nfs4_stateid(xdr, args->stateid);
+        encode_nfs4_stateid(xdr, &args->stateid);
        p = reserve_space(xdr, 16);
        p = xdr_encode_hyper(p, args->fl->fl_start);
        xdr_encode_hyper(p, nfs4_lock_length(args->fl));
@@ -1348,24 +1351,12 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
        encode_string(xdr, name->len, name->name);
 }
-static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
+static void encode_share_access(struct xdr_stream *xdr, u32 share_access)
 {
        __be32 *p;
        p = reserve_space(xdr, 8);
-        switch (fmode & (FMODE_READ|FMODE_WRITE)) {
+        *p++ = cpu_to_be32(share_access);
-        case FMODE_READ:
-                *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
-                break;
-        case FMODE_WRITE:
-                *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
-                break;
-        case FMODE_READ|FMODE_WRITE:
-                *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
-                break;
-        default:
-                *p++ = cpu_to_be32(0);
-        }
        *p = cpu_to_be32(0);            /* for linux, share_deny = 0 always */
 }
@@ -1377,7 +1368,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 * owner 4 = 32
 */
        encode_nfs4_seqid(xdr, arg->seqid);
-        encode_share_access(xdr, arg->fmode);
+        encode_share_access(xdr, arg->share_access);
        p = reserve_space(xdr, 36);
        p = xdr_encode_hyper(p, arg->clientid);
        *p++ = cpu_to_be32(24);
@@ -1530,9 +1521,9 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_closeargs *arg, struct compound_hdr *hdr)
 {
        encode_op_hdr(xdr, OP_OPEN_DOWNGRADE, decode_open_downgrade_maxsz, hdr);
-        encode_nfs4_stateid(xdr, arg->stateid);
+        encode_nfs4_stateid(xdr, &arg->stateid);
        encode_nfs4_seqid(xdr, arg->seqid);
-        encode_share_access(xdr, arg->fmode);
+        encode_share_access(xdr, arg->share_access);
 }
 static void
@@ -1724,17 +1715,17 @@ static void encode_secinfo(struct xdr_stream *xdr, const struct qstr *name, stru
 #if defined(CONFIG_NFS_V4_1)
 /* NFSv4.1 operations */
 static void encode_bind_conn_to_session(struct xdr_stream *xdr,
-                                   struct nfs4_session *session,
+                                   struct nfs41_bind_conn_to_session_args *args,
                                   struct compound_hdr *hdr)
 {
        __be32 *p;
        encode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION,
                decode_bind_conn_to_session_maxsz, hdr);
-        encode_opaque_fixed(xdr, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+        encode_opaque_fixed(xdr, args->sessionid.data, NFS4_MAX_SESSIONID_LEN);
        p = xdr_reserve_space(xdr, 8);
-        *p++ = cpu_to_be32(NFS4_CDFC4_BACK_OR_BOTH);
+        *p++ = cpu_to_be32(args->dir);
-        *p = 0; /* use_conn_in_rdma_mode = False */
+        *p = (args->use_conn_in_rdma_mode) ? cpu_to_be32(1) : cpu_to_be32(0);
 }
 static void encode_op_map(struct xdr_stream *xdr, struct nfs4_op_map *op_map)
@@ -1801,9 +1792,8 @@ static void encode_create_session(struct xdr_stream *xdr,
                                  struct compound_hdr *hdr)
 {
        __be32 *p;
-        char machine_name[NFS4_MAX_MACHINE_NAME_LEN];
-        uint32_t len;
        struct nfs_client *clp = args->client;
+        struct rpc_clnt *clnt = clp->cl_rpcclient;
        struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
        u32 max_resp_sz_cached;
@@ -1814,13 +1804,10 @@ static void encode_create_session(struct xdr_stream *xdr,
        max_resp_sz_cached = (NFS4_dec_open_sz + RPC_REPHDRSIZE +
                              RPC_MAX_AUTH_SIZE + 2) * XDR_UNIT;
-        len = scnprintf(machine_name, sizeof(machine_name), "%s",
-                        clp->cl_ipaddr);
        encode_op_hdr(xdr, OP_CREATE_SESSION, decode_create_session_maxsz, hdr);
-        p = reserve_space(xdr, 16 + 2*28 + 20 + len + 12);
+        p = reserve_space(xdr, 16 + 2*28 + 20 + clnt->cl_nodelen + 12);
-        p = xdr_encode_hyper(p, clp->cl_clientid);
+        p = xdr_encode_hyper(p, args->clientid);
-        *p++ = cpu_to_be32(clp->cl_seqid);                      /*Sequence id */
+        *p++ = cpu_to_be32(args->seqid);                        /*Sequence id */
        *p++ = cpu_to_be32(args->flags);                        /*flags */
        /* Fore Channel */
@@ -1847,7 +1834,7 @@ static void encode_create_session(struct xdr_stream *xdr,
        /* authsys_parms rfc1831 */
        *p++ = cpu_to_be32(nn->boot_time.tv_nsec);      /* stamp */
-        p = xdr_encode_opaque(p, machine_name, len);
+        p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
        *p++ = cpu_to_be32(0);                          /* UID */
        *p++ = cpu_to_be32(0);                          /* GID */
        *p = cpu_to_be32(0);                            /* No more gids */
@@ -2012,11 +1999,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
        p = reserve_space(xdr, 16);
        *p++ = cpu_to_be32(0);          /* reclaim. always 0 for now */
        *p++ = cpu_to_be32(args->layout_type);
-        *p++ = cpu_to_be32(IOMODE_ANY);
+        *p++ = cpu_to_be32(args->range.iomode);
        *p = cpu_to_be32(RETURN_FILE);
        p = reserve_space(xdr, 16);
-        p = xdr_encode_hyper(p, 0);
+        p = xdr_encode_hyper(p, args->range.offset);
-        p = xdr_encode_hyper(p, NFS4_MAX_UINT64);
+        p = xdr_encode_hyper(p, args->range.length);
        spin_lock(&args->inode->i_lock);
        encode_nfs4_stateid(xdr, &args->stateid);
        spin_unlock(&args->inode->i_lock);
@@ -2747,14 +2734,14 @@ static void nfs4_xdr_enc_fsid_present(struct rpc_rqst *req,
 */
 static void nfs4_xdr_enc_bind_conn_to_session(struct rpc_rqst *req,
                                struct xdr_stream *xdr,
-                                struct nfs_client *clp)
+                                struct nfs41_bind_conn_to_session_args *args)
 {
        struct compound_hdr hdr = {
-                .minorversion = clp->cl_mvops->minor_version,
+                .minorversion = args->client->cl_mvops->minor_version,
        };
        encode_compound_hdr(xdr, req, &hdr);
-        encode_bind_conn_to_session(xdr, clp->cl_session, &hdr);
+        encode_bind_conn_to_session(xdr, args, &hdr);
        encode_nops(&hdr);
 }
@@ -4936,20 +4923,13 @@ out_overflow:
        return -EIO;
 }
-static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+static int decode_rw_delegation(struct xdr_stream *xdr,
+                uint32_t delegation_type,
+                struct nfs_openres *res)
 {
        __be32 *p;
-        uint32_t delegation_type;
        int status;
-        p = xdr_inline_decode(xdr, 4);
-        if (unlikely(!p))
-                goto out_overflow;
-        delegation_type = be32_to_cpup(p);
-        if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
-                res->delegation_type = 0;
-                return 0;
-        }
        status = decode_stateid(xdr, &res->delegation);
        if (unlikely(status))
                return status;
@@ -4973,6 +4953,52 @@ out_overflow:
        return -EIO;
 }
+static int decode_no_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+        __be32 *p;
+        uint32_t why_no_delegation;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        why_no_delegation = be32_to_cpup(p);
+        switch (why_no_delegation) {
+                case WND4_CONTENTION:
+                case WND4_RESOURCE:
+                        xdr_inline_decode(xdr, 4);
+                        /* Ignore for now */
+        }
+        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
+{
+        __be32 *p;
+        uint32_t delegation_type;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        delegation_type = be32_to_cpup(p);
+        res->delegation_type = 0;
+        switch (delegation_type) {
+        case NFS4_OPEN_DELEGATE_NONE:
+                return 0;
+        case NFS4_OPEN_DELEGATE_READ:
+        case NFS4_OPEN_DELEGATE_WRITE:
+                return decode_rw_delegation(xdr, delegation_type, res);
+        case NFS4_OPEN_DELEGATE_NONE_EXT:
+                return decode_no_delegation(xdr, res);
+        }
+        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 {
        __be32 *p;
@@ -5587,7 +5613,7 @@ static int decode_bind_conn_to_session(struct xdr_stream *xdr,
        status = decode_op_hdr(xdr, OP_BIND_CONN_TO_SESSION);
        if (!status)
-                status = decode_sessionid(xdr, &res->session->sess_id);
+                status = decode_sessionid(xdr, &res->sessionid);
        if (unlikely(status))
                return status;
@@ -5615,12 +5641,10 @@ static int decode_create_session(struct xdr_stream *xdr,
 {
        __be32 *p;
        int status;
-        struct nfs_client *clp = res->client;
-        struct nfs4_session *session = clp->cl_session;
        status = decode_op_hdr(xdr, OP_CREATE_SESSION);
        if (!status)
-                status = decode_sessionid(xdr, &session->sess_id);
+                status = decode_sessionid(xdr, &res->sessionid);
        if (unlikely(status))
                return status;
@@ -5628,13 +5652,13 @@ static int decode_create_session(struct xdr_stream *xdr,
        p = xdr_inline_decode(xdr, 8);
        if (unlikely(!p))
                goto out_overflow;
-        clp->cl_seqid = be32_to_cpup(p++);
+        res->seqid = be32_to_cpup(p++);
-        session->flags = be32_to_cpup(p);
+        res->flags = be32_to_cpup(p);
        /* Channel attributes */
-        status = decode_chan_attrs(xdr, &session->fc_attrs);
+        status = decode_chan_attrs(xdr, &res->fc_attrs);
        if (!status)
-                status = decode_chan_attrs(xdr, &session->bc_attrs);
+                status = decode_chan_attrs(xdr, &res->bc_attrs);
        return status;
 out_overflow:
        print_overflow_msg(__func__, xdr);
@@ -6567,6 +6591,7 @@ static int nfs4_xdr_dec_read(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        int status;
        status = decode_compound_hdr(xdr, &hdr);
+        res->op_status = hdr.status;
        if (status)
                goto out;
        status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6592,6 +6617,7 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        int status;
        status = decode_compound_hdr(xdr, &hdr);
+        res->op_status = hdr.status;
        if (status)
                goto out;
        status = decode_sequence(xdr, &res->seq_res, rqstp);
@@ -6621,6 +6647,7 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
        int status;
        status = decode_compound_hdr(xdr, &hdr);
+        res->op_status = hdr.status;
        if (status)
                goto out;
        status = decode_sequence(xdr, &res->seq_res, rqstp);
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index cd3c910d2d12..9bc9f04fb7f6 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -261,11 +261,11 @@ static int __init root_nfs_data(char *cmdline)
         */
        len = snprintf(nfs_export_path, sizeof(nfs_export_path),
                                tmp, utsname()->nodename);
-        if (len > (int)sizeof(nfs_export_path))
+        if (len >= (int)sizeof(nfs_export_path))
                goto out_devnametoolong;
        len = snprintf(nfs_root_device, sizeof(nfs_root_device),
                                "%pI4:%s", &servaddr, nfs_export_path);
-        if (len > (int)sizeof(nfs_root_device))
+        if (len >= (int)sizeof(nfs_root_device))
                goto out_devnametoolong;
        retval = 0;
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index 9e5bc42180e4..24e1d7403c0b 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -537,11 +537,12 @@ int objio_write_pagelist(struct nfs_pgio_header *hdr, int how)
 static size_t objio_pg_test(struct nfs_pageio_descriptor *pgio,
                          struct nfs_page *prev, struct nfs_page *req)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(pgio);
        unsigned int size;
        size = pnfs_generic_pg_test(pgio, prev, req);
-        if (!size || pgio->pg_count + req->wb_bytes >
+        if (!size || mirror->pg_count + req->wb_bytes >
            (unsigned long)pgio->pg_layout_private)
                return 0;
@@ -607,12 +608,14 @@ static const struct nfs_pageio_ops objio_pg_read_ops = {
        .pg_init = objio_init_read,
        .pg_test = objio_pg_test,
        .pg_doio = pnfs_generic_pg_readpages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static const struct nfs_pageio_ops objio_pg_write_ops = {
        .pg_init = objio_init_write,
        .pg_test = objio_pg_test,
        .pg_doio = pnfs_generic_pg_writepages,
+        .pg_cleanup = pnfs_generic_pg_cleanup,
 };
 static struct pnfs_layoutdriver_type objlayout_type = {
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2b5e769beb16..d57190a0d533 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -42,21 +42,35 @@ static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
        return p->pagevec != NULL;
 }
+struct nfs_pgio_mirror *
+nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
+{
+        return nfs_pgio_has_mirroring(desc) ?
+                &desc->pg_mirrors[desc->pg_mirror_idx] :
+                &desc->pg_mirrors[0];
+}
+EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
 void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
                       struct nfs_pgio_header *hdr,
                       void (*release)(struct nfs_pgio_header *hdr))
 {
-        hdr->req = nfs_list_entry(desc->pg_list.next);
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+        hdr->req = nfs_list_entry(mirror->pg_list.next);
        hdr->inode = desc->pg_inode;
        hdr->cred = hdr->req->wb_context->cred;
        hdr->io_start = req_offset(hdr->req);
-        hdr->good_bytes = desc->pg_count;
+        hdr->good_bytes = mirror->pg_count;
        hdr->dreq = desc->pg_dreq;
        hdr->layout_private = desc->pg_layout_private;
        hdr->release = release;
        hdr->completion_ops = desc->pg_completion_ops;
        if (hdr->completion_ops->init_hdr)
                hdr->completion_ops->init_hdr(hdr);
+        hdr->pgio_mirror_idx = desc->pg_mirror_idx;
 }
 EXPORT_SYMBOL_GPL(nfs_pgheader_init);
@@ -480,7 +494,10 @@ nfs_wait_on_request(struct nfs_page *req)
 size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
                           struct nfs_page *prev, struct nfs_page *req)
 {
-        if (desc->pg_count > desc->pg_bsize) {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+        if (mirror->pg_count > mirror->pg_bsize) {
                /* should never happen */
                WARN_ON_ONCE(1);
                return 0;
@@ -490,11 +507,11 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
         * Limit the request size so that we can still allocate a page array
         * for it without upsetting the slab allocator.
         */
-        if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
+        if (((mirror->pg_count + req->wb_bytes) >> PAGE_SHIFT) *
                        sizeof(struct page) > PAGE_SIZE)
                return 0;
-        return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes);
+        return min(mirror->pg_bsize - mirror->pg_count, (size_t)req->wb_bytes);
 }
 EXPORT_SYMBOL_GPL(nfs_generic_pg_test);
@@ -597,13 +614,14 @@ static void nfs_pgio_prepare(struct rpc_task *task, void *calldata)
 }
 int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
+                      struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops,
                      const struct rpc_call_ops *call_ops, int how, int flags)
 {
        struct rpc_task *task;
        struct rpc_message msg = {
                .rpc_argp = &hdr->args,
                .rpc_resp = &hdr->res,
-                .rpc_cred = hdr->cred,
+                .rpc_cred = cred,
        };
        struct rpc_task_setup task_setup_data = {
                .rpc_client = clnt,
@@ -616,7 +634,7 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
        };
        int ret = 0;
-        hdr->rw_ops->rw_initiate(hdr, &msg, &task_setup_data, how);
+        hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
        dprintk("NFS: %5u initiated pgio call "
                "(req %s/%llu, %u bytes @ offset %llu)\n",
@@ -650,10 +668,18 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
 static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
                          struct nfs_pgio_header *hdr)
 {
+        struct nfs_pgio_mirror *mirror;
+        u32 midx;
        set_bit(NFS_IOHDR_REDO, &hdr->flags);
        nfs_pgio_data_destroy(hdr);
        hdr->completion_ops->completion(hdr);
-        desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+        /* TODO: Make sure it's right to clean up all mirrors here
+         *       and not just hdr->pgio_mirror_idx */
+        for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+                mirror = &desc->pg_mirrors[midx];
+                desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
+        }
        return -ENOMEM;
 }
@@ -670,6 +696,17 @@ static void nfs_pgio_release(void *calldata)
        hdr->completion_ops->completion(hdr);
 }
+static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror,
+                                   unsigned int bsize)
+{
+        INIT_LIST_HEAD(&mirror->pg_list);
+        mirror->pg_bytes_written = 0;
+        mirror->pg_count = 0;
+        mirror->pg_bsize = bsize;
+        mirror->pg_base = 0;
+        mirror->pg_recoalesce = 0;
+}
 /**
 * nfs_pageio_init - initialise a page io descriptor
 * @desc: pointer to descriptor
@@ -686,13 +723,10 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
                     size_t bsize,
                     int io_flags)
 {
-        INIT_LIST_HEAD(&desc->pg_list);
+        struct nfs_pgio_mirror *new;
-        desc->pg_bytes_written = 0;
+        int i;
-        desc->pg_count = 0;
-        desc->pg_bsize = bsize;
-        desc->pg_base = 0;
        desc->pg_moreio = 0;
-        desc->pg_recoalesce = 0;
        desc->pg_inode = inode;
        desc->pg_ops = pg_ops;
        desc->pg_completion_ops = compl_ops;
@@ -702,6 +736,26 @@ void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
        desc->pg_lseg = NULL;
        desc->pg_dreq = NULL;
        desc->pg_layout_private = NULL;
+        desc->pg_bsize = bsize;
+        desc->pg_mirror_count = 1;
+        desc->pg_mirror_idx = 0;
+        if (pg_ops->pg_get_mirror_count) {
+                /* until we have a request, we don't have an lseg and no
+                 * idea how many mirrors there will be */
+                new = kcalloc(NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX,
+                              sizeof(struct nfs_pgio_mirror), GFP_KERNEL);
+                desc->pg_mirrors_dynamic = new;
+                desc->pg_mirrors = new;
+                for (i = 0; i < NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX; i++)
+                        nfs_pageio_mirror_init(&desc->pg_mirrors[i], bsize);
+        } else {
+                desc->pg_mirrors_dynamic = NULL;
+                desc->pg_mirrors = desc->pg_mirrors_static;
+                nfs_pageio_mirror_init(&desc->pg_mirrors[0], bsize);
+        }
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_init);
@@ -737,14 +791,16 @@ static void nfs_pgio_result(struct rpc_task *task, void *calldata)
 int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
                     struct nfs_pgio_header *hdr)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        struct nfs_page         *req;
        struct page             **pages,
                                *last_page;
-        struct list_head *head = &desc->pg_list;
+        struct list_head *head = &mirror->pg_list;
        struct nfs_commit_info cinfo;
        unsigned int pagecount, pageused;
-        pagecount = nfs_page_array_len(desc->pg_base, desc->pg_count);
+        pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
        if (!nfs_pgarray_set(&hdr->page_array, pagecount))
                return nfs_pgio_error(desc, hdr);
@@ -772,7 +828,7 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
                desc->pg_ioflags &= ~FLUSH_COND_STABLE;
        /* Set up the argument struct */
-        nfs_pgio_rpcsetup(hdr, desc->pg_count, 0, desc->pg_ioflags, &cinfo);
+        nfs_pgio_rpcsetup(hdr, mirror->pg_count, 0, desc->pg_ioflags, &cinfo);
        desc->pg_rpc_callops = &nfs_pgio_common_ops;
        return 0;
 }
@@ -780,23 +836,74 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
 static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 {
+        struct nfs_pgio_mirror *mirror;
        struct nfs_pgio_header *hdr;
        int ret;
+        mirror = nfs_pgio_current_mirror(desc);
        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
        if (!hdr) {
-                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+                /* TODO: make sure this is right with mirroring - or
+                 *       should it back out all mirrors? */
+                desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
                return -ENOMEM;
        }
        nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
        ret = nfs_generic_pgio(desc, hdr);
        if (ret == 0)
                ret = nfs_initiate_pgio(NFS_CLIENT(hdr->inode),
-                                        hdr, desc->pg_rpc_callops,
+                                        hdr,
+                                        hdr->cred,
+                                        NFS_PROTO(hdr->inode),
+                                        desc->pg_rpc_callops,
                                        desc->pg_ioflags, 0);
        return ret;
 }
+/*
+ * nfs_pageio_setup_mirroring - determine if mirroring is to be used
+ *                              by calling the pg_get_mirror_count op
+ */
+static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
+                                       struct nfs_page *req)
+{
+        int mirror_count = 1;
+        if (!pgio->pg_ops->pg_get_mirror_count)
+                return 0;
+        mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
+        if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
+                return -EINVAL;
+        if (WARN_ON_ONCE(!pgio->pg_mirrors_dynamic))
+                return -EINVAL;
+        pgio->pg_mirror_count = mirror_count;
+        return 0;
+}
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+        pgio->pg_mirror_count = 1;
+        pgio->pg_mirror_idx = 0;
+}
+static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+        pgio->pg_mirror_count = 1;
+        pgio->pg_mirror_idx = 0;
+        pgio->pg_mirrors = pgio->pg_mirrors_static;
+        kfree(pgio->pg_mirrors_dynamic);
+        pgio->pg_mirrors_dynamic = NULL;
+}
 static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
                const struct nfs_open_context *ctx2)
 {
@@ -826,11 +933,15 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
                                      struct nfs_pageio_descriptor *pgio)
 {
        size_t size;
+        struct file_lock_context *flctx;
        if (prev) {
                if (!nfs_match_open_context(req->wb_context, prev->wb_context))
                        return false;
-                if (req->wb_context->dentry->d_inode->i_flock != NULL &&
+                flctx = req->wb_context->dentry->d_inode->i_flctx;
+                if (flctx != NULL &&
+                    !(list_empty_careful(&flctx->flc_posix) &&
+                      list_empty_careful(&flctx->flc_flock)) &&
                    !nfs_match_lock_context(req->wb_lock_context,
                                            prev->wb_lock_context))
                        return false;
@@ -863,19 +974,22 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
 static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
                                     struct nfs_page *req)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        struct nfs_page *prev = NULL;
-        if (desc->pg_count != 0) {
-                prev = nfs_list_entry(desc->pg_list.prev);
+        if (mirror->pg_count != 0) {
+                prev = nfs_list_entry(mirror->pg_list.prev);
        } else {
                if (desc->pg_ops->pg_init)
                        desc->pg_ops->pg_init(desc, req);
-                desc->pg_base = req->wb_pgbase;
+                mirror->pg_base = req->wb_pgbase;
        }
        if (!nfs_can_coalesce_requests(prev, req, desc))
                return 0;
        nfs_list_remove_request(req);
-        nfs_list_add_request(req, &desc->pg_list);
+        nfs_list_add_request(req, &mirror->pg_list);
-        desc->pg_count += req->wb_bytes;
+        mirror->pg_count += req->wb_bytes;
        return 1;
 }
@@ -884,16 +998,19 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 */
 static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 {
-        if (!list_empty(&desc->pg_list)) {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
+        if (!list_empty(&mirror->pg_list)) {
                int error = desc->pg_ops->pg_doio(desc);
                if (error < 0)
                        desc->pg_error = error;
                else
-                        desc->pg_bytes_written += desc->pg_count;
+                        mirror->pg_bytes_written += mirror->pg_count;
        }
-        if (list_empty(&desc->pg_list)) {
+        if (list_empty(&mirror->pg_list)) {
-                desc->pg_count = 0;
+                mirror->pg_count = 0;
-                desc->pg_base = 0;
+                mirror->pg_base = 0;
        }
 }
@@ -911,6 +1028,8 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
 static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                           struct nfs_page *req)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        struct nfs_page *subreq;
        unsigned int bytes_left = 0;
        unsigned int offset, pgbase;
@@ -934,7 +1053,7 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                        nfs_pageio_doio(desc);
                        if (desc->pg_error < 0)
                                return 0;
-                        if (desc->pg_recoalesce)
+                        if (mirror->pg_recoalesce)
                                return 0;
                        /* retry add_request for this subreq */
                        nfs_page_group_lock(req, false);
@@ -972,14 +1091,16 @@ err_ptr:
 static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        LIST_HEAD(head);
        do {
-                list_splice_init(&desc->pg_list, &head);
+                list_splice_init(&mirror->pg_list, &head);
-                desc->pg_bytes_written -= desc->pg_count;
+                mirror->pg_bytes_written -= mirror->pg_count;
-                desc->pg_count = 0;
+                mirror->pg_count = 0;
-                desc->pg_base = 0;
+                mirror->pg_base = 0;
-                desc->pg_recoalesce = 0;
+                mirror->pg_recoalesce = 0;
                desc->pg_moreio = 0;
                while (!list_empty(&head)) {
@@ -993,11 +1114,11 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
                                return 0;
                        break;
                }
-        } while (desc->pg_recoalesce);
+        } while (mirror->pg_recoalesce);
        return 1;
 }
-int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+static int nfs_pageio_add_request_mirror(struct nfs_pageio_descriptor *desc,
                struct nfs_page *req)
 {
        int ret;
@@ -1010,9 +1131,80 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
                        break;
                ret = nfs_do_recoalesce(desc);
        } while (ret);
        return ret;
 }
+int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
+                           struct nfs_page *req)
+{
+        u32 midx;
+        unsigned int pgbase, offset, bytes;
+        struct nfs_page *dupreq, *lastreq;
+        pgbase = req->wb_pgbase;
+        offset = req->wb_offset;
+        bytes = req->wb_bytes;
+        nfs_pageio_setup_mirroring(desc, req);
+        for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+                if (midx) {
+                        nfs_page_group_lock(req, false);
+                        /* find the last request */
+                        for (lastreq = req->wb_head;
+                             lastreq->wb_this_page != req->wb_head;
+                             lastreq = lastreq->wb_this_page)
+                                ;
+                        dupreq = nfs_create_request(req->wb_context,
+                                        req->wb_page, lastreq, pgbase, bytes);
+                        if (IS_ERR(dupreq)) {
+                                nfs_page_group_unlock(req);
+                                return 0;
+                        }
+                        nfs_lock_request(dupreq);
+                        nfs_page_group_unlock(req);
+                        dupreq->wb_offset = offset;
+                        dupreq->wb_index = req->wb_index;
+                } else
+                        dupreq = req;
+                if (nfs_pgio_has_mirroring(desc))
+                        desc->pg_mirror_idx = midx;
+                if (!nfs_pageio_add_request_mirror(desc, dupreq))
+                        return 0;
+        }
+        return 1;
+}
+/*
+ * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an
+ *                              nfs_pageio_descriptor
+ * @desc: pointer to io descriptor
+ */
+static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
+                                       u32 mirror_idx)
+{
+        struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
+        u32 restore_idx = desc->pg_mirror_idx;
+        if (nfs_pgio_has_mirroring(desc))
+                desc->pg_mirror_idx = mirror_idx;
+        for (;;) {
+                nfs_pageio_doio(desc);
+                if (!mirror->pg_recoalesce)
+                        break;
+                if (!nfs_do_recoalesce(desc))
+                        break;
+        }
+        desc->pg_mirror_idx = restore_idx;
+}
 /*
 * nfs_pageio_resend - Transfer requests to new descriptor and resend
 * @hdr - the pgio header to move request from
@@ -1046,18 +1238,19 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
 EXPORT_SYMBOL_GPL(nfs_pageio_resend);
 /**
- * nfs_pageio_complete - Complete I/O on an nfs_pageio_descriptor
+ * nfs_pageio_complete - Complete I/O then cleanup an nfs_pageio_descriptor
 * @desc: pointer to io descriptor
 */
 void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
 {
-        for (;;) {
+        u32 midx;
-                nfs_pageio_doio(desc);
-                if (!desc->pg_recoalesce)
+        for (midx = 0; midx < desc->pg_mirror_count; midx++)
-                        break;
+                nfs_pageio_complete_mirror(desc, midx);
-                if (!nfs_do_recoalesce(desc))
-                        break;
+        if (desc->pg_ops->pg_cleanup)
-        }
+                desc->pg_ops->pg_cleanup(desc);
+        nfs_pageio_cleanup_mirroring(desc);
 }
 /**
@@ -1073,10 +1266,17 @@ void nfs_pageio_complete(struct nfs_pageio_descriptor *desc)
 */
 void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
 {
-        if (!list_empty(&desc->pg_list)) {
+        struct nfs_pgio_mirror *mirror;
-                struct nfs_page *prev = nfs_list_entry(desc->pg_list.prev);
+        struct nfs_page *prev;
-                if (index != prev->wb_index + 1)
+        u32 midx;
-                        nfs_pageio_complete(desc);
+        for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+                mirror = &desc->pg_mirrors[midx];
+                if (!list_empty(&mirror->pg_list)) {
+                        prev = nfs_list_entry(mirror->pg_list.prev);
+                        if (index != prev->wb_index + 1)
+                                nfs_pageio_complete_mirror(desc, midx);
+                }
        }
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 0a5dda4d85c2..4f802b02fbb9 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -34,6 +34,7 @@
 #include "pnfs.h"
 #include "iostat.h"
 #include "nfs4trace.h"
+#include "delegation.h"
 #define NFSDBG_FACILITY         NFSDBG_PNFS
 #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ)
@@ -50,6 +51,10 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
 */
 static LIST_HEAD(pnfs_modules_tbl);
+static int
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+                       enum pnfs_iomode iomode, bool sync);
 /* Return the registered pnfs layout driver module matching given id */
 static struct pnfs_layoutdriver_type *
 find_pnfs_driver_locked(u32 id)
@@ -238,6 +243,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
        struct inode *inode = lo->plh_inode;
        if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
+                if (!list_empty(&lo->plh_segs))
+                        WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
                pnfs_detach_layout_hdr(lo);
                spin_unlock(&inode->i_lock);
                pnfs_free_layout_hdr(lo);
@@ -337,6 +344,48 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
        rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 }
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
+                        struct pnfs_layout_segment *lseg)
+{
+        struct pnfs_layout_segment *s;
+        if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+                return false;
+        list_for_each_entry(s, &lo->plh_segs, pls_list)
+                if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
+                        return false;
+        return true;
+}
+static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
+                struct pnfs_layout_hdr *lo, struct inode *inode)
+{
+        lo = lseg->pls_layout;
+        inode = lo->plh_inode;
+        spin_lock(&inode->i_lock);
+        if (pnfs_layout_need_return(lo, lseg)) {
+                nfs4_stateid stateid;
+                enum pnfs_iomode iomode;
+                stateid = lo->plh_stateid;
+                iomode = lo->plh_return_iomode;
+                /* decreased in pnfs_send_layoutreturn() */
+                lo->plh_block_lgets++;
+                lo->plh_return_iomode = 0;
+                spin_unlock(&inode->i_lock);
+                pnfs_get_layout_hdr(lo);
+                /* Send an async layoutreturn so we dont deadlock */
+                pnfs_send_layoutreturn(lo, stateid, iomode, false);
+        } else
+                spin_unlock(&inode->i_lock);
+}
 void
 pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -349,8 +398,17 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
        dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg,
                atomic_read(&lseg->pls_refcount),
                test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
+        /* Handle the case where refcount != 1 */
+        if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
+                return;
        lo = lseg->pls_layout;
        inode = lo->plh_inode;
+        /* Do we need a layoutreturn? */
+        if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+                pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
        if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
                pnfs_get_layout_hdr(lo);
                pnfs_layout_remove_lseg(lo, lseg);
@@ -543,6 +601,7 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
                pnfs_get_layout_hdr(lo);
                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
                pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
+                pnfs_clear_retry_layoutget(lo);
                spin_unlock(&nfsi->vfs_inode.i_lock);
                pnfs_free_lseg_list(&tmp_list);
                pnfs_put_layout_hdr(lo);
@@ -740,25 +799,37 @@ pnfs_layout_stateid_blocked(const struct pnfs_layout_hdr *lo,
        return !pnfs_seqid_is_newer(seqid, lo->plh_barrier);
 }
+static bool
+pnfs_layout_returning(const struct pnfs_layout_hdr *lo,
+                      struct pnfs_layout_range *range)
+{
+        return test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
+                (lo->plh_return_iomode == IOMODE_ANY ||
+                 lo->plh_return_iomode == range->iomode);
+}
 /* lget is set to 1 if called from inside send_layoutget call chain */
 static bool
-pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo, int lget)
+pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo,
+                        struct pnfs_layout_range *range, int lget)
 {
        return lo->plh_block_lgets ||
                test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
                (list_empty(&lo->plh_segs) &&
-                 (atomic_read(&lo->plh_outstanding) > lget));
+                 (atomic_read(&lo->plh_outstanding) > lget)) ||
+                pnfs_layout_returning(lo, range);
 }
 int
 pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
+                              struct pnfs_layout_range *range,
                              struct nfs4_state *open_state)
 {
        int status = 0;
        dprintk("--> %s\n", __func__);
        spin_lock(&lo->plh_inode->i_lock);
-        if (pnfs_layoutgets_blocked(lo, 1)) {
+        if (pnfs_layoutgets_blocked(lo, range, 1)) {
                status = -EAGAIN;
        } else if (!nfs4_valid_open_stateid(open_state)) {
                status = -EBADF;
@@ -825,7 +896,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
                        pnfs_layout_io_set_failed(lo, range->iomode);
                }
                return NULL;
-        }
+        } else
+                pnfs_layout_clear_fail_bit(lo,
+                                pnfs_iomode_to_fail_bit(range->iomode));
        return lseg;
 }
@@ -845,6 +918,49 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
        }
 }
+void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+{
+        clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
+        smp_mb__after_atomic();
+        wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+}
+static int
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+                       enum pnfs_iomode iomode, bool sync)
+{
+        struct inode *ino = lo->plh_inode;
+        struct nfs4_layoutreturn *lrp;
+        int status = 0;
+        lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
+        if (unlikely(lrp == NULL)) {
+                status = -ENOMEM;
+                spin_lock(&ino->i_lock);
+                lo->plh_block_lgets--;
+                pnfs_clear_layoutreturn_waitbit(lo);
+                rpc_wake_up(&NFS_SERVER(ino)->roc_rpcwaitq);
+                spin_unlock(&ino->i_lock);
+                pnfs_put_layout_hdr(lo);
+                goto out;
+        }
+        lrp->args.stateid = stateid;
+        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
+        lrp->args.inode = ino;
+        lrp->args.range.iomode = iomode;
+        lrp->args.range.offset = 0;
+        lrp->args.range.length = NFS4_MAX_UINT64;
+        lrp->args.layout = lo;
+        lrp->clp = NFS_SERVER(ino)->nfs_client;
+        lrp->cred = lo->plh_lc_cred;
+        status = nfs4_proc_layoutreturn(lrp, sync);
+out:
+        dprintk("<-- %s status: %d\n", __func__, status);
+        return status;
+}
 /*
 * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
 * when the layout segment list is empty.
@@ -859,7 +975,6 @@ _pnfs_return_layout(struct inode *ino)
        struct pnfs_layout_hdr *lo = NULL;
        struct nfs_inode *nfsi = NFS_I(ino);
        LIST_HEAD(tmp_list);
-        struct nfs4_layoutreturn *lrp;
        nfs4_stateid stateid;
        int status = 0, empty;
@@ -901,24 +1016,7 @@ _pnfs_return_layout(struct inode *ino)
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
-        lrp = kzalloc(sizeof(*lrp), GFP_KERNEL);
+        status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
-        if (unlikely(lrp == NULL)) {
-                status = -ENOMEM;
-                spin_lock(&ino->i_lock);
-                lo->plh_block_lgets--;
-                spin_unlock(&ino->i_lock);
-                pnfs_put_layout_hdr(lo);
-                goto out;
-        }
-        lrp->args.stateid = stateid;
-        lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
-        lrp->args.inode = ino;
-        lrp->args.layout = lo;
-        lrp->clp = NFS_SERVER(ino)->nfs_client;
-        lrp->cred = lo->plh_lc_cred;
-        status = nfs4_proc_layoutreturn(lrp);
 out:
        dprintk("<-- %s status: %d\n", __func__, status);
        return status;
@@ -954,31 +1052,60 @@ pnfs_commit_and_return_layout(struct inode *inode)
 bool pnfs_roc(struct inode *ino)
 {
+        struct nfs_inode *nfsi = NFS_I(ino);
+        struct nfs_open_context *ctx;
+        struct nfs4_state *state;
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg, *tmp;
+        nfs4_stateid stateid;
        LIST_HEAD(tmp_list);
-        bool found = false;
+        bool found = false, layoutreturn = false;
        spin_lock(&ino->i_lock);
-        lo = NFS_I(ino)->layout;
+        lo = nfsi->layout;
        if (!lo || !test_and_clear_bit(NFS_LAYOUT_ROC, &lo->plh_flags) ||
            test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
-                goto out_nolayout;
+                goto out_noroc;
+        /* Don't return layout if we hold a delegation */
+        if (nfs4_check_delegation(ino, FMODE_READ))
+                goto out_noroc;
+        list_for_each_entry(ctx, &nfsi->open_files, list) {
+                state = ctx->state;
+                /* Don't return layout if there is open file state */
+                if (state != NULL && state->state != 0)
+                        goto out_noroc;
+        }
+        pnfs_clear_retry_layoutget(lo);
        list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
                if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
                        mark_lseg_invalid(lseg, &tmp_list);
                        found = true;
                }
        if (!found)
-                goto out_nolayout;
+                goto out_noroc;
        lo->plh_block_lgets++;
        pnfs_get_layout_hdr(lo); /* matched in pnfs_roc_release */
        spin_unlock(&ino->i_lock);
        pnfs_free_lseg_list(&tmp_list);
        return true;
-out_nolayout:
+out_noroc:
+        if (lo) {
+                stateid = lo->plh_stateid;
+                layoutreturn =
+                        test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                                           &lo->plh_flags);
+                if (layoutreturn) {
+                        lo->plh_block_lgets++;
+                        pnfs_get_layout_hdr(lo);
+                }
+        }
        spin_unlock(&ino->i_lock);
+        if (layoutreturn)
+                pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
        return false;
 }
@@ -1013,8 +1140,9 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
        struct nfs_inode *nfsi = NFS_I(ino);
        struct pnfs_layout_hdr *lo;
        struct pnfs_layout_segment *lseg;
+        nfs4_stateid stateid;
        u32 current_seqid;
-        bool found = false;
+        bool found = false, layoutreturn = false;
        spin_lock(&ino->i_lock);
        list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list)
@@ -1031,7 +1159,21 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task)
         */
        *barrier = current_seqid + atomic_read(&lo->plh_outstanding);
 out:
+        if (!found) {
+                stateid = lo->plh_stateid;
+                layoutreturn =
+                        test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+                                           &lo->plh_flags);
+                if (layoutreturn) {
+                        lo->plh_block_lgets++;
+                        pnfs_get_layout_hdr(lo);
+                }
+        }
        spin_unlock(&ino->i_lock);
+        if (layoutreturn) {
+                rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
+                pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false);
+        }
        return found;
 }
@@ -1178,6 +1320,7 @@ pnfs_find_lseg(struct pnfs_layout_hdr *lo,
        list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
                if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags) &&
+                    !test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
                    pnfs_lseg_range_match(&lseg->pls_range, range)) {
                        ret = pnfs_get_lseg(lseg);
                        break;
@@ -1266,6 +1409,35 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
        return ret;
 }
+/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
+static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key)
+{
+        if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
+                return 1;
+        return nfs_wait_bit_killable(key);
+}
+static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+        /*
+         * send layoutcommit as it can hold up layoutreturn due to lseg
+         * reference
+         */
+        pnfs_layoutcommit_inode(lo->plh_inode, false);
+        return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
+                                   pnfs_layoutget_retry_bit_wait,
+                                   TASK_UNINTERRUPTIBLE);
+}
+static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
+{
+        unsigned long *bitlock = &lo->plh_flags;
+        clear_bit_unlock(NFS_LAYOUT_FIRST_LAYOUTGET, bitlock);
+        smp_mb__after_atomic();
+        wake_up_bit(bitlock, NFS_LAYOUT_FIRST_LAYOUTGET);
+}
 /*
 * Layout segment is retreived from the server if not cached.
 * The appropriate layout segment is referenced and returned to the caller.
@@ -1296,6 +1468,8 @@ pnfs_update_layout(struct inode *ino,
        if (pnfs_within_mdsthreshold(ctx, ino, iomode))
                goto out;
+lookup_again:
+        first = false;
        spin_lock(&ino->i_lock);
        lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
        if (lo == NULL) {
@@ -1310,27 +1484,62 @@ pnfs_update_layout(struct inode *ino,
        }
        /* if LAYOUTGET already failed once we don't try again */
-        if (pnfs_layout_io_test_failed(lo, iomode))
+        if (pnfs_layout_io_test_failed(lo, iomode) &&
+            !pnfs_should_retry_layoutget(lo))
                goto out_unlock;
-        /* Check to see if the layout for the given range already exists */
+        first = list_empty(&lo->plh_segs);
-        lseg = pnfs_find_lseg(lo, &arg);
+        if (first) {
-        if (lseg)
+                /* The first layoutget for the file. Need to serialize per
-                goto out_unlock;
+                 * RFC 5661 Errata 3208.
+                 */
+                if (test_and_set_bit(NFS_LAYOUT_FIRST_LAYOUTGET,
+                                     &lo->plh_flags)) {
+                        spin_unlock(&ino->i_lock);
+                        wait_on_bit(&lo->plh_flags, NFS_LAYOUT_FIRST_LAYOUTGET,
+                                    TASK_UNINTERRUPTIBLE);
+                        pnfs_put_layout_hdr(lo);
+                        goto lookup_again;
+                }
+        } else {
+                /* Check to see if the layout for the given range
+                 * already exists
+                 */
+                lseg = pnfs_find_lseg(lo, &arg);
+                if (lseg)
+                        goto out_unlock;
+        }
+        /*
+         * Because we free lsegs before sending LAYOUTRETURN, we need to wait
+         * for LAYOUTRETURN even if first is true.
+         */
+        if (!lseg && pnfs_should_retry_layoutget(lo) &&
+            test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+                spin_unlock(&ino->i_lock);
+                dprintk("%s wait for layoutreturn\n", __func__);
+                if (pnfs_prepare_to_retry_layoutget(lo)) {
+                        if (first)
+                                pnfs_clear_first_layoutget(lo);
+                        pnfs_put_layout_hdr(lo);
+                        dprintk("%s retrying\n", __func__);
+                        goto lookup_again;
+                }
+                goto out_put_layout_hdr;
+        }
-        if (pnfs_layoutgets_blocked(lo, 0))
+        if (pnfs_layoutgets_blocked(lo, &arg, 0))
                goto out_unlock;
        atomic_inc(&lo->plh_outstanding);
-        first = list_empty(&lo->plh_layouts) ? true : false;
        spin_unlock(&ino->i_lock);
-        if (first) {
+        if (list_empty(&lo->plh_layouts)) {
                /* The lo must be on the clp list if there is any
                 * chance of a CB_LAYOUTRECALL(FILE) coming in.
                 */
                spin_lock(&clp->cl_lock);
-                list_add_tail(&lo->plh_layouts, &server->layouts);
+                if (list_empty(&lo->plh_layouts))
+                        list_add_tail(&lo->plh_layouts, &server->layouts);
                spin_unlock(&clp->cl_lock);
        }
@@ -1343,8 +1552,11 @@ pnfs_update_layout(struct inode *ino,
                arg.length = PAGE_CACHE_ALIGN(arg.length);
        lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
+        pnfs_clear_retry_layoutget(lo);
        atomic_dec(&lo->plh_outstanding);
 out_put_layout_hdr:
+        if (first)
+                pnfs_clear_first_layoutget(lo);
        pnfs_put_layout_hdr(lo);
 out:
        dprintk("%s: inode %s/%llu pNFS layout segment %s for "
@@ -1393,7 +1605,7 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
                goto out_forget_reply;
        }
-        if (pnfs_layoutgets_blocked(lo, 1)) {
+        if (pnfs_layoutgets_blocked(lo, &lgp->args.range, 1)) {
                dprintk("%s forget reply due to state\n", __func__);
                goto out_forget_reply;
        }
@@ -1440,24 +1652,79 @@ out_forget_reply:
        goto out;
 }
+static void
+pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+                                struct list_head *tmp_list,
+                                struct pnfs_layout_range *return_range)
+{
+        struct pnfs_layout_segment *lseg, *next;
+        dprintk("%s:Begin lo %p\n", __func__, lo);
+        if (list_empty(&lo->plh_segs))
+                return;
+        list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+                if (should_free_lseg(&lseg->pls_range, return_range)) {
+                        dprintk("%s: marking lseg %p iomode %d "
+                                "offset %llu length %llu\n", __func__,
+                                lseg, lseg->pls_range.iomode,
+                                lseg->pls_range.offset,
+                                lseg->pls_range.length);
+                        set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+                        mark_lseg_invalid(lseg, tmp_list);
+                }
+}
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+                                       struct pnfs_layout_segment *lseg)
+{
+        struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
+        int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
+        struct pnfs_layout_range range = {
+                .iomode = lseg->pls_range.iomode,
+                .offset = 0,
+                .length = NFS4_MAX_UINT64,
+        };
+        LIST_HEAD(free_me);
+        spin_lock(&inode->i_lock);
+        /* set failure bit so that pnfs path will be retried later */
+        pnfs_layout_set_fail_bit(lo, iomode);
+        set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+        if (lo->plh_return_iomode == 0)
+                lo->plh_return_iomode = range.iomode;
+        else if (lo->plh_return_iomode != range.iomode)
+                lo->plh_return_iomode = IOMODE_ANY;
+        /*
+         * mark all matching lsegs so that we are sure to have no live
+         * segments at hand when sending layoutreturn. See pnfs_put_lseg()
+         * for how it works.
+         */
+        pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
+        spin_unlock(&inode->i_lock);
+        pnfs_free_lseg_list(&free_me);
+}
+EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
 void
 pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
 {
        u64 rd_size = req->wb_bytes;
-        WARN_ON_ONCE(pgio->pg_lseg != NULL);
+        if (pgio->pg_lseg == NULL) {
+                if (pgio->pg_dreq == NULL)
-        if (pgio->pg_dreq == NULL)
+                        rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
-                rd_size = i_size_read(pgio->pg_inode) - req_offset(req);
+                else
-        else
+                        rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
-                rd_size = nfs_dreq_bytes_left(pgio->pg_dreq);
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   req->wb_context,
-                                           req->wb_context,
+                                                   req_offset(req),
-                                           req_offset(req),
+                                                   rd_size,
-                                           rd_size,
+                                                   IOMODE_READ,
-                                           IOMODE_READ,
+                                                   GFP_KERNEL);
-                                           GFP_KERNEL);
+        }
        /* If no lseg, fall back to read through mds */
        if (pgio->pg_lseg == NULL)
                nfs_pageio_reset_read_mds(pgio);
@@ -1469,27 +1736,36 @@ void
 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
                           struct nfs_page *req, u64 wb_size)
 {
-        WARN_ON_ONCE(pgio->pg_lseg != NULL);
+        if (pgio->pg_lseg == NULL)
+                pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
-        pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
+                                                   req->wb_context,
-                                           req->wb_context,
+                                                   req_offset(req),
-                                           req_offset(req),
+                                                   wb_size,
-                                           wb_size,
+                                                   IOMODE_RW,
-                                           IOMODE_RW,
+                                                   GFP_NOFS);
-                                           GFP_NOFS);
        /* If no lseg, fall back to write through mds */
        if (pgio->pg_lseg == NULL)
                nfs_pageio_reset_write_mds(pgio);
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_init_write);
+void
+pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *desc)
+{
+        if (desc->pg_lseg) {
+                pnfs_put_lseg(desc->pg_lseg);
+                desc->pg_lseg = NULL;
+        }
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_cleanup);
 /*
 * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number
 * of bytes (maximum @req->wb_bytes) that can be coalesced.
 */
 size_t
-pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
+pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
-                     struct nfs_page *req)
+                     struct nfs_page *prev, struct nfs_page *req)
 {
        unsigned int size;
        u64 seg_end, req_start, seg_left;
@@ -1513,10 +1789,16 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev,
                seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
                                     pgio->pg_lseg->pls_range.length);
                req_start = req_offset(req);
-                WARN_ON_ONCE(req_start > seg_end);
+                WARN_ON_ONCE(req_start >= seg_end);
                /* start of request is past the last byte of this segment */
-                if (req_start >= seg_end)
+                if (req_start >= seg_end) {
+                        /* reference the new lseg */
+                        if (pgio->pg_ops->pg_cleanup)
+                                pgio->pg_ops->pg_cleanup(pgio);
+                        if (pgio->pg_ops->pg_init)
+                                pgio->pg_ops->pg_init(pgio, req);
                        return 0;
+                }
                /* adjust 'size' iff there are fewer bytes left in the
                 * segment than what nfs_generic_pg_test returned */
@@ -1571,10 +1853,12 @@ static void
 pnfs_write_through_mds(struct nfs_pageio_descriptor *desc,
                struct nfs_pgio_header *hdr)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
-                list_splice_tail_init(&hdr->pages, &desc->pg_list);
+                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
                nfs_pageio_reset_write_mds(desc);
-                desc->pg_recoalesce = 1;
+                mirror->pg_recoalesce = 1;
        }
        nfs_pgio_data_destroy(hdr);
 }
@@ -1608,11 +1892,9 @@ pnfs_do_write(struct nfs_pageio_descriptor *desc,
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
-        desc->pg_lseg = NULL;
        trypnfs = pnfs_try_to_write_data(hdr, call_ops, lseg, how);
        if (trypnfs == PNFS_NOT_ATTEMPTED)
                pnfs_write_through_mds(desc, hdr);
-        pnfs_put_lseg(lseg);
 }
 static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
@@ -1625,24 +1907,23 @@ EXPORT_SYMBOL_GPL(pnfs_writehdr_free);
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        struct nfs_pgio_header *hdr;
        int ret;
        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
        if (!hdr) {
-                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+                desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-                pnfs_put_lseg(desc->pg_lseg);
-                desc->pg_lseg = NULL;
                return -ENOMEM;
        }
        nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
        ret = nfs_generic_pgio(desc, hdr);
-        if (ret != 0) {
+        if (!ret)
-                pnfs_put_lseg(desc->pg_lseg);
-                desc->pg_lseg = NULL;
-        } else
                pnfs_do_write(desc, hdr, desc->pg_ioflags);
        return ret;
 }
 EXPORT_SYMBOL_GPL(pnfs_generic_pg_writepages);
@@ -1687,10 +1968,12 @@ static void
 pnfs_read_through_mds(struct nfs_pageio_descriptor *desc,
                struct nfs_pgio_header *hdr)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
-                list_splice_tail_init(&hdr->pages, &desc->pg_list);
+                list_splice_tail_init(&hdr->pages, &mirror->pg_list);
                nfs_pageio_reset_read_mds(desc);
-                desc->pg_recoalesce = 1;
+                mirror->pg_recoalesce = 1;
        }
        nfs_pgio_data_destroy(hdr);
 }
@@ -1719,18 +2002,29 @@ pnfs_try_to_read_data(struct nfs_pgio_header *hdr,
        return trypnfs;
 }
+/* Resend all requests through pnfs. */
+int pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
+{
+        struct nfs_pageio_descriptor pgio;
+        nfs_pageio_init_read(&pgio, hdr->inode, false, hdr->completion_ops);
+        return nfs_pageio_resend(&pgio, hdr);
+}
+EXPORT_SYMBOL_GPL(pnfs_read_resend_pnfs);
 static void
 pnfs_do_read(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr)
 {
        const struct rpc_call_ops *call_ops = desc->pg_rpc_callops;
        struct pnfs_layout_segment *lseg = desc->pg_lseg;
        enum pnfs_try_status trypnfs;
+        int err = 0;
-        desc->pg_lseg = NULL;
        trypnfs = pnfs_try_to_read_data(hdr, call_ops, lseg);
-        if (trypnfs == PNFS_NOT_ATTEMPTED)
+        if (trypnfs == PNFS_TRY_AGAIN)
+                err = pnfs_read_resend_pnfs(hdr);
+        if (trypnfs == PNFS_NOT_ATTEMPTED || err)
                pnfs_read_through_mds(desc, hdr);
-        pnfs_put_lseg(lseg);
 }
 static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
@@ -1743,24 +2037,20 @@ EXPORT_SYMBOL_GPL(pnfs_readhdr_free);
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
+        struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
        struct nfs_pgio_header *hdr;
        int ret;
        hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
        if (!hdr) {
-                desc->pg_completion_ops->error_cleanup(&desc->pg_list);
+                desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-                ret = -ENOMEM;
+                return -ENOMEM;
-                pnfs_put_lseg(desc->pg_lseg);
-                desc->pg_lseg = NULL;
-                return ret;
        }
        nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
        hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
        ret = nfs_generic_pgio(desc, hdr);
-        if (ret != 0) {
+        if (!ret)
-                pnfs_put_lseg(desc->pg_lseg);
-                desc->pg_lseg = NULL;
-        } else
                pnfs_do_read(desc, hdr);
        return ret;
 }
@@ -1966,6 +2256,7 @@ clear_layoutcommitting:
        pnfs_clear_layoutcommitting(inode);
        goto out;
 }
+EXPORT_SYMBOL_GPL(pnfs_layoutcommit_inode);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
 {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 9ae5b765b073..635f0865671c 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -38,6 +38,25 @@ enum {
        NFS_LSEG_VALID = 0,     /* cleared when lseg is recalled/returned */
        NFS_LSEG_ROC,           /* roc bit received from server */
        NFS_LSEG_LAYOUTCOMMIT,  /* layoutcommit bit set for layoutcommit */
+        NFS_LSEG_LAYOUTRETURN,  /* layoutreturn bit set for layoutreturn */
+};
+/* Individual ip address */
+struct nfs4_pnfs_ds_addr {
+        struct sockaddr_storage da_addr;
+        size_t                  da_addrlen;
+        struct list_head        da_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+        char                    *da_remotestr;  /* human readable addr+port */
+};
+struct nfs4_pnfs_ds {
+        struct list_head        ds_node;  /* nfs4_pnfs_dev_hlist dev_dslist */
+        char                    *ds_remotestr;  /* comma sep list of addrs */
+        struct list_head        ds_addrs;
+        struct nfs_client       *ds_clp;
+        atomic_t                ds_count;
+        unsigned long           ds_state;
+#define NFS4DS_CONNECTING       0       /* ds is establishing connection */
 };
 struct pnfs_layout_segment {
@@ -53,19 +72,34 @@ struct pnfs_layout_segment {
 enum pnfs_try_status {
        PNFS_ATTEMPTED     = 0,
        PNFS_NOT_ATTEMPTED = 1,
+        PNFS_TRY_AGAIN     = 2,
 };
 #ifdef CONFIG_NFS_V4_1
 #define LAYOUT_NFSV4_1_MODULE_PREFIX "nfs-layouttype4"
+/*
+ * Default data server connection timeout and retrans vaules.
+ * Set by module parameters dataserver_timeo and dataserver_retrans.
+ */
+#define NFS4_DEF_DS_TIMEO   600 /* in tenths of a second */
+#define NFS4_DEF_DS_RETRANS 5
+/* error codes for internal use */
+#define NFS4ERR_RESET_TO_MDS   12001
+#define NFS4ERR_RESET_TO_PNFS  12002
 enum {
        NFS_LAYOUT_RO_FAILED = 0,       /* get ro layout failed stop trying */
        NFS_LAYOUT_RW_FAILED,           /* get rw layout failed stop trying */
        NFS_LAYOUT_BULK_RECALL,         /* bulk recall affecting layout */
        NFS_LAYOUT_ROC,                 /* some lseg had roc bit set */
        NFS_LAYOUT_RETURN,              /* Return this layout ASAP */
+        NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */
        NFS_LAYOUT_INVALID_STID,        /* layout stateid id is invalid */
+        NFS_LAYOUT_FIRST_LAYOUTGET,     /* Serialize first layoutget */
+        NFS_LAYOUT_RETRY_LAYOUTGET,     /* Retry layoutget */
 };
 enum layoutdriver_policy_flags {
@@ -106,7 +140,8 @@ struct pnfs_layoutdriver_type {
        struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
        void (*mark_request_commit) (struct nfs_page *req,
                                     struct pnfs_layout_segment *lseg,
-                                     struct nfs_commit_info *cinfo);
+                                     struct nfs_commit_info *cinfo,
+                                     u32 ds_commit_idx);
        void (*clear_request_commit) (struct nfs_page *req,
                                      struct nfs_commit_info *cinfo);
        int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
@@ -154,6 +189,7 @@ struct pnfs_layout_hdr {
        u32                     plh_barrier; /* ignore lower seqids */
        unsigned long           plh_retry_timestamp;
        unsigned long           plh_flags;
+        enum pnfs_iomode        plh_return_iomode;
        loff_t                  plh_lwb; /* last write byte for layoutcommit */
        struct rpc_cred         *plh_lc_cred; /* layoutcommit cred */
        struct inode            *plh_inode;
@@ -185,7 +221,7 @@ extern int nfs4_proc_getdeviceinfo(struct nfs_server *server,
                                   struct pnfs_device *dev,
                                   struct rpc_cred *cred);
 extern struct pnfs_layout_segment* nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags);
-extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp);
+extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync);
 /* pnfs.c */
 void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
@@ -198,6 +234,7 @@ void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *
 int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
 void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
                                struct nfs_page *req, u64 wb_size);
+void pnfs_generic_pg_cleanup(struct nfs_pageio_descriptor *);
 int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc);
 size_t pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
                            struct nfs_page *prev, struct nfs_page *req);
@@ -217,6 +254,7 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
                             bool update_barrier);
 int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
                                  struct pnfs_layout_hdr *lo,
+                                  struct pnfs_layout_range *range,
                                  struct nfs4_state *open_state);
 int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
                                struct list_head *tmp_list,
@@ -233,17 +271,21 @@ int _pnfs_return_layout(struct inode *);
 int pnfs_commit_and_return_layout(struct inode *);
 void pnfs_ld_write_done(struct nfs_pgio_header *);
 void pnfs_ld_read_done(struct nfs_pgio_header *);
+int pnfs_read_resend_pnfs(struct nfs_pgio_header *);
 struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
                                               struct nfs_open_context *ctx,
                                               loff_t pos,
                                               u64 count,
                                               enum pnfs_iomode iomode,
                                               gfp_t gfp_flags);
+void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
 void nfs4_deviceid_mark_client_invalid(struct nfs_client *clp);
 int pnfs_read_done_resend_to_mds(struct nfs_pgio_header *);
 int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
 struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+                                       struct pnfs_layout_segment *lseg);
 /* nfs4_deviceid_flags */
 enum {
@@ -275,6 +317,43 @@ void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node);
 bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
 void nfs4_deviceid_purge_client(const struct nfs_client *);
+/* pnfs_nfs.c */
+void pnfs_generic_clear_request_commit(struct nfs_page *req,
+                                       struct nfs_commit_info *cinfo);
+void pnfs_generic_commit_release(void *calldata);
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
+void pnfs_generic_rw_release(void *data);
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+                                      struct nfs_commit_info *cinfo);
+int pnfs_generic_commit_pagelist(struct inode *inode,
+                                 struct list_head *mds_pages,
+                                 int how,
+                                 struct nfs_commit_info *cinfo,
+                                 int (*initiate_commit)(struct nfs_commit_data *data,
+                                                        int how));
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max);
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data);
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds);
+struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
+                                      gfp_t gfp_flags);
+void nfs4_pnfs_v3_ds_connect_unload(void);
+void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+                          struct nfs4_deviceid_node *devid, unsigned int timeo,
+                          unsigned int retrans, u32 version, u32 minor_version,
+                          rpc_authflavor_t au_flavor);
+struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
+                                                 struct xdr_stream *xdr,
+                                                 gfp_t gfp_flags);
+void pnfs_layout_mark_request_commit(struct nfs_page *req,
+                                     struct pnfs_layout_segment *lseg,
+                                     struct nfs_commit_info *cinfo,
+                                     u32 ds_commit_idx);
+static inline bool nfs_have_layout(struct inode *inode)
+{
+        return NFS_I(inode)->layout != NULL;
+}
 static inline struct nfs4_deviceid_node *
 nfs4_get_deviceid(struct nfs4_deviceid_node *d)
 {
@@ -282,6 +361,26 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
        return d;
 }
+static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+        if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
+                atomic_inc(&lo->plh_refcount);
+}
+static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+        if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
+                atomic_dec(&lo->plh_refcount);
+                /* wake up waiters for LAYOUTRETURN as that is not needed */
+                wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+        }
+}
+static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
+{
+        return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
+}
 static inline struct pnfs_layout_segment *
 pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -317,16 +416,22 @@ pnfs_get_ds_info(struct inode *inode)
        return ld->get_ds_info(inode);
 }
+static inline void
+pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
+{
+        set_bit(NFS_DEVICEID_INVALID, &node->flags);
+}
 static inline bool
 pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
-                         struct nfs_commit_info *cinfo)
+                         struct nfs_commit_info *cinfo, u32 ds_commit_idx)
 {
        struct inode *inode = req->wb_context->dentry->d_inode;
        struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
        if (lseg == NULL || ld->mark_request_commit == NULL)
                return false;
-        ld->mark_request_commit(req, lseg, cinfo);
+        ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
        return true;
 }
@@ -352,15 +457,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
                return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
 }
-static inline void
-pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
-                         struct nfs_commit_info *cinfo)
-{
-        if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
-                return;
-        NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
-}
 static inline struct nfs_page *
 pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
                        struct page *page)
@@ -427,6 +523,11 @@ static inline void nfs4_print_deviceid(const struct nfs4_deviceid *dev_id)
 #endif /* NFS_DEBUG */
 #else  /* CONFIG_NFS_V4_1 */
+static inline bool nfs_have_layout(struct inode *inode)
+{
+        return false;
+}
 static inline void pnfs_destroy_all_layouts(struct nfs_client *clp)
 {
 }
@@ -513,7 +614,7 @@ pnfs_get_ds_info(struct inode *inode)
 static inline bool
 pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
-                         struct nfs_commit_info *cinfo)
+                         struct nfs_commit_info *cinfo, u32 ds_commit_idx)
 {
        return false;
 }
@@ -531,12 +632,6 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
        return 0;
 }
-static inline void
-pnfs_recover_commit_reqs(struct inode *inode, struct list_head *list,
-                         struct nfs_commit_info *cinfo)
-{
-}
 static inline struct nfs_page *
 pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
                        struct page *page)
@@ -568,6 +663,10 @@ static inline struct nfs4_threshold *pnfs_mdsthreshold_alloc(void)
        return NULL;
 }
+static inline void nfs4_pnfs_v3_ds_connect_unload(void)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 #endif /* FS_NFS_PNFS_H */
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
new file mode 100644
index 000000000000..54e36b38fb5f
--- /dev/null
+++ b/fs/nfs/pnfs_nfs.c
@@ -0,0 +1,870 @@
+/*
+ * Common NFS I/O  operations for the pnfs file based
+ * layout drivers.
+ *
+ * Copyright (c) 2014, Primary Data, Inc. All rights reserved.
+ *
+ * Tom Haynes <loghyr@primarydata.com>
+ */
+#include <linux/nfs_fs.h>
+#include <linux/nfs_page.h>
+#include <linux/sunrpc/addr.h>
+#include <linux/module.h>
+#include "nfs4session.h"
+#include "internal.h"
+#include "pnfs.h"
+#define NFSDBG_FACILITY         NFSDBG_PNFS
+void pnfs_generic_rw_release(void *data)
+{
+        struct nfs_pgio_header *hdr = data;
+        nfs_put_client(hdr->ds_clp);
+        hdr->mds_ops->rpc_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_rw_release);
+/* Fake up some data that will cause nfs_commit_release to retry the writes. */
+void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data)
+{
+        struct nfs_page *first = nfs_list_entry(data->pages.next);
+        data->task.tk_status = 0;
+        memcpy(&data->verf.verifier, &first->wb_verf,
+               sizeof(data->verf.verifier));
+        data->verf.verifier.data[0]++; /* ensure verifier mismatch */
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_prepare_to_resend_writes);
+void pnfs_generic_write_commit_done(struct rpc_task *task, void *data)
+{
+        struct nfs_commit_data *wdata = data;
+        /* Note this may cause RPC to be resent */
+        wdata->mds_ops->rpc_call_done(task, data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_write_commit_done);
+void pnfs_generic_commit_release(void *calldata)
+{
+        struct nfs_commit_data *data = calldata;
+        data->completion_ops->completion(data);
+        pnfs_put_lseg(data->lseg);
+        nfs_put_client(data->ds_clp);
+        nfs_commitdata_release(data);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
+/* The generic layer is about to remove the req from the commit list.
+ * If this will make the bucket empty, it will need to put the lseg reference.
+ * Note this must be called holding the inode (/cinfo) lock
+ */
+void
+pnfs_generic_clear_request_commit(struct nfs_page *req,
+                                  struct nfs_commit_info *cinfo)
+{
+        struct pnfs_layout_segment *freeme = NULL;
+        if (!test_and_clear_bit(PG_COMMIT_TO_DS, &req->wb_flags))
+                goto out;
+        cinfo->ds->nwritten--;
+        if (list_is_singular(&req->wb_list)) {
+                struct pnfs_commit_bucket *bucket;
+                bucket = list_first_entry(&req->wb_list,
+                                          struct pnfs_commit_bucket,
+                                          written);
+                freeme = bucket->wlseg;
+                bucket->wlseg = NULL;
+        }
+out:
+        nfs_request_remove_commit_list(req, cinfo);
+        pnfs_put_lseg_locked(freeme);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
+static int
+pnfs_generic_transfer_commit_list(struct list_head *src, struct list_head *dst,
+                                  struct nfs_commit_info *cinfo, int max)
+{
+        struct nfs_page *req, *tmp;
+        int ret = 0;
+        list_for_each_entry_safe(req, tmp, src, wb_list) {
+                if (!nfs_lock_request(req))
+                        continue;
+                kref_get(&req->wb_kref);
+                if (cond_resched_lock(cinfo->lock))
+                        list_safe_reset_next(req, tmp, wb_list);
+                nfs_request_remove_commit_list(req, cinfo);
+                clear_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+                nfs_list_add_request(req, dst);
+                ret++;
+                if ((ret == max) && !cinfo->dreq)
+                        break;
+        }
+        return ret;
+}
+static int
+pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+                                 struct nfs_commit_info *cinfo,
+                                 int max)
+{
+        struct list_head *src = &bucket->written;
+        struct list_head *dst = &bucket->committing;
+        int ret;
+        lockdep_assert_held(cinfo->lock);
+        ret = pnfs_generic_transfer_commit_list(src, dst, cinfo, max);
+        if (ret) {
+                cinfo->ds->nwritten -= ret;
+                cinfo->ds->ncommitting += ret;
+                bucket->clseg = bucket->wlseg;
+                if (list_empty(src))
+                        bucket->wlseg = NULL;
+                else
+                        pnfs_get_lseg(bucket->clseg);
+        }
+        return ret;
+}
+/* Move reqs from written to committing lists, returning count
+ * of number moved.
+ */
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
+                                   int max)
+{
+        int i, rv = 0, cnt;
+        lockdep_assert_held(cinfo->lock);
+        for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
+                cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
+                                                       cinfo, max);
+                max -= cnt;
+                rv += cnt;
+        }
+        return rv;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
+/* Pull everything off the committing lists and dump into @dst.  */
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+                                      struct nfs_commit_info *cinfo)
+{
+        struct pnfs_commit_bucket *b;
+        struct pnfs_layout_segment *freeme;
+        int i;
+        lockdep_assert_held(cinfo->lock);
+restart:
+        for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+                if (pnfs_generic_transfer_commit_list(&b->written, dst,
+                                                      cinfo, 0)) {
+                        freeme = b->wlseg;
+                        b->wlseg = NULL;
+                        spin_unlock(cinfo->lock);
+                        pnfs_put_lseg(freeme);
+                        spin_lock(cinfo->lock);
+                        goto restart;
+                }
+        }
+        cinfo->ds->nwritten = 0;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
+static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
+{
+        struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+        struct pnfs_commit_bucket *bucket;
+        struct pnfs_layout_segment *freeme;
+        int i;
+        for (i = idx; i < fl_cinfo->nbuckets; i++) {
+                bucket = &fl_cinfo->buckets[i];
+                if (list_empty(&bucket->committing))
+                        continue;
+                nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo, i);
+                spin_lock(cinfo->lock);
+                freeme = bucket->clseg;
+                bucket->clseg = NULL;
+                spin_unlock(cinfo->lock);
+                pnfs_put_lseg(freeme);
+        }
+}
+static unsigned int
+pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
+                              struct list_head *list)
+{
+        struct pnfs_ds_commit_info *fl_cinfo;
+        struct pnfs_commit_bucket *bucket;
+        struct nfs_commit_data *data;
+        int i;
+        unsigned int nreq = 0;
+        fl_cinfo = cinfo->ds;
+        bucket = fl_cinfo->buckets;
+        for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
+                if (list_empty(&bucket->committing))
+                        continue;
+                data = nfs_commitdata_alloc();
+                if (!data)
+                        break;
+                data->ds_commit_index = i;
+                spin_lock(cinfo->lock);
+                data->lseg = bucket->clseg;
+                bucket->clseg = NULL;
+                spin_unlock(cinfo->lock);
+                list_add(&data->pages, list);
+                nreq++;
+        }
+        /* Clean up on error */
+        pnfs_generic_retry_commit(cinfo, i);
+        return nreq;
+}
+/* This follows nfs_commit_list pretty closely */
+int
+pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
+                             int how, struct nfs_commit_info *cinfo,
+                             int (*initiate_commit)(struct nfs_commit_data *data,
+                                                    int how))
+{
+        struct nfs_commit_data *data, *tmp;
+        LIST_HEAD(list);
+        unsigned int nreq = 0;
+        if (!list_empty(mds_pages)) {
+                data = nfs_commitdata_alloc();
+                if (data != NULL) {
+                        data->lseg = NULL;
+                        list_add(&data->pages, &list);
+                        nreq++;
+                } else {
+                        nfs_retry_commit(mds_pages, NULL, cinfo, 0);
+                        pnfs_generic_retry_commit(cinfo, 0);
+                        cinfo->completion_ops->error_cleanup(NFS_I(inode));
+                        return -ENOMEM;
+                }
+        }
+        nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
+        if (nreq == 0) {
+                cinfo->completion_ops->error_cleanup(NFS_I(inode));
+                goto out;
+        }
+        atomic_add(nreq, &cinfo->mds->rpcs_out);
+        list_for_each_entry_safe(data, tmp, &list, pages) {
+                list_del_init(&data->pages);
+                if (!data->lseg) {
+                        nfs_init_commit(data, mds_pages, NULL, cinfo);
+                        nfs_initiate_commit(NFS_CLIENT(inode), data,
+                                            NFS_PROTO(data->inode),
+                                            data->mds_ops, how, 0);
+                } else {
+                        struct pnfs_commit_bucket *buckets;
+                        buckets = cinfo->ds->buckets;
+                        nfs_init_commit(data,
+                                        &buckets[data->ds_commit_index].committing,
+                                        data->lseg,
+                                        cinfo);
+                        initiate_commit(data, how);
+                }
+        }
+out:
+        cinfo->ds->ncommitting = 0;
+        return PNFS_ATTEMPTED;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist);
+/*
+ * Data server cache
+ *
+ * Data servers can be mapped to different device ids.
+ * nfs4_pnfs_ds reference counting
+ *   - set to 1 on allocation
+ *   - incremented when a device id maps a data server already in the cache.
+ *   - decremented when deviceid is removed from the cache.
+ */
+static DEFINE_SPINLOCK(nfs4_ds_cache_lock);
+static LIST_HEAD(nfs4_data_server_cache);
+/* Debug routines */
+static void
+print_ds(struct nfs4_pnfs_ds *ds)
+{
+        if (ds == NULL) {
+                printk(KERN_WARNING "%s NULL device\n", __func__);
+                return;
+        }
+        printk(KERN_WARNING "        ds %s\n"
+                "        ref count %d\n"
+                "        client %p\n"
+                "        cl_exchange_flags %x\n",
+                ds->ds_remotestr,
+                atomic_read(&ds->ds_count), ds->ds_clp,
+                ds->ds_clp ? ds->ds_clp->cl_exchange_flags : 0);
+}
+static bool
+same_sockaddr(struct sockaddr *addr1, struct sockaddr *addr2)
+{
+        struct sockaddr_in *a, *b;
+        struct sockaddr_in6 *a6, *b6;
+        if (addr1->sa_family != addr2->sa_family)
+                return false;
+        switch (addr1->sa_family) {
+        case AF_INET:
+                a = (struct sockaddr_in *)addr1;
+                b = (struct sockaddr_in *)addr2;
+                if (a->sin_addr.s_addr == b->sin_addr.s_addr &&
+                    a->sin_port == b->sin_port)
+                        return true;
+                break;
+        case AF_INET6:
+                a6 = (struct sockaddr_in6 *)addr1;
+                b6 = (struct sockaddr_in6 *)addr2;
+                /* LINKLOCAL addresses must have matching scope_id */
+                if (ipv6_addr_src_scope(&a6->sin6_addr) ==
+                    IPV6_ADDR_SCOPE_LINKLOCAL &&
+                    a6->sin6_scope_id != b6->sin6_scope_id)
+                        return false;
+                if (ipv6_addr_equal(&a6->sin6_addr, &b6->sin6_addr) &&
+                    a6->sin6_port == b6->sin6_port)
+                        return true;
+                break;
+        default:
+                dprintk("%s: unhandled address family: %u\n",
+                        __func__, addr1->sa_family);
+                return false;
+        }
+        return false;
+}
+static bool
+_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
+                               const struct list_head *dsaddrs2)
+{
+        struct nfs4_pnfs_ds_addr *da1, *da2;
+        /* step through both lists, comparing as we go */
+        for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
+             da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
+             da1 != NULL && da2 != NULL;
+             da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
+             da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
+                if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
+                                   (struct sockaddr *)&da2->da_addr))
+                        return false;
+        }
+        if (da1 == NULL && da2 == NULL)
+                return true;
+        return false;
+}
+/*
+ * Lookup DS by addresses.  nfs4_ds_cache_lock is held
+ */
+static struct nfs4_pnfs_ds *
+_data_server_lookup_locked(const struct list_head *dsaddrs)
+{
+        struct nfs4_pnfs_ds *ds;
+        list_for_each_entry(ds, &nfs4_data_server_cache, ds_node)
+                if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs))
+                        return ds;
+        return NULL;
+}
+static void destroy_ds(struct nfs4_pnfs_ds *ds)
+{
+        struct nfs4_pnfs_ds_addr *da;
+        dprintk("--> %s\n", __func__);
+        ifdebug(FACILITY)
+                print_ds(ds);
+        nfs_put_client(ds->ds_clp);
+        while (!list_empty(&ds->ds_addrs)) {
+                da = list_first_entry(&ds->ds_addrs,
+                                      struct nfs4_pnfs_ds_addr,
+                                      da_node);
+                list_del_init(&da->da_node);
+                kfree(da->da_remotestr);
+                kfree(da);
+        }
+        kfree(ds->ds_remotestr);
+        kfree(ds);
+}
+void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds)
+{
+        if (atomic_dec_and_lock(&ds->ds_count,
+                                &nfs4_ds_cache_lock)) {
+                list_del_init(&ds->ds_node);
+                spin_unlock(&nfs4_ds_cache_lock);
+                destroy_ds(ds);
+        }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_put);
+/*
+ * Create a string with a human readable address and port to avoid
+ * complicated setup around many dprinks.
+ */
+static char *
+nfs4_pnfs_remotestr(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+        struct nfs4_pnfs_ds_addr *da;
+        char *remotestr;
+        size_t len;
+        char *p;
+        len = 3;        /* '{', '}' and eol */
+        list_for_each_entry(da, dsaddrs, da_node) {
+                len += strlen(da->da_remotestr) + 1;    /* string plus comma */
+        }
+        remotestr = kzalloc(len, gfp_flags);
+        if (!remotestr)
+                return NULL;
+        p = remotestr;
+        *(p++) = '{';
+        len--;
+        list_for_each_entry(da, dsaddrs, da_node) {
+                size_t ll = strlen(da->da_remotestr);
+                if (ll > len)
+                        goto out_err;
+                memcpy(p, da->da_remotestr, ll);
+                p += ll;
+                len -= ll;
+                if (len < 1)
+                        goto out_err;
+                (*p++) = ',';
+                len--;
+        }
+        if (len < 2)
+                goto out_err;
+        *(p++) = '}';
+        *p = '\0';
+        return remotestr;
+out_err:
+        kfree(remotestr);
+        return NULL;
+}
+/*
+ * Given a list of multipath struct nfs4_pnfs_ds_addr, add it to ds cache if
+ * uncached and return cached struct nfs4_pnfs_ds.
+ */
+struct nfs4_pnfs_ds *
+nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags)
+{
+        struct nfs4_pnfs_ds *tmp_ds, *ds = NULL;
+        char *remotestr;
+        if (list_empty(dsaddrs)) {
+                dprintk("%s: no addresses defined\n", __func__);
+                goto out;
+        }
+        ds = kzalloc(sizeof(*ds), gfp_flags);
+        if (!ds)
+                goto out;
+        /* this is only used for debugging, so it's ok if its NULL */
+        remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags);
+        spin_lock(&nfs4_ds_cache_lock);
+        tmp_ds = _data_server_lookup_locked(dsaddrs);
+        if (tmp_ds == NULL) {
+                INIT_LIST_HEAD(&ds->ds_addrs);
+                list_splice_init(dsaddrs, &ds->ds_addrs);
+                ds->ds_remotestr = remotestr;
+                atomic_set(&ds->ds_count, 1);
+                INIT_LIST_HEAD(&ds->ds_node);
+                ds->ds_clp = NULL;
+                list_add(&ds->ds_node, &nfs4_data_server_cache);
+                dprintk("%s add new data server %s\n", __func__,
+                        ds->ds_remotestr);
+        } else {
+                kfree(remotestr);
+                kfree(ds);
+                atomic_inc(&tmp_ds->ds_count);
+                dprintk("%s data server %s found, inc'ed ds_count to %d\n",
+                        __func__, tmp_ds->ds_remotestr,
+                        atomic_read(&tmp_ds->ds_count));
+                ds = tmp_ds;
+        }
+        spin_unlock(&nfs4_ds_cache_lock);
+out:
+        return ds;
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_add);
+static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
+{
+        might_sleep();
+        wait_on_bit(&ds->ds_state, NFS4DS_CONNECTING,
+                        TASK_KILLABLE);
+}
+static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
+{
+        smp_mb__before_atomic();
+        clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
+        smp_mb__after_atomic();
+        wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
+}
+static struct nfs_client *(*get_v3_ds_connect)(
+                        struct nfs_client *mds_clp,
+                        const struct sockaddr *ds_addr,
+                        int ds_addrlen,
+                        int ds_proto,
+                        unsigned int ds_timeo,
+                        unsigned int ds_retrans,
+                        rpc_authflavor_t au_flavor);
+static bool load_v3_ds_connect(void)
+{
+        if (!get_v3_ds_connect) {
+                get_v3_ds_connect = symbol_request(nfs3_set_ds_client);
+                WARN_ON_ONCE(!get_v3_ds_connect);
+        }
+        return(get_v3_ds_connect != NULL);
+}
+void __exit nfs4_pnfs_v3_ds_connect_unload(void)
+{
+        if (get_v3_ds_connect) {
+                symbol_put(nfs3_set_ds_client);
+                get_v3_ds_connect = NULL;
+        }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
+static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
+                                 struct nfs4_pnfs_ds *ds,
+                                 unsigned int timeo,
+                                 unsigned int retrans,
+                                 rpc_authflavor_t au_flavor)
+{
+        struct nfs_client *clp = ERR_PTR(-EIO);
+        struct nfs4_pnfs_ds_addr *da;
+        int status = 0;
+        dprintk("--> %s DS %s au_flavor %d\n", __func__,
+                ds->ds_remotestr, au_flavor);
+        if (!load_v3_ds_connect())
+                goto out;
+        list_for_each_entry(da, &ds->ds_addrs, da_node) {
+                dprintk("%s: DS %s: trying address %s\n",
+                        __func__, ds->ds_remotestr, da->da_remotestr);
+                clp = get_v3_ds_connect(mds_srv->nfs_client,
+                                        (struct sockaddr *)&da->da_addr,
+                                        da->da_addrlen, IPPROTO_TCP,
+                                        timeo, retrans, au_flavor);
+                if (!IS_ERR(clp))
+                        break;
+        }
+        if (IS_ERR(clp)) {
+                status = PTR_ERR(clp);
+                goto out;
+        }
+        smp_wmb();
+        ds->ds_clp = clp;
+        dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+        return status;
+}
+static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
+                                 struct nfs4_pnfs_ds *ds,
+                                 unsigned int timeo,
+                                 unsigned int retrans,
+                                 u32 minor_version,
+                                 rpc_authflavor_t au_flavor)
+{
+        struct nfs_client *clp = ERR_PTR(-EIO);
+        struct nfs4_pnfs_ds_addr *da;
+        int status = 0;
+        dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
+                au_flavor);
+        list_for_each_entry(da, &ds->ds_addrs, da_node) {
+                dprintk("%s: DS %s: trying address %s\n",
+                        __func__, ds->ds_remotestr, da->da_remotestr);
+                clp = nfs4_set_ds_client(mds_srv->nfs_client,
+                                        (struct sockaddr *)&da->da_addr,
+                                        da->da_addrlen, IPPROTO_TCP,
+                                        timeo, retrans, minor_version,
+                                        au_flavor);
+                if (!IS_ERR(clp))
+                        break;
+        }
+        if (IS_ERR(clp)) {
+                status = PTR_ERR(clp);
+                goto out;
+        }
+        status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
+        if (status)
+                goto out_put;
+        smp_wmb();
+        ds->ds_clp = clp;
+        dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
+out:
+        return status;
+out_put:
+        nfs_put_client(clp);
+        goto out;
+}
+/*
+ * Create an rpc connection to the nfs4_pnfs_ds data server.
+ * Currently only supports IPv4 and IPv6 addresses.
+ * If connection fails, make devid unavailable.
+ */
+void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
+                          struct nfs4_deviceid_node *devid, unsigned int timeo,
+                          unsigned int retrans, u32 version,
+                          u32 minor_version, rpc_authflavor_t au_flavor)
+{
+        if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
+                int err = 0;
+                if (version == 3) {
+                        err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
+                                                       retrans, au_flavor);
+                } else if (version == 4) {
+                        err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
+                                                       retrans, minor_version,
+                                                       au_flavor);
+                } else {
+                        dprintk("%s: unsupported DS version %d\n", __func__,
+                                version);
+                        err = -EPROTONOSUPPORT;
+                }
+                if (err)
+                        nfs4_mark_deviceid_unavailable(devid);
+                nfs4_clear_ds_conn_bit(ds);
+        } else {
+                nfs4_wait_ds_connect(ds);
+        }
+}
+EXPORT_SYMBOL_GPL(nfs4_pnfs_ds_connect);
+/*
+ * Currently only supports ipv4, ipv6 and one multi-path address.
+ */
+struct nfs4_pnfs_ds_addr *
+nfs4_decode_mp_ds_addr(struct net *net, struct xdr_stream *xdr, gfp_t gfp_flags)
+{
+        struct nfs4_pnfs_ds_addr *da = NULL;
+        char *buf, *portstr;
+        __be16 port;
+        int nlen, rlen;
+        int tmp[2];
+        __be32 *p;
+        char *netid, *match_netid;
+        size_t len, match_netid_len;
+        char *startsep = "";
+        char *endsep = "";
+        /* r_netid */
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_err;
+        nlen = be32_to_cpup(p++);
+        p = xdr_inline_decode(xdr, nlen);
+        if (unlikely(!p))
+                goto out_err;
+        netid = kmalloc(nlen+1, gfp_flags);
+        if (unlikely(!netid))
+                goto out_err;
+        netid[nlen] = '\0';
+        memcpy(netid, p, nlen);
+        /* r_addr: ip/ip6addr with port in dec octets - see RFC 5665 */
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_free_netid;
+        rlen = be32_to_cpup(p);
+        p = xdr_inline_decode(xdr, rlen);
+        if (unlikely(!p))
+                goto out_free_netid;
+        /* port is ".ABC.DEF", 8 chars max */
+        if (rlen > INET6_ADDRSTRLEN + IPV6_SCOPE_ID_LEN + 8) {
+                dprintk("%s: Invalid address, length %d\n", __func__,
+                        rlen);
+                goto out_free_netid;
+        }
+        buf = kmalloc(rlen + 1, gfp_flags);
+        if (!buf) {
+                dprintk("%s: Not enough memory\n", __func__);
+                goto out_free_netid;
+        }
+        buf[rlen] = '\0';
+        memcpy(buf, p, rlen);
+        /* replace port '.' with '-' */
+        portstr = strrchr(buf, '.');
+        if (!portstr) {
+                dprintk("%s: Failed finding expected dot in port\n",
+                        __func__);
+                goto out_free_buf;
+        }
+        *portstr = '-';
+        /* find '.' between address and port */
+        portstr = strrchr(buf, '.');
+        if (!portstr) {
+                dprintk("%s: Failed finding expected dot between address and "
+                        "port\n", __func__);
+                goto out_free_buf;
+        }
+        *portstr = '\0';
+        da = kzalloc(sizeof(*da), gfp_flags);
+        if (unlikely(!da))
+                goto out_free_buf;
+        INIT_LIST_HEAD(&da->da_node);
+        if (!rpc_pton(net, buf, portstr-buf, (struct sockaddr *)&da->da_addr,
+                      sizeof(da->da_addr))) {
+                dprintk("%s: error parsing address %s\n", __func__, buf);
+                goto out_free_da;
+        }
+        portstr++;
+        sscanf(portstr, "%d-%d", &tmp[0], &tmp[1]);
+        port = htons((tmp[0] << 8) | (tmp[1]));
+        switch (da->da_addr.ss_family) {
+        case AF_INET:
+                ((struct sockaddr_in *)&da->da_addr)->sin_port = port;
+                da->da_addrlen = sizeof(struct sockaddr_in);
+                match_netid = "tcp";
+                match_netid_len = 3;
+                break;
+        case AF_INET6:
+                ((struct sockaddr_in6 *)&da->da_addr)->sin6_port = port;
+                da->da_addrlen = sizeof(struct sockaddr_in6);
+                match_netid = "tcp6";
+                match_netid_len = 4;
+                startsep = "[";
+                endsep = "]";
+                break;
+        default:
+                dprintk("%s: unsupported address family: %u\n",
+                        __func__, da->da_addr.ss_family);
+                goto out_free_da;
+        }
+        if (nlen != match_netid_len || strncmp(netid, match_netid, nlen)) {
+                dprintk("%s: ERROR: r_netid \"%s\" != \"%s\"\n",
+                        __func__, netid, match_netid);
+                goto out_free_da;
+        }
+        /* save human readable address */
+        len = strlen(startsep) + strlen(buf) + strlen(endsep) + 7;
+        da->da_remotestr = kzalloc(len, gfp_flags);
+        /* NULL is ok, only used for dprintk */
+        if (da->da_remotestr)
+                snprintf(da->da_remotestr, len, "%s%s%s:%u", startsep,
+                         buf, endsep, ntohs(port));
+        dprintk("%s: Parsed DS addr %s\n", __func__, da->da_remotestr);
+        kfree(buf);
+        kfree(netid);
+        return da;
+out_free_da:
+        kfree(da);
+out_free_buf:
+        dprintk("%s: Error parsing DS addr: %s\n", __func__, buf);
+        kfree(buf);
+out_free_netid:
+        kfree(netid);
+out_err:
+        return NULL;
+}
+EXPORT_SYMBOL_GPL(nfs4_decode_mp_ds_addr);
+void
+pnfs_layout_mark_request_commit(struct nfs_page *req,
+                                struct pnfs_layout_segment *lseg,
+                                struct nfs_commit_info *cinfo,
+                                u32 ds_commit_idx)
+{
+        struct list_head *list;
+        struct pnfs_commit_bucket *buckets;
+        spin_lock(cinfo->lock);
+        buckets = cinfo->ds->buckets;
+        list = &buckets[ds_commit_idx].written;
+        if (list_empty(list)) {
+                /* Non-empty buckets hold a reference on the lseg.  That ref
+                 * is normally transferred to the COMMIT call and released
+                 * there.  It could also be released if the last req is pulled
+                 * off due to a rewrite, in which case it will be done in
+                 * pnfs_common_clear_request_commit
+                 */
+                WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
+                buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
+        }
+        set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
+        cinfo->ds->nwritten++;
+        spin_unlock(cinfo->lock);
+        nfs_request_add_commit_list(req, list, cinfo);
+}
+EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index c91a4799c562..568ecf0a880f 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -70,8 +70,15 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_read);
 void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 {
+        struct nfs_pgio_mirror *mirror;
        pgio->pg_ops = &nfs_pgio_rw_ops;
-        pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
+        /* read path should never have more than one mirror */
+        WARN_ON_ONCE(pgio->pg_mirror_count != 1);
+        mirror = &pgio->pg_mirrors[0];
+        mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->rsize;
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
@@ -81,6 +88,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
        struct nfs_page *new;
        unsigned int len;
        struct nfs_pageio_descriptor pgio;
+        struct nfs_pgio_mirror *pgm;
        len = nfs_page_length(page);
        if (len == 0)
@@ -97,7 +105,13 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
                             &nfs_async_read_completion_ops);
        nfs_pageio_add_request(&pgio, new);
        nfs_pageio_complete(&pgio);
-        NFS_I(inode)->read_io += pgio.pg_bytes_written;
+        /* It doesn't make sense to do mirrored reads! */
+        WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+        pgm = &pgio.pg_mirrors[0];
+        NFS_I(inode)->read_io += pgm->pg_bytes_written;
        return 0;
 }
@@ -168,13 +182,14 @@ out:
 static void nfs_initiate_read(struct nfs_pgio_header *hdr,
                              struct rpc_message *msg,
+                              const struct nfs_rpc_ops *rpc_ops,
                              struct rpc_task_setup *task_setup_data, int how)
 {
        struct inode *inode = hdr->inode;
        int swap_flags = IS_SWAPFILE(inode) ? NFS_RPC_SWAPFLAGS : 0;
        task_setup_data->flags |= swap_flags;
-        NFS_PROTO(inode)->read_setup(hdr, msg);
+        rpc_ops->read_setup(hdr, msg);
 }
 static void
@@ -351,6 +366,7 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                struct list_head *pages, unsigned nr_pages)
 {
        struct nfs_pageio_descriptor pgio;
+        struct nfs_pgio_mirror *pgm;
        struct nfs_readdesc desc = {
                .pgio = &pgio,
        };
@@ -386,10 +402,15 @@ int nfs_readpages(struct file *filp, struct address_space *mapping,
                             &nfs_async_read_completion_ops);
        ret = read_cache_pages(mapping, pages, readpage_async_filler, &desc);
        nfs_pageio_complete(&pgio);
-        NFS_I(inode)->read_io += pgio.pg_bytes_written;
-        npages = (pgio.pg_bytes_written + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        /* It doesn't make sense to do mirrored reads! */
+        WARN_ON_ONCE(pgio.pg_mirror_count != 1);
+        pgm = &pgio.pg_mirrors[0];
+        NFS_I(inode)->read_io += pgm->pg_bytes_written;
+        npages = (pgm->pg_bytes_written + PAGE_CACHE_SIZE - 1) >>
+                 PAGE_CACHE_SHIFT;
        nfs_add_stats(inode, NFSIOS_READPAGES, npages);
 read_complete:
        put_nfs_open_context(desc.ctx);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 31a11b0e885d..322b2de02988 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -311,7 +311,6 @@ const struct super_operations nfs_sops = {
        .destroy_inode  = nfs_destroy_inode,
        .write_inode    = nfs_write_inode,
        .drop_inode     = nfs_drop_inode,
-        .put_super      = nfs_put_super,
        .statfs         = nfs_statfs,
        .evict_inode    = nfs_evict_inode,
        .umount_begin   = nfs_umount_begin,
@@ -405,12 +404,15 @@ void __exit unregister_nfs_fs(void)
        unregister_filesystem(&nfs_fs_type);
 }
-void nfs_sb_active(struct super_block *sb)
+bool nfs_sb_active(struct super_block *sb)
 {
        struct nfs_server *server = NFS_SB(sb);
-        if (atomic_inc_return(&server->active) == 1)
+        if (!atomic_inc_not_zero(&sb->s_active))
-                atomic_inc(&sb->s_active);
+                return false;
+        if (atomic_inc_return(&server->active) != 1)
+                atomic_dec(&sb->s_active);
+        return true;
 }
 EXPORT_SYMBOL_GPL(nfs_sb_active);
@@ -2569,7 +2571,7 @@ struct dentry *nfs_fs_mount_common(struct nfs_server *server,
                error = nfs_bdi_register(server);
                if (error) {
                        mntroot = ERR_PTR(error);
-                        goto error_splat_bdi;
+                        goto error_splat_super;
                }
                server->super = s;
        }
@@ -2601,9 +2603,6 @@ error_splat_root:
        dput(mntroot);
        mntroot = ERR_PTR(error);
 error_splat_super:
-        if (server && !s->s_root)
-                bdi_unregister(&server->backing_dev_info);
-error_splat_bdi:
        deactivate_locked_super(s);
        goto out;
 }
@@ -2651,27 +2650,19 @@ out:
 EXPORT_SYMBOL_GPL(nfs_fs_mount);
 /*
- * Ensure that we unregister the bdi before kill_anon_super
- * releases the device name
- */
-void nfs_put_super(struct super_block *s)
-{
-        struct nfs_server *server = NFS_SB(s);
-        bdi_unregister(&server->backing_dev_info);
-}
-EXPORT_SYMBOL_GPL(nfs_put_super);
-/*
 * Destroy an NFS2/3 superblock
 */
 void nfs_kill_super(struct super_block *s)
 {
        struct nfs_server *server = NFS_SB(s);
+        dev_t dev = s->s_dev;
+        generic_shutdown_super(s);
-        kill_anon_super(s);
        nfs_fscache_release_super_cookie(s);
        nfs_free_server(server);
+        free_anon_bdev(dev);
 }
 EXPORT_SYMBOL_GPL(nfs_kill_super);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index af3af685a9e3..595d81e354d1 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -473,13 +473,18 @@ try_again:
        do {
                /*
                 * Subrequests are always contiguous, non overlapping
-                 * and in order. If not, it's a programming error.
+                 * and in order - but may be repeated (mirrored writes).
                 */
-                WARN_ON_ONCE(subreq->wb_offset !=
+                if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
-                     (head->wb_offset + total_bytes));
+                        /* keep track of how many bytes this group covers */
+                        total_bytes += subreq->wb_bytes;
-                /* keep track of how many bytes this group covers */
+                } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
-                total_bytes += subreq->wb_bytes;
+                            ((subreq->wb_offset + subreq->wb_bytes) >
+                             (head->wb_offset + total_bytes)))) {
+                        nfs_page_group_unlock(head);
+                        spin_unlock(&inode->i_lock);
+                        return ERR_PTR(-EIO);
+                }
                if (!nfs_lock_request(subreq)) {
                        /* releases page group bit lock and
@@ -784,13 +789,8 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
        nfs_list_add_request(req, dst);
        cinfo->mds->ncommit++;
        spin_unlock(cinfo->lock);
-        if (!cinfo->dreq) {
+        if (!cinfo->dreq)
-                inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+                nfs_mark_page_unstable(req->wb_page);
-                inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
-                             BDI_RECLAIMABLE);
-                __mark_inode_dirty(req->wb_context->dentry->d_inode,
-                                   I_DIRTY_DATASYNC);
-        }
 }
 EXPORT_SYMBOL_GPL(nfs_request_add_commit_list);
@@ -842,9 +842,9 @@ EXPORT_SYMBOL_GPL(nfs_init_cinfo);
 */
 void
 nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
-                        struct nfs_commit_info *cinfo)
+                        struct nfs_commit_info *cinfo, u32 ds_commit_idx)
 {
-        if (pnfs_mark_request_commit(req, lseg, cinfo))
+        if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
                return;
        nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
 }
@@ -853,7 +853,7 @@ static void
 nfs_clear_page_commit(struct page *page)
 {
        dec_zone_page_state(page, NR_UNSTABLE_NFS);
-        dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE);
+        dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE);
 }
 /* Called holding inode (/cinfo) lock */
@@ -900,7 +900,8 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
                }
                if (nfs_write_need_commit(hdr)) {
                        memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf));
-                        nfs_mark_request_commit(req, hdr->lseg, &cinfo);
+                        nfs_mark_request_commit(req, hdr->lseg, &cinfo,
+                                hdr->pgio_mirror_idx);
                        goto next;
                }
 remove_req:
@@ -1091,6 +1092,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 {
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct nfs_lock_context *l_ctx;
+        struct file_lock_context *flctx = file_inode(file)->i_flctx;
        struct nfs_page *req;
        int do_flush, status;
        /*
@@ -1109,7 +1111,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
                do_flush = req->wb_page != page || req->wb_context != ctx;
                /* for now, flush if more than 1 request in page_group */
                do_flush |= req->wb_this_page != req;
-                if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
+                if (l_ctx && flctx &&
+                    !(list_empty_careful(&flctx->flc_posix) &&
+                      list_empty_careful(&flctx->flc_flock))) {
                        do_flush |= l_ctx->lockowner.l_owner != current->files
                                || l_ctx->lockowner.l_pid != current->tgid;
                }
@@ -1170,6 +1174,13 @@ out:
        return PageUptodate(page) != 0;
 }
+static bool
+is_whole_file_wrlock(struct file_lock *fl)
+{
+        return fl->fl_start == 0 && fl->fl_end == OFFSET_MAX &&
+                        fl->fl_type == F_WRLCK;
+}
 /* If we know the page is up to date, and we're not using byte range locks (or
 * if we have the whole file locked for writing), it may be more efficient to
 * extend the write to cover the entire page in order to avoid fragmentation
@@ -1180,17 +1191,36 @@ out:
 */
 static int nfs_can_extend_write(struct file *file, struct page *page, struct inode *inode)
 {
+        int ret;
+        struct file_lock_context *flctx = inode->i_flctx;
+        struct file_lock *fl;
        if (file->f_flags & O_DSYNC)
                return 0;
        if (!nfs_write_pageuptodate(page, inode))
                return 0;
        if (NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE))
                return 1;
-        if (inode->i_flock == NULL || (inode->i_flock->fl_start == 0 &&
+        if (!flctx || (list_empty_careful(&flctx->flc_flock) &&
-                        inode->i_flock->fl_end == OFFSET_MAX &&
+                       list_empty_careful(&flctx->flc_posix)))
-                        inode->i_flock->fl_type != F_RDLCK))
+                return 0;
-                return 1;
-        return 0;
+        /* Check to see if there are whole file write locks */
+        ret = 0;
+        spin_lock(&flctx->flc_lock);
+        if (!list_empty(&flctx->flc_posix)) {
+                fl = list_first_entry(&flctx->flc_posix, struct file_lock,
+                                        fl_list);
+                if (is_whole_file_wrlock(fl))
+                        ret = 1;
+        } else if (!list_empty(&flctx->flc_flock)) {
+                fl = list_first_entry(&flctx->flc_flock, struct file_lock,
+                                        fl_list);
+                if (fl->fl_type == F_WRLCK)
+                        ret = 1;
+        }
+        spin_unlock(&flctx->flc_lock);
+        return ret;
 }
 /*
@@ -1240,15 +1270,15 @@ static int flush_task_priority(int how)
 static void nfs_initiate_write(struct nfs_pgio_header *hdr,
                               struct rpc_message *msg,
+                               const struct nfs_rpc_ops *rpc_ops,
                               struct rpc_task_setup *task_setup_data, int how)
 {
-        struct inode *inode = hdr->inode;
        int priority = flush_task_priority(how);
        task_setup_data->priority = priority;
-        NFS_PROTO(inode)->write_setup(hdr, msg);
+        rpc_ops->write_setup(hdr, msg);
-        nfs4_state_protect_write(NFS_SERVER(inode)->nfs_client,
+        nfs4_state_protect_write(NFS_SERVER(hdr->inode)->nfs_client,
                                 &task_setup_data->rpc_client, msg, hdr);
 }
@@ -1298,8 +1328,14 @@ EXPORT_SYMBOL_GPL(nfs_pageio_init_write);
 void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio)
 {
+        struct nfs_pgio_mirror *mirror;
        pgio->pg_ops = &nfs_pgio_rw_ops;
-        pgio->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
+        nfs_pageio_stop_mirroring(pgio);
+        mirror = &pgio->pg_mirrors[0];
+        mirror->pg_bsize = NFS_SERVER(pgio->pg_inode)->wsize;
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_write_mds);
@@ -1465,6 +1501,7 @@ void nfs_commitdata_release(struct nfs_commit_data *data)
 EXPORT_SYMBOL_GPL(nfs_commitdata_release);
 int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
+                        const struct nfs_rpc_ops *nfs_ops,
                        const struct rpc_call_ops *call_ops,
                        int how, int flags)
 {
@@ -1486,7 +1523,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
                .priority = priority,
        };
        /* Set up the initial task struct.  */
-        NFS_PROTO(data->inode)->commit_setup(data, &msg);
+        nfs_ops->commit_setup(data, &msg);
        dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
@@ -1554,19 +1591,17 @@ EXPORT_SYMBOL_GPL(nfs_init_commit);
 void nfs_retry_commit(struct list_head *page_list,
                      struct pnfs_layout_segment *lseg,
-                      struct nfs_commit_info *cinfo)
+                      struct nfs_commit_info *cinfo,
+                      u32 ds_commit_idx)
 {
        struct nfs_page *req;
        while (!list_empty(page_list)) {
                req = nfs_list_entry(page_list->next);
                nfs_list_remove_request(req);
-                nfs_mark_request_commit(req, lseg, cinfo);
+                nfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx);
-                if (!cinfo->dreq) {
+                if (!cinfo->dreq)
-                        dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS);
+                        nfs_clear_page_commit(req->wb_page);
-                        dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info,
-                                     BDI_RECLAIMABLE);
-                }
                nfs_unlock_and_release_request(req);
        }
 }
@@ -1589,10 +1624,10 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
        /* Set up the argument struct */
        nfs_init_commit(data, head, NULL, cinfo);
        atomic_inc(&cinfo->mds->rpcs_out);
-        return nfs_initiate_commit(NFS_CLIENT(inode), data, data->mds_ops,
+        return nfs_initiate_commit(NFS_CLIENT(inode), data, NFS_PROTO(inode),
-                                   how, 0);
+                                   data->mds_ops, how, 0);
 out_bad:
-        nfs_retry_commit(head, NULL, cinfo);
+        nfs_retry_commit(head, NULL, cinfo, 0);
        cinfo->completion_ops->error_cleanup(NFS_I(inode));
        return -ENOMEM;
 }
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 73395156bdb4..683bf718aead 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -82,6 +82,16 @@ config NFSD_V4
          If unsure, say N.
+config NFSD_PNFS
+        bool "NFSv4.1 server support for Parallel NFS (pNFS)"
+        depends on NFSD_V4
+        help
+          This option enables support for the parallel NFS features of the
+          minor version 1 of the NFSv4 protocol (RFC5661) in the kernel's NFS
+          server.
+          If unsure, say N.
 config NFSD_V4_SECURITY_LABEL
        bool "Provide Security Label support for NFSv4 server"
        depends on NFSD_V4 && SECURITY
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index af32ef06b4fe..9a6028e120c6 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -2,9 +2,14 @@
 # Makefile for the Linux nfs server
 #
+ccflags-y += -I$(src)                   # needed for trace events
 obj-$(CONFIG_NFSD)      += nfsd.o
-nfsd-y                  := nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
+# this one should be compiled first, as the tracing macros can easily blow up
+nfsd-y                  += trace.o
+nfsd-y                  += nfssvc.o nfsctl.o nfsproc.o nfsfh.o vfs.o \
                           export.o auth.o lockd.o nfscache.o nfsxdr.o stats.o
 nfsd-$(CONFIG_NFSD_FAULT_INJECTION) += fault_inject.o
 nfsd-$(CONFIG_NFSD_V2_ACL) += nfs2acl.o
@@ -12,3 +17,4 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
 nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
 nfsd-$(CONFIG_NFSD_V4)  += nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
                           nfs4acl.o nfs4callback.o nfs4recover.o
+nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
new file mode 100644
index 000000000000..cdbc78c72542
--- /dev/null
+++ b/fs/nfsd/blocklayout.c
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/exportfs.h>
+#include <linux/genhd.h>
+#include <linux/slab.h>
+#include <linux/nfsd/debug.h>
+#include "blocklayoutxdr.h"
+#include "pnfs.h"
+#define NFSDDBG_FACILITY        NFSDDBG_PNFS
+static int
+nfsd4_block_get_device_info_simple(struct super_block *sb,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        struct pnfs_block_deviceaddr *dev;
+        struct pnfs_block_volume *b;
+        dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
+                      sizeof(struct pnfs_block_volume), GFP_KERNEL);
+        if (!dev)
+                return -ENOMEM;
+        gdp->gd_device = dev;
+        dev->nr_volumes = 1;
+        b = &dev->volumes[0];
+        b->type = PNFS_BLOCK_VOLUME_SIMPLE;
+        b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
+        return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
+                        &b->simple.offset);
+}
+static __be32
+nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        if (sb->s_bdev != sb->s_bdev->bd_contains)
+                return nfserr_inval;
+        return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
+}
+static __be32
+nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
+                struct nfsd4_layoutget *args)
+{
+        struct nfsd4_layout_seg *seg = &args->lg_seg;
+        struct super_block *sb = inode->i_sb;
+        u32 block_size = (1 << inode->i_blkbits);
+        struct pnfs_block_extent *bex;
+        struct iomap iomap;
+        u32 device_generation = 0;
+        int error;
+        /*
+         * We do not attempt to support I/O smaller than the fs block size,
+         * or not aligned to it.
+         */
+        if (args->lg_minlength < block_size) {
+                dprintk("pnfsd: I/O too small\n");
+                goto out_layoutunavailable;
+        }
+        if (seg->offset & (block_size - 1)) {
+                dprintk("pnfsd: I/O misaligned\n");
+                goto out_layoutunavailable;
+        }
+        /*
+         * Some clients barf on non-zero block numbers for NONE or INVALID
+         * layouts, so make sure to zero the whole structure.
+         */
+        error = -ENOMEM;
+        bex = kzalloc(sizeof(*bex), GFP_KERNEL);
+        if (!bex)
+                goto out_error;
+        args->lg_content = bex;
+        error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
+                                            &iomap, seg->iomode != IOMODE_READ,
+                                            &device_generation);
+        if (error) {
+                if (error == -ENXIO)
+                        goto out_layoutunavailable;
+                goto out_error;
+        }
+        if (iomap.length < args->lg_minlength) {
+                dprintk("pnfsd: extent smaller than minlength\n");
+                goto out_layoutunavailable;
+        }
+        switch (iomap.type) {
+        case IOMAP_MAPPED:
+                if (seg->iomode == IOMODE_READ)
+                        bex->es = PNFS_BLOCK_READ_DATA;
+                else
+                        bex->es = PNFS_BLOCK_READWRITE_DATA;
+                bex->soff = (iomap.blkno << 9);
+                break;
+        case IOMAP_UNWRITTEN:
+                if (seg->iomode & IOMODE_RW) {
+                        /*
+                         * Crack monkey special case from section 2.3.1.
+                         */
+                        if (args->lg_minlength == 0) {
+                                dprintk("pnfsd: no soup for you!\n");
+                                goto out_layoutunavailable;
+                        }
+                        bex->es = PNFS_BLOCK_INVALID_DATA;
+                        bex->soff = (iomap.blkno << 9);
+                        break;
+                }
+                /*FALLTHRU*/
+        case IOMAP_HOLE:
+                if (seg->iomode == IOMODE_READ) {
+                        bex->es = PNFS_BLOCK_NONE_DATA;
+                        break;
+                }
+                /*FALLTHRU*/
+        case IOMAP_DELALLOC:
+        default:
+                WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
+                goto out_layoutunavailable;
+        }
+        error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
+        if (error)
+                goto out_error;
+        bex->foff = iomap.offset;
+        bex->len = iomap.length;
+        seg->offset = iomap.offset;
+        seg->length = iomap.length;
+        dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
+        return 0;
+out_error:
+        seg->length = 0;
+        return nfserrno(error);
+out_layoutunavailable:
+        seg->length = 0;
+        return nfserr_layoutunavailable;
+}
+static __be32
+nfsd4_block_proc_layoutcommit(struct inode *inode,
+                struct nfsd4_layoutcommit *lcp)
+{
+        loff_t new_size = lcp->lc_last_wr + 1;
+        struct iattr iattr = { .ia_valid = 0 };
+        struct iomap *iomaps;
+        int nr_iomaps;
+        int error;
+        nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
+                        lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
+        if (nr_iomaps < 0)
+                return nfserrno(nr_iomaps);
+        if (lcp->lc_mtime.tv_nsec == UTIME_NOW ||
+            timespec_compare(&lcp->lc_mtime, &inode->i_mtime) < 0)
+                lcp->lc_mtime = current_fs_time(inode->i_sb);
+        iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
+        iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime;
+        if (new_size > i_size_read(inode)) {
+                iattr.ia_valid |= ATTR_SIZE;
+                iattr.ia_size = new_size;
+        }
+        error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
+                        nr_iomaps, &iattr);
+        kfree(iomaps);
+        return nfserrno(error);
+}
+const struct nfsd4_layout_ops bl_layout_ops = {
+        .proc_getdeviceinfo     = nfsd4_block_proc_getdeviceinfo,
+        .encode_getdeviceinfo   = nfsd4_block_encode_getdeviceinfo,
+        .proc_layoutget         = nfsd4_block_proc_layoutget,
+        .encode_layoutget       = nfsd4_block_encode_layoutget,
+        .proc_layoutcommit      = nfsd4_block_proc_layoutcommit,
+};
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
new file mode 100644
index 000000000000..9da89fddab33
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/sunrpc/svc.h>
+#include <linux/exportfs.h>
+#include <linux/nfs4.h>
+#include "nfsd.h"
+#include "blocklayoutxdr.h"
+#define NFSDDBG_FACILITY        NFSDDBG_PNFS
+__be32
+nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+                struct nfsd4_layoutget *lgp)
+{
+        struct pnfs_block_extent *b = lgp->lg_content;
+        int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
+        __be32 *p;
+        p = xdr_reserve_space(xdr, sizeof(__be32) + len);
+        if (!p)
+                return nfserr_toosmall;
+        *p++ = cpu_to_be32(len);
+        *p++ = cpu_to_be32(1);          /* we always return a single extent */
+        p = xdr_encode_opaque_fixed(p, &b->vol_id,
+                        sizeof(struct nfsd4_deviceid));
+        p = xdr_encode_hyper(p, b->foff);
+        p = xdr_encode_hyper(p, b->len);
+        p = xdr_encode_hyper(p, b->soff);
+        *p++ = cpu_to_be32(b->es);
+        return 0;
+}
+static int
+nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
+{
+        __be32 *p;
+        int len;
+        switch (b->type) {
+        case PNFS_BLOCK_VOLUME_SIMPLE:
+                len = 4 + 4 + 8 + 4 + b->simple.sig_len;
+                p = xdr_reserve_space(xdr, len);
+                if (!p)
+                        return -ETOOSMALL;
+                *p++ = cpu_to_be32(b->type);
+                *p++ = cpu_to_be32(1);  /* single signature */
+                p = xdr_encode_hyper(p, b->simple.offset);
+                p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
+                break;
+        default:
+                return -ENOTSUPP;
+        }
+        return len;
+}
+__be32
+nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        struct pnfs_block_deviceaddr *dev = gdp->gd_device;
+        int len = sizeof(__be32), ret, i;
+        __be32 *p;
+        p = xdr_reserve_space(xdr, len + sizeof(__be32));
+        if (!p)
+                return nfserr_resource;
+        for (i = 0; i < dev->nr_volumes; i++) {
+                ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
+                if (ret < 0)
+                        return nfserrno(ret);
+                len += ret;
+        }
+        /*
+         * Fill in the overall length and number of volumes at the beginning
+         * of the layout.
+         */
+        *p++ = cpu_to_be32(len);
+        *p++ = cpu_to_be32(dev->nr_volumes);
+        return 0;
+}
+int
+nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+                u32 block_size)
+{
+        struct iomap *iomaps;
+        u32 nr_iomaps, expected, i;
+        if (len < sizeof(u32)) {
+                dprintk("%s: extent array too small: %u\n", __func__, len);
+                return -EINVAL;
+        }
+        nr_iomaps = be32_to_cpup(p++);
+        expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+        if (len != expected) {
+                dprintk("%s: extent array size mismatch: %u/%u\n",
+                        __func__, len, expected);
+                return -EINVAL;
+        }
+        iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
+        if (!iomaps) {
+                dprintk("%s: failed to allocate extent array\n", __func__);
+                return -ENOMEM;
+        }
+        for (i = 0; i < nr_iomaps; i++) {
+                struct pnfs_block_extent bex;
+                memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
+                p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
+                p = xdr_decode_hyper(p, &bex.foff);
+                if (bex.foff & (block_size - 1)) {
+                        dprintk("%s: unaligned offset %lld\n",
+                                __func__, bex.foff);
+                        goto fail;
+                }
+                p = xdr_decode_hyper(p, &bex.len);
+                if (bex.len & (block_size - 1)) {
+                        dprintk("%s: unaligned length %lld\n",
+                                __func__, bex.foff);
+                        goto fail;
+                }
+                p = xdr_decode_hyper(p, &bex.soff);
+                if (bex.soff & (block_size - 1)) {
+                        dprintk("%s: unaligned disk offset %lld\n",
+                                __func__, bex.soff);
+                        goto fail;
+                }
+                bex.es = be32_to_cpup(p++);
+                if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
+                        dprintk("%s: incorrect extent state %d\n",
+                                __func__, bex.es);
+                        goto fail;
+                }
+                iomaps[i].offset = bex.foff;
+                iomaps[i].length = bex.len;
+        }
+        *iomapp = iomaps;
+        return nr_iomaps;
+fail:
+        kfree(iomaps);
+        return -EINVAL;
+}
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
new file mode 100644
index 000000000000..fdc79037c0e7
--- /dev/null
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -0,0 +1,62 @@
+#ifndef _NFSD_BLOCKLAYOUTXDR_H
+#define _NFSD_BLOCKLAYOUTXDR_H 1
+#include <linux/blkdev.h>
+#include "xdr4.h"
+struct iomap;
+struct xdr_stream;
+enum pnfs_block_extent_state {
+        PNFS_BLOCK_READWRITE_DATA       = 0,
+        PNFS_BLOCK_READ_DATA            = 1,
+        PNFS_BLOCK_INVALID_DATA         = 2,
+        PNFS_BLOCK_NONE_DATA            = 3,
+};
+struct pnfs_block_extent {
+        struct nfsd4_deviceid           vol_id;
+        u64                             foff;
+        u64                             len;
+        u64                             soff;
+        enum pnfs_block_extent_state    es;
+};
+#define NFS4_BLOCK_EXTENT_SIZE          44
+enum pnfs_block_volume_type {
+        PNFS_BLOCK_VOLUME_SIMPLE        = 0,
+        PNFS_BLOCK_VOLUME_SLICE         = 1,
+        PNFS_BLOCK_VOLUME_CONCAT        = 2,
+        PNFS_BLOCK_VOLUME_STRIPE        = 3,
+};
+/*
+ * Random upper cap for the uuid length to avoid unbounded allocation.
+ * Not actually limited by the protocol.
+ */
+#define PNFS_BLOCK_UUID_LEN     128
+struct pnfs_block_volume {
+        enum pnfs_block_volume_type     type;
+        union {
+                struct {
+                        u64             offset;
+                        u32             sig_len;
+                        u8              sig[PNFS_BLOCK_UUID_LEN];
+                } simple;
+        };
+};
+struct pnfs_block_deviceaddr {
+        u32                             nr_volumes;
+        struct pnfs_block_volume        volumes[];
+};
+__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
+                struct nfsd4_getdeviceinfo *gdp);
+__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
+                struct nfsd4_layoutget *lgp);
+int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
+                u32 block_size);
+#endif /* _NFSD_BLOCKLAYOUTXDR_H */
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 30a739d896ff..c3e3b6e55ae2 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -20,6 +20,7 @@
 #include "nfsd.h"
 #include "nfsfh.h"
 #include "netns.h"
+#include "pnfs.h"
 #define NFSDDBG_FACILITY        NFSDDBG_EXPORT
@@ -545,6 +546,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
        exp.ex_client = dom;
        exp.cd = cd;
+        exp.ex_devid_map = NULL;
        /* expiry */
        err = -EINVAL;
@@ -621,6 +623,8 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
                if (!gid_valid(exp.ex_anon_gid))
                        goto out4;
                err = 0;
+                nfsd4_setup_layout_type(&exp);
        }
        expp = svc_export_lookup(&exp);
@@ -703,6 +707,7 @@ static void svc_export_init(struct cache_head *cnew, struct cache_head *citem)
        new->ex_fslocs.locations = NULL;
        new->ex_fslocs.locations_count = 0;
        new->ex_fslocs.migrated = 0;
+        new->ex_layout_type = 0;
        new->ex_uuid = NULL;
        new->cd = item->cd;
 }
@@ -717,6 +722,8 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
        new->ex_anon_uid = item->ex_anon_uid;
        new->ex_anon_gid = item->ex_anon_gid;
        new->ex_fsid = item->ex_fsid;
+        new->ex_devid_map = item->ex_devid_map;
+        item->ex_devid_map = NULL;
        new->ex_uuid = item->ex_uuid;
        item->ex_uuid = NULL;
        new->ex_fslocs.locations = item->ex_fslocs.locations;
@@ -725,6 +732,7 @@ static void export_update(struct cache_head *cnew, struct cache_head *citem)
        item->ex_fslocs.locations_count = 0;
        new->ex_fslocs.migrated = item->ex_fslocs.migrated;
        item->ex_fslocs.migrated = 0;
+        new->ex_layout_type = item->ex_layout_type;
        new->ex_nflavors = item->ex_nflavors;
        for (i = 0; i < MAX_SECINFO_LIST; i++) {
                new->ex_flavors[i] = item->ex_flavors[i];
diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h
index 04dc8c167b0c..1f52bfcc436f 100644
--- a/fs/nfsd/export.h
+++ b/fs/nfsd/export.h
@@ -56,6 +56,8 @@ struct svc_export {
        struct nfsd4_fs_locations ex_fslocs;
        uint32_t                ex_nflavors;
        struct exp_flavor_info  ex_flavors[MAX_SECINFO_LIST];
+        enum pnfs_layouttype    ex_layout_type;
+        struct nfsd4_deviceid_map *ex_devid_map;
        struct cache_detail     *cd;
 };
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 7cbdf1b2e4ab..58277859a467 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -546,6 +546,102 @@ out:
        return status;
 }
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * CB_LAYOUTRECALL4args
+ *
+ *      struct layoutrecall_file4 {
+ *              nfs_fh4         lor_fh;
+ *              offset4         lor_offset;
+ *              length4         lor_length;
+ *              stateid4        lor_stateid;
+ *      };
+ *
+ *      union layoutrecall4 switch(layoutrecall_type4 lor_recalltype) {
+ *      case LAYOUTRECALL4_FILE:
+ *              layoutrecall_file4 lor_layout;
+ *      case LAYOUTRECALL4_FSID:
+ *              fsid4              lor_fsid;
+ *      case LAYOUTRECALL4_ALL:
+ *              void;
+ *      };
+ *
+ *      struct CB_LAYOUTRECALL4args {
+ *              layouttype4             clora_type;
+ *              layoutiomode4           clora_iomode;
+ *              bool                    clora_changed;
+ *              layoutrecall4           clora_recall;
+ *      };
+ */
+static void encode_cb_layout4args(struct xdr_stream *xdr,
+                                  const struct nfs4_layout_stateid *ls,
+                                  struct nfs4_cb_compound_hdr *hdr)
+{
+        __be32 *p;
+        BUG_ON(hdr->minorversion == 0);
+        p = xdr_reserve_space(xdr, 5 * 4);
+        *p++ = cpu_to_be32(OP_CB_LAYOUTRECALL);
+        *p++ = cpu_to_be32(ls->ls_layout_type);
+        *p++ = cpu_to_be32(IOMODE_ANY);
+        *p++ = cpu_to_be32(1);
+        *p = cpu_to_be32(RETURN_FILE);
+        encode_nfs_fh4(xdr, &ls->ls_stid.sc_file->fi_fhandle);
+        p = xdr_reserve_space(xdr, 2 * 8);
+        p = xdr_encode_hyper(p, 0);
+        xdr_encode_hyper(p, NFS4_MAX_UINT64);
+        encode_stateid4(xdr, &ls->ls_recall_sid);
+        hdr->nops++;
+}
+static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
+                                   struct xdr_stream *xdr,
+                                   const struct nfsd4_callback *cb)
+{
+        const struct nfs4_layout_stateid *ls =
+                container_of(cb, struct nfs4_layout_stateid, ls_recall);
+        struct nfs4_cb_compound_hdr hdr = {
+                .ident = 0,
+                .minorversion = cb->cb_minorversion,
+        };
+        encode_cb_compound4args(xdr, &hdr);
+        encode_cb_sequence4args(xdr, cb, &hdr);
+        encode_cb_layout4args(xdr, ls, &hdr);
+        encode_cb_nops(&hdr);
+}
+static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
+                                  struct xdr_stream *xdr,
+                                  struct nfsd4_callback *cb)
+{
+        struct nfs4_cb_compound_hdr hdr;
+        enum nfsstat4 nfserr;
+        int status;
+        status = decode_cb_compound4res(xdr, &hdr);
+        if (unlikely(status))
+                goto out;
+        if (cb) {
+                status = decode_cb_sequence4res(xdr, cb);
+                if (unlikely(status))
+                        goto out;
+        }
+        status = decode_cb_op_status(xdr, OP_CB_LAYOUTRECALL, &nfserr);
+        if (unlikely(status))
+                goto out;
+        if (unlikely(nfserr != NFS4_OK))
+                status = nfs_cb_stat_to_errno(nfserr);
+out:
+        return status;
+}
+#endif /* CONFIG_NFSD_PNFS */
 /*
 * RPC procedure tables
 */
@@ -563,6 +659,9 @@ out:
 static struct rpc_procinfo nfs4_cb_procedures[] = {
        PROC(CB_NULL,   NULL,           cb_null,        cb_null),
        PROC(CB_RECALL, COMPOUND,       cb_recall,      cb_recall),
+#ifdef CONFIG_NFSD_PNFS
+        PROC(CB_LAYOUT, COMPOUND,       cb_layout,      cb_layout),
+#endif
 };
 static struct rpc_version nfs_cb_version4 = {
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
new file mode 100644
index 000000000000..3c1bfa155571
--- /dev/null
+++ b/fs/nfsd/nfs4layouts.c
@@ -0,0 +1,721 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include <linux/kmod.h>
+#include <linux/file.h>
+#include <linux/jhash.h>
+#include <linux/sched.h>
+#include <linux/sunrpc/addr.h>
+#include "pnfs.h"
+#include "netns.h"
+#include "trace.h"
+#define NFSDDBG_FACILITY                NFSDDBG_PNFS
+struct nfs4_layout {
+        struct list_head                lo_perstate;
+        struct nfs4_layout_stateid      *lo_state;
+        struct nfsd4_layout_seg         lo_seg;
+};
+static struct kmem_cache *nfs4_layout_cache;
+static struct kmem_cache *nfs4_layout_stateid_cache;
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct lock_manager_operations nfsd4_layouts_lm_ops;
+const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
+        [LAYOUT_BLOCK_VOLUME]   = &bl_layout_ops,
+};
+/* pNFS device ID to export fsid mapping */
+#define DEVID_HASH_BITS 8
+#define DEVID_HASH_SIZE (1 << DEVID_HASH_BITS)
+#define DEVID_HASH_MASK (DEVID_HASH_SIZE - 1)
+static u64 nfsd_devid_seq = 1;
+static struct list_head nfsd_devid_hash[DEVID_HASH_SIZE];
+static DEFINE_SPINLOCK(nfsd_devid_lock);
+static inline u32 devid_hashfn(u64 idx)
+{
+        return jhash_2words(idx, idx >> 32, 0) & DEVID_HASH_MASK;
+}
+static void
+nfsd4_alloc_devid_map(const struct svc_fh *fhp)
+{
+        const struct knfsd_fh *fh = &fhp->fh_handle;
+        size_t fsid_len = key_len(fh->fh_fsid_type);
+        struct nfsd4_deviceid_map *map, *old;
+        int i;
+        map = kzalloc(sizeof(*map) + fsid_len, GFP_KERNEL);
+        if (!map)
+                return;
+        map->fsid_type = fh->fh_fsid_type;
+        memcpy(&map->fsid, fh->fh_fsid, fsid_len);
+        spin_lock(&nfsd_devid_lock);
+        if (fhp->fh_export->ex_devid_map)
+                goto out_unlock;
+        for (i = 0; i < DEVID_HASH_SIZE; i++) {
+                list_for_each_entry(old, &nfsd_devid_hash[i], hash) {
+                        if (old->fsid_type != fh->fh_fsid_type)
+                                continue;
+                        if (memcmp(old->fsid, fh->fh_fsid,
+                                        key_len(old->fsid_type)))
+                                continue;
+                        fhp->fh_export->ex_devid_map = old;
+                        goto out_unlock;
+                }
+        }
+        map->idx = nfsd_devid_seq++;
+        list_add_tail_rcu(&map->hash, &nfsd_devid_hash[devid_hashfn(map->idx)]);
+        fhp->fh_export->ex_devid_map = map;
+        map = NULL;
+out_unlock:
+        spin_unlock(&nfsd_devid_lock);
+        kfree(map);
+}
+struct nfsd4_deviceid_map *
+nfsd4_find_devid_map(int idx)
+{
+        struct nfsd4_deviceid_map *map, *ret = NULL;
+        rcu_read_lock();
+        list_for_each_entry_rcu(map, &nfsd_devid_hash[devid_hashfn(idx)], hash)
+                if (map->idx == idx)
+                        ret = map;
+        rcu_read_unlock();
+        return ret;
+}
+int
+nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+                u32 device_generation)
+{
+        if (!fhp->fh_export->ex_devid_map) {
+                nfsd4_alloc_devid_map(fhp);
+                if (!fhp->fh_export->ex_devid_map)
+                        return -ENOMEM;
+        }
+        id->fsid_idx = fhp->fh_export->ex_devid_map->idx;
+        id->generation = device_generation;
+        id->pad = 0;
+        return 0;
+}
+void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+        struct super_block *sb = exp->ex_path.mnt->mnt_sb;
+        if (exp->ex_flags & NFSEXP_NOPNFS)
+                return;
+        if (sb->s_export_op->get_uuid &&
+            sb->s_export_op->map_blocks &&
+            sb->s_export_op->commit_blocks)
+                exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
+}
+static void
+nfsd4_free_layout_stateid(struct nfs4_stid *stid)
+{
+        struct nfs4_layout_stateid *ls = layoutstateid(stid);
+        struct nfs4_client *clp = ls->ls_stid.sc_client;
+        struct nfs4_file *fp = ls->ls_stid.sc_file;
+        trace_layoutstate_free(&ls->ls_stid.sc_stateid);
+        spin_lock(&clp->cl_lock);
+        list_del_init(&ls->ls_perclnt);
+        spin_unlock(&clp->cl_lock);
+        spin_lock(&fp->fi_lock);
+        list_del_init(&ls->ls_perfile);
+        spin_unlock(&fp->fi_lock);
+        vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
+        fput(ls->ls_file);
+        if (ls->ls_recalled)
+                atomic_dec(&ls->ls_stid.sc_file->fi_lo_recalls);
+        kmem_cache_free(nfs4_layout_stateid_cache, ls);
+}
+static int
+nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
+{
+        struct file_lock *fl;
+        int status;
+        fl = locks_alloc_lock();
+        if (!fl)
+                return -ENOMEM;
+        locks_init_lock(fl);
+        fl->fl_lmops = &nfsd4_layouts_lm_ops;
+        fl->fl_flags = FL_LAYOUT;
+        fl->fl_type = F_RDLCK;
+        fl->fl_end = OFFSET_MAX;
+        fl->fl_owner = ls;
+        fl->fl_pid = current->tgid;
+        fl->fl_file = ls->ls_file;
+        status = vfs_setlease(fl->fl_file, fl->fl_type, &fl, NULL);
+        if (status) {
+                locks_free_lock(fl);
+                return status;
+        }
+        BUG_ON(fl != NULL);
+        return 0;
+}
+static struct nfs4_layout_stateid *
+nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate,
+                struct nfs4_stid *parent, u32 layout_type)
+{
+        struct nfs4_client *clp = cstate->clp;
+        struct nfs4_file *fp = parent->sc_file;
+        struct nfs4_layout_stateid *ls;
+        struct nfs4_stid *stp;
+        stp = nfs4_alloc_stid(cstate->clp, nfs4_layout_stateid_cache);
+        if (!stp)
+                return NULL;
+        stp->sc_free = nfsd4_free_layout_stateid;
+        get_nfs4_file(fp);
+        stp->sc_file = fp;
+        ls = layoutstateid(stp);
+        INIT_LIST_HEAD(&ls->ls_perclnt);
+        INIT_LIST_HEAD(&ls->ls_perfile);
+        spin_lock_init(&ls->ls_lock);
+        INIT_LIST_HEAD(&ls->ls_layouts);
+        ls->ls_layout_type = layout_type;
+        nfsd4_init_cb(&ls->ls_recall, clp, &nfsd4_cb_layout_ops,
+                        NFSPROC4_CLNT_CB_LAYOUT);
+        if (parent->sc_type == NFS4_DELEG_STID)
+                ls->ls_file = get_file(fp->fi_deleg_file);
+        else
+                ls->ls_file = find_any_file(fp);
+        BUG_ON(!ls->ls_file);
+        if (nfsd4_layout_setlease(ls)) {
+                put_nfs4_file(fp);
+                kmem_cache_free(nfs4_layout_stateid_cache, ls);
+                return NULL;
+        }
+        spin_lock(&clp->cl_lock);
+        stp->sc_type = NFS4_LAYOUT_STID;
+        list_add(&ls->ls_perclnt, &clp->cl_lo_states);
+        spin_unlock(&clp->cl_lock);
+        spin_lock(&fp->fi_lock);
+        list_add(&ls->ls_perfile, &fp->fi_lo_states);
+        spin_unlock(&fp->fi_lock);
+        trace_layoutstate_alloc(&ls->ls_stid.sc_stateid);
+        return ls;
+}
+__be32
+nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate, stateid_t *stateid,
+                bool create, u32 layout_type, struct nfs4_layout_stateid **lsp)
+{
+        struct nfs4_layout_stateid *ls;
+        struct nfs4_stid *stid;
+        unsigned char typemask = NFS4_LAYOUT_STID;
+        __be32 status;
+        if (create)
+                typemask |= (NFS4_OPEN_STID | NFS4_LOCK_STID | NFS4_DELEG_STID);
+        status = nfsd4_lookup_stateid(cstate, stateid, typemask, &stid,
+                        net_generic(SVC_NET(rqstp), nfsd_net_id));
+        if (status)
+                goto out;
+        if (!fh_match(&cstate->current_fh.fh_handle,
+                      &stid->sc_file->fi_fhandle)) {
+                status = nfserr_bad_stateid;
+                goto out_put_stid;
+        }
+        if (stid->sc_type != NFS4_LAYOUT_STID) {
+                ls = nfsd4_alloc_layout_stateid(cstate, stid, layout_type);
+                nfs4_put_stid(stid);
+                status = nfserr_jukebox;
+                if (!ls)
+                        goto out;
+        } else {
+                ls = container_of(stid, struct nfs4_layout_stateid, ls_stid);
+                status = nfserr_bad_stateid;
+                if (stateid->si_generation > stid->sc_stateid.si_generation)
+                        goto out_put_stid;
+                if (layout_type != ls->ls_layout_type)
+                        goto out_put_stid;
+        }
+        *lsp = ls;
+        return 0;
+out_put_stid:
+        nfs4_put_stid(stid);
+out:
+        return status;
+}
+static void
+nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls)
+{
+        spin_lock(&ls->ls_lock);
+        if (ls->ls_recalled)
+                goto out_unlock;
+        ls->ls_recalled = true;
+        atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls);
+        if (list_empty(&ls->ls_layouts))
+                goto out_unlock;
+        trace_layout_recall(&ls->ls_stid.sc_stateid);
+        atomic_inc(&ls->ls_stid.sc_count);
+        update_stateid(&ls->ls_stid.sc_stateid);
+        memcpy(&ls->ls_recall_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+        nfsd4_run_cb(&ls->ls_recall);
+out_unlock:
+        spin_unlock(&ls->ls_lock);
+}
+static inline u64
+layout_end(struct nfsd4_layout_seg *seg)
+{
+        u64 end = seg->offset + seg->length;
+        return end >= seg->offset ? end : NFS4_MAX_UINT64;
+}
+static void
+layout_update_len(struct nfsd4_layout_seg *lo, u64 end)
+{
+        if (end == NFS4_MAX_UINT64)
+                lo->length = NFS4_MAX_UINT64;
+        else
+                lo->length = end - lo->offset;
+}
+static bool
+layouts_overlapping(struct nfs4_layout *lo, struct nfsd4_layout_seg *s)
+{
+        if (s->iomode != IOMODE_ANY && s->iomode != lo->lo_seg.iomode)
+                return false;
+        if (layout_end(&lo->lo_seg) <= s->offset)
+                return false;
+        if (layout_end(s) <= lo->lo_seg.offset)
+                return false;
+        return true;
+}
+static bool
+layouts_try_merge(struct nfsd4_layout_seg *lo, struct nfsd4_layout_seg *new)
+{
+        if (lo->iomode != new->iomode)
+                return false;
+        if (layout_end(new) < lo->offset)
+                return false;
+        if (layout_end(lo) < new->offset)
+                return false;
+        lo->offset = min(lo->offset, new->offset);
+        layout_update_len(lo, max(layout_end(lo), layout_end(new)));
+        return true;
+}
+static __be32
+nfsd4_recall_conflict(struct nfs4_layout_stateid *ls)
+{
+        struct nfs4_file *fp = ls->ls_stid.sc_file;
+        struct nfs4_layout_stateid *l, *n;
+        __be32 nfserr = nfs_ok;
+        assert_spin_locked(&fp->fi_lock);
+        list_for_each_entry_safe(l, n, &fp->fi_lo_states, ls_perfile) {
+                if (l != ls) {
+                        nfsd4_recall_file_layout(l);
+                        nfserr = nfserr_recallconflict;
+                }
+        }
+        return nfserr;
+}
+__be32
+nfsd4_insert_layout(struct nfsd4_layoutget *lgp, struct nfs4_layout_stateid *ls)
+{
+        struct nfsd4_layout_seg *seg = &lgp->lg_seg;
+        struct nfs4_file *fp = ls->ls_stid.sc_file;
+        struct nfs4_layout *lp, *new = NULL;
+        __be32 nfserr;
+        spin_lock(&fp->fi_lock);
+        nfserr = nfsd4_recall_conflict(ls);
+        if (nfserr)
+                goto out;
+        spin_lock(&ls->ls_lock);
+        list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+                if (layouts_try_merge(&lp->lo_seg, seg))
+                        goto done;
+        }
+        spin_unlock(&ls->ls_lock);
+        spin_unlock(&fp->fi_lock);
+        new = kmem_cache_alloc(nfs4_layout_cache, GFP_KERNEL);
+        if (!new)
+                return nfserr_jukebox;
+        memcpy(&new->lo_seg, seg, sizeof(lp->lo_seg));
+        new->lo_state = ls;
+        spin_lock(&fp->fi_lock);
+        nfserr = nfsd4_recall_conflict(ls);
+        if (nfserr)
+                goto out;
+        spin_lock(&ls->ls_lock);
+        list_for_each_entry(lp, &ls->ls_layouts, lo_perstate) {
+                if (layouts_try_merge(&lp->lo_seg, seg))
+                        goto done;
+        }
+        atomic_inc(&ls->ls_stid.sc_count);
+        list_add_tail(&new->lo_perstate, &ls->ls_layouts);
+        new = NULL;
+done:
+        update_stateid(&ls->ls_stid.sc_stateid);
+        memcpy(&lgp->lg_sid, &ls->ls_stid.sc_stateid, sizeof(stateid_t));
+        spin_unlock(&ls->ls_lock);
+out:
+        spin_unlock(&fp->fi_lock);
+        if (new)
+                kmem_cache_free(nfs4_layout_cache, new);
+        return nfserr;
+}
+static void
+nfsd4_free_layouts(struct list_head *reaplist)
+{
+        while (!list_empty(reaplist)) {
+                struct nfs4_layout *lp = list_first_entry(reaplist,
+                                struct nfs4_layout, lo_perstate);
+                list_del(&lp->lo_perstate);
+                nfs4_put_stid(&lp->lo_state->ls_stid);
+                kmem_cache_free(nfs4_layout_cache, lp);
+        }
+}
+static void
+nfsd4_return_file_layout(struct nfs4_layout *lp, struct nfsd4_layout_seg *seg,
+                struct list_head *reaplist)
+{
+        struct nfsd4_layout_seg *lo = &lp->lo_seg;
+        u64 end = layout_end(lo);
+        if (seg->offset <= lo->offset) {
+                if (layout_end(seg) >= end) {
+                        list_move_tail(&lp->lo_perstate, reaplist);
+                        return;
+                }
+                end = seg->offset;
+        } else {
+                /* retain the whole layout segment on a split. */
+                if (layout_end(seg) < end) {
+                        dprintk("%s: split not supported\n", __func__);
+                        return;
+                }
+                lo->offset = layout_end(seg);
+        }
+        layout_update_len(lo, end);
+}
+__be32
+nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct nfs4_layout_stateid *ls;
+        struct nfs4_layout *lp, *n;
+        LIST_HEAD(reaplist);
+        __be32 nfserr;
+        int found = 0;
+        nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lrp->lr_sid,
+                                                false, lrp->lr_layout_type,
+                                                &ls);
+        if (nfserr) {
+                trace_layout_return_lookup_fail(&lrp->lr_sid);
+                return nfserr;
+        }
+        spin_lock(&ls->ls_lock);
+        list_for_each_entry_safe(lp, n, &ls->ls_layouts, lo_perstate) {
+                if (layouts_overlapping(lp, &lrp->lr_seg)) {
+                        nfsd4_return_file_layout(lp, &lrp->lr_seg, &reaplist);
+                        found++;
+                }
+        }
+        if (!list_empty(&ls->ls_layouts)) {
+                if (found) {
+                        update_stateid(&ls->ls_stid.sc_stateid);
+                        memcpy(&lrp->lr_sid, &ls->ls_stid.sc_stateid,
+                                sizeof(stateid_t));
+                }
+                lrp->lrs_present = 1;
+        } else {
+                trace_layoutstate_unhash(&ls->ls_stid.sc_stateid);
+                nfs4_unhash_stid(&ls->ls_stid);
+                lrp->lrs_present = 0;
+        }
+        spin_unlock(&ls->ls_lock);
+        nfs4_put_stid(&ls->ls_stid);
+        nfsd4_free_layouts(&reaplist);
+        return nfs_ok;
+}
+__be32
+nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct nfs4_layout_stateid *ls, *n;
+        struct nfs4_client *clp = cstate->clp;
+        struct nfs4_layout *lp, *t;
+        LIST_HEAD(reaplist);
+        lrp->lrs_present = 0;
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt) {
+                if (lrp->lr_return_type == RETURN_FSID &&
+                    !fh_fsid_match(&ls->ls_stid.sc_file->fi_fhandle,
+                                   &cstate->current_fh.fh_handle))
+                        continue;
+                spin_lock(&ls->ls_lock);
+                list_for_each_entry_safe(lp, t, &ls->ls_layouts, lo_perstate) {
+                        if (lrp->lr_seg.iomode == IOMODE_ANY ||
+                            lrp->lr_seg.iomode == lp->lo_seg.iomode)
+                                list_move_tail(&lp->lo_perstate, &reaplist);
+                }
+                spin_unlock(&ls->ls_lock);
+        }
+        spin_unlock(&clp->cl_lock);
+        nfsd4_free_layouts(&reaplist);
+        return 0;
+}
+static void
+nfsd4_return_all_layouts(struct nfs4_layout_stateid *ls,
+                struct list_head *reaplist)
+{
+        spin_lock(&ls->ls_lock);
+        list_splice_init(&ls->ls_layouts, reaplist);
+        spin_unlock(&ls->ls_lock);
+}
+void
+nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+        struct nfs4_layout_stateid *ls, *n;
+        LIST_HEAD(reaplist);
+        spin_lock(&clp->cl_lock);
+        list_for_each_entry_safe(ls, n, &clp->cl_lo_states, ls_perclnt)
+                nfsd4_return_all_layouts(ls, &reaplist);
+        spin_unlock(&clp->cl_lock);
+        nfsd4_free_layouts(&reaplist);
+}
+void
+nfsd4_return_all_file_layouts(struct nfs4_client *clp, struct nfs4_file *fp)
+{
+        struct nfs4_layout_stateid *ls, *n;
+        LIST_HEAD(reaplist);
+        spin_lock(&fp->fi_lock);
+        list_for_each_entry_safe(ls, n, &fp->fi_lo_states, ls_perfile) {
+                if (ls->ls_stid.sc_client == clp)
+                        nfsd4_return_all_layouts(ls, &reaplist);
+        }
+        spin_unlock(&fp->fi_lock);
+        nfsd4_free_layouts(&reaplist);
+}
+static void
+nfsd4_cb_layout_fail(struct nfs4_layout_stateid *ls)
+{
+        struct nfs4_client *clp = ls->ls_stid.sc_client;
+        char addr_str[INET6_ADDRSTRLEN];
+        static char *envp[] = {
+                "HOME=/",
+                "TERM=linux",
+                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                NULL
+        };
+        char *argv[8];
+        int error;
+        rpc_ntop((struct sockaddr *)&clp->cl_addr, addr_str, sizeof(addr_str));
+        nfsd4_cb_layout_fail(ls);
+        printk(KERN_WARNING
+                "nfsd: client %s failed to respond to layout recall. "
+                "  Fencing..\n", addr_str);
+        argv[0] = "/sbin/nfsd-recall-failed";
+        argv[1] = addr_str;
+        argv[2] = ls->ls_file->f_path.mnt->mnt_sb->s_id;
+        argv[3] = NULL;
+        error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+        if (error) {
+                printk(KERN_ERR "nfsd: fence failed for client %s: %d!\n",
+                        addr_str, error);
+        }
+}
+static int
+nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+        struct nfs4_layout_stateid *ls =
+                container_of(cb, struct nfs4_layout_stateid, ls_recall);
+        LIST_HEAD(reaplist);
+        switch (task->tk_status) {
+        case 0:
+                return 1;
+        case -NFS4ERR_NOMATCHING_LAYOUT:
+                trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+                task->tk_status = 0;
+                return 1;
+        case -NFS4ERR_DELAY:
+                /* Poll the client until it's done with the layout */
+                /* FIXME: cap number of retries.
+                 * The pnfs standard states that we need to only expire
+                 * the client after at-least "lease time" .eg lease-time * 2
+                 * when failing to communicate a recall
+                 */
+                rpc_delay(task, HZ/100); /* 10 mili-seconds */
+                return 0;
+        default:
+                /*
+                 * Unknown error or non-responding client, we'll need to fence.
+                 */
+                nfsd4_cb_layout_fail(ls);
+                return -1;
+        }
+}
+static void
+nfsd4_cb_layout_release(struct nfsd4_callback *cb)
+{
+        struct nfs4_layout_stateid *ls =
+                container_of(cb, struct nfs4_layout_stateid, ls_recall);
+        LIST_HEAD(reaplist);
+        trace_layout_recall_release(&ls->ls_stid.sc_stateid);
+        nfsd4_return_all_layouts(ls, &reaplist);
+        nfsd4_free_layouts(&reaplist);
+        nfs4_put_stid(&ls->ls_stid);
+}
+static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+        .done           = nfsd4_cb_layout_done,
+        .release        = nfsd4_cb_layout_release,
+};
+static bool
+nfsd4_layout_lm_break(struct file_lock *fl)
+{
+        /*
+         * We don't want the locks code to timeout the lease for us;
+         * we'll remove it ourself if a layout isn't returned
+         * in time:
+         */
+        fl->fl_break_time = 0;
+        nfsd4_recall_file_layout(fl->fl_owner);
+        return false;
+}
+static int
+nfsd4_layout_lm_change(struct file_lock *onlist, int arg,
+                struct list_head *dispose)
+{
+        BUG_ON(!(arg & F_UNLCK));
+        return lease_modify(onlist, arg, dispose);
+}
+static const struct lock_manager_operations nfsd4_layouts_lm_ops = {
+        .lm_break       = nfsd4_layout_lm_break,
+        .lm_change      = nfsd4_layout_lm_change,
+};
+int
+nfsd4_init_pnfs(void)
+{
+        int i;
+        for (i = 0; i < DEVID_HASH_SIZE; i++)
+                INIT_LIST_HEAD(&nfsd_devid_hash[i]);
+        nfs4_layout_cache = kmem_cache_create("nfs4_layout",
+                        sizeof(struct nfs4_layout), 0, 0, NULL);
+        if (!nfs4_layout_cache)
+                return -ENOMEM;
+        nfs4_layout_stateid_cache = kmem_cache_create("nfs4_layout_stateid",
+                        sizeof(struct nfs4_layout_stateid), 0, 0, NULL);
+        if (!nfs4_layout_stateid_cache) {
+                kmem_cache_destroy(nfs4_layout_cache);
+                return -ENOMEM;
+        }
+        return 0;
+}
+void
+nfsd4_exit_pnfs(void)
+{
+        int i;
+        kmem_cache_destroy(nfs4_layout_cache);
+        kmem_cache_destroy(nfs4_layout_stateid_cache);
+        for (i = 0; i < DEVID_HASH_SIZE; i++) {
+                struct nfsd4_deviceid_map *map, *n;
+                list_for_each_entry_safe(map, n, &nfsd_devid_hash[i], hash)
+                        kfree(map);
+        }
+}
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index ac71d13c69ef..d30bea8d0277 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -43,6 +43,8 @@
 #include "current_stateid.h"
 #include "netns.h"
 #include "acl.h"
+#include "pnfs.h"
+#include "trace.h"
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -1178,6 +1180,259 @@ nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        return status == nfserr_same ? nfs_ok : status;
 }
+#ifdef CONFIG_NFSD_PNFS
+static const struct nfsd4_layout_ops *
+nfsd4_layout_verify(struct svc_export *exp, unsigned int layout_type)
+{
+        if (!exp->ex_layout_type) {
+                dprintk("%s: export does not support pNFS\n", __func__);
+                return NULL;
+        }
+        if (exp->ex_layout_type != layout_type) {
+                dprintk("%s: layout type %d not supported\n",
+                        __func__, layout_type);
+                return NULL;
+        }
+        return nfsd4_layout_ops[layout_type];
+}
+static __be32
+nfsd4_getdeviceinfo(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_getdeviceinfo *gdp)
+{
+        const struct nfsd4_layout_ops *ops;
+        struct nfsd4_deviceid_map *map;
+        struct svc_export *exp;
+        __be32 nfserr;
+        dprintk("%s: layout_type %u dev_id [0x%llx:0x%x] maxcnt %u\n",
+               __func__,
+               gdp->gd_layout_type,
+               gdp->gd_devid.fsid_idx, gdp->gd_devid.generation,
+               gdp->gd_maxcount);
+        map = nfsd4_find_devid_map(gdp->gd_devid.fsid_idx);
+        if (!map) {
+                dprintk("%s: couldn't find device ID to export mapping!\n",
+                        __func__);
+                return nfserr_noent;
+        }
+        exp = rqst_exp_find(rqstp, map->fsid_type, map->fsid);
+        if (IS_ERR(exp)) {
+                dprintk("%s: could not find device id\n", __func__);
+                return nfserr_noent;
+        }
+        nfserr = nfserr_layoutunavailable;
+        ops = nfsd4_layout_verify(exp, gdp->gd_layout_type);
+        if (!ops)
+                goto out;
+        nfserr = nfs_ok;
+        if (gdp->gd_maxcount != 0)
+                nfserr = ops->proc_getdeviceinfo(exp->ex_path.mnt->mnt_sb, gdp);
+        gdp->gd_notify_types &= ops->notify_types;
+        exp_put(exp);
+out:
+        return nfserr;
+}
+static __be32
+nfsd4_layoutget(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutget *lgp)
+{
+        struct svc_fh *current_fh = &cstate->current_fh;
+        const struct nfsd4_layout_ops *ops;
+        struct nfs4_layout_stateid *ls;
+        __be32 nfserr;
+        int accmode;
+        switch (lgp->lg_seg.iomode) {
+        case IOMODE_READ:
+                accmode = NFSD_MAY_READ;
+                break;
+        case IOMODE_RW:
+                accmode = NFSD_MAY_READ | NFSD_MAY_WRITE;
+                break;
+        default:
+                dprintk("%s: invalid iomode %d\n",
+                        __func__, lgp->lg_seg.iomode);
+                nfserr = nfserr_badiomode;
+                goto out;
+        }
+        nfserr = fh_verify(rqstp, current_fh, 0, accmode);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_layoutunavailable;
+        ops = nfsd4_layout_verify(current_fh->fh_export, lgp->lg_layout_type);
+        if (!ops)
+                goto out;
+        /*
+         * Verify minlength and range as per RFC5661:
+         *  o  If loga_length is less than loga_minlength,
+         *     the metadata server MUST return NFS4ERR_INVAL.
+         *  o  If the sum of loga_offset and loga_minlength exceeds
+         *     NFS4_UINT64_MAX, and loga_minlength is not
+         *     NFS4_UINT64_MAX, the error NFS4ERR_INVAL MUST result.
+         *  o  If the sum of loga_offset and loga_length exceeds
+         *     NFS4_UINT64_MAX, and loga_length is not NFS4_UINT64_MAX,
+         *     the error NFS4ERR_INVAL MUST result.
+         */
+        nfserr = nfserr_inval;
+        if (lgp->lg_seg.length < lgp->lg_minlength ||
+            (lgp->lg_minlength != NFS4_MAX_UINT64 &&
+             lgp->lg_minlength > NFS4_MAX_UINT64 - lgp->lg_seg.offset) ||
+            (lgp->lg_seg.length != NFS4_MAX_UINT64 &&
+             lgp->lg_seg.length > NFS4_MAX_UINT64 - lgp->lg_seg.offset))
+                goto out;
+        if (lgp->lg_seg.length == 0)
+                goto out;
+        nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lgp->lg_sid,
+                                                true, lgp->lg_layout_type, &ls);
+        if (nfserr) {
+                trace_layout_get_lookup_fail(&lgp->lg_sid);
+                goto out;
+        }
+        nfserr = nfserr_recallconflict;
+        if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls))
+                goto out_put_stid;
+        nfserr = ops->proc_layoutget(current_fh->fh_dentry->d_inode,
+                                     current_fh, lgp);
+        if (nfserr)
+                goto out_put_stid;
+        nfserr = nfsd4_insert_layout(lgp, ls);
+out_put_stid:
+        nfs4_put_stid(&ls->ls_stid);
+out:
+        return nfserr;
+}
+static __be32
+nfsd4_layoutcommit(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutcommit *lcp)
+{
+        const struct nfsd4_layout_seg *seg = &lcp->lc_seg;
+        struct svc_fh *current_fh = &cstate->current_fh;
+        const struct nfsd4_layout_ops *ops;
+        loff_t new_size = lcp->lc_last_wr + 1;
+        struct inode *inode;
+        struct nfs4_layout_stateid *ls;
+        __be32 nfserr;
+        nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_WRITE);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_layoutunavailable;
+        ops = nfsd4_layout_verify(current_fh->fh_export, lcp->lc_layout_type);
+        if (!ops)
+                goto out;
+        inode = current_fh->fh_dentry->d_inode;
+        nfserr = nfserr_inval;
+        if (new_size <= seg->offset) {
+                dprintk("pnfsd: last write before layout segment\n");
+                goto out;
+        }
+        if (new_size > seg->offset + seg->length) {
+                dprintk("pnfsd: last write beyond layout segment\n");
+                goto out;
+        }
+        if (!lcp->lc_newoffset && new_size > i_size_read(inode)) {
+                dprintk("pnfsd: layoutcommit beyond EOF\n");
+                goto out;
+        }
+        nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid,
+                                                false, lcp->lc_layout_type,
+                                                &ls);
+        if (nfserr) {
+                trace_layout_commit_lookup_fail(&lcp->lc_sid);
+                /* fixup error code as per RFC5661 */
+                if (nfserr == nfserr_bad_stateid)
+                        nfserr = nfserr_badlayout;
+                goto out;
+        }
+        nfserr = ops->proc_layoutcommit(inode, lcp);
+        if (nfserr)
+                goto out_put_stid;
+        if (new_size > i_size_read(inode)) {
+                lcp->lc_size_chg = 1;
+                lcp->lc_newsize = new_size;
+        } else {
+                lcp->lc_size_chg = 0;
+        }
+out_put_stid:
+        nfs4_put_stid(&ls->ls_stid);
+out:
+        return nfserr;
+}
+static __be32
+nfsd4_layoutreturn(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct svc_fh *current_fh = &cstate->current_fh;
+        __be32 nfserr;
+        nfserr = fh_verify(rqstp, current_fh, 0, NFSD_MAY_NOP);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_layoutunavailable;
+        if (!nfsd4_layout_verify(current_fh->fh_export, lrp->lr_layout_type))
+                goto out;
+        switch (lrp->lr_seg.iomode) {
+        case IOMODE_READ:
+        case IOMODE_RW:
+        case IOMODE_ANY:
+                break;
+        default:
+                dprintk("%s: invalid iomode %d\n", __func__,
+                        lrp->lr_seg.iomode);
+                nfserr = nfserr_inval;
+                goto out;
+        }
+        switch (lrp->lr_return_type) {
+        case RETURN_FILE:
+                nfserr = nfsd4_return_file_layouts(rqstp, cstate, lrp);
+                break;
+        case RETURN_FSID:
+        case RETURN_ALL:
+                nfserr = nfsd4_return_client_layouts(rqstp, cstate, lrp);
+                break;
+        default:
+                dprintk("%s: invalid return_type %d\n", __func__,
+                        lrp->lr_return_type);
+                nfserr = nfserr_inval;
+                break;
+        }
+out:
+        return nfserr;
+}
+#endif /* CONFIG_NFSD_PNFS */
 /*
 * NULL call.
 */
@@ -1679,6 +1934,36 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
                op_encode_channel_attrs_maxsz) * sizeof(__be32);
 }
+#ifdef CONFIG_NFSD_PNFS
+/*
+ * At this stage we don't really know what layout driver will handle the request,
+ * so we need to define an arbitrary upper bound here.
+ */
+#define MAX_LAYOUT_SIZE         128
+static inline u32 nfsd4_layoutget_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size +
+                1 /* logr_return_on_close */ +
+                op_encode_stateid_maxsz +
+                1 /* nr of layouts */ +
+                MAX_LAYOUT_SIZE) * sizeof(__be32);
+}
+static inline u32 nfsd4_layoutcommit_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size +
+                1 /* locr_newsize */ +
+                2 /* ns_size */) * sizeof(__be32);
+}
+static inline u32 nfsd4_layoutreturn_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+        return (op_encode_hdr_size +
+                1 /* lrs_stateid */ +
+                op_encode_stateid_maxsz) * sizeof(__be32);
+}
+#endif /* CONFIG_NFSD_PNFS */
 static struct nfsd4_operation nfsd4_ops[] = {
        [OP_ACCESS] = {
                .op_func = (nfsd4op_func)nfsd4_access,
@@ -1966,6 +2251,31 @@ static struct nfsd4_operation nfsd4_ops[] = {
                .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid,
                .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
        },
+#ifdef CONFIG_NFSD_PNFS
+        [OP_GETDEVICEINFO] = {
+                .op_func = (nfsd4op_func)nfsd4_getdeviceinfo,
+                .op_flags = ALLOWED_WITHOUT_FH,
+                .op_name = "OP_GETDEVICEINFO",
+        },
+        [OP_LAYOUTGET] = {
+                .op_func = (nfsd4op_func)nfsd4_layoutget,
+                .op_flags = OP_MODIFIES_SOMETHING,
+                .op_name = "OP_LAYOUTGET",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutget_rsize,
+        },
+        [OP_LAYOUTCOMMIT] = {
+                .op_func = (nfsd4op_func)nfsd4_layoutcommit,
+                .op_flags = OP_MODIFIES_SOMETHING,
+                .op_name = "OP_LAYOUTCOMMIT",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutcommit_rsize,
+        },
+        [OP_LAYOUTRETURN] = {
+                .op_func = (nfsd4op_func)nfsd4_layoutreturn,
+                .op_flags = OP_MODIFIES_SOMETHING,
+                .op_name = "OP_LAYOUTRETURN",
+                .op_rsize_bop = (nfsd4op_rsize)nfsd4_layoutreturn_rsize,
+        },
+#endif /* CONFIG_NFSD_PNFS */
        /* NFSv4.2 operations */
        [OP_ALLOCATE] = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3550a9c87616..f6b2a09f793f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -48,6 +48,7 @@
 #include "current_stateid.h"
 #include "netns.h"
+#include "pnfs.h"
 #define NFSDDBG_FACILITY                NFSDDBG_PROC
@@ -150,16 +151,6 @@ renew_client_locked(struct nfs4_client *clp)
        clp->cl_time = get_seconds();
 }
-static inline void
-renew_client(struct nfs4_client *clp)
-{
-        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
-        spin_lock(&nn->client_lock);
-        renew_client_locked(clp);
-        spin_unlock(&nn->client_lock);
-}
 static void put_client_renew_locked(struct nfs4_client *clp)
 {
        struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
@@ -282,7 +273,7 @@ static void nfsd4_free_file_rcu(struct rcu_head *rcu)
        kmem_cache_free(file_slab, fp);
 }
-static inline void
+void
 put_nfs4_file(struct nfs4_file *fi)
 {
        might_lock(&state_lock);
@@ -295,12 +286,6 @@ put_nfs4_file(struct nfs4_file *fi)
        }
 }
-static inline void
-get_nfs4_file(struct nfs4_file *fi)
-{
-        atomic_inc(&fi->fi_ref);
-}
 static struct file *
 __nfs4_get_fd(struct nfs4_file *f, int oflag)
 {
@@ -358,7 +343,7 @@ find_readable_file(struct nfs4_file *f)
        return ret;
 }
-static struct file *
+struct file *
 find_any_file(struct nfs4_file *f)
 {
        struct file *ret;
@@ -408,14 +393,6 @@ static unsigned int file_hashval(struct knfsd_fh *fh)
        return nfsd_fh_hashval(fh) & (FILE_HASH_SIZE - 1);
 }
-static bool nfsd_fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
-{
-        return fh1->fh_size == fh2->fh_size &&
-                !memcmp(fh1->fh_base.fh_pad,
-                                fh2->fh_base.fh_pad,
-                                fh1->fh_size);
-}
 static struct hlist_head file_hashtbl[FILE_HASH_SIZE];
 static void
@@ -494,7 +471,7 @@ static void nfs4_file_put_access(struct nfs4_file *fp, u32 access)
                __nfs4_file_put_access(fp, O_RDONLY);
 }
-static struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
                                         struct kmem_cache *slab)
 {
        struct nfs4_stid *stid;
@@ -688,17 +665,17 @@ static void nfs4_put_deleg_lease(struct nfs4_file *fp)
        struct file *filp = NULL;
        spin_lock(&fp->fi_lock);
-        if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees))
+        if (fp->fi_deleg_file && --fp->fi_delegees == 0)
                swap(filp, fp->fi_deleg_file);
        spin_unlock(&fp->fi_lock);
        if (filp) {
-                vfs_setlease(filp, F_UNLCK, NULL, NULL);
+                vfs_setlease(filp, F_UNLCK, NULL, (void **)&fp);
                fput(filp);
        }
 }
-static void unhash_stid(struct nfs4_stid *s)
+void nfs4_unhash_stid(struct nfs4_stid *s)
 {
        s->sc_type = 0;
 }
@@ -1006,7 +983,7 @@ static void unhash_lock_stateid(struct nfs4_ol_stateid *stp)
        list_del_init(&stp->st_locks);
        unhash_ol_stateid(stp);
-        unhash_stid(&stp->st_stid);
+        nfs4_unhash_stid(&stp->st_stid);
 }
 static void release_lock_stateid(struct nfs4_ol_stateid *stp)
@@ -1518,7 +1495,12 @@ unhash_session(struct nfsd4_session *ses)
 static int
 STALE_CLIENTID(clientid_t *clid, struct nfsd_net *nn)
 {
-        if (clid->cl_boot == nn->boot_time)
+        /*
+         * We're assuming the clid was not given out from a boot
+         * precisely 2^32 (about 136 years) before this one.  That seems
+         * a safe assumption:
+         */
+        if (clid->cl_boot == (u32)nn->boot_time)
                return 0;
        dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
                clid->cl_boot, clid->cl_id, nn->boot_time);
@@ -1558,6 +1540,9 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
        INIT_LIST_HEAD(&clp->cl_lru);
        INIT_LIST_HEAD(&clp->cl_callbacks);
        INIT_LIST_HEAD(&clp->cl_revoked);
+#ifdef CONFIG_NFSD_PNFS
+        INIT_LIST_HEAD(&clp->cl_lo_states);
+#endif
        spin_lock_init(&clp->cl_lock);
        rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
        return clp;
@@ -1662,6 +1647,7 @@ __destroy_client(struct nfs4_client *clp)
                nfs4_get_stateowner(&oo->oo_owner);
                release_openowner(oo);
        }
+        nfsd4_return_all_client_layouts(clp);
        nfsd4_shutdown_callback(clp);
        if (clp->cl_cb_conn.cb_xprt)
                svc_xprt_put(clp->cl_cb_conn.cb_xprt);
@@ -2145,8 +2131,11 @@ nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
 static void
 nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
 {
-        /* pNFS is not supported */
+#ifdef CONFIG_NFSD_PNFS
+        new->cl_exchange_flags |= EXCHGID4_FLAG_USE_PNFS_MDS;
+#else
        new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
+#endif
        /* Referrals are supported, Migration is not. */
        new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
@@ -3074,6 +3063,10 @@ static void nfsd4_init_file(struct knfsd_fh *fh, unsigned int hashval,
        fp->fi_share_deny = 0;
        memset(fp->fi_fds, 0, sizeof(fp->fi_fds));
        memset(fp->fi_access, 0, sizeof(fp->fi_access));
+#ifdef CONFIG_NFSD_PNFS
+        INIT_LIST_HEAD(&fp->fi_lo_states);
+        atomic_set(&fp->fi_lo_recalls, 0);
+#endif
        hlist_add_head_rcu(&fp->fi_hash, &file_hashtbl[hashval]);
 }
@@ -3300,7 +3293,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
        struct nfs4_file *fp;
        hlist_for_each_entry_rcu(fp, &file_hashtbl[hashval], fi_hash) {
-                if (nfsd_fh_match(&fp->fi_fhandle, fh)) {
+                if (fh_match(&fp->fi_fhandle, fh)) {
                        if (atomic_inc_not_zero(&fp->fi_ref))
                                return fp;
                }
@@ -3308,7 +3301,7 @@ find_file_locked(struct knfsd_fh *fh, unsigned int hashval)
        return NULL;
 }
-static struct nfs4_file *
+struct nfs4_file *
 find_file(struct knfsd_fh *fh)
 {
        struct nfs4_file *fp;
@@ -3477,7 +3470,8 @@ nfsd_break_deleg_cb(struct file_lock *fl)
 }
 static int
-nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose)
+nfsd_change_deleg_cb(struct file_lock *onlist, int arg,
+                     struct list_head *dispose)
 {
        if (arg & F_UNLCK)
                return lease_modify(onlist, arg, dispose);
@@ -3855,12 +3849,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp)
        /* Race breaker */
        if (fp->fi_deleg_file) {
                status = 0;
-                atomic_inc(&fp->fi_delegees);
+                ++fp->fi_delegees;
                hash_delegation_locked(dp, fp);
                goto out_unlock;
        }
        fp->fi_deleg_file = filp;
-        atomic_set(&fp->fi_delegees, 1);
+        fp->fi_delegees = 1;
        hash_delegation_locked(dp, fp);
        spin_unlock(&fp->fi_lock);
        spin_unlock(&state_lock);
@@ -3897,11 +3891,11 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh,
                status = nfs4_setlease(dp);
                goto out;
        }
-        atomic_inc(&fp->fi_delegees);
        if (fp->fi_had_conflict) {
                status = -EAGAIN;
                goto out_unlock;
        }
+        ++fp->fi_delegees;
        hash_delegation_locked(dp, fp);
        status = 0;
 out_unlock:
@@ -4294,7 +4288,7 @@ laundromat_main(struct work_struct *laundry)
 static inline __be32 nfs4_check_fh(struct svc_fh *fhp, struct nfs4_ol_stateid *stp)
 {
-        if (!nfsd_fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
+        if (!fh_match(&fhp->fh_handle, &stp->st_stid.sc_file->fi_fhandle))
                return nfserr_bad_stateid;
        return nfs_ok;
 }
@@ -4445,7 +4439,7 @@ out_unlock:
        return status;
 }
-static __be32
+__be32
 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
                     stateid_t *stateid, unsigned char typemask,
                     struct nfs4_stid **s, struct nfsd_net *nn)
@@ -4859,6 +4853,9 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
        update_stateid(&stp->st_stid.sc_stateid);
        memcpy(&close->cl_stateid, &stp->st_stid.sc_stateid, sizeof(stateid_t));
+        nfsd4_return_all_file_layouts(stp->st_stateowner->so_client,
+                                      stp->st_stid.sc_file);
        nfsd4_close_open_stateid(stp);
        /* put reference from nfs4_preprocess_seqid_op */
@@ -5556,10 +5553,11 @@ out_nfserr:
 static bool
 check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
 {
-        struct file_lock **flpp;
+        struct file_lock *fl;
        int status = false;
        struct file *filp = find_any_file(fp);
        struct inode *inode;
+        struct file_lock_context *flctx;
        if (!filp) {
                /* Any valid lock stateid should have some sort of access */
@@ -5568,15 +5566,18 @@ check_for_locks(struct nfs4_file *fp, struct nfs4_lockowner *lowner)
        }
        inode = file_inode(filp);
+        flctx = inode->i_flctx;
-        spin_lock(&inode->i_lock);
+        if (flctx && !list_empty_careful(&flctx->flc_posix)) {
-        for (flpp = &inode->i_flock; *flpp != NULL; flpp = &(*flpp)->fl_next) {
+                spin_lock(&flctx->flc_lock);
-                if ((*flpp)->fl_owner == (fl_owner_t)lowner) {
+                list_for_each_entry(fl, &flctx->flc_posix, fl_list) {
-                        status = true;
+                        if (fl->fl_owner == (fl_owner_t)lowner) {
-                        break;
+                                status = true;
+                                break;
+                        }
                }
+                spin_unlock(&flctx->flc_lock);
        }
-        spin_unlock(&inode->i_lock);
        fput(filp);
        return status;
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 15f7b73e0c0f..df5e66caf100 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -47,6 +47,7 @@
 #include "state.h"
 #include "cache.h"
 #include "netns.h"
+#include "pnfs.h"
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #include <linux/security.h>
@@ -234,6 +235,26 @@ static char *savemem(struct nfsd4_compoundargs *argp, __be32 *p, int nbytes)
        return ret;
 }
+/*
+ * We require the high 32 bits of 'seconds' to be 0, and
+ * we ignore all 32 bits of 'nseconds'.
+ */
+static __be32
+nfsd4_decode_time(struct nfsd4_compoundargs *argp, struct timespec *tv)
+{
+        DECODE_HEAD;
+        u64 sec;
+        READ_BUF(12);
+        p = xdr_decode_hyper(p, &sec);
+        tv->tv_sec = sec;
+        tv->tv_nsec = be32_to_cpup(p++);
+        if (tv->tv_nsec >= (u32)1000000000)
+                return nfserr_inval;
+        DECODE_TAIL;
+}
 static __be32
 nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 {
@@ -267,7 +288,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 {
        int expected_len, len = 0;
        u32 dummy32;
-        u64 sec;
        char *buf;
        DECODE_HEAD;
@@ -358,15 +378,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                dummy32 = be32_to_cpup(p++);
                switch (dummy32) {
                case NFS4_SET_TO_CLIENT_TIME:
-                        /* We require the high 32 bits of 'seconds' to be 0, and we ignore
-                           all 32 bits of 'nseconds'. */
-                        READ_BUF(12);
                        len += 12;
-                        p = xdr_decode_hyper(p, &sec);
+                        status = nfsd4_decode_time(argp, &iattr->ia_atime);
-                        iattr->ia_atime.tv_sec = (time_t)sec;
+                        if (status)
-                        iattr->ia_atime.tv_nsec = be32_to_cpup(p++);
+                                return status;
-                        if (iattr->ia_atime.tv_nsec >= (u32)1000000000)
-                                return nfserr_inval;
                        iattr->ia_valid |= (ATTR_ATIME | ATTR_ATIME_SET);
                        break;
                case NFS4_SET_TO_SERVER_TIME:
@@ -382,15 +397,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
                dummy32 = be32_to_cpup(p++);
                switch (dummy32) {
                case NFS4_SET_TO_CLIENT_TIME:
-                        /* We require the high 32 bits of 'seconds' to be 0, and we ignore
-                           all 32 bits of 'nseconds'. */
-                        READ_BUF(12);
                        len += 12;
-                        p = xdr_decode_hyper(p, &sec);
+                        status = nfsd4_decode_time(argp, &iattr->ia_mtime);
-                        iattr->ia_mtime.tv_sec = sec;
+                        if (status)
-                        iattr->ia_mtime.tv_nsec = be32_to_cpup(p++);
+                                return status;
-                        if (iattr->ia_mtime.tv_nsec >= (u32)1000000000)
-                                return nfserr_inval;
                        iattr->ia_valid |= (ATTR_MTIME | ATTR_MTIME_SET);
                        break;
                case NFS4_SET_TO_SERVER_TIME:
@@ -1513,6 +1523,127 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str
        DECODE_TAIL;
 }
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp,
+                struct nfsd4_getdeviceinfo *gdev)
+{
+        DECODE_HEAD;
+        u32 num, i;
+        READ_BUF(sizeof(struct nfsd4_deviceid) + 3 * 4);
+        COPYMEM(&gdev->gd_devid, sizeof(struct nfsd4_deviceid));
+        gdev->gd_layout_type = be32_to_cpup(p++);
+        gdev->gd_maxcount = be32_to_cpup(p++);
+        num = be32_to_cpup(p++);
+        if (num) {
+                READ_BUF(4 * num);
+                gdev->gd_notify_types = be32_to_cpup(p++);
+                for (i = 1; i < num; i++) {
+                        if (be32_to_cpup(p++)) {
+                                status = nfserr_inval;
+                                goto out;
+                        }
+                }
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_layoutget(struct nfsd4_compoundargs *argp,
+                struct nfsd4_layoutget *lgp)
+{
+        DECODE_HEAD;
+        READ_BUF(36);
+        lgp->lg_signal = be32_to_cpup(p++);
+        lgp->lg_layout_type = be32_to_cpup(p++);
+        lgp->lg_seg.iomode = be32_to_cpup(p++);
+        p = xdr_decode_hyper(p, &lgp->lg_seg.offset);
+        p = xdr_decode_hyper(p, &lgp->lg_seg.length);
+        p = xdr_decode_hyper(p, &lgp->lg_minlength);
+        nfsd4_decode_stateid(argp, &lgp->lg_sid);
+        READ_BUF(4);
+        lgp->lg_maxcount = be32_to_cpup(p++);
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp,
+                struct nfsd4_layoutcommit *lcp)
+{
+        DECODE_HEAD;
+        u32 timechange;
+        READ_BUF(20);
+        p = xdr_decode_hyper(p, &lcp->lc_seg.offset);
+        p = xdr_decode_hyper(p, &lcp->lc_seg.length);
+        lcp->lc_reclaim = be32_to_cpup(p++);
+        nfsd4_decode_stateid(argp, &lcp->lc_sid);
+        READ_BUF(4);
+        lcp->lc_newoffset = be32_to_cpup(p++);
+        if (lcp->lc_newoffset) {
+                READ_BUF(8);
+                p = xdr_decode_hyper(p, &lcp->lc_last_wr);
+        } else
+                lcp->lc_last_wr = 0;
+        READ_BUF(4);
+        timechange = be32_to_cpup(p++);
+        if (timechange) {
+                status = nfsd4_decode_time(argp, &lcp->lc_mtime);
+                if (status)
+                        return status;
+        } else {
+                lcp->lc_mtime.tv_nsec = UTIME_NOW;
+        }
+        READ_BUF(8);
+        lcp->lc_layout_type = be32_to_cpup(p++);
+        /*
+         * Save the layout update in XDR format and let the layout driver deal
+         * with it later.
+         */
+        lcp->lc_up_len = be32_to_cpup(p++);
+        if (lcp->lc_up_len > 0) {
+                READ_BUF(lcp->lc_up_len);
+                READMEM(lcp->lc_up_layout, lcp->lc_up_len);
+        }
+        DECODE_TAIL;
+}
+static __be32
+nfsd4_decode_layoutreturn(struct nfsd4_compoundargs *argp,
+                struct nfsd4_layoutreturn *lrp)
+{
+        DECODE_HEAD;
+        READ_BUF(16);
+        lrp->lr_reclaim = be32_to_cpup(p++);
+        lrp->lr_layout_type = be32_to_cpup(p++);
+        lrp->lr_seg.iomode = be32_to_cpup(p++);
+        lrp->lr_return_type = be32_to_cpup(p++);
+        if (lrp->lr_return_type == RETURN_FILE) {
+                READ_BUF(16);
+                p = xdr_decode_hyper(p, &lrp->lr_seg.offset);
+                p = xdr_decode_hyper(p, &lrp->lr_seg.length);
+                nfsd4_decode_stateid(argp, &lrp->lr_sid);
+                READ_BUF(4);
+                lrp->lrf_body_len = be32_to_cpup(p++);
+                if (lrp->lrf_body_len > 0) {
+                        READ_BUF(lrp->lrf_body_len);
+                        READMEM(lrp->lrf_body, lrp->lrf_body_len);
+                }
+        } else {
+                lrp->lr_seg.offset = 0;
+                lrp->lr_seg.length = NFS4_MAX_UINT64;
+        }
+        DECODE_TAIL;
+}
+#endif /* CONFIG_NFSD_PNFS */
 static __be32
 nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
                       struct nfsd4_fallocate *fallocate)
@@ -1607,11 +1738,19 @@ static nfsd4_dec nfsd4_dec_ops[] = {
        [OP_DESTROY_SESSION]    = (nfsd4_dec)nfsd4_decode_destroy_session,
        [OP_FREE_STATEID]       = (nfsd4_dec)nfsd4_decode_free_stateid,
        [OP_GET_DIR_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp,
+#ifdef CONFIG_NFSD_PNFS
+        [OP_GETDEVICEINFO]      = (nfsd4_dec)nfsd4_decode_getdeviceinfo,
+        [OP_GETDEVICELIST]      = (nfsd4_dec)nfsd4_decode_notsupp,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_layoutcommit,
+        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_layoutget,
+        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_layoutreturn,
+#else
        [OP_GETDEVICEINFO]      = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_GETDEVICELIST]      = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTCOMMIT]       = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTGET]          = (nfsd4_dec)nfsd4_decode_notsupp,
        [OP_LAYOUTRETURN]       = (nfsd4_dec)nfsd4_decode_notsupp,
+#endif
        [OP_SECINFO_NO_NAME]    = (nfsd4_dec)nfsd4_decode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_dec)nfsd4_decode_sequence,
        [OP_SET_SSV]            = (nfsd4_dec)nfsd4_decode_notsupp,
@@ -2539,6 +2678,30 @@ out_acl:
                        get_parent_attributes(exp, &stat);
                p = xdr_encode_hyper(p, stat.ino);
        }
+#ifdef CONFIG_NFSD_PNFS
+        if ((bmval1 & FATTR4_WORD1_FS_LAYOUT_TYPES) ||
+            (bmval2 & FATTR4_WORD2_LAYOUT_TYPES)) {
+                if (exp->ex_layout_type) {
+                        p = xdr_reserve_space(xdr, 8);
+                        if (!p)
+                                goto out_resource;
+                        *p++ = cpu_to_be32(1);
+                        *p++ = cpu_to_be32(exp->ex_layout_type);
+                } else {
+                        p = xdr_reserve_space(xdr, 4);
+                        if (!p)
+                                goto out_resource;
+                        *p++ = cpu_to_be32(0);
+                }
+        }
+        if (bmval2 & FATTR4_WORD2_LAYOUT_BLKSIZE) {
+                p = xdr_reserve_space(xdr, 4);
+                if (!p)
+                        goto out_resource;
+                *p++ = cpu_to_be32(stat.blksize);
+        }
+#endif /* CONFIG_NFSD_PNFS */
        if (bmval2 & FATTR4_WORD2_SECURITY_LABEL) {
                status = nfsd4_encode_security_label(xdr, rqstp, context,
                                                                contextlen);
@@ -2768,16 +2931,17 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
        if (entry_bytes > cd->rd_maxcount)
                goto fail;
        cd->rd_maxcount -= entry_bytes;
-        if (!cd->rd_dircount)
-                goto fail;
        /*
         * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
         * let's always let through the first entry, at least:
         */
-        name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+        if (!cd->rd_dircount)
+                goto fail;
+        name_and_cookie = 4 + 4 * XDR_QUADLEN(namlen) + 8;
        if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
                goto fail;
        cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
        cd->cookie_offset = cookie_offset;
 skip_entry:
        cd->common.err = nfs_ok;
@@ -3814,6 +3978,156 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr,
        return nfserr;
 }
+#ifdef CONFIG_NFSD_PNFS
+static __be32
+nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
+                struct nfsd4_getdeviceinfo *gdev)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        const struct nfsd4_layout_ops *ops =
+                nfsd4_layout_ops[gdev->gd_layout_type];
+        u32 starting_len = xdr->buf->len, needed_len;
+        __be32 *p;
+        dprintk("%s: err %d\n", __func__, nfserr);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_resource;
+        p = xdr_reserve_space(xdr, 4);
+        if (!p)
+                goto out;
+        *p++ = cpu_to_be32(gdev->gd_layout_type);
+        /* If maxcount is 0 then just update notifications */
+        if (gdev->gd_maxcount != 0) {
+                nfserr = ops->encode_getdeviceinfo(xdr, gdev);
+                if (nfserr) {
+                        /*
+                         * We don't bother to burden the layout drivers with
+                         * enforcing gd_maxcount, just tell the client to
+                         * come back with a bigger buffer if it's not enough.
+                         */
+                        if (xdr->buf->len + 4 > gdev->gd_maxcount)
+                                goto toosmall;
+                        goto out;
+                }
+        }
+        nfserr = nfserr_resource;
+        if (gdev->gd_notify_types) {
+                p = xdr_reserve_space(xdr, 4 + 4);
+                if (!p)
+                        goto out;
+                *p++ = cpu_to_be32(1);                  /* bitmap length */
+                *p++ = cpu_to_be32(gdev->gd_notify_types);
+        } else {
+                p = xdr_reserve_space(xdr, 4);
+                if (!p)
+                        goto out;
+                *p++ = 0;
+        }
+        nfserr = 0;
+out:
+        kfree(gdev->gd_device);
+        dprintk("%s: done: %d\n", __func__, be32_to_cpu(nfserr));
+        return nfserr;
+toosmall:
+        dprintk("%s: maxcount too small\n", __func__);
+        needed_len = xdr->buf->len + 4 /* notifications */;
+        xdr_truncate_encode(xdr, starting_len);
+        p = xdr_reserve_space(xdr, 4);
+        if (!p) {
+                nfserr = nfserr_resource;
+        } else {
+                *p++ = cpu_to_be32(needed_len);
+                nfserr = nfserr_toosmall;
+        }
+        goto out;
+}
+static __be32
+nfsd4_encode_layoutget(struct nfsd4_compoundres *resp, __be32 nfserr,
+                struct nfsd4_layoutget *lgp)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        const struct nfsd4_layout_ops *ops =
+                nfsd4_layout_ops[lgp->lg_layout_type];
+        __be32 *p;
+        dprintk("%s: err %d\n", __func__, nfserr);
+        if (nfserr)
+                goto out;
+        nfserr = nfserr_resource;
+        p = xdr_reserve_space(xdr, 36 + sizeof(stateid_opaque_t));
+        if (!p)
+                goto out;
+        *p++ = cpu_to_be32(1);  /* we always set return-on-close */
+        *p++ = cpu_to_be32(lgp->lg_sid.si_generation);
+        p = xdr_encode_opaque_fixed(p, &lgp->lg_sid.si_opaque,
+                                    sizeof(stateid_opaque_t));
+        *p++ = cpu_to_be32(1);  /* we always return a single layout */
+        p = xdr_encode_hyper(p, lgp->lg_seg.offset);
+        p = xdr_encode_hyper(p, lgp->lg_seg.length);
+        *p++ = cpu_to_be32(lgp->lg_seg.iomode);
+        *p++ = cpu_to_be32(lgp->lg_layout_type);
+        nfserr = ops->encode_layoutget(xdr, lgp);
+out:
+        kfree(lgp->lg_content);
+        return nfserr;
+}
+static __be32
+nfsd4_encode_layoutcommit(struct nfsd4_compoundres *resp, __be32 nfserr,
+                          struct nfsd4_layoutcommit *lcp)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        __be32 *p;
+        if (nfserr)
+                return nfserr;
+        p = xdr_reserve_space(xdr, 4);
+        if (!p)
+                return nfserr_resource;
+        *p++ = cpu_to_be32(lcp->lc_size_chg);
+        if (lcp->lc_size_chg) {
+                p = xdr_reserve_space(xdr, 8);
+                if (!p)
+                        return nfserr_resource;
+                p = xdr_encode_hyper(p, lcp->lc_newsize);
+        }
+        return nfs_ok;
+}
+static __be32
+nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
+                struct nfsd4_layoutreturn *lrp)
+{
+        struct xdr_stream *xdr = &resp->xdr;
+        __be32 *p;
+        if (nfserr)
+                return nfserr;
+        p = xdr_reserve_space(xdr, 4);
+        if (!p)
+                return nfserr_resource;
+        *p++ = cpu_to_be32(lrp->lrs_present);
+        if (lrp->lrs_present)
+                nfsd4_encode_stateid(xdr, &lrp->lr_sid);
+        return nfs_ok;
+}
+#endif /* CONFIG_NFSD_PNFS */
 static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
                  struct nfsd4_seek *seek)
@@ -3890,11 +4204,19 @@ static nfsd4_enc nfsd4_enc_ops[] = {
        [OP_DESTROY_SESSION]    = (nfsd4_enc)nfsd4_encode_noop,
        [OP_FREE_STATEID]       = (nfsd4_enc)nfsd4_encode_noop,
        [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
+#ifdef CONFIG_NFSD_PNFS
+        [OP_GETDEVICEINFO]      = (nfsd4_enc)nfsd4_encode_getdeviceinfo,
+        [OP_GETDEVICELIST]      = (nfsd4_enc)nfsd4_encode_noop,
+        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_layoutcommit,
+        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_layoutget,
+        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_layoutreturn,
+#else
        [OP_GETDEVICEINFO]      = (nfsd4_enc)nfsd4_encode_noop,
        [OP_GETDEVICELIST]      = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTCOMMIT]       = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTGET]          = (nfsd4_enc)nfsd4_encode_noop,
        [OP_LAYOUTRETURN]       = (nfsd4_enc)nfsd4_encode_noop,
+#endif
        [OP_SECINFO_NO_NAME]    = (nfsd4_enc)nfsd4_encode_secinfo_no_name,
        [OP_SEQUENCE]           = (nfsd4_enc)nfsd4_encode_sequence,
        [OP_SET_SSV]            = (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 19ace74d35f6..aa47d75ddb26 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -21,6 +21,7 @@
 #include "cache.h"
 #include "state.h"
 #include "netns.h"
+#include "pnfs.h"
 /*
 *      We have a single directory with several nodes in it.
@@ -1258,9 +1259,12 @@ static int __init init_nfsd(void)
        retval = nfsd4_init_slabs();
        if (retval)
                goto out_unregister_pernet;
-        retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+        retval = nfsd4_init_pnfs();
        if (retval)
                goto out_free_slabs;
+        retval = nfsd_fault_inject_init(); /* nfsd fault injection controls */
+        if (retval)
+                goto out_exit_pnfs;
        nfsd_stat_init();       /* Statistics */
        retval = nfsd_reply_cache_init();
        if (retval)
@@ -1282,6 +1286,8 @@ out_free_lockd:
 out_free_stat:
        nfsd_stat_shutdown();
        nfsd_fault_inject_cleanup();
+out_exit_pnfs:
+        nfsd4_exit_pnfs();
 out_free_slabs:
        nfsd4_free_slabs();
 out_unregister_pernet:
@@ -1299,6 +1305,7 @@ static void __exit exit_nfsd(void)
        nfsd_stat_shutdown();
        nfsd_lockd_shutdown();
        nfsd4_free_slabs();
+        nfsd4_exit_pnfs();
        nfsd_fault_inject_cleanup();
        unregister_filesystem(&nfsd_fs_type);
        unregister_pernet_subsys(&nfsd_net_ops);
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 33a46a8dfaf7..565c4da1a9eb 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -325,15 +325,27 @@ void		nfsd_lockd_shutdown(void);
 #define NFSD4_SUPPORTED_ATTRS_WORD2 0
+/* 4.1 */
+#ifdef CONFIG_NFSD_PNFS
+#define PNFSD_SUPPORTED_ATTRS_WORD1     FATTR4_WORD1_FS_LAYOUT_TYPES
+#define PNFSD_SUPPORTED_ATTRS_WORD2 \
+(FATTR4_WORD2_LAYOUT_BLKSIZE    | FATTR4_WORD2_LAYOUT_TYPES)
+#else
+#define PNFSD_SUPPORTED_ATTRS_WORD1     0
+#define PNFSD_SUPPORTED_ATTRS_WORD2     0
+#endif /* CONFIG_NFSD_PNFS */
 #define NFSD4_1_SUPPORTED_ATTRS_WORD0 \
        NFSD4_SUPPORTED_ATTRS_WORD0
 #define NFSD4_1_SUPPORTED_ATTRS_WORD1 \
-        NFSD4_SUPPORTED_ATTRS_WORD1
+        (NFSD4_SUPPORTED_ATTRS_WORD1    | PNFSD_SUPPORTED_ATTRS_WORD1)
 #define NFSD4_1_SUPPORTED_ATTRS_WORD2 \
-        (NFSD4_SUPPORTED_ATTRS_WORD2 | FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+        (NFSD4_SUPPORTED_ATTRS_WORD2    | PNFSD_SUPPORTED_ATTRS_WORD2 | \
+         FATTR4_WORD2_SUPPATTR_EXCLCREAT)
+/* 4.2 */
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
 #define NFSD4_2_SECURITY_ATTRS          FATTR4_WORD2_SECURITY_LABEL
 #else
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 08236d70c667..f22920442172 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -187,6 +187,24 @@ fh_init(struct svc_fh *fhp, int maxsize)
        return fhp;
 }
+static inline bool fh_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+        if (fh1->fh_size != fh2->fh_size)
+                return false;
+        if (memcmp(fh1->fh_base.fh_pad, fh2->fh_base.fh_pad, fh1->fh_size) != 0)
+                return false;
+        return true;
+}
+static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
+{
+        if (fh1->fh_fsid_type != fh2->fh_fsid_type)
+                return false;
+        if (memcmp(fh1->fh_fsid, fh2->fh_fsid, key_len(fh1->fh_fsid_type)) != 0)
+                return false;
+        return true;
+}
 #ifdef CONFIG_NFSD_V3
 /*
 * The wcc data stored in current_fh should be cleared
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 314f5c8f8f1a..9277cc91c21b 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -119,6 +119,7 @@ struct svc_program		nfsd_program = {
 static bool nfsd_supported_minorversions[NFSD_SUPPORTED_MINOR_VERSION + 1] = {
        [0] = 1,
        [1] = 1,
+        [2] = 1,
 };
 int nfsd_vers(int vers, enum vers_op change)
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
new file mode 100644
index 000000000000..d4c4453674c6
--- /dev/null
+++ b/fs/nfsd/pnfs.h
@@ -0,0 +1,86 @@
+#ifndef _FS_NFSD_PNFS_H
+#define _FS_NFSD_PNFS_H 1
+#ifdef CONFIG_NFSD_V4
+#include <linux/exportfs.h>
+#include <linux/nfsd/export.h>
+#include "state.h"
+#include "xdr4.h"
+struct xdr_stream;
+struct nfsd4_deviceid_map {
+        struct list_head        hash;
+        u64                     idx;
+        int                     fsid_type;
+        u32                     fsid[];
+};
+struct nfsd4_layout_ops {
+        u32             notify_types;
+        __be32 (*proc_getdeviceinfo)(struct super_block *sb,
+                        struct nfsd4_getdeviceinfo *gdevp);
+        __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr,
+                        struct nfsd4_getdeviceinfo *gdevp);
+        __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp,
+                        struct nfsd4_layoutget *lgp);
+        __be32 (*encode_layoutget)(struct xdr_stream *,
+                        struct nfsd4_layoutget *lgp);
+        __be32 (*proc_layoutcommit)(struct inode *inode,
+                        struct nfsd4_layoutcommit *lcp);
+};
+extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
+extern const struct nfsd4_layout_ops bl_layout_ops;
+__be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate, stateid_t *stateid,
+                bool create, u32 layout_type, struct nfs4_layout_stateid **lsp);
+__be32 nfsd4_insert_layout(struct nfsd4_layoutget *lgp,
+                struct nfs4_layout_stateid *ls);
+__be32 nfsd4_return_file_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp);
+__be32 nfsd4_return_client_layouts(struct svc_rqst *rqstp,
+                struct nfsd4_compound_state *cstate,
+                struct nfsd4_layoutreturn *lrp);
+int nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
+                u32 device_generation);
+struct nfsd4_deviceid_map *nfsd4_find_devid_map(int idx);
+#endif /* CONFIG_NFSD_V4 */
+#ifdef CONFIG_NFSD_PNFS
+void nfsd4_setup_layout_type(struct svc_export *exp);
+void nfsd4_return_all_client_layouts(struct nfs4_client *);
+void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+                struct nfs4_file *fp);
+int nfsd4_init_pnfs(void);
+void nfsd4_exit_pnfs(void);
+#else
+struct nfs4_client;
+struct nfs4_file;
+static inline void nfsd4_setup_layout_type(struct svc_export *exp)
+{
+}
+static inline void nfsd4_return_all_client_layouts(struct nfs4_client *clp)
+{
+}
+static inline void nfsd4_return_all_file_layouts(struct nfs4_client *clp,
+                struct nfs4_file *fp)
+{
+}
+static inline void nfsd4_exit_pnfs(void)
+{
+}
+static inline int nfsd4_init_pnfs(void)
+{
+        return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _FS_NFSD_PNFS_H */
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 9d3be371240a..4f3bfeb11766 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -92,6 +92,7 @@ struct nfs4_stid {
 /* For a deleg stateid kept around only to process free_stateid's: */
 #define NFS4_REVOKED_DELEG_STID 16
 #define NFS4_CLOSED_DELEG_STID 32
+#define NFS4_LAYOUT_STID 64
        unsigned char sc_type;
        stateid_t sc_stateid;
        struct nfs4_client *sc_client;
@@ -297,6 +298,9 @@ struct nfs4_client {
        struct list_head        cl_delegations;
        struct list_head        cl_revoked;     /* unacknowledged, revoked 4.1 state */
        struct list_head        cl_lru;         /* tail queue */
+#ifdef CONFIG_NFSD_PNFS
+        struct list_head        cl_lo_states;   /* outstanding layout states */
+#endif
        struct xdr_netobj       cl_name;        /* id generated by client */
        nfs4_verifier           cl_verifier;    /* generated by client */
        time_t                  cl_time;        /* time of last lease renewal */
@@ -493,9 +497,13 @@ struct nfs4_file {
        atomic_t                fi_access[2];
        u32                     fi_share_deny;
        struct file             *fi_deleg_file;
-        atomic_t                fi_delegees;
+        int                     fi_delegees;
        struct knfsd_fh         fi_fhandle;
        bool                    fi_had_conflict;
+#ifdef CONFIG_NFSD_PNFS
+        struct list_head        fi_lo_states;
+        atomic_t                fi_lo_recalls;
+#endif
 };
 /*
@@ -528,6 +536,24 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
        return container_of(s, struct nfs4_ol_stateid, st_stid);
 }
+struct nfs4_layout_stateid {
+        struct nfs4_stid                ls_stid;
+        struct list_head                ls_perclnt;
+        struct list_head                ls_perfile;
+        spinlock_t                      ls_lock;
+        struct list_head                ls_layouts;
+        u32                             ls_layout_type;
+        struct file                     *ls_file;
+        struct nfsd4_callback           ls_recall;
+        stateid_t                       ls_recall_sid;
+        bool                            ls_recalled;
+};
+static inline struct nfs4_layout_stateid *layoutstateid(struct nfs4_stid *s)
+{
+        return container_of(s, struct nfs4_layout_stateid, ls_stid);
+}
 /* flags for preprocess_seqid_op() */
 #define RD_STATE                0x00000010
 #define WR_STATE                0x00000020
@@ -535,6 +561,7 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s)
 enum nfsd4_cb_op {
        NFSPROC4_CLNT_CB_NULL = 0,
        NFSPROC4_CLNT_CB_RECALL,
+        NFSPROC4_CLNT_CB_LAYOUT,
        NFSPROC4_CLNT_CB_SEQUENCE,
 };
@@ -545,6 +572,12 @@ struct nfsd_net;
 extern __be32 nfs4_preprocess_stateid_op(struct net *net,
                struct nfsd4_compound_state *cstate,
                stateid_t *stateid, int flags, struct file **filp);
+__be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
+                     stateid_t *stateid, unsigned char typemask,
+                     struct nfs4_stid **s, struct nfsd_net *nn);
+struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl,
+                struct kmem_cache *slab);
+void nfs4_unhash_stid(struct nfs4_stid *s);
 void nfs4_put_stid(struct nfs4_stid *s);
 void nfs4_remove_reclaim_record(struct nfs4_client_reclaim *, struct nfsd_net *);
 extern void nfs4_release_reclaim(struct nfsd_net *);
@@ -567,6 +600,14 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name,
                                                        struct nfsd_net *nn);
 extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn);
+struct nfs4_file *find_file(struct knfsd_fh *fh);
+void put_nfs4_file(struct nfs4_file *fi);
+static inline void get_nfs4_file(struct nfs4_file *fi)
+{
+        atomic_inc(&fi->fi_ref);
+}
+struct file *find_any_file(struct nfs4_file *f);
 /* grace period management */
 void nfsd4_end_grace(struct nfsd_net *nn);
diff --git a/fs/nfsd/trace.c b/fs/nfsd/trace.c
new file mode 100644
index 000000000000..82f89070594c
--- /dev/null
+++ b/fs/nfsd/trace.c
@@ -0,0 +1,5 @@
+#include "state.h"
+#define CREATE_TRACE_POINTS
+#include "trace.h"
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
new file mode 100644
index 000000000000..c668520c344b
--- /dev/null
+++ b/fs/nfsd/trace.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nfsd
+#if !defined(_NFSD_TRACE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _NFSD_TRACE_H
+#include <linux/tracepoint.h>
+DECLARE_EVENT_CLASS(nfsd_stateid_class,
+        TP_PROTO(stateid_t *stp),
+        TP_ARGS(stp),
+        TP_STRUCT__entry(
+                __field(u32, cl_boot)
+                __field(u32, cl_id)
+                __field(u32, si_id)
+                __field(u32, si_generation)
+        ),
+        TP_fast_assign(
+                __entry->cl_boot = stp->si_opaque.so_clid.cl_boot;
+                __entry->cl_id = stp->si_opaque.so_clid.cl_id;
+                __entry->si_id = stp->si_opaque.so_id;
+                __entry->si_generation = stp->si_generation;
+        ),
+        TP_printk("client %08x:%08x stateid %08x:%08x",
+                __entry->cl_boot,
+                __entry->cl_id,
+                __entry->si_id,
+                __entry->si_generation)
+)
+#define DEFINE_STATEID_EVENT(name) \
+DEFINE_EVENT(nfsd_stateid_class, name, \
+        TP_PROTO(stateid_t *stp), \
+        TP_ARGS(stp))
+DEFINE_STATEID_EVENT(layoutstate_alloc);
+DEFINE_STATEID_EVENT(layoutstate_unhash);
+DEFINE_STATEID_EVENT(layoutstate_free);
+DEFINE_STATEID_EVENT(layout_get_lookup_fail);
+DEFINE_STATEID_EVENT(layout_commit_lookup_fail);
+DEFINE_STATEID_EVENT(layout_return_lookup_fail);
+DEFINE_STATEID_EVENT(layout_recall);
+DEFINE_STATEID_EVENT(layout_recall_done);
+DEFINE_STATEID_EVENT(layout_recall_fail);
+DEFINE_STATEID_EVENT(layout_recall_release);
+#endif /* _NFSD_TRACE_H */
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace
+#include <trace/define_trace.h>
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index 90a5925bd6ab..0bda93e58e1b 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -428,6 +428,61 @@ struct nfsd4_reclaim_complete {
        u32 rca_one_fs;
 };
+struct nfsd4_deviceid {
+        u64                     fsid_idx;
+        u32                     generation;
+        u32                     pad;
+};
+struct nfsd4_layout_seg {
+        u32                     iomode;
+        u64                     offset;
+        u64                     length;
+};
+struct nfsd4_getdeviceinfo {
+        struct nfsd4_deviceid   gd_devid;       /* request */
+        u32                     gd_layout_type; /* request */
+        u32                     gd_maxcount;    /* request */
+        u32                     gd_notify_types;/* request - response */
+        void                    *gd_device;     /* response */
+};
+struct nfsd4_layoutget {
+        u64                     lg_minlength;   /* request */
+        u32                     lg_signal;      /* request */
+        u32                     lg_layout_type; /* request */
+        u32                     lg_maxcount;    /* request */
+        stateid_t               lg_sid;         /* request/response */
+        struct nfsd4_layout_seg lg_seg;         /* request/response */
+        void                    *lg_content;    /* response */
+};
+struct nfsd4_layoutcommit {
+        stateid_t               lc_sid;         /* request */
+        struct nfsd4_layout_seg lc_seg;         /* request */
+        u32                     lc_reclaim;     /* request */
+        u32                     lc_newoffset;   /* request */
+        u64                     lc_last_wr;     /* request */
+        struct timespec         lc_mtime;       /* request */
+        u32                     lc_layout_type; /* request */
+        u32                     lc_up_len;      /* layout length */
+        void                    *lc_up_layout;  /* decoded by callback */
+        u32                     lc_size_chg;    /* boolean for response */
+        u64                     lc_newsize;     /* response */
+};
+struct nfsd4_layoutreturn {
+        u32                     lr_return_type; /* request */
+        u32                     lr_layout_type; /* request */
+        struct nfsd4_layout_seg lr_seg;         /* request */
+        u32                     lr_reclaim;     /* request */
+        u32                     lrf_body_len;   /* request */
+        void                    *lrf_body;      /* request */
+        stateid_t               lr_sid;         /* request/response */
+        u32                     lrs_present;    /* response */
+};
 struct nfsd4_fallocate {
        /* request */
        stateid_t       falloc_stateid;
@@ -491,6 +546,10 @@ struct nfsd4_op {
                struct nfsd4_reclaim_complete   reclaim_complete;
                struct nfsd4_test_stateid       test_stateid;
                struct nfsd4_free_stateid       free_stateid;
+                struct nfsd4_getdeviceinfo      getdeviceinfo;
+                struct nfsd4_layoutget          layoutget;
+                struct nfsd4_layoutcommit       layoutcommit;
+                struct nfsd4_layoutreturn       layoutreturn;
                /* NFSv4.2 */
                struct nfsd4_fallocate          allocate;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c5c55dfb91a9..c47f6fdb111a 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -21,3 +21,10 @@
 #define NFS4_dec_cb_recall_sz           (cb_compound_dec_hdr_sz  +      \
                                        cb_sequence_dec_sz +            \
                                        op_dec_sz)
+#define NFS4_enc_cb_layout_sz           (cb_compound_enc_hdr_sz +       \
+                                        cb_sequence_enc_sz +            \
+                                        1 + 3 +                         \
+                                        enc_nfs4_fh_sz + 4)
+#define NFS4_dec_cb_layout_sz           (cb_compound_dec_hdr_sz  +      \
+                                        cb_sequence_dec_sz +            \
+                                        op_dec_sz)
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 3a03e0aea1fb..a8c728acb7a8 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -128,7 +128,6 @@ static const struct vm_operations_struct nilfs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = nilfs_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 57ceaf33d177..748ca238915a 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -172,7 +172,6 @@ int nilfs_init_gcinode(struct inode *inode)
        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS);
        inode->i_mapping->a_ops = &empty_aops;
-        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        ii->i_flags = 0;
        nilfs_bmap_init_gc(ii->i_bmap);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c4dcd1db57ee..892cf5ffdb8e 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -429,7 +429,6 @@ int nilfs_mdt_init(struct inode *inode, gfp_t gfp_mask, size_t objsz)
        inode->i_mode = S_IFREG;
        mapping_set_gfp_mask(inode->i_mapping, gfp_mask);
-        inode->i_mapping->backing_dev_info = inode->i_sb->s_bdi;
        inode->i_op = &def_mdt_iops;
        inode->i_fop = &def_mdt_fops;
@@ -457,13 +456,12 @@ int nilfs_mdt_setup_shadow_map(struct inode *inode,
                               struct nilfs_shadow_map *shadow)
 {
        struct nilfs_mdt_info *mi = NILFS_MDT(inode);
-        struct backing_dev_info *bdi = inode->i_sb->s_bdi;
        INIT_LIST_HEAD(&shadow->frozen_buffers);
        address_space_init_once(&shadow->frozen_data);
-        nilfs_mapping_init(&shadow->frozen_data, inode, bdi);
+        nilfs_mapping_init(&shadow->frozen_data, inode);
        address_space_init_once(&shadow->frozen_btnodes);
-        nilfs_mapping_init(&shadow->frozen_btnodes, inode, bdi);
+        nilfs_mapping_init(&shadow->frozen_btnodes, inode);
        mi->mi_shadow = shadow;
        return 0;
 }
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 91093cd74f0d..385704027575 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -141,7 +141,6 @@ enum {
 * @ti_save: Backup of journal_info field of task_struct
 * @ti_flags: Flags
 * @ti_count: Nest level
- * @ti_garbage: List of inode to be put when releasing semaphore
 */
 struct nilfs_transaction_info {
        u32                     ti_magic;
@@ -150,7 +149,6 @@ struct nilfs_transaction_info {
                                   one of other filesystems has a bug. */
        unsigned short          ti_flags;
        unsigned short          ti_count;
-        struct list_head        ti_garbage;
 };
 /* ti_magic */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index da276640f776..700ecbcca55d 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -461,14 +461,12 @@ unsigned nilfs_page_count_clean_buffers(struct page *page,
        return nc;
 }
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode)
-                        struct backing_dev_info *bdi)
 {
        mapping->host = inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
        mapping->private_data = NULL;
-        mapping->backing_dev_info = bdi;
        mapping->a_ops = &empty_aops;
 }
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
index ef30c5c2426f..a43b8287d012 100644
--- a/fs/nilfs2/page.h
+++ b/fs/nilfs2/page.h
@@ -57,8 +57,7 @@ int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
 void nilfs_copy_back_pages(struct address_space *, struct address_space *);
 void nilfs_clear_dirty_page(struct page *, bool);
 void nilfs_clear_dirty_pages(struct address_space *, bool);
-void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
+void nilfs_mapping_init(struct address_space *mapping, struct inode *inode);
-                        struct backing_dev_info *bdi);
 unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
 unsigned long nilfs_find_uncommitted_extent(struct inode *inode,
                                            sector_t start_blk,
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 7ef18fc656c2..469086b9f99b 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -305,7 +305,6 @@ static void nilfs_transaction_lock(struct super_block *sb,
        ti->ti_count = 0;
        ti->ti_save = cur_ti;
        ti->ti_magic = NILFS_TI_MAGIC;
-        INIT_LIST_HEAD(&ti->ti_garbage);
        current->journal_info = ti;
        for (;;) {
@@ -332,8 +331,6 @@ static void nilfs_transaction_unlock(struct super_block *sb)
        up_write(&nilfs->ns_segctor_sem);
        current->journal_info = ti->ti_save;
-        if (!list_empty(&ti->ti_garbage))
-                nilfs_dispose_list(nilfs, &ti->ti_garbage, 0);
 }
 static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
@@ -746,6 +743,15 @@ static void nilfs_dispose_list(struct the_nilfs *nilfs,
        }
 }
+static void nilfs_iput_work_func(struct work_struct *work)
+{
+        struct nilfs_sc_info *sci = container_of(work, struct nilfs_sc_info,
+                                                 sc_iput_work);
+        struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+        nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 0);
+}
 static int nilfs_test_metadata_dirty(struct the_nilfs *nilfs,
                                     struct nilfs_root *root)
 {
@@ -1900,8 +1906,8 @@ static int nilfs_segctor_collect_dirty_files(struct nilfs_sc_info *sci,
 static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
                                             struct the_nilfs *nilfs)
 {
-        struct nilfs_transaction_info *ti = current->journal_info;
        struct nilfs_inode_info *ii, *n;
+        int defer_iput = false;
        spin_lock(&nilfs->ns_inode_lock);
        list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
@@ -1912,9 +1918,24 @@ static void nilfs_segctor_drop_written_files(struct nilfs_sc_info *sci,
                clear_bit(NILFS_I_BUSY, &ii->i_state);
                brelse(ii->i_bh);
                ii->i_bh = NULL;
-                list_move_tail(&ii->i_dirty, &ti->ti_garbage);
+                list_del_init(&ii->i_dirty);
+                if (!ii->vfs_inode.i_nlink) {
+                        /*
+                         * Defer calling iput() to avoid a deadlock
+                         * over I_SYNC flag for inodes with i_nlink == 0
+                         */
+                        list_add_tail(&ii->i_dirty, &sci->sc_iput_queue);
+                        defer_iput = true;
+                } else {
+                        spin_unlock(&nilfs->ns_inode_lock);
+                        iput(&ii->vfs_inode);
+                        spin_lock(&nilfs->ns_inode_lock);
+                }
        }
        spin_unlock(&nilfs->ns_inode_lock);
+        if (defer_iput)
+                schedule_work(&sci->sc_iput_work);
 }
 /*
@@ -2583,6 +2604,8 @@ static struct nilfs_sc_info *nilfs_segctor_new(struct super_block *sb,
        INIT_LIST_HEAD(&sci->sc_segbufs);
        INIT_LIST_HEAD(&sci->sc_write_logs);
        INIT_LIST_HEAD(&sci->sc_gc_inodes);
+        INIT_LIST_HEAD(&sci->sc_iput_queue);
+        INIT_WORK(&sci->sc_iput_work, nilfs_iput_work_func);
        init_timer(&sci->sc_timer);
        sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
@@ -2609,6 +2632,8 @@ static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
                ret = nilfs_segctor_construct(sci, SC_LSEG_SR);
                nilfs_transaction_unlock(sci->sc_super);
+                flush_work(&sci->sc_iput_work);
        } while (ret && retrycount-- > 0);
 }
@@ -2633,6 +2658,9 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
                || sci->sc_seq_request != sci->sc_seq_done);
        spin_unlock(&sci->sc_state_lock);
+        if (flush_work(&sci->sc_iput_work))
+                flag = true;
        if (flag || !nilfs_segctor_confirm(sci))
                nilfs_segctor_write_out(sci);
@@ -2642,6 +2670,12 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
                nilfs_dispose_list(nilfs, &sci->sc_dirty_files, 1);
        }
+        if (!list_empty(&sci->sc_iput_queue)) {
+                nilfs_warning(sci->sc_super, __func__,
+                              "iput queue is not empty\n");
+                nilfs_dispose_list(nilfs, &sci->sc_iput_queue, 1);
+        }
        WARN_ON(!list_empty(&sci->sc_segbufs));
        WARN_ON(!list_empty(&sci->sc_write_logs));
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index 38a1d0013314..a48d6de1e02c 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -26,6 +26,7 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
+#include <linux/workqueue.h>
 #include <linux/nilfs2_fs.h>
 #include "nilfs.h"
@@ -92,6 +93,8 @@ struct nilfs_segsum_pointer {
 * @sc_nblk_inc: Block count of current generation
 * @sc_dirty_files: List of files to be written
 * @sc_gc_inodes: List of GC inodes having blocks to be written
+ * @sc_iput_queue: list of inodes for which iput should be done
+ * @sc_iput_work: work struct to defer iput call
 * @sc_freesegs: array of segment numbers to be freed
 * @sc_nfreesegs: number of segments on @sc_freesegs
 * @sc_dsync_inode: inode whose data pages are written for a sync operation
@@ -135,6 +138,8 @@ struct nilfs_sc_info {
        struct list_head        sc_dirty_files;
        struct list_head        sc_gc_inodes;
+        struct list_head        sc_iput_queue;
+        struct work_struct      sc_iput_work;
        __u64                  *sc_freesegs;
        size_t                  sc_nfreesegs;
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 2e5b3ec85b8f..5bc2a1cf73c3 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -166,7 +166,7 @@ struct inode *nilfs_alloc_inode(struct super_block *sb)
        ii->i_state = 0;
        ii->i_cno = 0;
        ii->vfs_inode.i_version = 1;
-        nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode, sb->s_bdi);
+        nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
        return &ii->vfs_inode;
 }
@@ -1057,7 +1057,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct the_nilfs *nilfs;
        struct nilfs_root *fsroot;
-        struct backing_dev_info *bdi;
        __u64 cno;
        int err;
@@ -1077,8 +1076,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_time_gran = 1;
        sb->s_max_links = NILFS_LINK_MAX;
-        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
+        sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
-        sb->s_bdi = bdi ? : &default_backing_dev_info;
        err = load_nilfs(nilfs, sb);
        if (err)
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 22c629eedd82..2a24249b30af 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,5 +1,6 @@
 config FSNOTIFY
        def_bool n
+        select SRCU
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 30d3addfad75..51ceb8107284 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
        }
        if (S_ISDIR(path->dentry->d_inode->i_mode) &&
-            (marks_ignored_mask & FS_ISDIR))
+            !(marks_mask & FS_ISDIR & ~marks_ignored_mask))
                return false;
        if (event_mask & marks_mask & ~marks_ignored_mask)
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index c991616acca9..cf275500a665 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -259,16 +259,15 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
        struct fsnotify_event *kevent;
        char __user *start;
        int ret;
-        DEFINE_WAIT(wait);
+        DEFINE_WAIT_FUNC(wait, woken_wake_function);
        start = buf;
        group = file->private_data;
        pr_debug("%s: group=%p\n", __func__, group);
+        add_wait_queue(&group->notification_waitq, &wait);
        while (1) {
-                prepare_to_wait(&group->notification_waitq, &wait, TASK_INTERRUPTIBLE);
                mutex_lock(&group->notification_mutex);
                kevent = get_one_event(group, count);
                mutex_unlock(&group->notification_mutex);
@@ -289,7 +288,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
                        if (start != buf)
                                break;
-                        schedule();
+                        wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
                        continue;
                }
@@ -318,8 +318,8 @@ static ssize_t fanotify_read(struct file *file, char __user *buf,
                buf += ret;
                count -= ret;
        }
+        remove_wait_queue(&group->notification_waitq, &wait);
-        finish_wait(&group->notification_waitq, &wait);
        if (start != buf && ret != -EFAULT)
                ret = buf - start;
        return ret;
@@ -487,20 +487,27 @@ static __u32 fanotify_mark_remove_from_mask(struct fsnotify_mark *fsn_mark,
                                            unsigned int flags,
                                            int *destroy)
 {
-        __u32 oldmask;
+        __u32 oldmask = 0;
        spin_lock(&fsn_mark->lock);
        if (!(flags & FAN_MARK_IGNORED_MASK)) {
+                __u32 tmask = fsn_mark->mask & ~mask;
+                if (flags & FAN_MARK_ONDIR)
+                        tmask &= ~FAN_ONDIR;
                oldmask = fsn_mark->mask;
-                fsnotify_set_mark_mask_locked(fsn_mark, (oldmask & ~mask));
+                fsnotify_set_mark_mask_locked(fsn_mark, tmask);
        } else {
-                oldmask = fsn_mark->ignored_mask;
+                __u32 tmask = fsn_mark->ignored_mask & ~mask;
-                fsnotify_set_mark_ignored_mask_locked(fsn_mark, (oldmask & ~mask));
+                if (flags & FAN_MARK_ONDIR)
+                        tmask &= ~FAN_ONDIR;
+                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
        }
+        *destroy = !(fsn_mark->mask | fsn_mark->ignored_mask);
        spin_unlock(&fsn_mark->lock);
-        *destroy = !(oldmask & ~mask);
        return mask & oldmask;
 }
@@ -569,20 +576,22 @@ static __u32 fanotify_mark_add_to_mask(struct fsnotify_mark *fsn_mark,
        spin_lock(&fsn_mark->lock);
        if (!(flags & FAN_MARK_IGNORED_MASK)) {
+                __u32 tmask = fsn_mark->mask | mask;
+                if (flags & FAN_MARK_ONDIR)
+                        tmask |= FAN_ONDIR;
                oldmask = fsn_mark->mask;
-                fsnotify_set_mark_mask_locked(fsn_mark, (oldmask | mask));
+                fsnotify_set_mark_mask_locked(fsn_mark, tmask);
        } else {
                __u32 tmask = fsn_mark->ignored_mask | mask;
+                if (flags & FAN_MARK_ONDIR)
+                        tmask |= FAN_ONDIR;
                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
                if (flags & FAN_MARK_IGNORED_SURV_MODIFY)
                        fsn_mark->flags |= FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY;
        }
-        if (!(flags & FAN_MARK_ONDIR)) {
-                __u32 tmask = fsn_mark->ignored_mask | FAN_ONDIR;
-                fsnotify_set_mark_ignored_mask_locked(fsn_mark, tmask);
-        }
        spin_unlock(&fsn_mark->lock);
        return mask & ~oldmask;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 643faa44f22b..1da9b2d184dc 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -19,6 +19,7 @@
 * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/gfp.h>
 #include <linux/pagemap.h>
@@ -2091,7 +2092,7 @@ static ssize_t ntfs_file_aio_write_nolock(struct kiocb *iocb,
        count = iov_length(iov, nr_segs);
        pos = *ppos;
        /* We can write back this queue in page reclaim. */
-        current->backing_dev_info = mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
        written = 0;
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index 7e8282dcea2a..c58a1bcfda0f 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -245,16 +245,14 @@ int ocfs2_set_acl(handle_t *handle,
                        ret = posix_acl_equiv_mode(acl, &mode);
                        if (ret < 0)
                                return ret;
-                        else {
-                                if (ret == 0)
-                                        acl = NULL;
-                                ret = ocfs2_acl_set_mode(inode, di_bh,
+                        if (ret == 0)
-                                                         handle, mode);
+                                acl = NULL;
-                                if (ret)
-                                        return ret;
-                        }
+                        ret = ocfs2_acl_set_mode(inode, di_bh,
+                                                 handle, mode);
+                        if (ret)
+                                return ret;
                }
                break;
        case ACL_TYPE_DEFAULT:
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index fcae9ef1a328..044158bd22be 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6873,7 +6873,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
        if (IS_ERR(handle)) {
                ret = PTR_ERR(handle);
                mlog_errno(ret);
-                goto out_unlock;
+                goto out;
        }
        ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
@@ -6931,7 +6931,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                if (ret) {
                        mlog_errno(ret);
                        need_free = 1;
-                        goto out_commit;
+                        goto out_unlock;
                }
                page_end = PAGE_CACHE_SIZE;
@@ -6964,12 +6964,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
                if (ret) {
                        mlog_errno(ret);
                        need_free = 1;
-                        goto out_commit;
+                        goto out_unlock;
                }
                inode->i_blocks = ocfs2_inode_sector_count(inode);
        }
+out_unlock:
+        if (pages)
+                ocfs2_unlock_and_free_pages(pages, num_pages);
 out_commit:
        if (ret < 0 && did_quota)
                dquot_free_space_nodirty(inode,
@@ -6989,15 +6993,11 @@ out_commit:
        ocfs2_commit_trans(osb, handle);
-out_unlock:
+out:
        if (data_ac)
                ocfs2_free_alloc_context(data_ac);
+        if (pages)
-out:
-        if (pages) {
-                ocfs2_unlock_and_free_pages(pages, num_pages);
                kfree(pages);
-        }
        return ret;
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 46d93e941f3d..44db1808cdb5 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -28,6 +28,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/mpage.h>
 #include <linux/quotaops.h>
+#include <linux/blkdev.h>
 #include <cluster/masklog.h>
@@ -47,6 +48,9 @@
 #include "ocfs2_trace.h"
 #include "buffer_head_io.h"
+#include "dir.h"
+#include "namei.h"
+#include "sysfile.h"
 static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
@@ -506,18 +510,21 @@ bail:
 *
 * called like this: dio->get_blocks(dio->inode, fs_startblk,
 *                                      fs_count, map_bh, dio->rw == WRITE);
- *
- * Note that we never bother to allocate blocks here, and thus ignore the
- * create argument.
 */
 static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                                     struct buffer_head *bh_result, int create)
 {
        int ret;
+        u32 cpos = 0;
+        int alloc_locked = 0;
        u64 p_blkno, inode_blocks, contig_blocks;
        unsigned int ext_flags;
        unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
        unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+        unsigned long len = bh_result->b_size;
+        unsigned int clusters_to_alloc = 0;
+        cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock);
        /* This function won't even be called if the request isn't all
         * nicely aligned and of the right size, so there's no need
@@ -539,6 +546,40 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
        /* We should already CoW the refcounted extent in case of create. */
        BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED));
+        /* allocate blocks if no p_blkno is found, and create == 1 */
+        if (!p_blkno && create) {
+                ret = ocfs2_inode_lock(inode, NULL, 1);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto bail;
+                }
+                alloc_locked = 1;
+                /* fill hole, allocate blocks can't be larger than the size
+                 * of the hole */
+                clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len);
+                if (clusters_to_alloc > contig_blocks)
+                        clusters_to_alloc = contig_blocks;
+                /* allocate extent and insert them into the extent tree */
+                ret = ocfs2_extend_allocation(inode, cpos,
+                                clusters_to_alloc, 0);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto bail;
+                }
+                ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+                                &contig_blocks, &ext_flags);
+                if (ret < 0) {
+                        mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
+                                        (unsigned long long)iblock);
+                        ret = -EIO;
+                        goto bail;
+                }
+        }
        /*
         * get_more_blocks() expects us to describe a hole by clearing
         * the mapped bit on bh_result().
@@ -556,6 +597,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
                contig_blocks = max_blocks;
        bh_result->b_size = contig_blocks << blocksize_bits;
 bail:
+        if (alloc_locked)
+                ocfs2_inode_unlock(inode, 1);
        return ret;
 }
@@ -597,6 +640,184 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait)
        return try_to_free_buffers(page);
 }
+static int ocfs2_is_overwrite(struct ocfs2_super *osb,
+                struct inode *inode, loff_t offset)
+{
+        int ret = 0;
+        u32 v_cpos = 0;
+        u32 p_cpos = 0;
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
+        v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+        ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+                        &num_clusters, &ext_flags);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return ret;
+        }
+        if (p_cpos && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+                return 1;
+        return 0;
+}
+static ssize_t ocfs2_direct_IO_write(struct kiocb *iocb,
+                struct iov_iter *iter,
+                loff_t offset)
+{
+        ssize_t ret = 0;
+        ssize_t written = 0;
+        bool orphaned = false;
+        int is_overwrite = 0;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file_inode(file)->i_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        struct buffer_head *di_bh = NULL;
+        size_t count = iter->count;
+        journal_t *journal = osb->journal->j_journal;
+        u32 zero_len;
+        int cluster_align;
+        loff_t final_size = offset + count;
+        int append_write = offset >= i_size_read(inode) ? 1 : 0;
+        unsigned int num_clusters = 0;
+        unsigned int ext_flags = 0;
+        {
+                u64 o = offset;
+                zero_len = do_div(o, 1 << osb->s_clustersize_bits);
+                cluster_align = !zero_len;
+        }
+        /*
+         * when final_size > inode->i_size, inode->i_size will be
+         * updated after direct write, so add the inode to orphan
+         * dir first.
+         */
+        if (final_size > i_size_read(inode)) {
+                ret = ocfs2_add_inode_to_orphan(osb, inode);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto out;
+                }
+                orphaned = true;
+        }
+        if (append_write) {
+                ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto clean_orphan;
+                }
+                if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+                        ret = ocfs2_zero_extend(inode, di_bh, offset);
+                else
+                        ret = ocfs2_extend_no_holes(inode, di_bh, offset,
+                                        offset);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        ocfs2_inode_unlock(inode, 1);
+                        brelse(di_bh);
+                        goto clean_orphan;
+                }
+                is_overwrite = ocfs2_is_overwrite(osb, inode, offset);
+                if (is_overwrite < 0) {
+                        mlog_errno(is_overwrite);
+                        ocfs2_inode_unlock(inode, 1);
+                        brelse(di_bh);
+                        goto clean_orphan;
+                }
+                ocfs2_inode_unlock(inode, 1);
+                brelse(di_bh);
+                di_bh = NULL;
+        }
+        written = __blockdev_direct_IO(WRITE, iocb, inode, inode->i_sb->s_bdev,
+                        iter, offset,
+                        ocfs2_direct_IO_get_blocks,
+                        ocfs2_dio_end_io, NULL, 0);
+        if (unlikely(written < 0)) {
+                loff_t i_size = i_size_read(inode);
+                if (offset + count > i_size) {
+                        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                        if (ret < 0) {
+                                mlog_errno(ret);
+                                goto clean_orphan;
+                        }
+                        if (i_size == i_size_read(inode)) {
+                                ret = ocfs2_truncate_file(inode, di_bh,
+                                                i_size);
+                                if (ret < 0) {
+                                        if (ret != -ENOSPC)
+                                                mlog_errno(ret);
+                                        ocfs2_inode_unlock(inode, 1);
+                                        brelse(di_bh);
+                                        goto clean_orphan;
+                                }
+                        }
+                        ocfs2_inode_unlock(inode, 1);
+                        brelse(di_bh);
+                        ret = jbd2_journal_force_commit(journal);
+                        if (ret < 0)
+                                mlog_errno(ret);
+                }
+        } else if (written < 0 && append_write && !is_overwrite &&
+                        !cluster_align) {
+                u32 p_cpos = 0;
+                u32 v_cpos = ocfs2_bytes_to_clusters(osb->sb, offset);
+                ret = ocfs2_get_clusters(inode, v_cpos, &p_cpos,
+                                &num_clusters, &ext_flags);
+                if (ret < 0) {
+                        mlog_errno(ret);
+                        goto clean_orphan;
+                }
+                BUG_ON(!p_cpos || (ext_flags & OCFS2_EXT_UNWRITTEN));
+                ret = blkdev_issue_zeroout(osb->sb->s_bdev,
+                                p_cpos << (osb->s_clustersize_bits - 9),
+                                zero_len >> 9, GFP_KERNEL, false);
+                if (ret < 0)
+                        mlog_errno(ret);
+        }
+clean_orphan:
+        if (orphaned) {
+                int tmp_ret;
+                int update_isize = written > 0 ? 1 : 0;
+                loff_t end = update_isize ? offset + written : 0;
+                tmp_ret = ocfs2_del_inode_from_orphan(osb, inode,
+                                update_isize, end);
+                if (tmp_ret < 0) {
+                        ret = tmp_ret;
+                        goto out;
+                }
+                tmp_ret = jbd2_journal_force_commit(journal);
+                if (tmp_ret < 0) {
+                        ret = tmp_ret;
+                        mlog_errno(tmp_ret);
+                }
+        }
+out:
+        if (ret >= 0)
+                ret = written;
+        return ret;
+}
 static ssize_t ocfs2_direct_IO(int rw,
                               struct kiocb *iocb,
                               struct iov_iter *iter,
@@ -604,6 +825,9 @@ static ssize_t ocfs2_direct_IO(int rw,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file)->i_mapping->host;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int full_coherency = !(osb->s_mount_opt &
+                        OCFS2_MOUNT_COHERENCY_BUFFERED);
        /*
         * Fallback to buffered I/O if we see an inode without
@@ -612,14 +836,20 @@ static ssize_t ocfs2_direct_IO(int rw,
        if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
                return 0;
-        /* Fallback to buffered I/O if we are appending. */
+        /* Fallback to buffered I/O if we are appending and
-        if (i_size_read(inode) <= offset)
+         * concurrent O_DIRECT writes are allowed.
+         */
+        if (i_size_read(inode) <= offset && !full_coherency)
                return 0;
-        return __blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev,
+        if (rw == READ)
+                return __blockdev_direct_IO(rw, iocb, inode,
+                                    inode->i_sb->s_bdev,
                                    iter, offset,
                                    ocfs2_direct_IO_get_blocks,
                                    ocfs2_dio_end_io, NULL, 0);
+        else
+                return ocfs2_direct_IO_write(iocb, iter, offset);
 }
 static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 2e355e0f8335..56c403a563bc 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1016,7 +1016,8 @@ void o2net_fill_node_map(unsigned long *map, unsigned bytes)
        memset(map, 0, bytes);
        for (node = 0; node < O2NM_MAX_NODES; ++node) {
-                o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
+                if (!o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret))
+                        continue;
                if (!ret) {
                        set_bit(node, map);
                        sc_put(sc);
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index dc024367110a..b95e7df5b76a 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -107,12 +107,12 @@ struct o2net_node {
        struct list_head                nn_status_list;
        /* connects are attempted from when heartbeat comes up until either hb
-         * goes down, the node is unconfigured, no connect attempts succeed
+         * goes down, the node is unconfigured, or a connect succeeds.
-         * before O2NET_CONN_IDLE_DELAY, or a connect succeeds.  connect_work
+         * connect_work is queued from set_nn_state both from hb up and from
-         * is queued from set_nn_state both from hb up and from itself if a
+         * itself if a connect attempt fails and so can be self-arming.
-         * connect attempt fails and so can be self-arming.  shutdown is
+         * shutdown is careful to first mark the nn such that no connects will
-         * careful to first mark the nn such that no connects will be attempted
+         * be attempted before canceling delayed connect work and flushing the
-         * before canceling delayed connect work and flushing the queue. */
+         * queue. */
        struct delayed_work             nn_connect_work;
        unsigned long                   nn_last_connect_attempt;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 319e786175af..b08050bd3f2e 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -3456,10 +3456,8 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
        int blocksize = dir->i_sb->s_blocksize;
        status = ocfs2_read_dir_block(dir, 0, &bh, 0);
-        if (status) {
+        if (status)
-                mlog_errno(status);
                goto bail;
-        }
        rec_len = OCFS2_DIR_REC_LEN(namelen);
        offset = 0;
@@ -3480,10 +3478,9 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name,
                        status = ocfs2_read_dir_block(dir,
                                             offset >> sb->s_blocksize_bits,
                                             &bh, 0);
-                        if (status) {
+                        if (status)
-                                mlog_errno(status);
                                goto bail;
-                        }
                        /* move to next block */
                        de = (struct ocfs2_dir_entry *) bh->b_data;
                }
@@ -3513,7 +3510,6 @@ next:
                de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len));
        }
-        status = 0;
 bail:
        brelse(bh);
        if (status)
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index b46278f9ae44..fd6bbbbd7d78 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -385,8 +385,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data,
                head = &res->granted;
        list_for_each_entry(lock, head, list) {
-                if (lock->ml.cookie == cookie)
+                /* if lock is found but unlock is pending ignore the bast */
+                if (lock->ml.cookie == cookie) {
+                        if (lock->unlock_pending)
+                                break;
                        goto do_ast;
+                }
        }
        mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, "
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index 149eb556b8c6..825136070d2c 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -406,7 +406,7 @@ static int debug_purgelist_print(struct dlm_ctxt *dlm, char *buf, int len)
        }
        spin_unlock(&dlm->spinlock);
-        out += snprintf(buf + out, len - out, "Total on list: %ld\n", total);
+        out += snprintf(buf + out, len - out, "Total on list: %lu\n", total);
        return out;
 }
@@ -464,7 +464,7 @@ static int debug_mle_print(struct dlm_ctxt *dlm, char *buf, int len)
        spin_unlock(&dlm->master_lock);
        out += snprintf(buf + out, len - out,
-                        "Total: %ld, Longest: %ld\n", total, longest);
+                        "Total: %lu, Longest: %lu\n", total, longest);
        return out;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 50a59d2337b2..7df88a6dd626 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -674,20 +674,6 @@ static void dlm_leave_domain(struct dlm_ctxt *dlm)
        spin_unlock(&dlm->spinlock);
 }
-int dlm_joined(struct dlm_ctxt *dlm)
-{
-        int ret = 0;
-        spin_lock(&dlm_domain_lock);
-        if (dlm->dlm_state == DLM_CTXT_JOINED)
-                ret = 1;
-        spin_unlock(&dlm_domain_lock);
-        return ret;
-}
 int dlm_shutting_down(struct dlm_ctxt *dlm)
 {
        int ret = 0;
diff --git a/fs/ocfs2/dlm/dlmdomain.h b/fs/ocfs2/dlm/dlmdomain.h
index 2f7f60bfeb3b..fd6122a38dbd 100644
--- a/fs/ocfs2/dlm/dlmdomain.h
+++ b/fs/ocfs2/dlm/dlmdomain.h
@@ -28,7 +28,6 @@
 extern spinlock_t dlm_domain_lock;
 extern struct list_head dlm_domains;
-int dlm_joined(struct dlm_ctxt *dlm);
 int dlm_shutting_down(struct dlm_ctxt *dlm);
 void dlm_fire_domain_eviction_callbacks(struct dlm_ctxt *dlm,
                                        int node_num);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 79b5af5e6a7b..ce12e0b1a31f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1070,6 +1070,9 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
                                             dead_node, dlm->name);
                                        list_del_init(&lock->list);
                                        dlm_lock_put(lock);
+                                        /* Can't schedule DLM_UNLOCK_FREE_LOCK
+                                         * - do manually */
+                                        dlm_lock_put(lock);
                                        break;
                                }
                        }
@@ -2023,11 +2026,8 @@ leave:
        dlm_lockres_drop_inflight_ref(dlm, res);
        spin_unlock(&res->spinlock);
-        if (ret < 0) {
+        if (ret < 0)
                mlog_errno(ret);
-                if (newlock)
-                        dlm_lock_put(newlock);
-        }
        return ret;
 }
@@ -2349,6 +2349,10 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
                                                     dead_node, dlm->name);
                                                list_del_init(&lock->list);
                                                dlm_lock_put(lock);
+                                                /* Can't schedule
+                                                 * DLM_UNLOCK_FREE_LOCK
+                                                 * - do manually */
+                                                dlm_lock_put(lock);
                                                break;
                                        }
                                }
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 57c40e34f56f..061ba6a91bf2 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -390,12 +390,6 @@ clear_fields:
        ip->ip_conn = NULL;
 }
-static struct backing_dev_info dlmfs_backing_dev_info = {
-        .name           = "ocfs2-dlmfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
-};
 static struct inode *dlmfs_get_root_inode(struct super_block *sb)
 {
        struct inode *inode = new_inode(sb);
@@ -404,7 +398,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb)
        if (inode) {
                inode->i_ino = get_next_ino();
                inode_init_owner(inode, NULL, mode);
-                inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inc_nlink(inode);
@@ -428,7 +421,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent,
        inode->i_ino = get_next_ino();
        inode_init_owner(inode, parent, mode);
-        inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info;
        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
        ip = DLMFS_I(inode);
@@ -643,10 +635,6 @@ static int __init init_dlmfs_fs(void)
        int status;
        int cleanup_inode = 0, cleanup_worker = 0;
-        status = bdi_init(&dlmfs_backing_dev_info);
-        if (status)
-                return status;
        dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
                                sizeof(struct dlmfs_inode_private),
                                0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -673,7 +661,6 @@ bail:
                        kmem_cache_destroy(dlmfs_inode_cache);
                if (cleanup_worker)
                        destroy_workqueue(user_dlm_worker);
-                bdi_destroy(&dlmfs_backing_dev_info);
        } else
                printk("OCFS2 User DLM kernel interface loaded\n");
        return status;
@@ -693,7 +680,6 @@ static void __exit exit_dlmfs_fs(void)
        rcu_barrier();
        kmem_cache_destroy(dlmfs_inode_cache);
-        bdi_destroy(&dlmfs_backing_dev_info);
 }
 MODULE_AUTHOR("Oracle");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 1c423af04c69..11849a44dc5a 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3750,6 +3750,9 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres,
                        break;
                spin_unlock(&dentry_attach_lock);
+                if (S_ISDIR(dl->dl_inode->i_mode))
+                        shrink_dcache_parent(dentry);
                mlog(0, "d_delete(%pd);\n", dentry);
                /*
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 3950693dd0f6..46e0d4e857c7 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -295,7 +295,7 @@ out:
        return ret;
 }
-static int ocfs2_set_inode_size(handle_t *handle,
+int ocfs2_set_inode_size(handle_t *handle,
                                struct inode *inode,
                                struct buffer_head *fe_bh,
                                u64 new_i_size)
@@ -441,7 +441,7 @@ out:
        return status;
 }
-static int ocfs2_truncate_file(struct inode *inode,
+int ocfs2_truncate_file(struct inode *inode,
                               struct buffer_head *di_bh,
                               u64 new_i_size)
 {
@@ -569,7 +569,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
        handle_t *handle = NULL;
        struct ocfs2_alloc_context *data_ac = NULL;
        struct ocfs2_alloc_context *meta_ac = NULL;
-        enum ocfs2_alloc_restarted why;
+        enum ocfs2_alloc_restarted why = RESTART_NONE;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        struct ocfs2_extent_tree et;
        int did_quota = 0;
@@ -709,6 +709,13 @@ leave:
        return status;
 }
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+                u32 clusters_to_add, int mark_unwritten)
+{
+        return __ocfs2_extend_allocation(inode, logical_start,
+                        clusters_to_add, mark_unwritten);
+}
 /*
 * While a write will already be ordering the data, a truncate will not.
 * Thus, we need to explicitly order the zeroed pages.
@@ -2109,6 +2116,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        loff_t saved_pos = 0, end;
+        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+        int full_coherency = !(osb->s_mount_opt &
+                OCFS2_MOUNT_COHERENCY_BUFFERED);
        /*
         * We start with a read level meta lock and only jump to an ex
@@ -2197,7 +2207,16 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                 * one node could wind up truncating another
                 * nodes writes.
                 */
-                if (end > i_size_read(inode)) {
+                if (end > i_size_read(inode) && !full_coherency) {
+                        *direct_io = 0;
+                        break;
+                }
+                /*
+                 * Fallback to old way if the feature bit is not set.
+                 */
+                if (end > i_size_read(inode) &&
+                                !ocfs2_supports_append_dio(osb)) {
                        *direct_io = 0;
                        break;
                }
@@ -2210,7 +2229,13 @@ static int ocfs2_prepare_inode_for_write(struct file *file,
                 */
                ret = ocfs2_check_range_for_holes(inode, saved_pos, count);
                if (ret == 1) {
-                        *direct_io = 0;
+                        /*
+                         * Fallback to old way if the feature bit is not set.
+                         * Otherwise try dio first and then complete the rest
+                         * request through buffer io.
+                         */
+                        if (!ocfs2_supports_append_dio(osb))
+                                *direct_io = 0;
                        ret = 0;
                } else if (ret < 0)
                        mlog_errno(ret);
@@ -2243,6 +2268,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
        u32 old_clusters;
        struct file *file = iocb->ki_filp;
        struct inode *inode = file_inode(file);
+        struct address_space *mapping = file->f_mapping;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        int full_coherency = !(osb->s_mount_opt &
                               OCFS2_MOUNT_COHERENCY_BUFFERED);
@@ -2357,13 +2383,53 @@ relock:
        iov_iter_truncate(from, count);
        if (direct_io) {
+                loff_t endbyte;
+                ssize_t written_buffered;
                written = generic_file_direct_write(iocb, from, *ppos);
-                if (written < 0) {
+                if (written < 0 || written == count) {
                        ret = written;
                        goto out_dio;
                }
+                /*
+                 * for completing the rest of the request.
+                 */
+                *ppos += written;
+                count -= written;
+                written_buffered = generic_perform_write(file, from, *ppos);
+                /*
+                 * If generic_file_buffered_write() returned a synchronous error
+                 * then we want to return the number of bytes which were
+                 * direct-written, or the error code if that was zero. Note
+                 * that this differs from normal direct-io semantics, which
+                 * will return -EFOO even if some bytes were written.
+                 */
+                if (written_buffered < 0) {
+                        ret = written_buffered;
+                        goto out_dio;
+                }
+                iocb->ki_pos = *ppos + written_buffered;
+                /* We need to ensure that the page cache pages are written to
+                 * disk and invalidated to preserve the expected O_DIRECT
+                 * semantics.
+                 */
+                endbyte = *ppos + written_buffered - 1;
+                ret = filemap_write_and_wait_range(file->f_mapping, *ppos,
+                                endbyte);
+                if (ret == 0) {
+                        written += written_buffered;
+                        invalidate_mapping_pages(mapping,
+                                        *ppos >> PAGE_CACHE_SHIFT,
+                                        endbyte >> PAGE_CACHE_SHIFT);
+                } else {
+                        /*
+                         * We don't know how much we wrote, so just return
+                         * the number of bytes which were direct-written
+                         */
+                }
        } else {
-                current->backing_dev_info = file->f_mapping->backing_dev_info;
+                current->backing_dev_info = inode_to_bdi(inode);
                written = generic_perform_write(file, from, *ppos);
                if (likely(written >= 0))
                        iocb->ki_pos = *ppos + written;
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index 97bf761c9e7c..e8c62f22215c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -51,13 +51,22 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb,
                         struct ocfs2_alloc_context *data_ac,
                         struct ocfs2_alloc_context *meta_ac,
                         enum ocfs2_alloc_restarted *reason_ret);
+int ocfs2_set_inode_size(handle_t *handle,
+                struct inode *inode,
+                struct buffer_head *fe_bh,
+                u64 new_i_size);
 int ocfs2_simple_size_update(struct inode *inode,
                             struct buffer_head *di_bh,
                             u64 new_i_size);
+int ocfs2_truncate_file(struct inode *inode,
+                struct buffer_head *di_bh,
+                u64 new_i_size);
 int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
                          u64 new_i_size, u64 zero_to);
 int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
                      loff_t zero_to);
+int ocfs2_extend_allocation(struct inode *inode, u32 logical_start,
+                u32 clusters_to_add, int mark_unwritten);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
                  struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c8b25de9efbb..3025c0da6b8a 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -648,7 +648,7 @@ static int ocfs2_remove_inode(struct inode *inode,
        if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) {
                status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-                                          orphan_dir_bh);
+                                          orphan_dir_bh, false);
                if (status < 0) {
                        mlog_errno(status);
                        goto bail_commit;
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index ca3431ee7f24..5e86b247c821 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -81,6 +81,8 @@ struct ocfs2_inode_info
        tid_t i_sync_tid;
        tid_t i_datasync_tid;
+        wait_queue_head_t append_dio_wq;
        struct dquot *i_dquot[MAXQUOTAS];
 };
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 4f502382180f..ff531928269e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -50,6 +50,8 @@
 #include "sysfile.h"
 #include "uptodate.h"
 #include "quota.h"
+#include "file.h"
+#include "namei.h"
 #include "buffer_head_io.h"
 #include "ocfs2_trace.h"
@@ -69,13 +71,15 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb,
 static int ocfs2_trylock_journal(struct ocfs2_super *osb,
                                 int slot_num);
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
-                                 int slot);
+                                 int slot,
+                                 enum ocfs2_orphan_reco_type orphan_reco_type);
 static int ocfs2_commit_thread(void *arg);
 static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                                            int slot_num,
                                            struct ocfs2_dinode *la_dinode,
                                            struct ocfs2_dinode *tl_dinode,
-                                            struct ocfs2_quota_recovery *qrec);
+                                            struct ocfs2_quota_recovery *qrec,
+                                            enum ocfs2_orphan_reco_type orphan_reco_type);
 static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb)
 {
@@ -149,7 +153,8 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb)
        return 0;
 }
-void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
+void ocfs2_queue_replay_slots(struct ocfs2_super *osb,
+                enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        struct ocfs2_replay_map *replay_map = osb->replay_map;
        int i;
@@ -163,7 +168,8 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb)
        for (i = 0; i < replay_map->rm_slots; i++)
                if (replay_map->rm_replay_slots[i])
                        ocfs2_queue_recovery_completion(osb->journal, i, NULL,
-                                                        NULL, NULL);
+                                                        NULL, NULL,
+                                                        orphan_reco_type);
        replay_map->rm_state = REPLAY_DONE;
 }
@@ -1174,6 +1180,7 @@ struct ocfs2_la_recovery_item {
        struct ocfs2_dinode     *lri_la_dinode;
        struct ocfs2_dinode     *lri_tl_dinode;
        struct ocfs2_quota_recovery *lri_qrec;
+        enum ocfs2_orphan_reco_type  lri_orphan_reco_type;
 };
 /* Does the second half of the recovery process. By this point, the
@@ -1195,6 +1202,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
        struct ocfs2_dinode *la_dinode, *tl_dinode;
        struct ocfs2_la_recovery_item *item, *n;
        struct ocfs2_quota_recovery *qrec;
+        enum ocfs2_orphan_reco_type orphan_reco_type;
        LIST_HEAD(tmp_la_list);
        trace_ocfs2_complete_recovery(
@@ -1212,6 +1220,7 @@ void ocfs2_complete_recovery(struct work_struct *work)
                la_dinode = item->lri_la_dinode;
                tl_dinode = item->lri_tl_dinode;
                qrec = item->lri_qrec;
+                orphan_reco_type = item->lri_orphan_reco_type;
                trace_ocfs2_complete_recovery_slot(item->lri_slot,
                        la_dinode ? le64_to_cpu(la_dinode->i_blkno) : 0,
@@ -1236,7 +1245,8 @@ void ocfs2_complete_recovery(struct work_struct *work)
                        kfree(tl_dinode);
                }
-                ret = ocfs2_recover_orphans(osb, item->lri_slot);
+                ret = ocfs2_recover_orphans(osb, item->lri_slot,
+                                orphan_reco_type);
                if (ret < 0)
                        mlog_errno(ret);
@@ -1261,7 +1271,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                                            int slot_num,
                                            struct ocfs2_dinode *la_dinode,
                                            struct ocfs2_dinode *tl_dinode,
-                                            struct ocfs2_quota_recovery *qrec)
+                                            struct ocfs2_quota_recovery *qrec,
+                                            enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        struct ocfs2_la_recovery_item *item;
@@ -1285,6 +1296,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
        item->lri_slot = slot_num;
        item->lri_tl_dinode = tl_dinode;
        item->lri_qrec = qrec;
+        item->lri_orphan_reco_type = orphan_reco_type;
        spin_lock(&journal->j_lock);
        list_add_tail(&item->lri_list, &journal->j_la_cleanups);
@@ -1304,7 +1316,8 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
        /* No need to queue up our truncate_log as regular cleanup will catch
         * that */
        ocfs2_queue_recovery_completion(journal, osb->slot_num,
-                                        osb->local_alloc_copy, NULL, NULL);
+                                        osb->local_alloc_copy, NULL, NULL,
+                                        ORPHAN_NEED_TRUNCATE);
        ocfs2_schedule_truncate_log_flush(osb, 0);
        osb->local_alloc_copy = NULL;
@@ -1312,7 +1325,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb)
        /* queue to recover orphan slots for all offline slots */
        ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
-        ocfs2_queue_replay_slots(osb);
+        ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
        ocfs2_free_replay_slots(osb);
 }
@@ -1323,7 +1336,8 @@ void ocfs2_complete_quota_recovery(struct ocfs2_super *osb)
                                                osb->slot_num,
                                                NULL,
                                                NULL,
-                                                osb->quota_rec);
+                                                osb->quota_rec,
+                                                ORPHAN_NEED_TRUNCATE);
                osb->quota_rec = NULL;
        }
 }
@@ -1360,7 +1374,7 @@ restart:
        /* queue recovery for our own slot */
        ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL,
-                                        NULL, NULL);
+                                        NULL, NULL, ORPHAN_NO_NEED_TRUNCATE);
        spin_lock(&osb->osb_lock);
        while (rm->rm_used) {
@@ -1419,13 +1433,14 @@ skip_recovery:
                        continue;
                }
                ocfs2_queue_recovery_completion(osb->journal, rm_quota[i],
-                                                NULL, NULL, qrec);
+                                                NULL, NULL, qrec,
+                                                ORPHAN_NEED_TRUNCATE);
        }
        ocfs2_super_unlock(osb, 1);
        /* queue recovery for offline slots */
-        ocfs2_queue_replay_slots(osb);
+        ocfs2_queue_replay_slots(osb, ORPHAN_NEED_TRUNCATE);
 bail:
        mutex_lock(&osb->recovery_lock);
@@ -1447,7 +1462,6 @@ bail:
         * requires that we call do_exit().  And it isn't exported, but
         * complete_and_exit() seems to be a minimal wrapper around it. */
        complete_and_exit(NULL, status);
-        return status;
 }
 void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
@@ -1712,7 +1726,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
        /* This will kfree the memory pointed to by la_copy and tl_copy */
        ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy,
-                                        tl_copy, NULL);
+                                        tl_copy, NULL, ORPHAN_NEED_TRUNCATE);
        status = 0;
 done:
@@ -1902,7 +1916,7 @@ void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
        for (i = 0; i < osb->max_slots; i++)
                ocfs2_queue_recovery_completion(osb->journal, i, NULL, NULL,
-                                                NULL);
+                                                NULL, ORPHAN_NO_NEED_TRUNCATE);
        /*
         * We queued a recovery on orphan slots, increment the sequence
         * number and update LVB so other node will skip the scan for a while
@@ -2001,6 +2015,13 @@ static int ocfs2_orphan_filldir(struct dir_context *ctx, const char *name,
        if (IS_ERR(iter))
                return 0;
+        /* Skip inodes which are already added to recover list, since dio may
+         * happen concurrently with unlink/rename */
+        if (OCFS2_I(iter)->ip_next_orphan) {
+                iput(iter);
+                return 0;
+        }
        trace_ocfs2_orphan_filldir((unsigned long long)OCFS2_I(iter)->ip_blkno);
        /* No locking is required for the next_orphan queue as there
         * is only ever a single process doing orphan recovery. */
@@ -2109,7 +2130,8 @@ static void ocfs2_clear_recovering_orphan_dir(struct ocfs2_super *osb,
 *   advertising our state to ocfs2_delete_inode().
 */
 static int ocfs2_recover_orphans(struct ocfs2_super *osb,
-                                 int slot)
+                                 int slot,
+                                 enum ocfs2_orphan_reco_type orphan_reco_type)
 {
        int ret = 0;
        struct inode *inode = NULL;
@@ -2133,13 +2155,60 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
                                        (unsigned long long)oi->ip_blkno);
                iter = oi->ip_next_orphan;
+                oi->ip_next_orphan = NULL;
+                /*
+                 * We need to take and drop the inode lock to
+                 * force read inode from disk.
+                 */
+                ret = ocfs2_inode_lock(inode, NULL, 0);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto next;
+                }
+                ocfs2_inode_unlock(inode, 0);
+                if (inode->i_nlink == 0) {
+                        spin_lock(&oi->ip_lock);
+                        /* Set the proper information to get us going into
+                         * ocfs2_delete_inode. */
+                        oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
+                        spin_unlock(&oi->ip_lock);
+                } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) {
+                        struct buffer_head *di_bh = NULL;
+                        ret = ocfs2_rw_lock(inode, 1);
+                        if (ret) {
+                                mlog_errno(ret);
+                                goto next;
+                        }
+                        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+                        if (ret < 0) {
+                                ocfs2_rw_unlock(inode, 1);
+                                mlog_errno(ret);
+                                goto next;
+                        }
+                        ret = ocfs2_truncate_file(inode, di_bh,
+                                        i_size_read(inode));
+                        ocfs2_inode_unlock(inode, 1);
+                        ocfs2_rw_unlock(inode, 1);
+                        brelse(di_bh);
+                        if (ret < 0) {
+                                if (ret != -ENOSPC)
+                                        mlog_errno(ret);
+                                goto next;
+                        }
+                        ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0);
+                        if (ret)
+                                mlog_errno(ret);
-                spin_lock(&oi->ip_lock);
+                        wake_up(&OCFS2_I(inode)->append_dio_wq);
-                /* Set the proper information to get us going into
+                } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */
-                 * ocfs2_delete_inode. */
-                oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-                spin_unlock(&oi->ip_lock);
+next:
                iput(inode);
                inode = iter;
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 7f8cde94abfe..f4cd3c3e9fb7 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -472,6 +472,11 @@ static inline int ocfs2_unlink_credits(struct super_block *sb)
 * orphan dir index leaf */
 #define OCFS2_DELETE_INODE_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 4)
+/* dinode + orphan dir dinode + extent tree leaf block + orphan dir entry +
+ * orphan dir index root + orphan dir index leaf */
+#define OCFS2_INODE_ADD_TO_ORPHAN_CREDITS  (2 * OCFS2_INODE_UPDATE_CREDITS + 4)
+#define OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS  OCFS2_INODE_ADD_TO_ORPHAN_CREDITS
 /* dinode update, old dir dinode update, new dir dinode update, old
 * dir dir entry, new dir dir entry, dir entry update for renaming
 * directory + target unlink + 3 x dir index leaves */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 10d66c75cecb..9581d190f6e1 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,7 +173,6 @@ out:
 static const struct vm_operations_struct ocfs2_file_vm_ops = {
        .fault          = ocfs2_fault,
        .page_mkwrite   = ocfs2_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index b931e04e3388..b5c3a5ea3ee6 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -79,7 +79,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    u64 blkno,
                                    char *name,
-                                    struct ocfs2_dir_lookup_result *lookup);
+                                    struct ocfs2_dir_lookup_result *lookup,
+                                    bool dio);
 static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            handle_t *handle,
@@ -87,15 +88,26 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
-                            struct inode *orphan_dir_inode);
+                            struct inode *orphan_dir_inode,
+                            bool dio);
 static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
                                     handle_t *handle,
                                     struct inode *inode,
                                     const char *symname);
+static int ocfs2_double_lock(struct ocfs2_super *osb,
+                             struct buffer_head **bh1,
+                             struct inode *inode1,
+                             struct buffer_head **bh2,
+                             struct inode *inode2,
+                             int rename);
+static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
 /* An orphan dir name is an 8 byte value, printed as a hex string */
 #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
+#define OCFS2_DIO_ORPHAN_PREFIX "dio-"
+#define OCFS2_DIO_ORPHAN_PREFIX_LEN 4
 static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
                                   unsigned int flags)
@@ -678,8 +690,10 @@ static int ocfs2_link(struct dentry *old_dentry,
 {
        handle_t *handle;
        struct inode *inode = old_dentry->d_inode;
+        struct inode *old_dir = old_dentry->d_parent->d_inode;
        int err;
        struct buffer_head *fe_bh = NULL;
+        struct buffer_head *old_dir_bh = NULL;
        struct buffer_head *parent_fe_bh = NULL;
        struct ocfs2_dinode *fe = NULL;
        struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
@@ -696,19 +710,33 @@ static int ocfs2_link(struct dentry *old_dentry,
        dquot_initialize(dir);
-        err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
+        err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
+                        &parent_fe_bh, dir, 0);
        if (err < 0) {
                if (err != -ENOENT)
                        mlog_errno(err);
                return err;
        }
+        /* make sure both dirs have bhs
+         * get an extra ref on old_dir_bh if old==new */
+        if (!parent_fe_bh) {
+                if (old_dir_bh) {
+                        parent_fe_bh = old_dir_bh;
+                        get_bh(parent_fe_bh);
+                } else {
+                        mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
+                        err = -EIO;
+                        goto out;
+                }
+        }
        if (!dir->i_nlink) {
                err = -ENOENT;
                goto out;
        }
-        err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
+        err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
                        old_dentry->d_name.len, &old_de_ino);
        if (err) {
                err = -ENOENT;
@@ -801,10 +829,11 @@ out_unlock_inode:
        ocfs2_inode_unlock(inode, 1);
 out:
-        ocfs2_inode_unlock(dir, 1);
+        ocfs2_double_unlock(old_dir, dir);
        brelse(fe_bh);
        brelse(parent_fe_bh);
+        brelse(old_dir_bh);
        ocfs2_free_dir_lookup_result(&lookup);
@@ -927,7 +956,8 @@ static int ocfs2_unlink(struct inode *dir,
        if (ocfs2_inode_is_unlinkable(inode)) {
                status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                  OCFS2_I(inode)->ip_blkno,
-                                                  orphan_name, &orphan_insert);
+                                                  orphan_name, &orphan_insert,
+                                                  false);
                if (status < 0) {
                        mlog_errno(status);
                        goto leave;
@@ -979,7 +1009,7 @@ static int ocfs2_unlink(struct inode *dir,
        if (is_unlinkable) {
                status = ocfs2_orphan_add(osb, handle, inode, fe_bh,
-                                orphan_name, &orphan_insert, orphan_dir);
+                                orphan_name, &orphan_insert, orphan_dir, false);
                if (status < 0)
                        mlog_errno(status);
        }
@@ -1072,14 +1102,15 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
 }
 /*
- * The only place this should be used is rename!
+ * The only place this should be used is rename and link!
 * if they have the same id, then the 1st one is the only one locked.
 */
 static int ocfs2_double_lock(struct ocfs2_super *osb,
                             struct buffer_head **bh1,
                             struct inode *inode1,
                             struct buffer_head **bh2,
-                             struct inode *inode2)
+                             struct inode *inode2,
+                             int rename)
 {
        int status;
        int inode1_is_ancestor, inode2_is_ancestor;
@@ -1127,7 +1158,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
                }
                /* lock id2 */
                status = ocfs2_inode_lock_nested(inode2, bh2, 1,
-                                                 OI_LS_RENAME1);
+                                rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
                if (status < 0) {
                        if (status != -ENOENT)
                                mlog_errno(status);
@@ -1136,7 +1167,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
        }
        /* lock id1 */
-        status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
+        status = ocfs2_inode_lock_nested(inode1, bh1, 1,
+                        rename == 1 ?  OI_LS_RENAME2 : OI_LS_PARENT);
        if (status < 0) {
                /*
                 * An error return must mean that no cluster locks
@@ -1252,7 +1284,7 @@ static int ocfs2_rename(struct inode *old_dir,
        /* if old and new are the same, this'll just do one lock. */
        status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
-                                   &new_dir_bh, new_dir);
+                                   &new_dir_bh, new_dir, 1);
        if (status < 0) {
                mlog_errno(status);
                goto bail;
@@ -1413,7 +1445,8 @@ static int ocfs2_rename(struct inode *old_dir,
                if (S_ISDIR(new_inode->i_mode) || (new_inode->i_nlink == 1)) {
                        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir,
                                                OCFS2_I(new_inode)->ip_blkno,
-                                                orphan_name, &orphan_insert);
+                                                orphan_name, &orphan_insert,
+                                                false);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -1480,7 +1513,7 @@ static int ocfs2_rename(struct inode *old_dir,
                if (should_add_orphan) {
                        status = ocfs2_orphan_add(osb, handle, new_inode,
                                        newfe_bh, orphan_name,
-                                        &orphan_insert, orphan_dir);
+                                        &orphan_insert, orphan_dir, false);
                        if (status < 0) {
                                mlog_errno(status);
                                goto bail;
@@ -2061,12 +2094,28 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
                                      struct buffer_head *orphan_dir_bh,
                                      u64 blkno,
                                      char *name,
-                                      struct ocfs2_dir_lookup_result *lookup)
+                                      struct ocfs2_dir_lookup_result *lookup,
+                                      bool dio)
 {
        int ret;
        struct ocfs2_super *osb = OCFS2_SB(orphan_dir_inode->i_sb);
+        int namelen = dio ?
+                        (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+                        OCFS2_ORPHAN_NAMELEN;
+        if (dio) {
+                ret = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+                                OCFS2_DIO_ORPHAN_PREFIX);
+                if (ret != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+                        ret = -EINVAL;
+                        mlog_errno(ret);
+                        return ret;
+                }
-        ret = ocfs2_blkno_stringify(blkno, name);
+                ret = ocfs2_blkno_stringify(blkno,
+                                name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+        } else
+                ret = ocfs2_blkno_stringify(blkno, name);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -2074,7 +2123,7 @@ static int __ocfs2_prepare_orphan_dir(struct inode *orphan_dir_inode,
        ret = ocfs2_prepare_dir_for_insert(osb, orphan_dir_inode,
                                           orphan_dir_bh, name,
-                                           OCFS2_ORPHAN_NAMELEN, lookup);
+                                           namelen, lookup);
        if (ret < 0) {
                mlog_errno(ret);
                return ret;
@@ -2101,7 +2150,8 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
                                    struct inode **ret_orphan_dir,
                                    u64 blkno,
                                    char *name,
-                                    struct ocfs2_dir_lookup_result *lookup)
+                                    struct ocfs2_dir_lookup_result *lookup,
+                                    bool dio)
 {
        struct inode *orphan_dir_inode = NULL;
        struct buffer_head *orphan_dir_bh = NULL;
@@ -2115,7 +2165,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb,
        }
        ret = __ocfs2_prepare_orphan_dir(orphan_dir_inode, orphan_dir_bh,
-                                         blkno, name, lookup);
+                                         blkno, name, lookup, dio);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -2143,12 +2193,16 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                            struct buffer_head *fe_bh,
                            char *name,
                            struct ocfs2_dir_lookup_result *lookup,
-                            struct inode *orphan_dir_inode)
+                            struct inode *orphan_dir_inode,
+                            bool dio)
 {
        struct buffer_head *orphan_dir_bh = NULL;
        int status = 0;
        struct ocfs2_dinode *orphan_fe;
        struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data;
+        int namelen = dio ?
+                        (OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN) :
+                        OCFS2_ORPHAN_NAMELEN;
        trace_ocfs2_orphan_add_begin(
                                (unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -2192,7 +2246,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
        ocfs2_journal_dirty(handle, orphan_dir_bh);
        status = __ocfs2_add_entry(handle, orphan_dir_inode, name,
-                                   OCFS2_ORPHAN_NAMELEN, inode,
+                                   namelen, inode,
                                   OCFS2_I(inode)->ip_blkno,
                                   orphan_dir_bh, lookup);
        if (status < 0) {
@@ -2200,13 +2254,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
                goto rollback;
        }
-        fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+        if (dio) {
-        OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
+                /* Update flag OCFS2_DIO_ORPHANED_FL and record the orphan
+                 * slot.
+                 */
+                fe->i_flags |= cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+                fe->i_dio_orphaned_slot = cpu_to_le16(osb->slot_num);
+        } else {
+                fe->i_flags |= cpu_to_le32(OCFS2_ORPHANED_FL);
+                OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR;
-        /* Record which orphan dir our inode now resides
+                /* Record which orphan dir our inode now resides
-         * in. delete_inode will use this to determine which orphan
+                 * in. delete_inode will use this to determine which orphan
-         * dir to lock. */
+                 * dir to lock. */
-        fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+                fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
+        }
        ocfs2_journal_dirty(handle, fe_bh);
@@ -2231,14 +2293,28 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
                     handle_t *handle,
                     struct inode *orphan_dir_inode,
                     struct inode *inode,
-                     struct buffer_head *orphan_dir_bh)
+                     struct buffer_head *orphan_dir_bh,
+                     bool dio)
 {
-        char name[OCFS2_ORPHAN_NAMELEN + 1];
+        const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN;
+        char name[namelen + 1];
        struct ocfs2_dinode *orphan_fe;
        int status = 0;
        struct ocfs2_dir_lookup_result lookup = { NULL, };
-        status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
+        if (dio) {
+                status = snprintf(name, OCFS2_DIO_ORPHAN_PREFIX_LEN + 1, "%s",
+                                OCFS2_DIO_ORPHAN_PREFIX);
+                if (status != OCFS2_DIO_ORPHAN_PREFIX_LEN) {
+                        status = -EINVAL;
+                        mlog_errno(status);
+                        return status;
+                }
+                status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno,
+                                name + OCFS2_DIO_ORPHAN_PREFIX_LEN);
+        } else
+                status = ocfs2_blkno_stringify(OCFS2_I(inode)->ip_blkno, name);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -2246,10 +2322,10 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
        trace_ocfs2_orphan_del(
             (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
-             name, OCFS2_ORPHAN_NAMELEN);
+             name, namelen);
        /* find it's spot in the orphan directory */
-        status = ocfs2_find_entry(name, OCFS2_ORPHAN_NAMELEN, orphan_dir_inode,
+        status = ocfs2_find_entry(name, namelen, orphan_dir_inode,
                                  &lookup);
        if (status) {
                mlog_errno(status);
@@ -2349,7 +2425,8 @@ static int ocfs2_prep_new_orphaned_file(struct inode *dir,
        }
        ret = __ocfs2_prepare_orphan_dir(orphan_dir, orphan_dir_bh,
-                                         di_blkno, orphan_name, orphan_insert);
+                                         di_blkno, orphan_name, orphan_insert,
+                                         false);
        if (ret < 0) {
                mlog_errno(ret);
                goto out;
@@ -2455,7 +2532,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir,
        di = (struct ocfs2_dinode *)new_di_bh->b_data;
        status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name,
-                                  &orphan_insert, orphan_dir);
+                                  &orphan_insert, orphan_dir, false);
        if (status < 0) {
                mlog_errno(status);
                goto leave;
@@ -2500,6 +2577,186 @@ leave:
        return status;
 }
+static int ocfs2_dio_orphan_recovered(struct inode *inode)
+{
+        int ret;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di = NULL;
+        ret = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (ret < 0) {
+                mlog_errno(ret);
+                return 0;
+        }
+        di = (struct ocfs2_dinode *) di_bh->b_data;
+        ret = !(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL));
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+        return ret;
+}
+#define OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL 10000
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+        struct inode *inode)
+{
+        char orphan_name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1];
+        struct inode *orphan_dir_inode = NULL;
+        struct ocfs2_dir_lookup_result orphan_insert = { NULL, };
+        struct buffer_head *di_bh = NULL;
+        int status = 0;
+        handle_t *handle = NULL;
+        struct ocfs2_dinode *di = NULL;
+restart:
+        status = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        di = (struct ocfs2_dinode *) di_bh->b_data;
+        /*
+         * Another append dio crashed?
+         * If so, wait for recovery first.
+         */
+        if (unlikely(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) {
+                ocfs2_inode_unlock(inode, 1);
+                brelse(di_bh);
+                wait_event_interruptible_timeout(OCFS2_I(inode)->append_dio_wq,
+                                ocfs2_dio_orphan_recovered(inode),
+                                msecs_to_jiffies(OCFS2_DIO_ORPHANED_FL_CHECK_INTERVAL));
+                goto restart;
+        }
+        status = ocfs2_prepare_orphan_dir(osb, &orphan_dir_inode,
+                        OCFS2_I(inode)->ip_blkno,
+                        orphan_name,
+                        &orphan_insert,
+                        true);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail_unlock_inode;
+        }
+        handle = ocfs2_start_trans(osb,
+                        OCFS2_INODE_ADD_TO_ORPHAN_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                goto bail_unlock_orphan;
+        }
+        status = ocfs2_orphan_add(osb, handle, inode, di_bh, orphan_name,
+                        &orphan_insert, orphan_dir_inode, true);
+        if (status)
+                mlog_errno(status);
+        ocfs2_commit_trans(osb, handle);
+bail_unlock_orphan:
+        ocfs2_inode_unlock(orphan_dir_inode, 1);
+        mutex_unlock(&orphan_dir_inode->i_mutex);
+        iput(orphan_dir_inode);
+        ocfs2_free_dir_lookup_result(&orphan_insert);
+bail_unlock_inode:
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+bail:
+        return status;
+}
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+                struct inode *inode, int update_isize,
+                loff_t end)
+{
+        struct inode *orphan_dir_inode = NULL;
+        struct buffer_head *orphan_dir_bh = NULL;
+        struct buffer_head *di_bh = NULL;
+        struct ocfs2_dinode *di = NULL;
+        handle_t *handle = NULL;
+        int status = 0;
+        status = ocfs2_inode_lock(inode, &di_bh, 1);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail;
+        }
+        di = (struct ocfs2_dinode *) di_bh->b_data;
+        orphan_dir_inode = ocfs2_get_system_file_inode(osb,
+                        ORPHAN_DIR_SYSTEM_INODE,
+                        le16_to_cpu(di->i_dio_orphaned_slot));
+        if (!orphan_dir_inode) {
+                status = -ENOENT;
+                mlog_errno(status);
+                goto bail_unlock_inode;
+        }
+        mutex_lock(&orphan_dir_inode->i_mutex);
+        status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
+        if (status < 0) {
+                mutex_unlock(&orphan_dir_inode->i_mutex);
+                iput(orphan_dir_inode);
+                mlog_errno(status);
+                goto bail_unlock_inode;
+        }
+        handle = ocfs2_start_trans(osb,
+                        OCFS2_INODE_DEL_FROM_ORPHAN_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                goto bail_unlock_orphan;
+        }
+        BUG_ON(!(di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL)));
+        status = ocfs2_orphan_del(osb, handle, orphan_dir_inode,
+                                inode, orphan_dir_bh, true);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail_commit;
+        }
+        status = ocfs2_journal_access_di(handle,
+                        INODE_CACHE(inode),
+                        di_bh,
+                        OCFS2_JOURNAL_ACCESS_WRITE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto bail_commit;
+        }
+        di->i_flags &= ~cpu_to_le32(OCFS2_DIO_ORPHANED_FL);
+        di->i_dio_orphaned_slot = 0;
+        if (update_isize) {
+                status = ocfs2_set_inode_size(handle, inode, di_bh, end);
+                if (status)
+                        mlog_errno(status);
+        } else
+                ocfs2_journal_dirty(handle, di_bh);
+bail_commit:
+        ocfs2_commit_trans(osb, handle);
+bail_unlock_orphan:
+        ocfs2_inode_unlock(orphan_dir_inode, 1);
+        mutex_unlock(&orphan_dir_inode->i_mutex);
+        brelse(orphan_dir_bh);
+        iput(orphan_dir_inode);
+bail_unlock_inode:
+        ocfs2_inode_unlock(inode, 1);
+        brelse(di_bh);
+bail:
+        return status;
+}
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                                   struct inode *inode,
                                   struct dentry *dentry)
@@ -2588,7 +2845,7 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
        }
        status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode,
-                                  orphan_dir_bh);
+                                  orphan_dir_bh, false);
        if (status < 0) {
                mlog_errno(status);
                goto out_commit;
diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h
index e5d059d4f115..5ddecce172fa 100644
--- a/fs/ocfs2/namei.h
+++ b/fs/ocfs2/namei.h
@@ -34,10 +34,16 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
                     handle_t *handle,
                     struct inode *orphan_dir_inode,
                     struct inode *inode,
-                     struct buffer_head *orphan_dir_bh);
+                     struct buffer_head *orphan_dir_bh,
+                     bool dio);
 int ocfs2_create_inode_in_orphan(struct inode *dir,
                                 int mode,
                                 struct inode **new_inode);
+int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
+                struct inode *inode);
+int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
+                struct inode *inode, int update_isize,
+                loff_t end);
 int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
                                   struct inode *new_inode,
                                   struct dentry *new_dentry);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7d6b7d090452..8490c64d34fe 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -209,6 +209,11 @@ struct ocfs2_lock_res {
 #endif
 };
+enum ocfs2_orphan_reco_type {
+        ORPHAN_NO_NEED_TRUNCATE = 0,
+        ORPHAN_NEED_TRUNCATE,
+};
 enum ocfs2_orphan_scan_state {
        ORPHAN_SCAN_ACTIVE,
        ORPHAN_SCAN_INACTIVE
@@ -279,6 +284,8 @@ enum ocfs2_mount_options
                                                     writes */
        OCFS2_MOUNT_HB_NONE = 1 << 13, /* No heartbeat */
        OCFS2_MOUNT_HB_GLOBAL = 1 << 14, /* Global heartbeat */
+        OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15,  /* Journal Async Commit */
 };
 #define OCFS2_OSB_SOFT_RO       0x0001
@@ -493,6 +500,14 @@ static inline int ocfs2_writes_unwritten_extents(struct ocfs2_super *osb)
        return 0;
 }
+static inline int ocfs2_supports_append_dio(struct ocfs2_super *osb)
+{
+        if (osb->s_feature_ro_compat & OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
+                return 1;
+        return 0;
+}
 static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb)
 {
        if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_INLINE_DATA)
@@ -724,6 +739,16 @@ static inline unsigned int ocfs2_clusters_for_bytes(struct super_block *sb,
        return clusters;
 }
+static inline unsigned int ocfs2_bytes_to_clusters(struct super_block *sb,
+                u64 bytes)
+{
+        int cl_bits = OCFS2_SB(sb)->s_clustersize_bits;
+        unsigned int clusters;
+        clusters = (unsigned int)(bytes >> cl_bits);
+        return clusters;
+}
 static inline u64 ocfs2_blocks_for_bytes(struct super_block *sb,
                                         u64 bytes)
 {
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index 938387a10d5d..20e37a3ed26f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -105,7 +105,8 @@
                                         | OCFS2_FEATURE_INCOMPAT_CLUSTERINFO)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP    (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \
                                         | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \
-                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)
+                                         | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA \
+                                         | OCFS2_FEATURE_RO_COMPAT_APPEND_DIO)
 /*
 * Heartbeat-only devices are missing journals and other files.  The
@@ -199,6 +200,11 @@
 #define OCFS2_FEATURE_RO_COMPAT_USRQUOTA        0x0002
 #define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA        0x0004
+/*
+ * Append Direct IO support
+ */
+#define OCFS2_FEATURE_RO_COMPAT_APPEND_DIO      0x0008
 /* The byte offset of the first backup block will be 1G.
 * The following will be 4G, 16G, 64G, 256G and 1T.
 */
@@ -229,6 +235,8 @@
 #define OCFS2_CHAIN_FL          (0x00000400)    /* Chain allocator */
 #define OCFS2_DEALLOC_FL        (0x00000800)    /* Truncate log */
 #define OCFS2_QUOTA_FL          (0x00001000)    /* Quota file */
+#define OCFS2_DIO_ORPHANED_FL   (0X00002000)    /* On the orphan list especially
+                                                 * for dio */
 /*
 * Flags on ocfs2_dinode.i_dyn_features
@@ -729,7 +737,9 @@ struct ocfs2_dinode {
                                           inode belongs to.  Only valid
                                           if allocated from a
                                           discontiguous block group */
-/*A0*/  __le64 i_reserved2[3];
+/*A0*/  __le16 i_dio_orphaned_slot;     /* only used for append dio write */
+        __le16 i_reserved1[3];
+        __le64 i_reserved2[2];
 /*B8*/  union {
                __le64 i_pad1;          /* Generic way to refer to this
                                           64bit union */
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 1eae330193a6..b6d51333ad02 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -48,6 +48,7 @@ struct ocfs2_quota_recovery {
 /* In-memory structure with quota header information */
 struct ocfs2_mem_dqinfo {
        unsigned int dqi_type;          /* Quota type this structure describes */
+        unsigned int dqi_flags;         /* Flags OLQF_* */
        unsigned int dqi_chunks;        /* Number of chunks in local quota file */
        unsigned int dqi_blocks;        /* Number of blocks allocated for local quota file */
        unsigned int dqi_syncms;        /* How often should we sync with other nodes */
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 10b653930ee2..3d0b63d34225 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -73,12 +73,6 @@ static loff_t ol_dqblk_off(struct super_block *sb, int c, int off)
               ol_dqblk_block_off(sb, c, off);
 }
-/* Compute block number from given offset */
-static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off)
-{
-        return off >> sb->s_blocksize_bits;
-}
 static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off)
 {
        return off & ((1 << sb->s_blocksize_bits) - 1);
@@ -292,7 +286,7 @@ static void olq_update_info(struct buffer_head *bh, void *private)
        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
                                                OCFS2_LOCAL_INFO_OFF);
        spin_lock(&dq_data_lock);
-        ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+        ldinfo->dqi_flags = cpu_to_le32(oinfo->dqi_flags);
        ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks);
        ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks);
        spin_unlock(&dq_data_lock);
@@ -701,8 +695,8 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        /* We don't need the lock and we have to acquire quota file locks
         * which will later depend on this lock */
        mutex_unlock(&sb_dqopt(sb)->dqio_mutex);
-        info->dqi_maxblimit = 0x7fffffffffffffffLL;
+        info->dqi_max_spc_limit = 0x7fffffffffffffffLL;
-        info->dqi_maxilimit = 0x7fffffffffffffffLL;
+        info->dqi_max_ino_limit = 0x7fffffffffffffffLL;
        oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS);
        if (!oinfo) {
                mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota"
@@ -737,13 +731,13 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        }
        ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data +
                                                OCFS2_LOCAL_INFO_OFF);
-        info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
+        oinfo->dqi_flags = le32_to_cpu(ldinfo->dqi_flags);
        oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks);
        oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks);
        oinfo->dqi_libh = bh;
        /* We crashed when using local quota file? */
-        if (!(info->dqi_flags & OLQF_CLEAN)) {
+        if (!(oinfo->dqi_flags & OLQF_CLEAN)) {
                rec = OCFS2_SB(sb)->quota_rec;
                if (!rec) {
                        rec = ocfs2_alloc_quota_recovery();
@@ -772,7 +766,7 @@ static int ocfs2_local_read_info(struct super_block *sb, int type)
        }
        /* Now mark quota file as used */
-        info->dqi_flags &= ~OLQF_CLEAN;
+        oinfo->dqi_flags &= ~OLQF_CLEAN;
        status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info);
        if (status < 0) {
                mlog_errno(status);
@@ -857,7 +851,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
                goto out;
        /* Mark local file as clean */
-        info->dqi_flags |= OLQF_CLEAN;
+        oinfo->dqi_flags |= OLQF_CLEAN;
        status = ocfs2_modify_bh(sb_dqopt(sb)->files[type],
                                 oinfo->dqi_libh,
                                 olq_update_info,
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index d81f6e2a97f5..ee541f92dab4 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -2428,8 +2428,6 @@ static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
                        get_bh(prev_bh);
                }
-                rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
                trace_ocfs2_calc_refcount_meta_credits_iterate(
                                recs_add, (unsigned long long)cpos, clusters,
                                (unsigned long long)le64_to_cpu(rec.r_cpos),
diff --git a/fs/ocfs2/reservations.c b/fs/ocfs2/reservations.c
index 41ffd36c689c..6a348b0294ab 100644
--- a/fs/ocfs2/reservations.c
+++ b/fs/ocfs2/reservations.c
@@ -39,7 +39,7 @@
 #define OCFS2_CHECK_RESERVATIONS
 #endif
-DEFINE_SPINLOCK(resv_lock);
+static DEFINE_SPINLOCK(resv_lock);
 #define OCFS2_MIN_RESV_WINDOW_BITS      8
 #define OCFS2_MAX_RESV_WINDOW_BITS      1024
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 83723179e1ec..26675185b886 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -191,6 +191,7 @@ enum {
        Opt_coherency_full,
        Opt_resv_level,
        Opt_dir_resv_level,
+        Opt_journal_async_commit,
        Opt_err,
 };
@@ -222,6 +223,7 @@ static const match_table_t tokens = {
        {Opt_coherency_full, "coherency=full"},
        {Opt_resv_level, "resv_level=%u"},
        {Opt_dir_resv_level, "dir_resv_level=%u"},
+        {Opt_journal_async_commit, "journal_async_commit"},
        {Opt_err, NULL}
 };
@@ -1000,36 +1002,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
        }
 }
-/* Handle quota on quotactl */
-static int ocfs2_quota_on(struct super_block *sb, int type, int format_id)
-{
-        unsigned int feature[OCFS2_MAXQUOTAS] = {
-                                        OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-                                        OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
-        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
-                return -EINVAL;
-        return dquot_enable(sb_dqopt(sb)->files[type], type,
-                            format_id, DQUOT_LIMITS_ENABLED);
-}
-/* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type)
-{
-        return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-}
-static const struct quotactl_ops ocfs2_quotactl_ops = {
-        .quota_on_meta  = ocfs2_quota_on,
-        .quota_off      = ocfs2_quota_off,
-        .quota_sync     = dquot_quota_sync,
-        .get_info       = dquot_get_dqinfo,
-        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = dquot_set_dqblk,
-};
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
 {
        struct dentry *root;
@@ -1500,6 +1472,9 @@ static int ocfs2_parse_options(struct super_block *sb,
                            option < OCFS2_MAX_RESV_LEVEL)
                                mopt->dir_resv_level = option;
                        break;
+                case Opt_journal_async_commit:
+                        mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
+                        break;
                default:
                        mlog(ML_ERROR,
                             "Unrecognized mount option \"%s\" "
@@ -1606,6 +1581,9 @@ static int ocfs2_show_options(struct seq_file *s, struct dentry *root)
        if (osb->osb_dir_resv_level != osb->osb_resv_level)
                seq_printf(s, ",dir_resv_level=%d", osb->osb_resv_level);
+        if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+                seq_printf(s, ",journal_async_commit");
        return 0;
 }
@@ -1768,6 +1746,8 @@ static void ocfs2_inode_init_once(void *data)
        ocfs2_lock_res_init_once(&oi->ip_inode_lockres);
        ocfs2_lock_res_init_once(&oi->ip_open_lockres);
+        init_waitqueue_head(&oi->append_dio_wq);
        ocfs2_metadata_cache_init(INODE_CACHE(&oi->vfs_inode),
                                  &ocfs2_inode_caching_ops);
@@ -2079,7 +2059,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
        sb->s_op = &ocfs2_sops;
        sb->s_d_op = &ocfs2_dentry_ops;
        sb->s_export_op = &ocfs2_export_ops;
-        sb->s_qcop = &ocfs2_quotactl_ops;
+        sb->s_qcop = &dquot_quotactl_sysfile_ops;
        sb->dq_op = &ocfs2_quota_operations;
        sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
        sb->s_xattr = ocfs2_xattr_handlers;
@@ -2475,6 +2455,15 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                goto finally;
        }
+        if (osb->s_mount_opt & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
+                jbd2_journal_set_features(osb->journal->j_journal,
+                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
+        else
+                jbd2_journal_clear_features(osb->journal->j_journal,
+                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
        if (dirty) {
                /* recover my local alloc if we didn't unmount cleanly. */
                status = ocfs2_begin_local_alloc_recovery(osb,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 662f8dee149f..85b190dc132f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -5334,16 +5334,6 @@ out:
        return ret;
 }
-static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode,
-                                        struct ocfs2_xattr_bucket *bucket,
-                                        int offs)
-{
-        int block_off = offs >> inode->i_sb->s_blocksize_bits;
-        offs = offs % inode->i_sb->s_blocksize;
-        return bucket_block(bucket, block_off) + offs;
-}
 /*
 * Truncate the specified xe_off entry in xattr bucket.
 * bucket is indicated by header_bh and len is the new length.
diff --git a/fs/open.c b/fs/open.c
index 813be037b412..33f9cbf2610b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -667,11 +667,8 @@ int open_check_o_direct(struct file *f)
 {
        /* NB: we're sure to have correct a_ops only after f_op->open */
        if (f->f_flags & O_DIRECT) {
-                if (!f->f_mapping->a_ops ||
+                if (!f->f_mapping->a_ops || !f->f_mapping->a_ops->direct_IO)
-                    ((!f->f_mapping->a_ops->direct_IO) &&
-                    (!f->f_mapping->a_ops->get_xip_mem))) {
                        return -EINVAL;
-                }
        }
        return 0;
 }
@@ -971,8 +968,14 @@ struct file *file_open_name(struct filename *name, int flags, umode_t mode)
 */
 struct file *filp_open(const char *filename, int flags, umode_t mode)
 {
-        struct filename name = {.name = filename};
+        struct filename *name = getname_kernel(filename);
-        return file_open_name(&name, flags, mode);
+        struct file *file = ERR_CAST(name);
+        
+        if (!IS_ERR(name)) {
+                file = file_open_name(name, flags, mode);
+                putname(name);
+        }
+        return file;
 }
 EXPORT_SYMBOL(filp_open);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index bd117d065b82..1295a00ca316 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -81,6 +81,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
+#include <linux/string_helpers.h>
 #include <linux/user_namespace.h>
 #include <asm/pgtable.h>
@@ -89,39 +90,18 @@
 static inline void task_name(struct seq_file *m, struct task_struct *p)
 {
-        int i;
+        char *buf;
-        char *buf, *end;
-        char *name;
        char tcomm[sizeof(p->comm)];
        get_task_comm(tcomm, p);
        seq_puts(m, "Name:\t");
-        end = m->buf + m->size;
        buf = m->buf + m->count;
-        name = tcomm;
-        i = sizeof(tcomm);
+        /* Ignore error for now */
-        while (i && (buf < end)) {
+        string_escape_str(tcomm, &buf, m->size - m->count,
-                unsigned char c = *name;
+                          ESCAPE_SPACE | ESCAPE_SPECIAL, "\n\\");
-                name++;
-                i--;
-                *buf = c;
-                if (!c)
-                        break;
-                if (c == '\\') {
-                        buf++;
-                        if (buf < end)
-                                *buf++ = c;
-                        continue;
-                }
-                if (c == '\n') {
-                        *buf++ = '\\';
-                        if (buf < end)
-                                *buf++ = 'n';
-                        continue;
-                }
-                buf++;
-        }
        m->count = buf - m->buf;
        seq_putc(m, '\n');
 }
@@ -336,12 +316,10 @@ static inline void task_context_switch_counts(struct seq_file *m,
 static void task_cpus_allowed(struct seq_file *m, struct task_struct *task)
 {
-        seq_puts(m, "Cpus_allowed:\t");
+        seq_printf(m, "Cpus_allowed:\t%*pb\n",
-        seq_cpumask(m, &task->cpus_allowed);
+                   cpumask_pr_args(&task->cpus_allowed));
-        seq_putc(m, '\n');
+        seq_printf(m, "Cpus_allowed_list:\t%*pbl\n",
-        seq_puts(m, "Cpus_allowed_list:\t");
+                   cpumask_pr_args(&task->cpus_allowed));
-        seq_cpumask_list(m, &task->cpus_allowed);
-        seq_putc(m, '\n');
 }
 int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 7fea13229f33..3309f59d421b 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -122,7 +122,7 @@ static int proc_getattr(struct vfsmount *mnt, struct dentry *dentry,
                        struct kstat *stat)
 {
        struct inode *inode = dentry->d_inode;
-        struct proc_dir_entry *de = PROC_I(inode)->pde;
+        struct proc_dir_entry *de = PDE(inode);
        if (de && de->nlink)
                set_nlink(inode, de->nlink);
@@ -350,29 +350,12 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
        if (ret)
                return ret;
-        if (S_ISDIR(dp->mode)) {
-                dp->proc_fops = &proc_dir_operations;
-                dp->proc_iops = &proc_dir_inode_operations;
-                dir->nlink++;
-        } else if (S_ISLNK(dp->mode)) {
-                dp->proc_iops = &proc_link_inode_operations;
-        } else if (S_ISREG(dp->mode)) {
-                BUG_ON(dp->proc_fops == NULL);
-                dp->proc_iops = &proc_file_inode_operations;
-        } else {
-                WARN_ON(1);
-                proc_free_inum(dp->low_ino);
-                return -EINVAL;
-        }
        spin_lock(&proc_subdir_lock);
        dp->parent = dir;
        if (pde_subdir_insert(dir, dp) == false) {
                WARN(1, "proc_dir_entry '%s/%s' already registered\n",
                     dir->name, dp->name);
                spin_unlock(&proc_subdir_lock);
-                if (S_ISDIR(dp->mode))
-                        dir->nlink--;
                proc_free_inum(dp->low_ino);
                return -EEXIST;
        }
@@ -431,6 +414,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
                ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
                if (ent->data) {
                        strcpy((char*)ent->data,dest);
+                        ent->proc_iops = &proc_link_inode_operations;
                        if (proc_register(parent, ent) < 0) {
                                kfree(ent->data);
                                kfree(ent);
@@ -456,8 +440,12 @@ struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
        ent = __proc_create(&parent, name, S_IFDIR | mode, 2);
        if (ent) {
                ent->data = data;
+                ent->proc_fops = &proc_dir_operations;
+                ent->proc_iops = &proc_dir_inode_operations;
+                parent->nlink++;
                if (proc_register(parent, ent) < 0) {
                        kfree(ent);
+                        parent->nlink--;
                        ent = NULL;
                }
        }
@@ -493,6 +481,8 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
                return NULL;
        }
+        BUG_ON(proc_fops == NULL);
        if ((mode & S_IALLUGO) == 0)
                mode |= S_IRUGO;
        pde = __proc_create(&parent, name, mode, 1);
@@ -500,6 +490,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
                goto out;
        pde->proc_fops = proc_fops;
        pde->data = data;
+        pde->proc_iops = &proc_file_inode_operations;
        if (proc_register(parent, pde) < 0)
                goto out_free;
        return pde;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 8420a2f80811..13a50a32652d 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -40,7 +40,7 @@ static void proc_evict_inode(struct inode *inode)
        put_pid(PROC_I(inode)->pid);
        /* Let go of any associated proc directory entry */
-        de = PROC_I(inode)->pde;
+        de = PDE(inode);
        if (de)
                pde_put(de);
        head = PROC_I(inode)->sysctl;
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 1e3187da1fed..7eee2d8b97d9 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -5,6 +5,7 @@
 #include <linux/ksm.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/huge_mm.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
@@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page)
         * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon
         * to make sure a given page is a thp, not a non-huge compound page.
         */
-        else if (PageTransCompound(page) && (PageLRU(compound_head(page)) ||
+        else if (PageTransCompound(page)) {
-                                             PageAnon(compound_head(page))))
+                struct page *head = compound_head(page);
-                u |= 1 << KPF_THP;
+                if (PageLRU(head) || PageAnon(head))
+                        u |= 1 << KPF_THP;
+                else if (is_huge_zero_page(head)) {
+                        u |= 1 << KPF_ZERO_PAGE;
+                        u |= 1 << KPF_THP;
+                }
+        } else if (is_zero_pfn(page_to_pfn(page)))
+                u |= 1 << KPF_ZERO_PAGE;
        /*
         * Caveats on high order pages: page->_count will only be set
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 246eae84b13b..956b75d61809 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -21,7 +21,7 @@
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-        unsigned long data, text, lib, swap;
+        unsigned long data, text, lib, swap, ptes, pmds;
        unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
        /*
@@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
        text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
        lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
        swap = get_mm_counter(mm, MM_SWAPENTS);
+        ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes);
+        pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm);
        seq_printf(m,
                "VmPeak:\t%8lu kB\n"
                "VmSize:\t%8lu kB\n"
@@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                "VmExe:\t%8lu kB\n"
                "VmLib:\t%8lu kB\n"
                "VmPTE:\t%8lu kB\n"
+                "VmPMD:\t%8lu kB\n"
                "VmSwap:\t%8lu kB\n",
                hiwater_vm << (PAGE_SHIFT-10),
                total_vm << (PAGE_SHIFT-10),
@@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
                total_rss << (PAGE_SHIFT-10),
                data << (PAGE_SHIFT-10),
                mm->stack_vm << (PAGE_SHIFT-10), text, lib,
-                (PTRS_PER_PTE * sizeof(pte_t) *
+                ptes >> 10,
-                 atomic_long_read(&mm->nr_ptes)) >> 10,
+                pmds >> 10,
                swap << (PAGE_SHIFT-10));
 }
@@ -433,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = {
 #ifdef CONFIG_PROC_PAGE_MONITOR
 struct mem_size_stats {
-        struct vm_area_struct *vma;
        unsigned long resident;
        unsigned long shared_clean;
        unsigned long shared_dirty;
@@ -443,7 +445,6 @@ struct mem_size_stats {
        unsigned long anonymous;
        unsigned long anonymous_thp;
        unsigned long swap;
-        unsigned long nonlinear;
        u64 pss;
 };
@@ -483,8 +484,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
                struct mm_walk *walk)
 {
        struct mem_size_stats *mss = walk->private;
-        struct vm_area_struct *vma = mss->vma;
+        struct vm_area_struct *vma = walk->vma;
-        pgoff_t pgoff = linear_page_index(vma, addr);
        struct page *page = NULL;
        if (pte_present(*pte)) {
@@ -496,17 +496,10 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
                        mss->swap += PAGE_SIZE;
                else if (is_migration_entry(swpent))
                        page = migration_entry_to_page(swpent);
-        } else if (pte_file(*pte)) {
-                if (pte_to_pgoff(*pte) != pgoff)
-                        mss->nonlinear += PAGE_SIZE;
        }
        if (!page)
                return;
-        if (page->index != pgoff)
-                mss->nonlinear += PAGE_SIZE;
        smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
 }
@@ -515,7 +508,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
                struct mm_walk *walk)
 {
        struct mem_size_stats *mss = walk->private;
-        struct vm_area_struct *vma = mss->vma;
+        struct vm_area_struct *vma = walk->vma;
        struct page *page;
        /* FOLL_DUMP will return -EFAULT on huge zero page */
@@ -536,8 +529,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                           struct mm_walk *walk)
 {
-        struct mem_size_stats *mss = walk->private;
+        struct vm_area_struct *vma = walk->vma;
-        struct vm_area_struct *vma = mss->vma;
        pte_t *pte;
        spinlock_t *ptl;
@@ -596,7 +588,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
                [ilog2(VM_ACCOUNT)]     = "ac",
                [ilog2(VM_NORESERVE)]   = "nr",
                [ilog2(VM_HUGETLB)]     = "ht",
-                [ilog2(VM_NONLINEAR)]   = "nl",
                [ilog2(VM_ARCH_1)]      = "ar",
                [ilog2(VM_DONTDUMP)]    = "dd",
 #ifdef CONFIG_MEM_SOFT_DIRTY
@@ -630,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
        };
        memset(&mss, 0, sizeof mss);
-        mss.vma = vma;
        /* mmap_sem is held in m_start */
-        if (vma->vm_mm && !is_vm_hugetlb_page(vma))
+        walk_page_vma(vma, &smaps_walk);
-                walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk);
        show_map_vma(m, vma, is_pid);
@@ -668,10 +657,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
                   (vma->vm_flags & VM_LOCKED) ?
                        (unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
-        if (vma->vm_flags & VM_NONLINEAR)
-                seq_printf(m, "Nonlinear:      %8lu kB\n",
-                                mss.nonlinear >> 10);
        show_smap_vma_flags(m, vma);
        m_cache_vma(m, vma);
        return 0;
@@ -747,18 +732,18 @@ enum clear_refs_types {
        CLEAR_REFS_ANON,
        CLEAR_REFS_MAPPED,
        CLEAR_REFS_SOFT_DIRTY,
+        CLEAR_REFS_MM_HIWATER_RSS,
        CLEAR_REFS_LAST,
 };
 struct clear_refs_private {
-        struct vm_area_struct *vma;
        enum clear_refs_types type;
 };
+#ifdef CONFIG_MEM_SOFT_DIRTY
 static inline void clear_soft_dirty(struct vm_area_struct *vma,
                unsigned long addr, pte_t *pte)
 {
-#ifdef CONFIG_MEM_SOFT_DIRTY
        /*
         * The soft-dirty tracker uses #PF-s to catch writes
         * to pages, so write-protect the pte as well. See the
@@ -772,24 +757,63 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
                ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
        } else if (is_swap_pte(ptent)) {
                ptent = pte_swp_clear_soft_dirty(ptent);
-        } else if (pte_file(ptent)) {
-                ptent = pte_file_clear_soft_dirty(ptent);
        }
        set_pte_at(vma->vm_mm, addr, pte, ptent);
-#endif
 }
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+                unsigned long addr, pmd_t *pmdp)
+{
+        pmd_t pmd = *pmdp;
+        pmd = pmd_wrprotect(pmd);
+        pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+        if (vma->vm_flags & VM_SOFTDIRTY)
+                vma->vm_flags &= ~VM_SOFTDIRTY;
+        set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
+}
+#else
+static inline void clear_soft_dirty(struct vm_area_struct *vma,
+                unsigned long addr, pte_t *pte)
+{
+}
+static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+                unsigned long addr, pmd_t *pmdp)
+{
+}
+#endif
 static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
                                unsigned long end, struct mm_walk *walk)
 {
        struct clear_refs_private *cp = walk->private;
-        struct vm_area_struct *vma = cp->vma;
+        struct vm_area_struct *vma = walk->vma;
        pte_t *pte, ptent;
        spinlock_t *ptl;
        struct page *page;
-        split_huge_page_pmd(vma, addr, pmd);
+        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+                if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
+                        clear_soft_dirty_pmd(vma, addr, pmd);
+                        goto out;
+                }
+                page = pmd_page(*pmd);
+                /* Clear accessed and referenced bits. */
+                pmdp_test_and_clear_young(vma, addr, pmd);
+                ClearPageReferenced(page);
+out:
+                spin_unlock(ptl);
+                return 0;
+        }
        if (pmd_trans_unstable(pmd))
                return 0;
@@ -818,6 +842,28 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        return 0;
 }
+static int clear_refs_test_walk(unsigned long start, unsigned long end,
+                                struct mm_walk *walk)
+{
+        struct clear_refs_private *cp = walk->private;
+        struct vm_area_struct *vma = walk->vma;
+        if (vma->vm_flags & VM_PFNMAP)
+                return 1;
+        /*
+         * Writing 1 to /proc/pid/clear_refs affects all pages.
+         * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
+         * Writing 3 to /proc/pid/clear_refs only affects file mapped pages.
+         * Writing 4 to /proc/pid/clear_refs affects all pages.
+         */
+        if (cp->type == CLEAR_REFS_ANON && vma->vm_file)
+                return 1;
+        if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file)
+                return 1;
+        return 0;
+}
 static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                                size_t count, loff_t *ppos)
 {
@@ -858,9 +904,22 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                };
                struct mm_walk clear_refs_walk = {
                        .pmd_entry = clear_refs_pte_range,
+                        .test_walk = clear_refs_test_walk,
                        .mm = mm,
                        .private = &cp,
                };
+                if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+                        /*
+                         * Writing 5 to /proc/pid/clear_refs resets the peak
+                         * resident set size to this mm's current rss value.
+                         */
+                        down_write(&mm->mmap_sem);
+                        reset_mm_hiwater_rss(mm);
+                        up_write(&mm->mmap_sem);
+                        goto out_mm;
+                }
                down_read(&mm->mmap_sem);
                if (type == CLEAR_REFS_SOFT_DIRTY) {
                        for (vma = mm->mmap; vma; vma = vma->vm_next) {
@@ -877,32 +936,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
                        }
                        mmu_notifier_invalidate_range_start(mm, 0, -1);
                }
-                for (vma = mm->mmap; vma; vma = vma->vm_next) {
+                walk_page_range(0, ~0UL, &clear_refs_walk);
-                        cp.vma = vma;
-                        if (is_vm_hugetlb_page(vma))
-                                continue;
-                        /*
-                         * Writing 1 to /proc/pid/clear_refs affects all pages.
-                         *
-                         * Writing 2 to /proc/pid/clear_refs only affects
-                         * Anonymous pages.
-                         *
-                         * Writing 3 to /proc/pid/clear_refs only affects file
-                         * mapped pages.
-                         *
-                         * Writing 4 to /proc/pid/clear_refs affects all pages.
-                         */
-                        if (type == CLEAR_REFS_ANON && vma->vm_file)
-                                continue;
-                        if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
-                                continue;
-                        walk_page_range(vma->vm_start, vma->vm_end,
-                                        &clear_refs_walk);
-                }
                if (type == CLEAR_REFS_SOFT_DIRTY)
                        mmu_notifier_invalidate_range_end(mm, 0, -1);
                flush_tlb_mm(mm);
                up_read(&mm->mmap_sem);
+out_mm:
                mmput(mm);
        }
        put_task_struct(task);
@@ -1066,15 +1105,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap
 static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
                             struct mm_walk *walk)
 {
-        struct vm_area_struct *vma;
+        struct vm_area_struct *vma = walk->vma;
        struct pagemapread *pm = walk->private;
        spinlock_t *ptl;
-        pte_t *pte;
+        pte_t *pte, *orig_pte;
        int err = 0;
-        /* find the first VMA at or above 'addr' */
+        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-        vma = find_vma(walk->mm, addr);
-        if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                int pmd_flags2;
                if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd))
@@ -1100,51 +1137,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        if (pmd_trans_unstable(pmd))
                return 0;
-        while (1) {
+        /*
-                /* End of address space hole, which we mark as non-present. */
+         * We can assume that @vma always points to a valid one and @end never
-                unsigned long hole_end;
+         * goes beyond vma->vm_end.
+         */
-                if (vma)
+        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
-                        hole_end = min(end, vma->vm_start);
+        for (; addr < end; pte++, addr += PAGE_SIZE) {
-                else
+                pagemap_entry_t pme;
-                        hole_end = end;
-                for (; addr < hole_end; addr += PAGE_SIZE) {
-                        pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2));
-                        err = add_to_pagemap(addr, &pme, pm);
-                        if (err)
-                                return err;
-                }
-                if (!vma || vma->vm_start >= end)
-                        break;
-                /*
-                 * We can't possibly be in a hugetlb VMA. In general,
-                 * for a mm_walk with a pmd_entry and a hugetlb_entry,
-                 * the pmd_entry can only be called on addresses in a
-                 * hugetlb if the walk starts in a non-hugetlb VMA and
-                 * spans a hugepage VMA. Since pagemap_read walks are
-                 * PMD-sized and PMD-aligned, this will never be true.
-                 */
-                BUG_ON(is_vm_hugetlb_page(vma));
-                /* Addresses in the VMA. */
-                for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) {
-                        pagemap_entry_t pme;
-                        pte = pte_offset_map(pmd, addr);
-                        pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
-                        pte_unmap(pte);
-                        err = add_to_pagemap(addr, &pme, pm);
-                        if (err)
-                                return err;
-                }
-                if (addr == end)
+                pte_to_pagemap_entry(&pme, pm, vma, addr, *pte);
+                err = add_to_pagemap(addr, &pme, pm);
+                if (err)
                        break;
-                vma = find_vma(walk->mm, addr);
        }
+        pte_unmap_unlock(orig_pte, ptl);
        cond_resched();
@@ -1170,15 +1176,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
                                 struct mm_walk *walk)
 {
        struct pagemapread *pm = walk->private;
-        struct vm_area_struct *vma;
+        struct vm_area_struct *vma = walk->vma;
        int err = 0;
        int flags2;
        pagemap_entry_t pme;
-        vma = find_vma(walk->mm, addr);
+        if (vma->vm_flags & VM_SOFTDIRTY)
-        WARN_ON_ONCE(!vma);
-        if (vma && (vma->vm_flags & VM_SOFTDIRTY))
                flags2 = __PM_SOFT_DIRTY;
        else
                flags2 = 0;
@@ -1338,7 +1341,6 @@ const struct file_operations proc_pagemap_operations = {
 #ifdef CONFIG_NUMA
 struct numa_maps {
-        struct vm_area_struct *vma;
        unsigned long pages;
        unsigned long anon;
        unsigned long active;
@@ -1407,18 +1409,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
                unsigned long end, struct mm_walk *walk)
 {
-        struct numa_maps *md;
+        struct numa_maps *md = walk->private;
+        struct vm_area_struct *vma = walk->vma;
        spinlock_t *ptl;
        pte_t *orig_pte;
        pte_t *pte;
-        md = walk->private;
+        if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
-        if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) {
                pte_t huge_pte = *(pte_t *)pmd;
                struct page *page;
-                page = can_gather_numa_stats(huge_pte, md->vma, addr);
+                page = can_gather_numa_stats(huge_pte, vma, addr);
                if (page)
                        gather_stats(page, md, pte_dirty(huge_pte),
                                     HPAGE_PMD_SIZE/PAGE_SIZE);
@@ -1430,7 +1431,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
                return 0;
        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
        do {
-                struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
+                struct page *page = can_gather_numa_stats(*pte, vma, addr);
                if (!page)
                        continue;
                gather_stats(page, md, pte_dirty(*pte), 1);
@@ -1440,7 +1441,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
        return 0;
 }
 #ifdef CONFIG_HUGETLB_PAGE
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
                unsigned long addr, unsigned long end, struct mm_walk *walk)
 {
        struct numa_maps *md;
@@ -1459,7 +1460,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
 }
 #else
-static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
+static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask,
                unsigned long addr, unsigned long end, struct mm_walk *walk)
 {
        return 0;
@@ -1477,7 +1478,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        struct numa_maps *md = &numa_priv->md;
        struct file *file = vma->vm_file;
        struct mm_struct *mm = vma->vm_mm;
-        struct mm_walk walk = {};
+        struct mm_walk walk = {
+                .hugetlb_entry = gather_hugetlb_stats,
+                .pmd_entry = gather_pte_stats,
+                .private = md,
+                .mm = mm,
+        };
        struct mempolicy *pol;
        char buffer[64];
        int nid;
@@ -1488,13 +1494,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        /* Ensure we start with an empty set of numa_maps statistics. */
        memset(md, 0, sizeof(*md));
-        md->vma = vma;
-        walk.hugetlb_entry = gather_hugetbl_stats;
-        walk.pmd_entry = gather_pte_stats;
-        walk.private = md;
-        walk.mm = mm;
        pol = __get_vma_policy(vma, vma->vm_start);
        if (pol) {
                mpol_to_str(buffer, sizeof(buffer), pol);
@@ -1528,7 +1527,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        if (is_vm_hugetlb_page(vma))
                seq_puts(m, " huge");
-        walk_page_range(vma->vm_start, vma->vm_end, &walk);
+        /* mmap_sem is held by m_start */
+        walk_page_vma(vma, &walk);
        if (!md->pages)
                goto out;
@@ -1557,6 +1557,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        for_each_node_state(nid, N_MEMORY)
                if (md->node[nid])
                        seq_printf(m, " N%d=%lu", nid, md->node[nid]);
+        seq_printf(m, " kernelpagesize_kB=%lu", vma_kernel_pagesize(vma) >> 10);
 out:
        seq_putc(m, '\n');
        m_cache_vma(m, vma);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index a90d6d354199..4e61388ec03d 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -546,8 +546,8 @@ static int __init update_note_header_size_elf64(const Elf64_Ehdr *ehdr_ptr)
                nhdr_ptr = notes_section;
                while (nhdr_ptr->n_namesz != 0) {
                        sz = sizeof(Elf64_Nhdr) +
-                                ((nhdr_ptr->n_namesz + 3) & ~3) +
+                                (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
-                                ((nhdr_ptr->n_descsz + 3) & ~3);
+                                (((u64)nhdr_ptr->n_descsz + 3) & ~3);
                        if ((real_sz + sz) > max_sz) {
                                pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
                                        nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
@@ -732,8 +732,8 @@ static int __init update_note_header_size_elf32(const Elf32_Ehdr *ehdr_ptr)
                nhdr_ptr = notes_section;
                while (nhdr_ptr->n_namesz != 0) {
                        sz = sizeof(Elf32_Nhdr) +
-                                ((nhdr_ptr->n_namesz + 3) & ~3) +
+                                (((u64)nhdr_ptr->n_namesz + 3) & ~3) +
-                                ((nhdr_ptr->n_descsz + 3) & ~3);
+                                (((u64)nhdr_ptr->n_descsz + 3) & ~3);
                        if ((real_sz + sz) > max_sz) {
                                pr_warn("Warning: Exceeded p_memsz, dropping PT_NOTE entry n_namesz=0x%x, n_descsz=0x%x\n",
                                        nhdr_ptr->n_namesz, nhdr_ptr->n_descsz);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 0f96f71ab32b..8db932da4009 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
                { MS_SYNCHRONOUS, ",sync" },
                { MS_DIRSYNC, ",dirsync" },
                { MS_MANDLOCK, ",mand" },
+                { MS_LAZYTIME, ",lazytime" },
                { 0, NULL }
        };
        const struct proc_fs_info *fs_infop;
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index 983d9510becc..916b8e23d968 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -21,6 +21,16 @@ config PSTORE_CONSOLE
          When the option is enabled, pstore will log all kernel
          messages, even if no oops or panic happened.
+config PSTORE_PMSG
+        bool "Log user space messages"
+        depends on PSTORE
+        help
+          When the option is enabled, pstore will export a character
+          interface /dev/pmsg0 to log user space messages. On reboot
+          data can be retrieved from /sys/fs/pstore/pmsg-ramoops-[ID].
+          If unsure, say N.
 config PSTORE_FTRACE
        bool "Persistent function tracer"
        depends on PSTORE
diff --git a/fs/pstore/Makefile b/fs/pstore/Makefile
index 4c9095c2781e..e647d8e81712 100644
--- a/fs/pstore/Makefile
+++ b/fs/pstore/Makefile
@@ -7,5 +7,7 @@ obj-y += pstore.o
 pstore-objs += inode.o platform.o
 obj-$(CONFIG_PSTORE_FTRACE)     += ftrace.o
+obj-$(CONFIG_PSTORE_PMSG)       += pmsg.o
 ramoops-objs += ram.o ram_core.o
 obj-$(CONFIG_PSTORE_RAM)        += ramoops.o
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 50416602774d..b32ce53d24ee 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -338,32 +338,38 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
        switch (type) {
        case PSTORE_TYPE_DMESG:
-                sprintf(name, "dmesg-%s-%lld%s", psname, id,
+                scnprintf(name, sizeof(name), "dmesg-%s-%lld%s",
-                                                compressed ? ".enc.z" : "");
+                          psname, id, compressed ? ".enc.z" : "");
                break;
        case PSTORE_TYPE_CONSOLE:
-                sprintf(name, "console-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "console-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_FTRACE:
-                sprintf(name, "ftrace-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "ftrace-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_MCE:
-                sprintf(name, "mce-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "mce-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_PPC_RTAS:
-                sprintf(name, "rtas-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "rtas-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_PPC_OF:
-                sprintf(name, "powerpc-ofw-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "powerpc-ofw-%s-%lld",
+                          psname, id);
                break;
        case PSTORE_TYPE_PPC_COMMON:
-                sprintf(name, "powerpc-common-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "powerpc-common-%s-%lld",
+                          psname, id);
+                break;
+        case PSTORE_TYPE_PMSG:
+                scnprintf(name, sizeof(name), "pmsg-%s-%lld", psname, id);
                break;
        case PSTORE_TYPE_UNKNOWN:
-                sprintf(name, "unknown-%s-%lld", psname, id);
+                scnprintf(name, sizeof(name), "unknown-%s-%lld", psname, id);
                break;
        default:
-                sprintf(name, "type%d-%s-%lld", type, psname, id);
+                scnprintf(name, sizeof(name), "type%d-%s-%lld",
+                          type, psname, id);
                break;
        }
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 3b3d305277c4..c36ba2cd0b5d 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -45,6 +45,12 @@ extern void pstore_register_ftrace(void);
 static inline void pstore_register_ftrace(void) {}
 #endif
+#ifdef CONFIG_PSTORE_PMSG
+extern void pstore_register_pmsg(void);
+#else
+static inline void pstore_register_pmsg(void) {}
+#endif
 extern struct pstore_info *psinfo;
 extern void     pstore_set_kmsg_bytes(int);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 0a9b72cdfeca..c4c9a10c5760 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -301,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                if (big_oops_buf) {
                        dst = big_oops_buf;
-                        hsize = sprintf(dst, "%s#%d Part%d\n", why,
+                        hsize = sprintf(dst, "%s#%d Part%u\n", why,
                                                        oopscount, part);
                        size = big_oops_buf_sz - hsize;
@@ -321,7 +321,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                        }
                } else {
                        dst = psinfo->buf;
-                        hsize = sprintf(dst, "%s#%d Part%d\n", why, oopscount,
+                        hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount,
                                                                        part);
                        size = psinfo->bufsize - hsize;
                        dst += hsize;
@@ -447,6 +447,7 @@ int pstore_register(struct pstore_info *psi)
        if ((psi->flags & PSTORE_FLAGS_FRAGILE) == 0) {
                pstore_register_console();
                pstore_register_ftrace();
+                pstore_register_pmsg();
        }
        if (pstore_update_ms >= 0) {
diff --git a/fs/pstore/pmsg.c b/fs/pstore/pmsg.c
new file mode 100644
index 000000000000..feb5dd2948b4
--- /dev/null
+++ b/fs/pstore/pmsg.c
@@ -0,0 +1,114 @@
+/*
+ * Copyright 2014  Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include "internal.h"
+static DEFINE_MUTEX(pmsg_lock);
+#define PMSG_MAX_BOUNCE_BUFFER_SIZE (2*PAGE_SIZE)
+static ssize_t write_pmsg(struct file *file, const char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+        size_t i, buffer_size;
+        char *buffer;
+        if (!count)
+                return 0;
+        if (!access_ok(VERIFY_READ, buf, count))
+                return -EFAULT;
+        buffer_size = count;
+        if (buffer_size > PMSG_MAX_BOUNCE_BUFFER_SIZE)
+                buffer_size = PMSG_MAX_BOUNCE_BUFFER_SIZE;
+        buffer = vmalloc(buffer_size);
+        mutex_lock(&pmsg_lock);
+        for (i = 0; i < count; ) {
+                size_t c = min(count - i, buffer_size);
+                u64 id;
+                long ret;
+                ret = __copy_from_user(buffer, buf + i, c);
+                if (unlikely(ret != 0)) {
+                        mutex_unlock(&pmsg_lock);
+                        vfree(buffer);
+                        return -EFAULT;
+                }
+                psinfo->write_buf(PSTORE_TYPE_PMSG, 0, &id, 0, buffer, 0, c,
+                                  psinfo);
+                i += c;
+        }
+        mutex_unlock(&pmsg_lock);
+        vfree(buffer);
+        return count;
+}
+static const struct file_operations pmsg_fops = {
+        .owner          = THIS_MODULE,
+        .llseek         = noop_llseek,
+        .write          = write_pmsg,
+};
+static struct class *pmsg_class;
+static int pmsg_major;
+#define PMSG_NAME "pmsg"
+#undef pr_fmt
+#define pr_fmt(fmt) PMSG_NAME ": " fmt
+static char *pmsg_devnode(struct device *dev, umode_t *mode)
+{
+        if (mode)
+                *mode = 0220;
+        return NULL;
+}
+void pstore_register_pmsg(void)
+{
+        struct device *pmsg_device;
+        pmsg_major = register_chrdev(0, PMSG_NAME, &pmsg_fops);
+        if (pmsg_major < 0) {
+                pr_err("register_chrdev failed\n");
+                goto err;
+        }
+        pmsg_class = class_create(THIS_MODULE, PMSG_NAME);
+        if (IS_ERR(pmsg_class)) {
+                pr_err("device class file already in use\n");
+                goto err_class;
+        }
+        pmsg_class->devnode = pmsg_devnode;
+        pmsg_device = device_create(pmsg_class, NULL, MKDEV(pmsg_major, 0),
+                                        NULL, "%s%d", PMSG_NAME, 0);
+        if (IS_ERR(pmsg_device)) {
+                pr_err("failed to create device\n");
+                goto err_device;
+        }
+        return;
+err_device:
+        class_destroy(pmsg_class);
+err_class:
+        unregister_chrdev(pmsg_major, PMSG_NAME);
+err:
+        return;
+}
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 8613e5b35c22..39d1373128e9 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -51,6 +51,10 @@ static ulong ramoops_ftrace_size = MIN_MEM_SIZE;
 module_param_named(ftrace_size, ramoops_ftrace_size, ulong, 0400);
 MODULE_PARM_DESC(ftrace_size, "size of ftrace log");
+static ulong ramoops_pmsg_size = MIN_MEM_SIZE;
+module_param_named(pmsg_size, ramoops_pmsg_size, ulong, 0400);
+MODULE_PARM_DESC(pmsg_size, "size of user space message log");
 static ulong mem_address;
 module_param(mem_address, ulong, 0400);
 MODULE_PARM_DESC(mem_address,
@@ -82,12 +86,14 @@ struct ramoops_context {
        struct persistent_ram_zone **przs;
        struct persistent_ram_zone *cprz;
        struct persistent_ram_zone *fprz;
+        struct persistent_ram_zone *mprz;
        phys_addr_t phys_addr;
        unsigned long size;
        unsigned int memtype;
        size_t record_size;
        size_t console_size;
        size_t ftrace_size;
+        size_t pmsg_size;
        int dump_oops;
        struct persistent_ram_ecc_info ecc_info;
        unsigned int max_dump_cnt;
@@ -96,6 +102,7 @@ struct ramoops_context {
        unsigned int dump_read_cnt;
        unsigned int console_read_cnt;
        unsigned int ftrace_read_cnt;
+        unsigned int pmsg_read_cnt;
        struct pstore_info pstore;
 };
@@ -109,6 +116,7 @@ static int ramoops_pstore_open(struct pstore_info *psi)
        cxt->dump_read_cnt = 0;
        cxt->console_read_cnt = 0;
        cxt->ftrace_read_cnt = 0;
+        cxt->pmsg_read_cnt = 0;
        return 0;
 }
@@ -164,6 +172,12 @@ static int ramoops_read_kmsg_hdr(char *buffer, struct timespec *time,
        return header_length;
 }
+static bool prz_ok(struct persistent_ram_zone *prz)
+{
+        return !!prz && !!(persistent_ram_old_size(prz) +
+                           persistent_ram_ecc_string(prz, NULL, 0));
+}
 static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
                                   int *count, struct timespec *time,
                                   char **buf, bool *compressed,
@@ -178,13 +192,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
        prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
                                   cxt->max_dump_cnt, id, type,
                                   PSTORE_TYPE_DMESG, 1);
-        if (!prz)
+        if (!prz_ok(prz))
                prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
                                           1, id, type, PSTORE_TYPE_CONSOLE, 0);
-        if (!prz)
+        if (!prz_ok(prz))
                prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
                                           1, id, type, PSTORE_TYPE_FTRACE, 0);
-        if (!prz)
+        if (!prz_ok(prz))
+                prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
+                                           1, id, type, PSTORE_TYPE_PMSG, 0);
+        if (!prz_ok(prz))
                return 0;
        if (!persistent_ram_old(prz))
@@ -252,6 +269,11 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
                        return -ENOMEM;
                persistent_ram_write(cxt->fprz, buf, size);
                return 0;
+        } else if (type == PSTORE_TYPE_PMSG) {
+                if (!cxt->mprz)
+                        return -ENOMEM;
+                persistent_ram_write(cxt->mprz, buf, size);
+                return 0;
        }
        if (type != PSTORE_TYPE_DMESG)
@@ -309,6 +331,9 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
        case PSTORE_TYPE_FTRACE:
                prz = cxt->fprz;
                break;
+        case PSTORE_TYPE_PMSG:
+                prz = cxt->mprz;
+                break;
        default:
                return -EINVAL;
        }
@@ -435,7 +460,7 @@ static int ramoops_probe(struct platform_device *pdev)
                goto fail_out;
        if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
-                        !pdata->ftrace_size)) {
+                        !pdata->ftrace_size && !pdata->pmsg_size)) {
                pr_err("The memory size and the record/console size must be "
                        "non-zero\n");
                goto fail_out;
@@ -447,6 +472,8 @@ static int ramoops_probe(struct platform_device *pdev)
                pdata->console_size = rounddown_pow_of_two(pdata->console_size);
        if (pdata->ftrace_size && !is_power_of_2(pdata->ftrace_size))
                pdata->ftrace_size = rounddown_pow_of_two(pdata->ftrace_size);
+        if (pdata->pmsg_size && !is_power_of_2(pdata->pmsg_size))
+                pdata->pmsg_size = rounddown_pow_of_two(pdata->pmsg_size);
        cxt->size = pdata->mem_size;
        cxt->phys_addr = pdata->mem_address;
@@ -454,12 +481,14 @@ static int ramoops_probe(struct platform_device *pdev)
        cxt->record_size = pdata->record_size;
        cxt->console_size = pdata->console_size;
        cxt->ftrace_size = pdata->ftrace_size;
+        cxt->pmsg_size = pdata->pmsg_size;
        cxt->dump_oops = pdata->dump_oops;
        cxt->ecc_info = pdata->ecc_info;
        paddr = cxt->phys_addr;
-        dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size;
+        dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
+                        - cxt->pmsg_size;
        err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
        if (err)
                goto fail_out;
@@ -474,13 +503,9 @@ static int ramoops_probe(struct platform_device *pdev)
        if (err)
                goto fail_init_fprz;
-        if (!cxt->przs && !cxt->cprz && !cxt->fprz) {
+        err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
-                pr_err("memory size too small, minimum is %zu\n",
+        if (err)
-                        cxt->console_size + cxt->record_size +
+                goto fail_init_mprz;
-                        cxt->ftrace_size);
-                err = -EINVAL;
-                goto fail_cnt;
-        }
        cxt->pstore.data = cxt;
        /*
@@ -525,7 +550,8 @@ fail_buf:
        kfree(cxt->pstore.buf);
 fail_clear:
        cxt->pstore.bufsize = 0;
-fail_cnt:
+        kfree(cxt->mprz);
+fail_init_mprz:
        kfree(cxt->fprz);
 fail_init_fprz:
        kfree(cxt->cprz);
@@ -583,6 +609,7 @@ static void ramoops_register_dummy(void)
        dummy_data->record_size = record_size;
        dummy_data->console_size = ramoops_console_size;
        dummy_data->ftrace_size = ramoops_ftrace_size;
+        dummy_data->pmsg_size = ramoops_pmsg_size;
        dummy_data->dump_oops = dump_oops;
        /*
         * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig
index c51df1dd237e..4a09975aac90 100644
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -5,6 +5,7 @@
 config QUOTA
        bool "Quota support"
        select QUOTACTL
+        select SRCU
        help
          If you say Y here, you will be able to set per user limits for disk
          usage (also called disk quotas). Currently, it works for the
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 8f0acef3d184..0ccd4ba3a246 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1248,7 +1248,7 @@ static int ignore_hardlimit(struct dquot *dquot)
        return capable(CAP_SYS_RESOURCE) &&
               (info->dqi_format->qf_fmt_id != QFMT_VFS_OLD ||
-                !(info->dqi_flags & V1_DQF_RSQUASH));
+                !(info->dqi_flags & DQF_ROOT_SQUASH));
 }
 /* needs dq_data_lock */
@@ -2385,41 +2385,106 @@ out:
 }
 EXPORT_SYMBOL(dquot_quota_on_mount);
-static inline qsize_t qbtos(qsize_t blocks)
+static int dquot_quota_enable(struct super_block *sb, unsigned int flags)
 {
-        return blocks << QIF_DQBLKSIZE_BITS;
+        int ret;
+        int type;
+        struct quota_info *dqopt = sb_dqopt(sb);
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+                return -ENOSYS;
+        /* Accounting cannot be turned on while fs is mounted */
+        flags &= ~(FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT);
+        if (!flags)
+                return -EINVAL;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (!(flags & qtype_enforce_flag(type)))
+                        continue;
+                /* Can't enforce without accounting */
+                if (!sb_has_quota_usage_enabled(sb, type))
+                        return -EINVAL;
+                ret = dquot_enable(dqopt->files[type], type,
+                                   dqopt->info[type].dqi_fmt_id,
+                                   DQUOT_LIMITS_ENABLED);
+                if (ret < 0)
+                        goto out_err;
+        }
+        return 0;
+out_err:
+        /* Backout enforcement enablement we already did */
+        for (type--; type >= 0; type--)  {
+                if (flags & qtype_enforce_flag(type))
+                        dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+        }
+        /* Error code translation for better compatibility with XFS */
+        if (ret == -EBUSY)
+                ret = -EEXIST;
+        return ret;
 }
-static inline qsize_t stoqb(qsize_t space)
+static int dquot_quota_disable(struct super_block *sb, unsigned int flags)
 {
-        return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+        int ret;
+        int type;
+        struct quota_info *dqopt = sb_dqopt(sb);
+        if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE))
+                return -ENOSYS;
+        /*
+         * We don't support turning off accounting via quotactl. In principle
+         * quota infrastructure can do this but filesystems don't expect
+         * userspace to be able to do it.
+         */
+        if (flags &
+                  (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT | FS_QUOTA_PDQ_ACCT))
+                return -EOPNOTSUPP;
+        /* Filter out limits not enabled */
+        for (type = 0; type < MAXQUOTAS; type++)
+                if (!sb_has_quota_limits_enabled(sb, type))
+                        flags &= ~qtype_enforce_flag(type);
+        /* Nothing left? */
+        if (!flags)
+                return -EEXIST;
+        for (type = 0; type < MAXQUOTAS; type++) {
+                if (flags & qtype_enforce_flag(type)) {
+                        ret = dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
+                        if (ret < 0)
+                                goto out_err;
+                }
+        }
+        return 0;
+out_err:
+        /* Backout enforcement disabling we already did */
+        for (type--; type >= 0; type--)  {
+                if (flags & qtype_enforce_flag(type))
+                        dquot_enable(dqopt->files[type], type,
+                                     dqopt->info[type].dqi_fmt_id,
+                                     DQUOT_LIMITS_ENABLED);
+        }
+        return ret;
 }
 /* Generic routine for getting common part of quota structure */
-static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+static void do_get_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
        memset(di, 0, sizeof(*di));
-        di->d_version = FS_DQUOT_VERSION;
-        di->d_flags = dquot->dq_id.type == USRQUOTA ?
-                        FS_USER_QUOTA : FS_GROUP_QUOTA;
-        di->d_id = from_kqid_munged(current_user_ns(), dquot->dq_id);
        spin_lock(&dq_data_lock);
-        di->d_blk_hardlimit = stoqb(dm->dqb_bhardlimit);
+        di->d_spc_hardlimit = dm->dqb_bhardlimit;
-        di->d_blk_softlimit = stoqb(dm->dqb_bsoftlimit);
+        di->d_spc_softlimit = dm->dqb_bsoftlimit;
        di->d_ino_hardlimit = dm->dqb_ihardlimit;
        di->d_ino_softlimit = dm->dqb_isoftlimit;
-        di->d_bcount = dm->dqb_curspace + dm->dqb_rsvspace;
+        di->d_space = dm->dqb_curspace + dm->dqb_rsvspace;
-        di->d_icount = dm->dqb_curinodes;
+        di->d_ino_count = dm->dqb_curinodes;
-        di->d_btimer = dm->dqb_btime;
+        di->d_spc_timer = dm->dqb_btime;
-        di->d_itimer = dm->dqb_itime;
+        di->d_ino_timer = dm->dqb_itime;
        spin_unlock(&dq_data_lock);
 }
 int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
-                    struct fs_disk_quota *di)
+                    struct qc_dqblk *di)
 {
        struct dquot *dquot;
@@ -2433,70 +2498,70 @@ int dquot_get_dqblk(struct super_block *sb, struct kqid qid,
 }
 EXPORT_SYMBOL(dquot_get_dqblk);
-#define VFS_FS_DQ_MASK \
+#define VFS_QC_MASK \
-        (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
+        (QC_SPACE | QC_SPC_SOFT | QC_SPC_HARD | \
-         FS_DQ_ICOUNT | FS_DQ_ISOFT | FS_DQ_IHARD | \
+         QC_INO_COUNT | QC_INO_SOFT | QC_INO_HARD | \
-         FS_DQ_BTIMER | FS_DQ_ITIMER)
+         QC_SPC_TIMER | QC_INO_TIMER)
 /* Generic routine for setting common part of quota structure */
-static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
+static int do_set_dqblk(struct dquot *dquot, struct qc_dqblk *di)
 {
        struct mem_dqblk *dm = &dquot->dq_dqb;
        int check_blim = 0, check_ilim = 0;
        struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_id.type];
-        if (di->d_fieldmask & ~VFS_FS_DQ_MASK)
+        if (di->d_fieldmask & ~VFS_QC_MASK)
                return -EINVAL;
-        if (((di->d_fieldmask & FS_DQ_BSOFT) &&
+        if (((di->d_fieldmask & QC_SPC_SOFT) &&
-             (di->d_blk_softlimit > dqi->dqi_maxblimit)) ||
+             di->d_spc_softlimit > dqi->dqi_max_spc_limit) ||
-            ((di->d_fieldmask & FS_DQ_BHARD) &&
+            ((di->d_fieldmask & QC_SPC_HARD) &&
-             (di->d_blk_hardlimit > dqi->dqi_maxblimit)) ||
+             di->d_spc_hardlimit > dqi->dqi_max_spc_limit) ||
-            ((di->d_fieldmask & FS_DQ_ISOFT) &&
+            ((di->d_fieldmask & QC_INO_SOFT) &&
-             (di->d_ino_softlimit > dqi->dqi_maxilimit)) ||
+             (di->d_ino_softlimit > dqi->dqi_max_ino_limit)) ||
-            ((di->d_fieldmask & FS_DQ_IHARD) &&
+            ((di->d_fieldmask & QC_INO_HARD) &&
-             (di->d_ino_hardlimit > dqi->dqi_maxilimit)))
+             (di->d_ino_hardlimit > dqi->dqi_max_ino_limit)))
                return -ERANGE;
        spin_lock(&dq_data_lock);
-        if (di->d_fieldmask & FS_DQ_BCOUNT) {
+        if (di->d_fieldmask & QC_SPACE) {
-                dm->dqb_curspace = di->d_bcount - dm->dqb_rsvspace;
+                dm->dqb_curspace = di->d_space - dm->dqb_rsvspace;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags);
        }
-        if (di->d_fieldmask & FS_DQ_BSOFT)
+        if (di->d_fieldmask & QC_SPC_SOFT)
-                dm->dqb_bsoftlimit = qbtos(di->d_blk_softlimit);
+                dm->dqb_bsoftlimit = di->d_spc_softlimit;
-        if (di->d_fieldmask & FS_DQ_BHARD)
+        if (di->d_fieldmask & QC_SPC_HARD)
-                dm->dqb_bhardlimit = qbtos(di->d_blk_hardlimit);
+                dm->dqb_bhardlimit = di->d_spc_hardlimit;
-        if (di->d_fieldmask & (FS_DQ_BSOFT | FS_DQ_BHARD)) {
+        if (di->d_fieldmask & (QC_SPC_SOFT | QC_SPC_HARD)) {
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags);
        }
-        if (di->d_fieldmask & FS_DQ_ICOUNT) {
+        if (di->d_fieldmask & QC_INO_COUNT) {
-                dm->dqb_curinodes = di->d_icount;
+                dm->dqb_curinodes = di->d_ino_count;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags);
        }
-        if (di->d_fieldmask & FS_DQ_ISOFT)
+        if (di->d_fieldmask & QC_INO_SOFT)
                dm->dqb_isoftlimit = di->d_ino_softlimit;
-        if (di->d_fieldmask & FS_DQ_IHARD)
+        if (di->d_fieldmask & QC_INO_HARD)
                dm->dqb_ihardlimit = di->d_ino_hardlimit;
-        if (di->d_fieldmask & (FS_DQ_ISOFT | FS_DQ_IHARD)) {
+        if (di->d_fieldmask & (QC_INO_SOFT | QC_INO_HARD)) {
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags);
        }
-        if (di->d_fieldmask & FS_DQ_BTIMER) {
+        if (di->d_fieldmask & QC_SPC_TIMER) {
-                dm->dqb_btime = di->d_btimer;
+                dm->dqb_btime = di->d_spc_timer;
                check_blim = 1;
                set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags);
        }
-        if (di->d_fieldmask & FS_DQ_ITIMER) {
+        if (di->d_fieldmask & QC_INO_TIMER) {
-                dm->dqb_itime = di->d_itimer;
+                dm->dqb_itime = di->d_ino_timer;
                check_ilim = 1;
                set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags);
        }
@@ -2506,7 +2571,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
                    dm->dqb_curspace < dm->dqb_bsoftlimit) {
                        dm->dqb_btime = 0;
                        clear_bit(DQ_BLKS_B, &dquot->dq_flags);
-                } else if (!(di->d_fieldmask & FS_DQ_BTIMER))
+                } else if (!(di->d_fieldmask & QC_SPC_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_btime = get_seconds() + dqi->dqi_bgrace;
        }
@@ -2515,7 +2580,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
                    dm->dqb_curinodes < dm->dqb_isoftlimit) {
                        dm->dqb_itime = 0;
                        clear_bit(DQ_INODES_B, &dquot->dq_flags);
-                } else if (!(di->d_fieldmask & FS_DQ_ITIMER))
+                } else if (!(di->d_fieldmask & QC_INO_TIMER))
                        /* Set grace only if user hasn't provided his own... */
                        dm->dqb_itime = get_seconds() + dqi->dqi_igrace;
        }
@@ -2531,7 +2596,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
 }
 int dquot_set_dqblk(struct super_block *sb, struct kqid qid,
-                  struct fs_disk_quota *di)
+                  struct qc_dqblk *di)
 {
        struct dquot *dquot;
        int rc;
@@ -2582,6 +2647,14 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
                goto out;
        }
        mi = sb_dqopt(sb)->info + type;
+        if (ii->dqi_valid & IIF_FLAGS) {
+                if (ii->dqi_flags & ~DQF_SETINFO_MASK ||
+                    (ii->dqi_flags & DQF_ROOT_SQUASH &&
+                     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) {
+                        err = -EINVAL;
+                        goto out;
+                }
+        }
        spin_lock(&dq_data_lock);
        if (ii->dqi_valid & IIF_BGRACE)
                mi->dqi_bgrace = ii->dqi_bgrace;
@@ -2611,6 +2684,17 @@ const struct quotactl_ops dquot_quotactl_ops = {
 };
 EXPORT_SYMBOL(dquot_quotactl_ops);
+const struct quotactl_ops dquot_quotactl_sysfile_ops = {
+        .quota_enable   = dquot_quota_enable,
+        .quota_disable  = dquot_quota_disable,
+        .quota_sync     = dquot_quota_sync,
+        .get_info       = dquot_get_dqinfo,
+        .set_info       = dquot_set_dqinfo,
+        .get_dqblk      = dquot_get_dqblk,
+        .set_dqblk      = dquot_set_dqblk
+};
+EXPORT_SYMBOL(dquot_quotactl_sysfile_ops);
 static int do_proc_dqstats(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2aa4151f99d2..d14a799c7785 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -66,18 +66,40 @@ static int quota_sync_all(int type)
        return ret;
 }
+unsigned int qtype_enforce_flag(int type)
+{
+        switch (type) {
+        case USRQUOTA:
+                return FS_QUOTA_UDQ_ENFD;
+        case GRPQUOTA:
+                return FS_QUOTA_GDQ_ENFD;
+        case PRJQUOTA:
+                return FS_QUOTA_PDQ_ENFD;
+        }
+        return 0;
+}
 static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
                         struct path *path)
 {
-        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_on_meta)
+        if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
                return -ENOSYS;
-        if (sb->s_qcop->quota_on_meta)
+        if (sb->s_qcop->quota_enable)
-                return sb->s_qcop->quota_on_meta(sb, type, id);
+                return sb->s_qcop->quota_enable(sb, qtype_enforce_flag(type));
        if (IS_ERR(path))
                return PTR_ERR(path);
        return sb->s_qcop->quota_on(sb, type, id, path);
 }
+static int quota_quotaoff(struct super_block *sb, int type)
+{
+        if (!sb->s_qcop->quota_off && !sb->s_qcop->quota_disable)
+                return -ENOSYS;
+        if (sb->s_qcop->quota_disable)
+                return sb->s_qcop->quota_disable(sb, qtype_enforce_flag(type));
+        return sb->s_qcop->quota_off(sb, type);
+}
 static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
 {
        __u32 fmt;
@@ -118,17 +140,27 @@ static int quota_setinfo(struct super_block *sb, int type, void __user *addr)
        return sb->s_qcop->set_info(sb, type, &info);
 }
-static void copy_to_if_dqblk(struct if_dqblk *dst, struct fs_disk_quota *src)
+static inline qsize_t qbtos(qsize_t blocks)
+{
+        return blocks << QIF_DQBLKSIZE_BITS;
+}
+static inline qsize_t stoqb(qsize_t space)
+{
+        return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS;
+}
+static void copy_to_if_dqblk(struct if_dqblk *dst, struct qc_dqblk *src)
 {
        memset(dst, 0, sizeof(*dst));
-        dst->dqb_bhardlimit = src->d_blk_hardlimit;
+        dst->dqb_bhardlimit = stoqb(src->d_spc_hardlimit);
-        dst->dqb_bsoftlimit = src->d_blk_softlimit;
+        dst->dqb_bsoftlimit = stoqb(src->d_spc_softlimit);
-        dst->dqb_curspace = src->d_bcount;
+        dst->dqb_curspace = src->d_space;
        dst->dqb_ihardlimit = src->d_ino_hardlimit;
        dst->dqb_isoftlimit = src->d_ino_softlimit;
-        dst->dqb_curinodes = src->d_icount;
+        dst->dqb_curinodes = src->d_ino_count;
-        dst->dqb_btime = src->d_btimer;
+        dst->dqb_btime = src->d_spc_timer;
-        dst->dqb_itime = src->d_itimer;
+        dst->dqb_itime = src->d_ino_timer;
        dst->dqb_valid = QIF_ALL;
 }
@@ -136,7 +168,7 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
 {
        struct kqid qid;
-        struct fs_disk_quota fdq;
+        struct qc_dqblk fdq;
        struct if_dqblk idq;
        int ret;
@@ -154,36 +186,36 @@ static int quota_getquota(struct super_block *sb, int type, qid_t id,
        return 0;
 }
-static void copy_from_if_dqblk(struct fs_disk_quota *dst, struct if_dqblk *src)
+static void copy_from_if_dqblk(struct qc_dqblk *dst, struct if_dqblk *src)
 {
-        dst->d_blk_hardlimit = src->dqb_bhardlimit;
+        dst->d_spc_hardlimit = qbtos(src->dqb_bhardlimit);
-        dst->d_blk_softlimit  = src->dqb_bsoftlimit;
+        dst->d_spc_softlimit = qbtos(src->dqb_bsoftlimit);
-        dst->d_bcount = src->dqb_curspace;
+        dst->d_space = src->dqb_curspace;
        dst->d_ino_hardlimit = src->dqb_ihardlimit;
        dst->d_ino_softlimit = src->dqb_isoftlimit;
-        dst->d_icount = src->dqb_curinodes;
+        dst->d_ino_count = src->dqb_curinodes;
-        dst->d_btimer = src->dqb_btime;
+        dst->d_spc_timer = src->dqb_btime;
-        dst->d_itimer = src->dqb_itime;
+        dst->d_ino_timer = src->dqb_itime;
        dst->d_fieldmask = 0;
        if (src->dqb_valid & QIF_BLIMITS)
-                dst->d_fieldmask |= FS_DQ_BSOFT | FS_DQ_BHARD;
+                dst->d_fieldmask |= QC_SPC_SOFT | QC_SPC_HARD;
        if (src->dqb_valid & QIF_SPACE)
-                dst->d_fieldmask |= FS_DQ_BCOUNT;
+                dst->d_fieldmask |= QC_SPACE;
        if (src->dqb_valid & QIF_ILIMITS)
-                dst->d_fieldmask |= FS_DQ_ISOFT | FS_DQ_IHARD;
+                dst->d_fieldmask |= QC_INO_SOFT | QC_INO_HARD;
        if (src->dqb_valid & QIF_INODES)
-                dst->d_fieldmask |= FS_DQ_ICOUNT;
+                dst->d_fieldmask |= QC_INO_COUNT;
        if (src->dqb_valid & QIF_BTIME)
-                dst->d_fieldmask |= FS_DQ_BTIMER;
+                dst->d_fieldmask |= QC_SPC_TIMER;
        if (src->dqb_valid & QIF_ITIME)
-                dst->d_fieldmask |= FS_DQ_ITIMER;
+                dst->d_fieldmask |= QC_INO_TIMER;
 }
 static int quota_setquota(struct super_block *sb, int type, qid_t id,
                          void __user *addr)
 {
-        struct fs_disk_quota fdq;
+        struct qc_dqblk fdq;
        struct if_dqblk idq;
        struct kqid qid;
@@ -198,15 +230,26 @@ static int quota_setquota(struct super_block *sb, int type, qid_t id,
        return sb->s_qcop->set_dqblk(sb, qid, &fdq);
 }
-static int quota_setxstate(struct super_block *sb, int cmd, void __user *addr)
+static int quota_enable(struct super_block *sb, void __user *addr)
+{
+        __u32 flags;
+        if (copy_from_user(&flags, addr, sizeof(flags)))
+                return -EFAULT;
+        if (!sb->s_qcop->quota_enable)
+                return -ENOSYS;
+        return sb->s_qcop->quota_enable(sb, flags);
+}
+static int quota_disable(struct super_block *sb, void __user *addr)
 {
        __u32 flags;
        if (copy_from_user(&flags, addr, sizeof(flags)))
                return -EFAULT;
-        if (!sb->s_qcop->set_xstate)
+        if (!sb->s_qcop->quota_disable)
                return -ENOSYS;
-        return sb->s_qcop->set_xstate(sb, flags, cmd);
+        return sb->s_qcop->quota_disable(sb, flags);
 }
 static int quota_getxstate(struct super_block *sb, void __user *addr)
@@ -247,10 +290,78 @@ static int quota_getxstatev(struct super_block *sb, void __user *addr)
        return ret;
 }
+/*
+ * XFS defines BBTOB and BTOBB macros inside fs/xfs/ and we cannot move them
+ * out of there as xfsprogs rely on definitions being in that header file. So
+ * just define same functions here for quota purposes.
+ */
+#define XFS_BB_SHIFT 9
+static inline u64 quota_bbtob(u64 blocks)
+{
+        return blocks << XFS_BB_SHIFT;
+}
+static inline u64 quota_btobb(u64 bytes)
+{
+        return (bytes + (1 << XFS_BB_SHIFT) - 1) >> XFS_BB_SHIFT;
+}
+static void copy_from_xfs_dqblk(struct qc_dqblk *dst, struct fs_disk_quota *src)
+{
+        dst->d_spc_hardlimit = quota_bbtob(src->d_blk_hardlimit);
+        dst->d_spc_softlimit = quota_bbtob(src->d_blk_softlimit);
+        dst->d_ino_hardlimit = src->d_ino_hardlimit;
+        dst->d_ino_softlimit = src->d_ino_softlimit;
+        dst->d_space = quota_bbtob(src->d_bcount);
+        dst->d_ino_count = src->d_icount;
+        dst->d_ino_timer = src->d_itimer;
+        dst->d_spc_timer = src->d_btimer;
+        dst->d_ino_warns = src->d_iwarns;
+        dst->d_spc_warns = src->d_bwarns;
+        dst->d_rt_spc_hardlimit = quota_bbtob(src->d_rtb_hardlimit);
+        dst->d_rt_spc_softlimit = quota_bbtob(src->d_rtb_softlimit);
+        dst->d_rt_space = quota_bbtob(src->d_rtbcount);
+        dst->d_rt_spc_timer = src->d_rtbtimer;
+        dst->d_rt_spc_warns = src->d_rtbwarns;
+        dst->d_fieldmask = 0;
+        if (src->d_fieldmask & FS_DQ_ISOFT)
+                dst->d_fieldmask |= QC_INO_SOFT;
+        if (src->d_fieldmask & FS_DQ_IHARD)
+                dst->d_fieldmask |= QC_INO_HARD;
+        if (src->d_fieldmask & FS_DQ_BSOFT)
+                dst->d_fieldmask |= QC_SPC_SOFT;
+        if (src->d_fieldmask & FS_DQ_BHARD)
+                dst->d_fieldmask |= QC_SPC_HARD;
+        if (src->d_fieldmask & FS_DQ_RTBSOFT)
+                dst->d_fieldmask |= QC_RT_SPC_SOFT;
+        if (src->d_fieldmask & FS_DQ_RTBHARD)
+                dst->d_fieldmask |= QC_RT_SPC_HARD;
+        if (src->d_fieldmask & FS_DQ_BTIMER)
+                dst->d_fieldmask |= QC_SPC_TIMER;
+        if (src->d_fieldmask & FS_DQ_ITIMER)
+                dst->d_fieldmask |= QC_INO_TIMER;
+        if (src->d_fieldmask & FS_DQ_RTBTIMER)
+                dst->d_fieldmask |= QC_RT_SPC_TIMER;
+        if (src->d_fieldmask & FS_DQ_BWARNS)
+                dst->d_fieldmask |= QC_SPC_WARNS;
+        if (src->d_fieldmask & FS_DQ_IWARNS)
+                dst->d_fieldmask |= QC_INO_WARNS;
+        if (src->d_fieldmask & FS_DQ_RTBWARNS)
+                dst->d_fieldmask |= QC_RT_SPC_WARNS;
+        if (src->d_fieldmask & FS_DQ_BCOUNT)
+                dst->d_fieldmask |= QC_SPACE;
+        if (src->d_fieldmask & FS_DQ_ICOUNT)
+                dst->d_fieldmask |= QC_INO_COUNT;
+        if (src->d_fieldmask & FS_DQ_RTBCOUNT)
+                dst->d_fieldmask |= QC_RT_SPACE;
+}
 static int quota_setxquota(struct super_block *sb, int type, qid_t id,
                           void __user *addr)
 {
        struct fs_disk_quota fdq;
+        struct qc_dqblk qdq;
        struct kqid qid;
        if (copy_from_user(&fdq, addr, sizeof(fdq)))
@@ -260,13 +371,44 @@ static int quota_setxquota(struct super_block *sb, int type, qid_t id,
        qid = make_kqid(current_user_ns(), type, id);
        if (!qid_valid(qid))
                return -EINVAL;
-        return sb->s_qcop->set_dqblk(sb, qid, &fdq);
+        copy_from_xfs_dqblk(&qdq, &fdq);
+        return sb->s_qcop->set_dqblk(sb, qid, &qdq);
+}
+static void copy_to_xfs_dqblk(struct fs_disk_quota *dst, struct qc_dqblk *src,
+                              int type, qid_t id)
+{
+        memset(dst, 0, sizeof(*dst));
+        dst->d_version = FS_DQUOT_VERSION;
+        dst->d_id = id;
+        if (type == USRQUOTA)
+                dst->d_flags = FS_USER_QUOTA;
+        else if (type == PRJQUOTA)
+                dst->d_flags = FS_PROJ_QUOTA;
+        else
+                dst->d_flags = FS_GROUP_QUOTA;
+        dst->d_blk_hardlimit = quota_btobb(src->d_spc_hardlimit);
+        dst->d_blk_softlimit = quota_btobb(src->d_spc_softlimit);
+        dst->d_ino_hardlimit = src->d_ino_hardlimit;
+        dst->d_ino_softlimit = src->d_ino_softlimit;
+        dst->d_bcount = quota_btobb(src->d_space);
+        dst->d_icount = src->d_ino_count;
+        dst->d_itimer = src->d_ino_timer;
+        dst->d_btimer = src->d_spc_timer;
+        dst->d_iwarns = src->d_ino_warns;
+        dst->d_bwarns = src->d_spc_warns;
+        dst->d_rtb_hardlimit = quota_btobb(src->d_rt_spc_hardlimit);
+        dst->d_rtb_softlimit = quota_btobb(src->d_rt_spc_softlimit);
+        dst->d_rtbcount = quota_btobb(src->d_rt_space);
+        dst->d_rtbtimer = src->d_rt_spc_timer;
+        dst->d_rtbwarns = src->d_rt_spc_warns;
 }
 static int quota_getxquota(struct super_block *sb, int type, qid_t id,
                           void __user *addr)
 {
        struct fs_disk_quota fdq;
+        struct qc_dqblk qdq;
        struct kqid qid;
        int ret;
@@ -275,8 +417,11 @@ static int quota_getxquota(struct super_block *sb, int type, qid_t id,
        qid = make_kqid(current_user_ns(), type, id);
        if (!qid_valid(qid))
                return -EINVAL;
-        ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
+        ret = sb->s_qcop->get_dqblk(sb, qid, &qdq);
-        if (!ret && copy_to_user(addr, &fdq, sizeof(fdq)))
+        if (ret)
+                return ret;
+        copy_to_xfs_dqblk(&fdq, &qdq, type, id);
+        if (copy_to_user(addr, &fdq, sizeof(fdq)))
                return -EFAULT;
        return ret;
 }
@@ -317,9 +462,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        case Q_QUOTAON:
                return quota_quotaon(sb, type, cmd, id, path);
        case Q_QUOTAOFF:
-                if (!sb->s_qcop->quota_off)
+                return quota_quotaoff(sb, type);
-                        return -ENOSYS;
-                return sb->s_qcop->quota_off(sb, type);
        case Q_GETFMT:
                return quota_getfmt(sb, type, addr);
        case Q_GETINFO:
@@ -335,8 +478,9 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
                        return -ENOSYS;
                return sb->s_qcop->quota_sync(sb, type);
        case Q_XQUOTAON:
+                return quota_enable(sb, addr);
        case Q_XQUOTAOFF:
-                return quota_setxstate(sb, cmd, addr);
+                return quota_disable(sb, addr);
        case Q_XQUOTARM:
                return quota_rmxquota(sb, addr);
        case Q_XGETQSTAT:
diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c
index 469c6848b322..8fe79beced5c 100644
--- a/fs/quota/quota_v1.c
+++ b/fs/quota/quota_v1.c
@@ -169,8 +169,8 @@ static int v1_read_file_info(struct super_block *sb, int type)
        }
        ret = 0;
        /* limits are stored as unsigned 32-bit data */
-        dqopt->info[type].dqi_maxblimit = 0xffffffff;
+        dqopt->info[type].dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
-        dqopt->info[type].dqi_maxilimit = 0xffffffff;
+        dqopt->info[type].dqi_max_ino_limit = 0xffffffff;
        dqopt->info[type].dqi_igrace =
                        dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME;
        dqopt->info[type].dqi_bgrace =
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 02751ec695c5..9cb10d7197f7 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -117,16 +117,17 @@ static int v2_read_file_info(struct super_block *sb, int type)
        qinfo = info->dqi_priv;
        if (version == 0) {
                /* limits are stored as unsigned 32-bit data */
-                info->dqi_maxblimit = 0xffffffff;
+                info->dqi_max_spc_limit = 0xffffffffULL << QUOTABLOCK_BITS;
-                info->dqi_maxilimit = 0xffffffff;
+                info->dqi_max_ino_limit = 0xffffffff;
        } else {
-                /* used space is stored as unsigned 64-bit value */
+                /* used space is stored as unsigned 64-bit value in bytes */
-                info->dqi_maxblimit = 0xffffffffffffffffULL;    /* 2^64-1 */
+                info->dqi_max_spc_limit = 0xffffffffffffffffULL; /* 2^64-1 */
-                info->dqi_maxilimit = 0xffffffffffffffffULL;
+                info->dqi_max_ino_limit = 0xffffffffffffffffULL;
        }
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
-        info->dqi_flags = le32_to_cpu(dinfo.dqi_flags);
+        /* No flags currently supported */
+        info->dqi_flags = 0;
        qinfo->dqi_sb = sb;
        qinfo->dqi_type = type;
        qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
@@ -157,7 +158,8 @@ static int v2_write_file_info(struct super_block *sb, int type)
        info->dqi_flags &= ~DQF_INFO_DIRTY;
        dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace);
        dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace);
-        dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK);
+        /* No flags currently supported */
+        dinfo.dqi_flags = cpu_to_le32(0);
        spin_unlock(&dq_data_lock);
        dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks);
        dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index bbafbde3471a..f6ab41b39612 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -34,7 +34,14 @@ static unsigned long ramfs_nommu_get_unmapped_area(struct file *file,
                                                   unsigned long flags);
 static int ramfs_nommu_mmap(struct file *file, struct vm_area_struct *vma);
+static unsigned ramfs_mmap_capabilities(struct file *file)
+{
+        return NOMMU_MAP_DIRECT | NOMMU_MAP_COPY | NOMMU_MAP_READ |
+                NOMMU_MAP_WRITE | NOMMU_MAP_EXEC;
+}
 const struct file_operations ramfs_file_operations = {
+        .mmap_capabilities      = ramfs_mmap_capabilities,
        .mmap                   = ramfs_nommu_mmap,
        .get_unmapped_area      = ramfs_nommu_get_unmapped_area,
        .read                   = new_sync_read,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index d365b1c4eb3c..889d558b4e05 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -50,14 +50,6 @@ static const struct address_space_operations ramfs_aops = {
        .set_page_dirty = __set_page_dirty_no_writeback,
 };
-static struct backing_dev_info ramfs_backing_dev_info = {
-        .name           = "ramfs",
-        .ra_pages       = 0,    /* No readahead */
-        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK |
-                          BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
-                          BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP,
-};
 struct inode *ramfs_get_inode(struct super_block *sb,
                                const struct inode *dir, umode_t mode, dev_t dev)
 {
@@ -67,7 +59,6 @@ struct inode *ramfs_get_inode(struct super_block *sb,
                inode->i_ino = get_next_ino();
                inode_init_owner(inode, dir, mode);
                inode->i_mapping->a_ops = &ramfs_aops;
-                inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info;
                mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER);
                mapping_set_unevictable(inode->i_mapping);
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -267,19 +258,9 @@ static struct file_system_type ramfs_fs_type = {
 int __init init_ramfs_fs(void)
 {
        static unsigned long once;
-        int err;
        if (test_and_set_bit(0, &once))
                return 0;
+        return register_filesystem(&ramfs_fs_type);
-        err = bdi_init(&ramfs_backing_dev_info);
-        if (err)
-                return err;
-        err = register_filesystem(&ramfs_fs_type);
-        if (err)
-                bdi_destroy(&ramfs_backing_dev_info);
-        return err;
 }
 fs_initcall(init_ramfs_fs);
diff --git a/fs/read_write.c b/fs/read_write.c
index c0805c93b6fa..8e1b68786d66 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -333,6 +333,52 @@ out_putf:
 }
 #endif
+ssize_t vfs_iter_read(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+        struct kiocb kiocb;
+        ssize_t ret;
+        if (!file->f_op->read_iter)
+                return -EINVAL;
+        init_sync_kiocb(&kiocb, file);
+        kiocb.ki_pos = *ppos;
+        kiocb.ki_nbytes = iov_iter_count(iter);
+        iter->type |= READ;
+        ret = file->f_op->read_iter(&kiocb, iter);
+        if (ret == -EIOCBQUEUED)
+                ret = wait_on_sync_kiocb(&kiocb);
+        if (ret > 0)
+                *ppos = kiocb.ki_pos;
+        return ret;
+}
+EXPORT_SYMBOL(vfs_iter_read);
+ssize_t vfs_iter_write(struct file *file, struct iov_iter *iter, loff_t *ppos)
+{
+        struct kiocb kiocb;
+        ssize_t ret;
+        if (!file->f_op->write_iter)
+                return -EINVAL;
+        init_sync_kiocb(&kiocb, file);
+        kiocb.ki_pos = *ppos;
+        kiocb.ki_nbytes = iov_iter_count(iter);
+        iter->type |= WRITE;
+        ret = file->f_op->write_iter(&kiocb, iter);
+        if (ret == -EIOCBQUEUED)
+                ret = wait_on_sync_kiocb(&kiocb);
+        if (ret > 0)
+                *ppos = kiocb.ki_pos;
+        return ret;
+}
+EXPORT_SYMBOL(vfs_iter_write);
 /*
 * rw_verify_area doesn't like huge counts. We limit
 * them to something that fits in "int" so that others
@@ -358,7 +404,7 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
                        return retval;
        }
-        if (unlikely(inode->i_flock && mandatory_lock(inode))) {
+        if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
                retval = locks_mandatory_area(
                        read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
                        inode, file, pos, count);
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index a7eec9888f10..e72401e1f995 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -2766,7 +2766,7 @@ static int reiserfs_write_begin(struct file *file,
        int old_ref = 0;
        inode = mapping->host;
-        *fsdata = 0;
+        *fsdata = NULL;
        if (flags & AOP_FLAG_CONT_EXPAND &&
            (pos & (inode->i_sb->s_blocksize - 1)) == 0) {
                pos ++;
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
index ea06c7554860..7da9e2153953 100644
--- a/fs/romfs/mmap-nommu.c
+++ b/fs/romfs/mmap-nommu.c
@@ -70,6 +70,15 @@ static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
        return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
 }
+static unsigned romfs_mmap_capabilities(struct file *file)
+{
+        struct mtd_info *mtd = file_inode(file)->i_sb->s_mtd;
+        if (!mtd)
+                return NOMMU_MAP_COPY;
+        return mtd_mmap_capabilities(mtd);
+}
 const struct file_operations romfs_ro_fops = {
        .llseek                 = generic_file_llseek,
        .read                   = new_sync_read,
@@ -77,4 +86,5 @@ const struct file_operations romfs_ro_fops = {
        .splice_read            = generic_file_splice_read,
        .mmap                   = romfs_mmap,
        .get_unmapped_area      = romfs_get_unmapped_area,
+        .mmap_capabilities      = romfs_mmap_capabilities,
 };
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index e98dd88197d5..268733cda397 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -355,9 +355,6 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
        case ROMFH_REG:
                i->i_fop = &romfs_ro_fops;
                i->i_data.a_ops = &romfs_aops;
-                if (i->i_sb->s_mtd)
-                        i->i_data.backing_dev_info =
-                                i->i_sb->s_mtd->backing_dev_info;
                if (nextfh & ROMFH_EXEC)
                        mode |= S_IXUGO;
                break;
diff --git a/fs/select.c b/fs/select.c
index 467bb1cb3ea5..f684c750e08a 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -971,7 +971,7 @@ SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds,
        if (ret == -EINTR) {
                struct restart_block *restart_block;
-                restart_block = &current_thread_info()->restart_block;
+                restart_block = &current->restart_block;
                restart_block->fn = do_restart_poll;
                restart_block->poll.ufds = ufds;
                restart_block->poll.nfds = nfds;
diff --git a/fs/seq_file.c b/fs/seq_file.c
index dbf3a59c86bb..555f82155be8 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -539,38 +539,6 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc)
        return res;
 }
-int seq_bitmap(struct seq_file *m, const unsigned long *bits,
-                                   unsigned int nr_bits)
-{
-        if (m->count < m->size) {
-                int len = bitmap_scnprintf(m->buf + m->count,
-                                m->size - m->count, bits, nr_bits);
-                if (m->count + len < m->size) {
-                        m->count += len;
-                        return 0;
-                }
-        }
-        seq_set_overflow(m);
-        return -1;
-}
-EXPORT_SYMBOL(seq_bitmap);
-int seq_bitmap_list(struct seq_file *m, const unsigned long *bits,
-                unsigned int nr_bits)
-{
-        if (m->count < m->size) {
-                int len = bitmap_scnlistprintf(m->buf + m->count,
-                                m->size - m->count, bits, nr_bits);
-                if (m->count + len < m->size) {
-                        m->count += len;
-                        return 0;
-                }
-        }
-        seq_set_overflow(m);
-        return -1;
-}
-EXPORT_SYMBOL(seq_bitmap_list);
 static void *single_start(struct seq_file *p, loff_t *pos)
 {
        return NULL + (*pos == 0);
diff --git a/fs/splice.c b/fs/splice.c
index 75c6058eabf2..7968da96bebb 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -961,7 +961,6 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
        splice_from_pipe_begin(&sd);
        while (sd.total_len) {
                struct iov_iter from;
-                struct kiocb kiocb;
                size_t left;
                int n, idx;
@@ -1005,29 +1004,15 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                        left -= this_len;
                }
-                /* ... iov_iter */
+                iov_iter_bvec(&from, ITER_BVEC | WRITE, array, n,
-                from.type = ITER_BVEC | WRITE;
+                              sd.total_len - left);
-                from.bvec = array;
+                ret = vfs_iter_write(out, &from, &sd.pos);
-                from.nr_segs = n;
-                from.count = sd.total_len - left;
-                from.iov_offset = 0;
-                /* ... and iocb */
-                init_sync_kiocb(&kiocb, out);
-                kiocb.ki_pos = sd.pos;
-                kiocb.ki_nbytes = sd.total_len - left;
-                /* now, send it */
-                ret = out->f_op->write_iter(&kiocb, &from);
-                if (-EIOCBQUEUED == ret)
-                        ret = wait_on_sync_kiocb(&kiocb);
                if (ret <= 0)
                        break;
                sd.num_spliced += ret;
                sd.total_len -= ret;
-                *ppos = sd.pos = kiocb.ki_pos;
+                *ppos = sd.pos;
                /* dismiss the fully eaten buffers, adjust the partial one */
                while (ret) {
diff --git a/fs/super.c b/fs/super.c
index eae088f6aaae..65a53efc1cf4 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,8 +36,8 @@
 #include "internal.h"
-LIST_HEAD(super_blocks);
+static LIST_HEAD(super_blocks);
-DEFINE_SPINLOCK(sb_lock);
+static DEFINE_SPINLOCK(sb_lock);
 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
        "sb_writers",
@@ -75,10 +75,10 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
                return SHRINK_STOP;
        if (sb->s_op->nr_cached_objects)
-                fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
+                fs_objects = sb->s_op->nr_cached_objects(sb, sc);
-        inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
+        inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
-        dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
+        dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
        total_objects = dentries + inodes + fs_objects + 1;
        if (!total_objects)
                total_objects = 1;
@@ -86,19 +86,23 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
        /* proportion the scan between the caches */
        dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
        inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+        fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
        /*
         * prune the dcache first as the icache is pinned by it, then
         * prune the icache, followed by the filesystem specific caches
+         *
+         * Ensure that we always scan at least one object - memcg kmem
+         * accounting uses this to fully empty the caches.
         */
-        freed = prune_dcache_sb(sb, dentries, sc->nid);
+        sc->nr_to_scan = dentries + 1;
-        freed += prune_icache_sb(sb, inodes, sc->nid);
+        freed = prune_dcache_sb(sb, sc);
+        sc->nr_to_scan = inodes + 1;
+        freed += prune_icache_sb(sb, sc);
        if (fs_objects) {
-                fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
+                sc->nr_to_scan = fs_objects + 1;
-                                                                total_objects);
+                freed += sb->s_op->free_cached_objects(sb, sc);
-                freed += sb->s_op->free_cached_objects(sb, fs_objects,
-                                                       sc->nid);
        }
        drop_super(sb);
@@ -118,17 +122,14 @@ static unsigned long super_cache_count(struct shrinker *shrink,
         * scalability bottleneck. The counts could get updated
         * between super_cache_count and super_cache_scan anyway.
         * Call to super_cache_count with shrinker_rwsem held
-         * ensures the safety of call to list_lru_count_node() and
+         * ensures the safety of call to list_lru_shrink_count() and
         * s_op->nr_cached_objects().
         */
        if (sb->s_op && sb->s_op->nr_cached_objects)
-                total_objects = sb->s_op->nr_cached_objects(sb,
+                total_objects = sb->s_op->nr_cached_objects(sb, sc);
-                                                 sc->nid);
-        total_objects += list_lru_count_node(&sb->s_dentry_lru,
+        total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
-                                                 sc->nid);
+        total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
-        total_objects += list_lru_count_node(&sb->s_inode_lru,
-                                                 sc->nid);
        total_objects = vfs_pressure_ratio(total_objects);
        return total_objects;
@@ -185,15 +186,15 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        }
        init_waitqueue_head(&s->s_writers.wait);
        init_waitqueue_head(&s->s_writers.wait_unfrozen);
+        s->s_bdi = &noop_backing_dev_info;
        s->s_flags = flags;
-        s->s_bdi = &default_backing_dev_info;
        INIT_HLIST_NODE(&s->s_instances);
        INIT_HLIST_BL_HEAD(&s->s_anon);
        INIT_LIST_HEAD(&s->s_inodes);
-        if (list_lru_init(&s->s_dentry_lru))
+        if (list_lru_init_memcg(&s->s_dentry_lru))
                goto fail;
-        if (list_lru_init(&s->s_inode_lru))
+        if (list_lru_init_memcg(&s->s_inode_lru))
                goto fail;
        init_rwsem(&s->s_umount);
@@ -229,7 +230,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
        s->s_shrink.scan_objects = super_cache_scan;
        s->s_shrink.count_objects = super_cache_count;
        s->s_shrink.batch = 1024;
-        s->s_shrink.flags = SHRINKER_NUMA_AWARE;
+        s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
        return s;
 fail:
@@ -284,6 +285,14 @@ void deactivate_locked_super(struct super_block *s)
                unregister_shrinker(&s->s_shrink);
                fs->kill_sb(s);
+                /*
+                 * Since list_lru_destroy() may sleep, we cannot call it from
+                 * put_super(), where we hold the sb_lock. Therefore we destroy
+                 * the lru lists right now.
+                 */
+                list_lru_destroy(&s->s_dentry_lru);
+                list_lru_destroy(&s->s_inode_lru);
                put_filesystem(fs);
                put_super(s);
        } else {
@@ -706,9 +715,9 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
        if (remount_ro) {
-                if (sb->s_pins.first) {
+                if (!hlist_empty(&sb->s_pins)) {
                        up_write(&sb->s_umount);
-                        sb_pin_kill(sb);
+                        group_pin_kill(&sb->s_pins);
                        down_write(&sb->s_umount);
                        if (!sb->s_root)
                                return 0;
@@ -863,10 +872,7 @@ EXPORT_SYMBOL(free_anon_bdev);
 int set_anon_super(struct super_block *s, void *data)
 {
-        int error = get_anon_bdev(&s->s_dev);
+        return get_anon_bdev(&s->s_dev);
-        if (!error)
-                s->s_bdi = &noop_backing_dev_info;
-        return error;
 }
 EXPORT_SYMBOL(set_anon_super);
@@ -1111,7 +1117,6 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
        sb = root->d_sb;
        BUG_ON(!sb);
        WARN_ON(!sb->s_bdi);
-        WARN_ON(sb->s_bdi == &default_backing_dev_info);
        sb->s_flags |= MS_BORN;
        error = security_sb_kern_mount(sb, flags, secdata);
diff --git a/fs/sync.c b/fs/sync.c
index 01d9f18a70b5..fbc98ee62044 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -177,8 +177,16 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 */
 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 {
+        struct inode *inode = file->f_mapping->host;
        if (!file->f_op->fsync)
                return -EINVAL;
+        if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
+                spin_lock(&inode->i_lock);
+                inode->i_state &= ~I_DIRTY_TIME;
+                spin_unlock(&inode->i_lock);
+                mark_inode_dirty_sync(inode);
+        }
        return file->f_op->fsync(file, start, end, datasync);
 }
 EXPORT_SYMBOL(vfs_fsync_range);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index dfe928a9540f..7c2867b44141 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -295,7 +295,7 @@ int sysfs_add_file_mode_ns(struct kernfs_node *parent,
                key = attr->key ?: (struct lock_class_key *)&attr->skey;
 #endif
        kn = __kernfs_create_file(parent, attr->name, mode & 0777, size, ops,
-                                  (void *)attr, ns, true, key);
+                                  (void *)attr, ns, key);
        if (IS_ERR(kn)) {
                if (PTR_ERR(kn) == -EEXIST)
                        sysfs_warn_dup(parent, attr->name);
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 7d2a860ba788..2554d8835b48 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -99,7 +99,7 @@ static int internal_create_group(struct kobject *kobj, int update,
                return -EINVAL;
        if (!grp->attrs && !grp->bin_attrs) {
                WARN(1, "sysfs: (bin_)attrs not set by subsystem for group: %s/%s\n",
-                        kobj->name, grp->name ? "" : grp->name);
+                        kobj->name, grp->name ?: "");
                return -EINVAL;
        }
        if (grp->name) {
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 7ed13e1e216a..4cfb3e82c56f 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2032,6 +2032,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
                long long blk_offs;
                struct ubifs_data_node *dn = node;
+                ubifs_assert(zbr->len >= UBIFS_DATA_NODE_SZ);
                /*
                 * Search the inode node this data node belongs to and insert
                 * it to the RB-tree of inodes.
@@ -2060,6 +2062,8 @@ static int check_leaf(struct ubifs_info *c, struct ubifs_zbranch *zbr,
                struct ubifs_dent_node *dent = node;
                struct fsck_inode *fscki1;
+                ubifs_assert(zbr->len >= UBIFS_DENT_NODE_SZ);
                err = ubifs_validate_entry(c, dent);
                if (err)
                        goto out_dump;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ea41649e4ca5..0fa6c803992e 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -108,8 +108,6 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
        inode->i_mtime = inode->i_atime = inode->i_ctime =
                         ubifs_current_time(inode);
        inode->i_mapping->nrpages = 0;
-        /* Disable readahead */
-        inode->i_mapping->backing_dev_info = &c->bdi;
        switch (mode & S_IFMT) {
        case S_IFREG:
@@ -272,6 +270,10 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                goto out_budg;
        }
+        err = ubifs_init_security(dir, inode, &dentry->d_name);
+        if (err)
+                goto out_cancel;
        mutex_lock(&dir_ui->ui_mutex);
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
@@ -728,6 +730,10 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
                goto out_budg;
        }
+        err = ubifs_init_security(dir, inode, &dentry->d_name);
+        if (err)
+                goto out_cancel;
        mutex_lock(&dir_ui->ui_mutex);
        insert_inode_hash(inode);
        inc_nlink(inode);
@@ -808,6 +814,10 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
        ui->data = dev;
        ui->data_len = devlen;
+        err = ubifs_init_security(dir, inode, &dentry->d_name);
+        if (err)
+                goto out_cancel;
        mutex_lock(&dir_ui->ui_mutex);
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
@@ -884,6 +894,10 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
        ui->data_len = len;
        inode->i_size = ubifs_inode(inode)->ui_size = len;
+        err = ubifs_init_security(dir, inode, &dentry->d_name);
+        if (err)
+                goto out_cancel;
        mutex_lock(&dir_ui->ui_mutex);
        dir->i_size += sz_change;
        dir_ui->ui_size = dir->i_size;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 538519ee37d9..e627c0acf626 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1536,7 +1536,6 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
        .fault        = filemap_fault,
        .map_pages = filemap_map_pages,
        .page_mkwrite = ubifs_vm_page_mkwrite,
-        .remap_pages = generic_file_remap_pages,
 };
 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
@@ -1574,6 +1573,10 @@ const struct inode_operations ubifs_symlink_inode_operations = {
        .follow_link = ubifs_follow_link,
        .setattr     = ubifs_setattr,
        .getattr     = ubifs_getattr,
+        .setxattr    = ubifs_setxattr,
+        .getxattr    = ubifs_getxattr,
+        .listxattr   = ubifs_listxattr,
+        .removexattr = ubifs_removexattr,
 };
 const struct file_operations ubifs_file_operations = {
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index 3187925e9879..9b40a1c5e160 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -1028,9 +1028,22 @@ int ubifs_replay_journal(struct ubifs_info *c)
        do {
                err = replay_log_leb(c, lnum, 0, c->sbuf);
-                if (err == 1)
+                if (err == 1) {
-                        /* We hit the end of the log */
+                        if (lnum != c->lhead_lnum)
-                        break;
+                                /* We hit the end of the log */
+                                break;
+                        /*
+                         * The head of the log must always start with the
+                         * "commit start" node on a properly formatted UBIFS.
+                         * But we found no nodes at all, which means that
+                         * someting went wrong and we cannot proceed mounting
+                         * the file-system.
+                         */
+                        ubifs_err("no UBIFS nodes found at the log head LEB %d:%d, possibly corrupted",
+                                  lnum, 0);
+                        err = -EINVAL;
+                }
                if (err)
                        goto out;
                lnum = ubifs_next_log_lnum(c, lnum);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 106bf20629ce..93e946561c5c 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -156,9 +156,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
        if (err)
                goto out_invalid;
-        /* Disable read-ahead */
-        inode->i_mapping->backing_dev_info = &c->bdi;
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
                inode->i_mapping->a_ops = &ubifs_file_address_operations;
@@ -2017,7 +2014,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
         * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
         */
        c->bdi.name = "ubifs",
-        c->bdi.capabilities = BDI_CAP_MAP_COPY;
+        c->bdi.capabilities = 0;
        err  = bdi_init(&c->bdi);
        if (err)
                goto out_close;
@@ -2039,6 +2036,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        if (c->max_inode_sz > MAX_LFS_FILESIZE)
                sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
        sb->s_op = &ubifs_super_operations;
+        sb->s_xattr = ubifs_xattr_handlers;
        mutex_lock(&c->umount_mutex);
        err = mount_ubifs(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index c4fe900c67ab..bc04b9c69891 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -36,6 +36,7 @@
 #include <linux/mtd/ubi.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
+#include <linux/security.h>
 #include "ubifs-media.h"
 /* Version of this UBIFS implementation */
@@ -1465,6 +1466,7 @@ extern spinlock_t ubifs_infos_lock;
 extern atomic_long_t ubifs_clean_zn_cnt;
 extern struct kmem_cache *ubifs_inode_slab;
 extern const struct super_operations ubifs_super_operations;
+extern const struct xattr_handler *ubifs_xattr_handlers[];
 extern const struct address_space_operations ubifs_file_address_operations;
 extern const struct file_operations ubifs_file_operations;
 extern const struct inode_operations ubifs_file_inode_operations;
@@ -1754,6 +1756,8 @@ ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
                       size_t size);
 ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int ubifs_removexattr(struct dentry *dentry, const char *name);
+int ubifs_init_security(struct inode *dentry, struct inode *inode,
+                        const struct qstr *qstr);
 /* super.c */
 struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index 5e0a63b1b0d5..a92be244a6fb 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -100,24 +100,30 @@ static const struct file_operations empty_fops;
 static int create_xattr(struct ubifs_info *c, struct inode *host,
                        const struct qstr *nm, const void *value, int size)
 {
-        int err;
+        int err, names_len;
        struct inode *inode;
        struct ubifs_inode *ui, *host_ui = ubifs_inode(host);
        struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
                                .new_ino_d = ALIGN(size, 8), .dirtied_ino = 1,
                                .dirtied_ino_d = ALIGN(host_ui->data_len, 8) };
-        if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE)
+        if (host_ui->xattr_cnt >= MAX_XATTRS_PER_INODE) {
+                ubifs_err("inode %lu already has too many xattrs (%d), cannot create more",
+                          host->i_ino, host_ui->xattr_cnt);
                return -ENOSPC;
+        }
        /*
         * Linux limits the maximum size of the extended attribute names list
         * to %XATTR_LIST_MAX. This means we should not allow creating more
         * extended attributes if the name list becomes larger. This limitation
         * is artificial for UBIFS, though.
         */
-        if (host_ui->xattr_names + host_ui->xattr_cnt +
+        names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1;
-                                        nm->len + 1 > XATTR_LIST_MAX)
+        if (names_len > XATTR_LIST_MAX) {
+                ubifs_err("cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
+                          host->i_ino, names_len, XATTR_LIST_MAX);
                return -ENOSPC;
+        }
        err = ubifs_budget_space(c, &req);
        if (err)
@@ -293,18 +299,16 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
        return ERR_PTR(-EINVAL);
 }
-int ubifs_setxattr(struct dentry *dentry, const char *name,
+static int setxattr(struct inode *host, const char *name, const void *value,
-                   const void *value, size_t size, int flags)
+                    size_t size, int flags)
 {
-        struct inode *inode, *host = dentry->d_inode;
+        struct inode *inode;
        struct ubifs_info *c = host->i_sb->s_fs_info;
        struct qstr nm = QSTR_INIT(name, strlen(name));
        struct ubifs_dent_node *xent;
        union ubifs_key key;
        int err, type;
-        dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd", name,
-                host->i_ino, dentry, size);
        ubifs_assert(mutex_is_locked(&host->i_mutex));
        if (size > UBIFS_MAX_INO_DATA)
@@ -356,6 +360,15 @@ out_free:
        return err;
 }
+int ubifs_setxattr(struct dentry *dentry, const char *name,
+                   const void *value, size_t size, int flags)
+{
+        dbg_gen("xattr '%s', host ino %lu ('%pd'), size %zd",
+                name, dentry->d_inode->i_ino, dentry, size);
+        return setxattr(dentry->d_inode, name, value, size, flags);
+}
 ssize_t ubifs_getxattr(struct dentry *dentry, const char *name, void *buf,
                       size_t size)
 {
@@ -568,3 +581,84 @@ out_free:
        kfree(xent);
        return err;
 }
+static size_t security_listxattr(struct dentry *d, char *list, size_t list_size,
+                                 const char *name, size_t name_len, int flags)
+{
+        const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
+        const size_t total_len = prefix_len + name_len + 1;
+        if (list && total_len <= list_size) {
+                memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
+                memcpy(list + prefix_len, name, name_len);
+                list[prefix_len + name_len] = '\0';
+        }
+        return total_len;
+}
+static int security_getxattr(struct dentry *d, const char *name, void *buffer,
+                      size_t size, int flags)
+{
+        return ubifs_getxattr(d, name, buffer, size);
+}
+static int security_setxattr(struct dentry *d, const char *name,
+                             const void *value, size_t size, int flags,
+                             int handler_flags)
+{
+        return ubifs_setxattr(d, name, value, size, flags);
+}
+static const struct xattr_handler ubifs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = security_listxattr,
+        .get    = security_getxattr,
+        .set    = security_setxattr,
+};
+const struct xattr_handler *ubifs_xattr_handlers[] = {
+        &ubifs_xattr_security_handler,
+        NULL,
+};
+static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
+                      void *fs_info)
+{
+        const struct xattr *xattr;
+        char *name;
+        int err = 0;
+        for (xattr = xattr_array; xattr->name != NULL; xattr++) {
+                name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
+                               strlen(xattr->name) + 1, GFP_NOFS);
+                if (!name) {
+                        err = -ENOMEM;
+                        break;
+                }
+                strcpy(name, XATTR_SECURITY_PREFIX);
+                strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
+                err = setxattr(inode, name, xattr->value, xattr->value_len, 0);
+                kfree(name);
+                if (err < 0)
+                        break;
+        }
+        return err;
+}
+int ubifs_init_security(struct inode *dentry, struct inode *inode,
+                        const struct qstr *qstr)
+{
+        int err;
+        mutex_lock(&inode->i_mutex);
+        err = security_inode_init_security(inode, dentry, qstr,
+                                           &init_xattrs, 0);
+        mutex_unlock(&inode->i_mutex);
+        if (err)
+                ubifs_err("cannot initialize security for inode %lu, error %d",
+                          inode->i_ino, err);
+        return err;
+}
diff --git a/fs/udf/Kconfig b/fs/udf/Kconfig
index 0e0e99bd6bce..c6e17a744c3b 100644
--- a/fs/udf/Kconfig
+++ b/fs/udf/Kconfig
@@ -2,10 +2,12 @@ config UDF_FS
        tristate "UDF file system support"
        select CRC_ITU_T
        help
-          This is the new file system used on some CD-ROMs and DVDs. Say Y if
+          This is a file system used on some CD-ROMs and DVDs. Since the
-          you intend to mount DVD discs or CDRW's written in packet mode, or
+          file system is supported by multiple operating systems and is more
-          if written to by other UDF utilities, such as DirectCD.
+          compatible with standard unix file systems, it is also suitable for
-          Please read <file:Documentation/filesystems/udf.txt>.
+          removable USB disks. Say Y if you intend to mount DVD discs or CDRW's
+          written in packet mode, or if you want to use UDF for removable USB
+          disks. Please read <file:Documentation/filesystems/udf.txt>.
          To compile this file system support as a module, choose M here: the
          module will be called udf.
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index a012c51caffd..05e90edd1992 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -57,6 +57,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
        sector_t offset;
        int i, num, ret = 0;
        struct extent_position epos = { NULL, 0, {0, 0} };
+        struct super_block *sb = dir->i_sb;
        if (ctx->pos == 0) {
                if (!dir_emit_dot(file, ctx))
@@ -76,16 +77,16 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
        if (nf_pos == 0)
                nf_pos = udf_ext0_offset(dir);
-        fibh.soffset = fibh.eoffset = nf_pos & (dir->i_sb->s_blocksize - 1);
+        fibh.soffset = fibh.eoffset = nf_pos & (sb->s_blocksize - 1);
        if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
-                if (inode_bmap(dir, nf_pos >> dir->i_sb->s_blocksize_bits,
+                if (inode_bmap(dir, nf_pos >> sb->s_blocksize_bits,
                    &epos, &eloc, &elen, &offset)
                    != (EXT_RECORDED_ALLOCATED >> 30)) {
                        ret = -ENOENT;
                        goto out;
                }
-                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
+                block = udf_get_lb_pblock(sb, &eloc, offset);
-                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
+                if ((++offset << sb->s_blocksize_bits) < elen) {
                        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                                epos.offset -= sizeof(struct short_ad);
                        else if (iinfo->i_alloc_type ==
@@ -95,18 +96,18 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
                        offset = 0;
                }
-                if (!(fibh.sbh = fibh.ebh = udf_tread(dir->i_sb, block))) {
+                if (!(fibh.sbh = fibh.ebh = udf_tread(sb, block))) {
                        ret = -EIO;
                        goto out;
                }
-                if (!(offset & ((16 >> (dir->i_sb->s_blocksize_bits - 9)) - 1))) {
+                if (!(offset & ((16 >> (sb->s_blocksize_bits - 9)) - 1))) {
-                        i = 16 >> (dir->i_sb->s_blocksize_bits - 9);
+                        i = 16 >> (sb->s_blocksize_bits - 9);
-                        if (i + offset > (elen >> dir->i_sb->s_blocksize_bits))
+                        if (i + offset > (elen >> sb->s_blocksize_bits))
-                                i = (elen >> dir->i_sb->s_blocksize_bits) - offset;
+                                i = (elen >> sb->s_blocksize_bits) - offset;
                        for (num = 0; i > 0; i--) {
-                                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset + i);
+                                block = udf_get_lb_pblock(sb, &eloc, offset + i);
-                                tmp = udf_tgetblk(dir->i_sb, block);
+                                tmp = udf_tgetblk(sb, block);
                                if (tmp && !buffer_uptodate(tmp) && !buffer_locked(tmp))
                                        bha[num++] = tmp;
                                else
@@ -152,12 +153,12 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
                }
                if ((cfi.fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
-                        if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE))
+                        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
                                continue;
                }
                if ((cfi.fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) {
-                        if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE))
+                        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
                                continue;
                }
@@ -167,12 +168,12 @@ static int udf_readdir(struct file *file, struct dir_context *ctx)
                        continue;
                }
-                flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+                flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
                if (!flen)
                        continue;
                tloc = lelb_to_cpu(cfi.icb.extLocation);
-                iblock = udf_get_lb_pblock(dir->i_sb, &tloc, 0);
+                iblock = udf_get_lb_pblock(sb, &tloc, 0);
                if (!dir_emit(ctx, fname, flen, iblock, DT_UNKNOWN))
                        goto out;
        } /* end while */
diff --git a/fs/udf/file.c b/fs/udf/file.c
index bb15771b92ae..08f3555fbeac 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -224,7 +224,7 @@ out:
 static int udf_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->f_mode & FMODE_WRITE &&
-            atomic_read(&inode->i_writecount) > 1) {
+            atomic_read(&inode->i_writecount) == 1) {
                /*
                 * Grab i_mutex to avoid races with writes changing i_size
                 * while we are running.
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index c9b4df5810d5..a445d599098d 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -750,7 +750,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
        /* Are we beyond EOF? */
        if (etype == -1) {
                int ret;
-                isBeyondEOF = 1;
+                isBeyondEOF = true;
                if (count) {
                        if (c)
                                laarr[0] = laarr[1];
@@ -792,7 +792,7 @@ static sector_t inode_getblk(struct inode *inode, sector_t block,
                endnum = c + 1;
                lastblock = 1;
        } else {
-                isBeyondEOF = 0;
+                isBeyondEOF = false;
                endnum = startnum = ((count > 2) ? 2 : count);
                /* if the current extent is in position 0,
@@ -1288,6 +1288,7 @@ static int udf_read_inode(struct inode *inode, bool hidden_inode)
        struct kernel_lb_addr *iloc = &iinfo->i_location;
        unsigned int link_count;
        unsigned int indirections = 0;
+        int bs = inode->i_sb->s_blocksize;
        int ret = -EIO;
 reread:
@@ -1374,38 +1375,35 @@ reread:
        if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
                iinfo->i_efe = 1;
                iinfo->i_use = 0;
-                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, bs -
                                        sizeof(struct extendedFileEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct extendedFileEntry),
-                       inode->i_sb->s_blocksize -
+                       bs - sizeof(struct extendedFileEntry));
-                                        sizeof(struct extendedFileEntry));
        } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
                iinfo->i_efe = 0;
                iinfo->i_use = 0;
-                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, bs - sizeof(struct fileEntry));
-                                                sizeof(struct fileEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct fileEntry),
-                       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
+                       bs - sizeof(struct fileEntry));
        } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_USE)) {
                iinfo->i_efe = 0;
                iinfo->i_use = 1;
                iinfo->i_lenAlloc = le32_to_cpu(
                                ((struct unallocSpaceEntry *)bh->b_data)->
                                 lengthAllocDescs);
-                ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+                ret = udf_alloc_i_data(inode, bs -
                                        sizeof(struct unallocSpaceEntry));
                if (ret)
                        goto out;
                memcpy(iinfo->i_ext.i_data,
                       bh->b_data + sizeof(struct unallocSpaceEntry),
-                       inode->i_sb->s_blocksize -
+                       bs - sizeof(struct unallocSpaceEntry));
-                                        sizeof(struct unallocSpaceEntry));
                return 0;
        }
@@ -1489,6 +1487,28 @@ reread:
        }
        inode->i_generation = iinfo->i_unique;
+        /*
+         * Sanity check length of allocation descriptors and extended attrs to
+         * avoid integer overflows
+         */
+        if (iinfo->i_lenEAttr > bs || iinfo->i_lenAlloc > bs)
+                goto out;
+        /* Now do exact checks */
+        if (udf_file_entry_alloc_offset(inode) + iinfo->i_lenAlloc > bs)
+                goto out;
+        /* Sanity checks for files in ICB so that we don't get confused later */
+        if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) {
+                /*
+                 * For file in ICB data is stored in allocation descriptor
+                 * so sizes should match
+                 */
+                if (iinfo->i_lenAlloc != inode->i_size)
+                        goto out;
+                /* File in ICB has to fit in there... */
+                if (inode->i_size > bs - udf_file_entry_alloc_offset(inode))
+                        goto out;
+        }
        switch (fe->icbTag.fileType) {
        case ICBTAG_FILE_TYPE_DIRECTORY:
                inode->i_op = &udf_dir_inode_operations;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index c12e260fd6c4..33b246b82c98 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -159,18 +159,19 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
        struct udf_inode_info *dinfo = UDF_I(dir);
        int isdotdot = child->len == 2 &&
                child->name[0] == '.' && child->name[1] == '.';
+        struct super_block *sb = dir->i_sb;
        size = udf_ext0_offset(dir) + dir->i_size;
        f_pos = udf_ext0_offset(dir);
        fibh->sbh = fibh->ebh = NULL;
-        fibh->soffset = fibh->eoffset = f_pos & (dir->i_sb->s_blocksize - 1);
+        fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1);
        if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
-                if (inode_bmap(dir, f_pos >> dir->i_sb->s_blocksize_bits, &epos,
+                if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos,
                    &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30))
                        goto out_err;
-                block = udf_get_lb_pblock(dir->i_sb, &eloc, offset);
+                block = udf_get_lb_pblock(sb, &eloc, offset);
-                if ((++offset << dir->i_sb->s_blocksize_bits) < elen) {
+                if ((++offset << sb->s_blocksize_bits) < elen) {
                        if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
                                epos.offset -= sizeof(struct short_ad);
                        else if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_LONG)
@@ -178,7 +179,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                } else
                        offset = 0;
-                fibh->sbh = fibh->ebh = udf_tread(dir->i_sb, block);
+                fibh->sbh = fibh->ebh = udf_tread(sb, block);
                if (!fibh->sbh)
                        goto out_err;
        }
@@ -217,12 +218,12 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                }
                if ((cfi->fileCharacteristics & FID_FILE_CHAR_DELETED) != 0) {
-                        if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNDELETE))
+                        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNDELETE))
                                continue;
                }
                if ((cfi->fileCharacteristics & FID_FILE_CHAR_HIDDEN) != 0) {
-                        if (!UDF_QUERY_FLAG(dir->i_sb, UDF_FLAG_UNHIDE))
+                        if (!UDF_QUERY_FLAG(sb, UDF_FLAG_UNHIDE))
                                continue;
                }
@@ -233,7 +234,7 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir,
                if (!lfi)
                        continue;
-                flen = udf_get_filename(dir->i_sb, nameptr, fname, lfi);
+                flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN);
                if (flen && udf_match(flen, fname, child->len, child->name))
                        goto out_ok;
        }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 3ccb2f11fc76..f169411c4ea0 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1599,7 +1599,7 @@ static noinline int udf_process_sequence(
        struct udf_vds_record *curr;
        struct generic_desc *gd;
        struct volDescPtr *vdp;
-        int done = 0;
+        bool done = false;
        uint32_t vdsn;
        uint16_t ident;
        long next_s = 0, next_e = 0;
@@ -1680,7 +1680,7 @@ static noinline int udf_process_sequence(
                                lastblock = next_e;
                                next_s = next_e = 0;
                        } else
-                                done = 1;
+                                done = true;
                        break;
                }
                brelse(bh);
@@ -2300,6 +2300,7 @@ static void udf_put_super(struct super_block *sb)
                udf_close_lvid(sb);
        brelse(sbi->s_lvid_bh);
        udf_sb_free_partitions(sb);
+        mutex_destroy(&sbi->s_alloc_mutex);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
 }
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 6fb7945c1e6e..ac10ca939f26 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -30,49 +30,73 @@
 #include <linux/buffer_head.h>
 #include "udf_i.h"
-static void udf_pc_to_char(struct super_block *sb, unsigned char *from,
+static int udf_pc_to_char(struct super_block *sb, unsigned char *from,
-                           int fromlen, unsigned char *to)
+                          int fromlen, unsigned char *to, int tolen)
 {
        struct pathComponent *pc;
        int elen = 0;
+        int comp_len;
        unsigned char *p = to;
+        /* Reserve one byte for terminating \0 */
+        tolen--;
        while (elen < fromlen) {
                pc = (struct pathComponent *)(from + elen);
+                elen += sizeof(struct pathComponent);
                switch (pc->componentType) {
                case 1:
                        /*
                         * Symlink points to some place which should be agreed
                         * upon between originator and receiver of the media. Ignore.
                         */
-                        if (pc->lengthComponentIdent > 0)
+                        if (pc->lengthComponentIdent > 0) {
+                                elen += pc->lengthComponentIdent;
                                break;
+                        }
                        /* Fall through */
                case 2:
+                        if (tolen == 0)
+                                return -ENAMETOOLONG;
                        p = to;
                        *p++ = '/';
+                        tolen--;
                        break;
                case 3:
+                        if (tolen < 3)
+                                return -ENAMETOOLONG;
                        memcpy(p, "../", 3);
                        p += 3;
+                        tolen -= 3;
                        break;
                case 4:
+                        if (tolen < 2)
+                                return -ENAMETOOLONG;
                        memcpy(p, "./", 2);
                        p += 2;
+                        tolen -= 2;
                        /* that would be . - just ignore */
                        break;
                case 5:
-                        p += udf_get_filename(sb, pc->componentIdent, p,
+                        elen += pc->lengthComponentIdent;
-                                              pc->lengthComponentIdent);
+                        if (elen > fromlen)
+                                return -EIO;
+                        comp_len = udf_get_filename(sb, pc->componentIdent,
+                                                    pc->lengthComponentIdent,
+                                                    p, tolen);
+                        p += comp_len;
+                        tolen -= comp_len;
+                        if (tolen == 0)
+                                return -ENAMETOOLONG;
                        *p++ = '/';
+                        tolen--;
                        break;
                }
-                elen += sizeof(struct pathComponent) + pc->lengthComponentIdent;
        }
        if (p > to + 1)
                p[-1] = '\0';
        else
                p[0] = '\0';
+        return 0;
 }
 static int udf_symlink_filler(struct file *file, struct page *page)
@@ -80,11 +104,17 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        struct inode *inode = page->mapping->host;
        struct buffer_head *bh = NULL;
        unsigned char *symlink;
-        int err = -EIO;
+        int err;
        unsigned char *p = kmap(page);
        struct udf_inode_info *iinfo;
        uint32_t pos;
+        /* We don't support symlinks longer than one block */
+        if (inode->i_size > inode->i_sb->s_blocksize) {
+                err = -ENAMETOOLONG;
+                goto out_unmap;
+        }
        iinfo = UDF_I(inode);
        pos = udf_block_map(inode, 0);
@@ -94,14 +124,18 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        } else {
                bh = sb_bread(inode->i_sb, pos);
-                if (!bh)
+                if (!bh) {
-                        goto out;
+                        err = -EIO;
+                        goto out_unlock_inode;
+                }
                symlink = bh->b_data;
        }
-        udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p);
+        err = udf_pc_to_char(inode->i_sb, symlink, inode->i_size, p, PAGE_SIZE);
        brelse(bh);
+        if (err)
+                goto out_unlock_inode;
        up_read(&iinfo->i_data_sem);
        SetPageUptodate(page);
@@ -109,9 +143,10 @@ static int udf_symlink_filler(struct file *file, struct page *page)
        unlock_page(page);
        return 0;
-out:
+out_unlock_inode:
        up_read(&iinfo->i_data_sem);
        SetPageError(page);
+out_unmap:
        kunmap(page);
        unlock_page(page);
        return err;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 1cc3c993ebd0..47bb3f5ca360 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -211,7 +211,8 @@ udf_get_lb_pblock(struct super_block *sb, struct kernel_lb_addr *loc,
 }
 /* unicode.c */
-extern int udf_get_filename(struct super_block *, uint8_t *, uint8_t *, int);
+extern int udf_get_filename(struct super_block *, uint8_t *, int, uint8_t *,
+                            int);
 extern int udf_put_filename(struct super_block *, const uint8_t *, uint8_t *,
                            int);
 extern int udf_build_ustr(struct ustr *, dstring *, int);
diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c
index afd470e588ff..b84fee372734 100644
--- a/fs/udf/unicode.c
+++ b/fs/udf/unicode.c
@@ -28,7 +28,8 @@
 #include "udf_sb.h"
-static int udf_translate_to_linux(uint8_t *, uint8_t *, int, uint8_t *, int);
+static int udf_translate_to_linux(uint8_t *, int, uint8_t *, int, uint8_t *,
+                                  int);
 static int udf_char_to_ustr(struct ustr *dest, const uint8_t *src, int strlen)
 {
@@ -333,8 +334,8 @@ try_again:
        return u_len + 1;
 }
-int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
+int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen,
-                     int flen)
+                     uint8_t *dname, int dlen)
 {
        struct ustr *filename, *unifilename;
        int len = 0;
@@ -347,7 +348,7 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
        if (!unifilename)
                goto out1;
-        if (udf_build_ustr_exact(unifilename, sname, flen))
+        if (udf_build_ustr_exact(unifilename, sname, slen))
                goto out2;
        if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) {
@@ -366,7 +367,8 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, uint8_t *dname,
        } else
                goto out2;
-        len = udf_translate_to_linux(dname, filename->u_name, filename->u_len,
+        len = udf_translate_to_linux(dname, dlen,
+                                     filename->u_name, filename->u_len,
                                     unifilename->u_name, unifilename->u_len);
 out2:
        kfree(unifilename);
@@ -403,10 +405,12 @@ int udf_put_filename(struct super_block *sb, const uint8_t *sname,
 #define EXT_MARK                '.'
 #define CRC_MARK                '#'
 #define EXT_SIZE                5
+/* Number of chars we need to store generated CRC to make filename unique */
+#define CRC_LEN                 5
-static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
+static int udf_translate_to_linux(uint8_t *newName, int newLen,
-                                  int udfLen, uint8_t *fidName,
+                                  uint8_t *udfName, int udfLen,
-                                  int fidNameLen)
+                                  uint8_t *fidName, int fidNameLen)
 {
        int index, newIndex = 0, needsCRC = 0;
        int extIndex = 0, newExtIndex = 0, hasExt = 0;
@@ -439,7 +443,7 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
                                        newExtIndex = newIndex;
                                }
                        }
-                        if (newIndex < 256)
+                        if (newIndex < newLen)
                                newName[newIndex++] = curr;
                        else
                                needsCRC = 1;
@@ -467,13 +471,13 @@ static int udf_translate_to_linux(uint8_t *newName, uint8_t *udfName,
                                }
                                ext[localExtIndex++] = curr;
                        }
-                        maxFilenameLen = 250 - localExtIndex;
+                        maxFilenameLen = newLen - CRC_LEN - localExtIndex;
                        if (newIndex > maxFilenameLen)
                                newIndex = maxFilenameLen;
                        else
                                newIndex = newExtIndex;
-                } else if (newIndex > 250)
+                } else if (newIndex > newLen - CRC_LEN)
-                        newIndex = 250;
+                        newIndex = newLen - CRC_LEN;
                newName[newIndex++] = CRC_MARK;
                valueCRC = crc_itu_t(0, fidName, fidNameLen);
                newName[newIndex++] = hex_asc_upper_hi(valueCRC >> 8);
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index da73801301d5..8092d3759a5e 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -95,22 +95,18 @@
 void lock_ufs(struct super_block *sb)
 {
-#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
        struct ufs_sb_info *sbi = UFS_SB(sb);
        mutex_lock(&sbi->mutex);
        sbi->mutex_owner = current;
-#endif
 }
 void unlock_ufs(struct super_block *sb)
 {
-#if defined(CONFIG_SMP) || defined (CONFIG_PREEMPT)
        struct ufs_sb_info *sbi = UFS_SB(sb);
        sbi->mutex_owner = NULL;
        mutex_unlock(&sbi->mutex);
-#endif
 }
 static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
@@ -1415,9 +1411,11 @@ static struct kmem_cache * ufs_inode_cachep;
 static struct inode *ufs_alloc_inode(struct super_block *sb)
 {
        struct ufs_inode_info *ei;
-        ei = (struct ufs_inode_info *)kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
+        ei = kmem_cache_alloc(ufs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
        ei->vfs_inode.i_version = 1;
        return &ei->vfs_inode;
 }
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d61799949580..df6828570e87 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -121,3 +121,4 @@ xfs-$(CONFIG_XFS_POSIX_ACL)	+= xfs_acl.o
 xfs-$(CONFIG_PROC_FS)           += xfs_stats.o
 xfs-$(CONFIG_SYSCTL)            += xfs_sysctl.o
 xfs-$(CONFIG_COMPAT)            += xfs_ioctl32.o
+xfs-$(CONFIG_NFSD_PNFS)         += xfs_pnfs.o
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 53e95b2a1369..a7a3a63bb360 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -91,16 +91,6 @@ kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
        return ptr;
 }
-void
-kmem_free(const void *ptr)
-{
-        if (!is_vmalloc_addr(ptr)) {
-                kfree(ptr);
-        } else {
-                vfree(ptr);
-        }
-}
 void *
 kmem_realloc(const void *ptr, size_t newsize, size_t oldsize,
             xfs_km_flags_t flags)
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index 64db0e53edea..cc6b768fc068 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -63,7 +63,10 @@ kmem_flags_convert(xfs_km_flags_t flags)
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
 extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
 extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
-extern void  kmem_free(const void *);
+static inline void  kmem_free(const void *ptr)
+{
+        kvfree(ptr);
+}
 extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 5d38e8b8a913..15105dbc9e28 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -403,7 +403,7 @@ xfs_sbversion_add_attr2(xfs_mount_t *mp, xfs_trans_t *tp)
                if (!xfs_sb_version_hasattr2(&mp->m_sb)) {
                        xfs_sb_version_addattr2(&mp->m_sb);
                        spin_unlock(&mp->m_sb_lock);
-                        xfs_mod_sb(tp, XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+                        xfs_log_sb(tp);
                } else
                        spin_unlock(&mp->m_sb_lock);
        }
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index b5eb4743f75a..61ec015dca16 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -973,7 +973,11 @@ xfs_bmap_local_to_extents(
        *firstblock = args.fsbno;
        bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-        /* initialise the block and copy the data */
+        /*
+         * Initialise the block and copy the data
+         *
+         * Note: init_fn must set the buffer log item type correctly!
+         */
        init_fn(tp, bp, ip, ifp);
        /* account for the change in fork size and log everything */
@@ -1221,22 +1225,20 @@ xfs_bmap_add_attrfork(
                goto bmap_cancel;
        if (!xfs_sb_version_hasattr(&mp->m_sb) ||
           (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2)) {
-                __int64_t sbfields = 0;
+                bool log_sb = false;
                spin_lock(&mp->m_sb_lock);
                if (!xfs_sb_version_hasattr(&mp->m_sb)) {
                        xfs_sb_version_addattr(&mp->m_sb);
-                        sbfields |= XFS_SB_VERSIONNUM;
+                        log_sb = true;
                }
                if (!xfs_sb_version_hasattr2(&mp->m_sb) && version == 2) {
                        xfs_sb_version_addattr2(&mp->m_sb);
-                        sbfields |= (XFS_SB_VERSIONNUM | XFS_SB_FEATURES2);
+                        log_sb = true;
                }
-                if (sbfields) {
+                spin_unlock(&mp->m_sb_lock);
-                        spin_unlock(&mp->m_sb_lock);
+                if (log_sb)
-                        xfs_mod_sb(tp, sbfields);
+                        xfs_log_sb(tp);
-                } else
-                        spin_unlock(&mp->m_sb_lock);
        }
        error = xfs_bmap_finish(&tp, &flist, &committed);
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 44db6db86402..b9d8a499d2c4 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -28,6 +28,37 @@ struct xfs_trans;
 extern kmem_zone_t      *xfs_bmap_free_item_zone;
 /*
+ * Argument structure for xfs_bmap_alloc.
+ */
+struct xfs_bmalloca {
+        xfs_fsblock_t           *firstblock; /* i/o first block allocated */
+        struct xfs_bmap_free    *flist; /* bmap freelist */
+        struct xfs_trans        *tp;    /* transaction pointer */
+        struct xfs_inode        *ip;    /* incore inode pointer */
+        struct xfs_bmbt_irec    prev;   /* extent before the new one */
+        struct xfs_bmbt_irec    got;    /* extent after, or delayed */
+        xfs_fileoff_t           offset; /* offset in file filling in */
+        xfs_extlen_t            length; /* i/o length asked/allocated */
+        xfs_fsblock_t           blkno;  /* starting block of new extent */
+        struct xfs_btree_cur    *cur;   /* btree cursor */
+        xfs_extnum_t            idx;    /* current extent index */
+        int                     nallocs;/* number of extents alloc'd */
+        int                     logflags;/* flags for transaction logging */
+        xfs_extlen_t            total;  /* total blocks needed for xaction */
+        xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
+        xfs_extlen_t            minleft; /* amount must be left after alloc */
+        bool                    eof;    /* set if allocating past last extent */
+        bool                    wasdel; /* replacing a delayed allocation */
+        bool                    userdata;/* set if is user data */
+        bool                    aeof;   /* allocated space at eof */
+        bool                    conv;   /* overwriting unwritten extents */
+        int                     flags;
+};
+/*
 * List of extents to be free "later".
 * The list is kept sorted on xbf_startblock.
 */
@@ -149,6 +180,8 @@ void	xfs_bmap_local_to_extents_empty(struct xfs_inode *ip, int whichfork);
 void    xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
                struct xfs_bmap_free *flist, struct xfs_mount *mp);
 void    xfs_bmap_cancel(struct xfs_bmap_free *flist);
+int     xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
+                        int *committed);
 void    xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
 int     xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
                xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index fbd6da263571..8eb718979383 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -151,10 +151,13 @@ typedef struct xfs_sb {
        __uint32_t      sb_features2;   /* additional feature bits */
        /*
-         * bad features2 field as a result of failing to pad the sb
+         * bad features2 field as a result of failing to pad the sb structure to
-         * structure to 64 bits. Some machines will be using this field
+         * 64 bits. Some machines will be using this field for features2 bits.
-         * for features2 bits. Easiest just to mark it bad and not use
+         * Easiest just to mark it bad and not use it for anything else.
-         * it for anything else.
+         *
+         * This is not kept up to date in memory; it is always overwritten by
+         * the value in sb_features2 when formatting the incore superblock to
+         * the disk buffer.
         */
        __uint32_t      sb_bad_features2;
@@ -304,8 +307,8 @@ typedef enum {
 #define XFS_SB_ICOUNT           XFS_SB_MVAL(ICOUNT)
 #define XFS_SB_IFREE            XFS_SB_MVAL(IFREE)
 #define XFS_SB_FDBLOCKS         XFS_SB_MVAL(FDBLOCKS)
-#define XFS_SB_FEATURES2        XFS_SB_MVAL(FEATURES2)
+#define XFS_SB_FEATURES2        (XFS_SB_MVAL(FEATURES2) | \
-#define XFS_SB_BAD_FEATURES2    XFS_SB_MVAL(BAD_FEATURES2)
+                                 XFS_SB_MVAL(BAD_FEATURES2))
 #define XFS_SB_FEATURES_COMPAT  XFS_SB_MVAL(FEATURES_COMPAT)
 #define XFS_SB_FEATURES_RO_COMPAT XFS_SB_MVAL(FEATURES_RO_COMPAT)
 #define XFS_SB_FEATURES_INCOMPAT XFS_SB_MVAL(FEATURES_INCOMPAT)
@@ -319,9 +322,9 @@ typedef enum {
         XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO | XFS_SB_GQUOTINO | \
         XFS_SB_QFLAGS | XFS_SB_SHARED_VN | XFS_SB_UNIT | XFS_SB_WIDTH | \
         XFS_SB_ICOUNT | XFS_SB_IFREE | XFS_SB_FDBLOCKS | XFS_SB_FEATURES2 | \
-         XFS_SB_BAD_FEATURES2 | XFS_SB_FEATURES_COMPAT | \
+         XFS_SB_FEATURES_COMPAT | XFS_SB_FEATURES_RO_COMPAT | \
-         XFS_SB_FEATURES_RO_COMPAT | XFS_SB_FEATURES_INCOMPAT | \
+         XFS_SB_FEATURES_INCOMPAT | XFS_SB_FEATURES_LOG_INCOMPAT | \
-         XFS_SB_FEATURES_LOG_INCOMPAT | XFS_SB_PQUOTINO)
+         XFS_SB_PQUOTINO)
 /*
@@ -453,13 +456,11 @@ static inline void xfs_sb_version_addattr2(struct xfs_sb *sbp)
 {
        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
        sbp->sb_features2 |= XFS_SB_VERSION2_ATTR2BIT;
-        sbp->sb_bad_features2 |= XFS_SB_VERSION2_ATTR2BIT;
 }
 static inline void xfs_sb_version_removeattr2(struct xfs_sb *sbp)
 {
        sbp->sb_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
-        sbp->sb_bad_features2 &= ~XFS_SB_VERSION2_ATTR2BIT;
        if (!sbp->sb_features2)
                sbp->sb_versionnum &= ~XFS_SB_VERSION_MOREBITSBIT;
 }
@@ -475,7 +476,6 @@ static inline void xfs_sb_version_addprojid32bit(struct xfs_sb *sbp)
 {
        sbp->sb_versionnum |= XFS_SB_VERSION_MOREBITSBIT;
        sbp->sb_features2 |= XFS_SB_VERSION2_PROJID32BIT;
-        sbp->sb_bad_features2 |= XFS_SB_VERSION2_PROJID32BIT;
 }
 /*
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 18dc721ca19f..18dc721ca19f 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 752915fa775a..b0a5fe95a3e2 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -40,69 +40,6 @@
 * Physical superblock buffer manipulations. Shared with libxfs in userspace.
 */
-static const struct {
-        short offset;
-        short type;     /* 0 = integer
-                         * 1 = binary / string (no translation)
-                         */
-} xfs_sb_info[] = {
-        { offsetof(xfs_sb_t, sb_magicnum),      0 },
-        { offsetof(xfs_sb_t, sb_blocksize),     0 },
-        { offsetof(xfs_sb_t, sb_dblocks),       0 },
-        { offsetof(xfs_sb_t, sb_rblocks),       0 },
-        { offsetof(xfs_sb_t, sb_rextents),      0 },
-        { offsetof(xfs_sb_t, sb_uuid),          1 },
-        { offsetof(xfs_sb_t, sb_logstart),      0 },
-        { offsetof(xfs_sb_t, sb_rootino),       0 },
-        { offsetof(xfs_sb_t, sb_rbmino),        0 },
-        { offsetof(xfs_sb_t, sb_rsumino),       0 },
-        { offsetof(xfs_sb_t, sb_rextsize),      0 },
-        { offsetof(xfs_sb_t, sb_agblocks),      0 },
-        { offsetof(xfs_sb_t, sb_agcount),       0 },
-        { offsetof(xfs_sb_t, sb_rbmblocks),     0 },
-        { offsetof(xfs_sb_t, sb_logblocks),     0 },
-        { offsetof(xfs_sb_t, sb_versionnum),    0 },
-        { offsetof(xfs_sb_t, sb_sectsize),      0 },
-        { offsetof(xfs_sb_t, sb_inodesize),     0 },
-        { offsetof(xfs_sb_t, sb_inopblock),     0 },
-        { offsetof(xfs_sb_t, sb_fname[0]),      1 },
-        { offsetof(xfs_sb_t, sb_blocklog),      0 },
-        { offsetof(xfs_sb_t, sb_sectlog),       0 },
-        { offsetof(xfs_sb_t, sb_inodelog),      0 },
-        { offsetof(xfs_sb_t, sb_inopblog),      0 },
-        { offsetof(xfs_sb_t, sb_agblklog),      0 },
-        { offsetof(xfs_sb_t, sb_rextslog),      0 },
-        { offsetof(xfs_sb_t, sb_inprogress),    0 },
-        { offsetof(xfs_sb_t, sb_imax_pct),      0 },
-        { offsetof(xfs_sb_t, sb_icount),        0 },
-        { offsetof(xfs_sb_t, sb_ifree),         0 },
-        { offsetof(xfs_sb_t, sb_fdblocks),      0 },
-        { offsetof(xfs_sb_t, sb_frextents),     0 },
-        { offsetof(xfs_sb_t, sb_uquotino),      0 },
-        { offsetof(xfs_sb_t, sb_gquotino),      0 },
-        { offsetof(xfs_sb_t, sb_qflags),        0 },
-        { offsetof(xfs_sb_t, sb_flags),         0 },
-        { offsetof(xfs_sb_t, sb_shared_vn),     0 },
-        { offsetof(xfs_sb_t, sb_inoalignmt),    0 },
-        { offsetof(xfs_sb_t, sb_unit),          0 },
-        { offsetof(xfs_sb_t, sb_width),         0 },
-        { offsetof(xfs_sb_t, sb_dirblklog),     0 },
-        { offsetof(xfs_sb_t, sb_logsectlog),    0 },
-        { offsetof(xfs_sb_t, sb_logsectsize),   0 },
-        { offsetof(xfs_sb_t, sb_logsunit),      0 },
-        { offsetof(xfs_sb_t, sb_features2),     0 },
-        { offsetof(xfs_sb_t, sb_bad_features2), 0 },
-        { offsetof(xfs_sb_t, sb_features_compat),       0 },
-        { offsetof(xfs_sb_t, sb_features_ro_compat),    0 },
-        { offsetof(xfs_sb_t, sb_features_incompat),     0 },
-        { offsetof(xfs_sb_t, sb_features_log_incompat), 0 },
-        { offsetof(xfs_sb_t, sb_crc),           0 },
-        { offsetof(xfs_sb_t, sb_pad),           0 },
-        { offsetof(xfs_sb_t, sb_pquotino),      0 },
-        { offsetof(xfs_sb_t, sb_lsn),           0 },
-        { sizeof(xfs_sb_t),                     0 }
-};
 /*
 * Reference counting access wrappers to the perag structures.
 * Because we never free per-ag structures, the only thing we
@@ -461,58 +398,49 @@ xfs_sb_from_disk(
        __xfs_sb_from_disk(to, from, true);
 }
-static inline void
+static void
 xfs_sb_quota_to_disk(
-        xfs_dsb_t       *to,
+        struct xfs_dsb  *to,
-        xfs_sb_t        *from,
+        struct xfs_sb   *from)
-        __int64_t       *fields)
 {
        __uint16_t      qflags = from->sb_qflags;
+        to->sb_uquotino = cpu_to_be64(from->sb_uquotino);
+        if (xfs_sb_version_has_pquotino(from)) {
+                to->sb_qflags = cpu_to_be16(from->sb_qflags);
+                to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
+                to->sb_pquotino = cpu_to_be64(from->sb_pquotino);
+                return;
+        }
        /*
-         * We need to do these manipilations only if we are working
+         * The in-core version of sb_qflags do not have XFS_OQUOTA_*
-         * with an older version of on-disk superblock.
+         * flags, whereas the on-disk version does.  So, convert incore
+         * XFS_{PG}QUOTA_* flags to on-disk XFS_OQUOTA_* flags.
         */
-        if (xfs_sb_version_has_pquotino(from))
+        qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
-                return;
+                        XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
-        if (*fields & XFS_SB_QFLAGS) {
+        if (from->sb_qflags &
-                /*
+                        (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
-                 * The in-core version of sb_qflags do not have
+                qflags |= XFS_OQUOTA_ENFD;
-                 * XFS_OQUOTA_* flags, whereas the on-disk version
+        if (from->sb_qflags &
-                 * does.  So, convert incore XFS_{PG}QUOTA_* flags
+                        (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
-                 * to on-disk XFS_OQUOTA_* flags.
+                qflags |= XFS_OQUOTA_CHKD;
-                 */
+        to->sb_qflags = cpu_to_be16(qflags);
-                qflags &= ~(XFS_PQUOTA_ENFD | XFS_PQUOTA_CHKD |
-                                XFS_GQUOTA_ENFD | XFS_GQUOTA_CHKD);
-                if (from->sb_qflags &
-                                (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD))
-                        qflags |= XFS_OQUOTA_ENFD;
-                if (from->sb_qflags &
-                                (XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD))
-                        qflags |= XFS_OQUOTA_CHKD;
-                to->sb_qflags = cpu_to_be16(qflags);
-                *fields &= ~XFS_SB_QFLAGS;
-        }
        /*
-         * GQUOTINO and PQUOTINO cannot be used together in versions of
+         * GQUOTINO and PQUOTINO cannot be used together in versions
-         * superblock that do not have pquotino. from->sb_flags tells us which
+         * of superblock that do not have pquotino. from->sb_flags
-         * quota is active and should be copied to disk. If neither are active,
+         * tells us which quota is active and should be copied to
-         * make sure we write NULLFSINO to the sb_gquotino field as a quota
+         * disk. If neither are active, we should NULL the inode.
-         * inode value of "0" is invalid when the XFS_SB_VERSION_QUOTA feature
-         * bit is set.
         *
-         * Note that we don't need to handle the sb_uquotino or sb_pquotino here
+         * In all cases, the separate pquotino must remain 0 because it
-         * as they do not require any translation. Hence the main sb field loop
+         * it beyond the "end" of the valid non-pquotino superblock.
-         * will write them appropriately from the in-core superblock.
         */
-        if ((*fields & XFS_SB_GQUOTINO) &&
+        if (from->sb_qflags & XFS_GQUOTA_ACCT)
-                                (from->sb_qflags & XFS_GQUOTA_ACCT))
                to->sb_gquotino = cpu_to_be64(from->sb_gquotino);
-        else if ((*fields & XFS_SB_PQUOTINO) &&
+        else if (from->sb_qflags & XFS_PQUOTA_ACCT)
-                                (from->sb_qflags & XFS_PQUOTA_ACCT))
                to->sb_gquotino = cpu_to_be64(from->sb_pquotino);
        else {
                /*
@@ -526,63 +454,78 @@ xfs_sb_quota_to_disk(
                        to->sb_gquotino = cpu_to_be64(NULLFSINO);
        }
-        *fields &= ~(XFS_SB_PQUOTINO | XFS_SB_GQUOTINO);
+        to->sb_pquotino = 0;
 }
-/*
- * Copy in core superblock to ondisk one.
- *
- * The fields argument is mask of superblock fields to copy.
- */
 void
 xfs_sb_to_disk(
-        xfs_dsb_t       *to,
+        struct xfs_dsb  *to,
-        xfs_sb_t        *from,
+        struct xfs_sb   *from)
-        __int64_t       fields)
 {
-        xfs_caddr_t     to_ptr = (xfs_caddr_t)to;
+        xfs_sb_quota_to_disk(to, from);
-        xfs_caddr_t     from_ptr = (xfs_caddr_t)from;
-        xfs_sb_field_t  f;
-        int             first;
-        int             size;
-        ASSERT(fields);
-        if (!fields)
-                return;
-        /* We should never write the crc here, it's updated in the IO path */
+        to->sb_magicnum = cpu_to_be32(from->sb_magicnum);
-        fields &= ~XFS_SB_CRC;
+        to->sb_blocksize = cpu_to_be32(from->sb_blocksize);
+        to->sb_dblocks = cpu_to_be64(from->sb_dblocks);
-        xfs_sb_quota_to_disk(to, from, &fields);
+        to->sb_rblocks = cpu_to_be64(from->sb_rblocks);
-        while (fields) {
+        to->sb_rextents = cpu_to_be64(from->sb_rextents);
-                f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+        memcpy(&to->sb_uuid, &from->sb_uuid, sizeof(to->sb_uuid));
-                first = xfs_sb_info[f].offset;
+        to->sb_logstart = cpu_to_be64(from->sb_logstart);
-                size = xfs_sb_info[f + 1].offset - first;
+        to->sb_rootino = cpu_to_be64(from->sb_rootino);
+        to->sb_rbmino = cpu_to_be64(from->sb_rbmino);
-                ASSERT(xfs_sb_info[f].type == 0 || xfs_sb_info[f].type == 1);
+        to->sb_rsumino = cpu_to_be64(from->sb_rsumino);
+        to->sb_rextsize = cpu_to_be32(from->sb_rextsize);
-                if (size == 1 || xfs_sb_info[f].type == 1) {
+        to->sb_agblocks = cpu_to_be32(from->sb_agblocks);
-                        memcpy(to_ptr + first, from_ptr + first, size);
+        to->sb_agcount = cpu_to_be32(from->sb_agcount);
-                } else {
+        to->sb_rbmblocks = cpu_to_be32(from->sb_rbmblocks);
-                        switch (size) {
+        to->sb_logblocks = cpu_to_be32(from->sb_logblocks);
-                        case 2:
+        to->sb_versionnum = cpu_to_be16(from->sb_versionnum);
-                                *(__be16 *)(to_ptr + first) =
+        to->sb_sectsize = cpu_to_be16(from->sb_sectsize);
-                                      cpu_to_be16(*(__u16 *)(from_ptr + first));
+        to->sb_inodesize = cpu_to_be16(from->sb_inodesize);
-                                break;
+        to->sb_inopblock = cpu_to_be16(from->sb_inopblock);
-                        case 4:
+        memcpy(&to->sb_fname, &from->sb_fname, sizeof(to->sb_fname));
-                                *(__be32 *)(to_ptr + first) =
+        to->sb_blocklog = from->sb_blocklog;
-                                      cpu_to_be32(*(__u32 *)(from_ptr + first));
+        to->sb_sectlog = from->sb_sectlog;
-                                break;
+        to->sb_inodelog = from->sb_inodelog;
-                        case 8:
+        to->sb_inopblog = from->sb_inopblog;
-                                *(__be64 *)(to_ptr + first) =
+        to->sb_agblklog = from->sb_agblklog;
-                                      cpu_to_be64(*(__u64 *)(from_ptr + first));
+        to->sb_rextslog = from->sb_rextslog;
-                                break;
+        to->sb_inprogress = from->sb_inprogress;
-                        default:
+        to->sb_imax_pct = from->sb_imax_pct;
-                                ASSERT(0);
+        to->sb_icount = cpu_to_be64(from->sb_icount);
-                        }
+        to->sb_ifree = cpu_to_be64(from->sb_ifree);
-                }
+        to->sb_fdblocks = cpu_to_be64(from->sb_fdblocks);
+        to->sb_frextents = cpu_to_be64(from->sb_frextents);
-                fields &= ~(1LL << f);
+        to->sb_flags = from->sb_flags;
+        to->sb_shared_vn = from->sb_shared_vn;
+        to->sb_inoalignmt = cpu_to_be32(from->sb_inoalignmt);
+        to->sb_unit = cpu_to_be32(from->sb_unit);
+        to->sb_width = cpu_to_be32(from->sb_width);
+        to->sb_dirblklog = from->sb_dirblklog;
+        to->sb_logsectlog = from->sb_logsectlog;
+        to->sb_logsectsize = cpu_to_be16(from->sb_logsectsize);
+        to->sb_logsunit = cpu_to_be32(from->sb_logsunit);
+        /*
+         * We need to ensure that bad_features2 always matches features2.
+         * Hence we enforce that here rather than having to remember to do it
+         * everywhere else that updates features2.
+         */
+        from->sb_bad_features2 = from->sb_features2;
+        to->sb_features2 = cpu_to_be32(from->sb_features2);
+        to->sb_bad_features2 = cpu_to_be32(from->sb_bad_features2);
+        if (xfs_sb_version_hascrc(from)) {
+                to->sb_features_compat = cpu_to_be32(from->sb_features_compat);
+                to->sb_features_ro_compat =
+                                cpu_to_be32(from->sb_features_ro_compat);
+                to->sb_features_incompat =
+                                cpu_to_be32(from->sb_features_incompat);
+                to->sb_features_log_incompat =
+                                cpu_to_be32(from->sb_features_log_incompat);
+                to->sb_pad = 0;
+                to->sb_lsn = cpu_to_be64(from->sb_lsn);
        }
 }
@@ -816,42 +759,51 @@ xfs_initialize_perag_data(
 }
 /*
- * xfs_mod_sb() can be used to copy arbitrary changes to the
+ * xfs_log_sb() can be used to copy arbitrary changes to the in-core superblock
- * in-core superblock into the superblock buffer to be logged.
+ * into the superblock buffer to be logged.  It does not provide the higher
- * It does not provide the higher level of locking that is
+ * level of locking that is needed to protect the in-core superblock from
- * needed to protect the in-core superblock from concurrent
+ * concurrent access.
- * access.
 */
 void
-xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
+xfs_log_sb(
+        struct xfs_trans        *tp)
 {
-        xfs_buf_t       *bp;
+        struct xfs_mount        *mp = tp->t_mountp;
-        int             first;
+        struct xfs_buf          *bp = xfs_trans_getsb(tp, mp, 0);
-        int             last;
-        xfs_mount_t     *mp;
-        xfs_sb_field_t  f;
-        ASSERT(fields);
-        if (!fields)
-                return;
-        mp = tp->t_mountp;
-        bp = xfs_trans_getsb(tp, mp, 0);
-        first = sizeof(xfs_sb_t);
-        last = 0;
-        /* translate/copy */
-        xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
+        xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+        xfs_trans_log_buf(tp, bp, 0, sizeof(struct xfs_dsb));
+}
-        /* find modified range */
+/*
-        f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+ * xfs_sync_sb
-        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+ *
-        last = xfs_sb_info[f + 1].offset - 1;
+ * Sync the superblock to disk.
+ *
+ * Note that the caller is responsible for checking the frozen state of the
+ * filesystem. This procedure uses the non-blocking transaction allocator and
+ * thus will allow modifications to a frozen fs. This is required because this
+ * code can be called during the process of freezing where use of the high-level
+ * allocator would deadlock.
+ */
+int
+xfs_sync_sb(
+        struct xfs_mount        *mp,
+        bool                    wait)
+{
+        struct xfs_trans        *tp;
+        int                     error;
-        f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
+        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP);
-        ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        first = xfs_sb_info[f].offset;
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
-        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
+        xfs_log_sb(tp);
-        xfs_trans_log_buf(tp, bp, first, last);
+        if (wait)
+                xfs_trans_set_sync(tp);
+        return xfs_trans_commit(tp, 0);
 }
diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h
index 8eb1c54bafbf..b25bb9a343f3 100644
--- a/fs/xfs/libxfs/xfs_sb.h
+++ b/fs/xfs/libxfs/xfs_sb.h
@@ -27,11 +27,12 @@ extern struct xfs_perag *xfs_perag_get_tag(struct xfs_mount *, xfs_agnumber_t,
 extern void     xfs_perag_put(struct xfs_perag *pag);
 extern int      xfs_initialize_perag_data(struct xfs_mount *, xfs_agnumber_t);
-extern void     xfs_sb_calc_crc(struct xfs_buf  *);
+extern void     xfs_sb_calc_crc(struct xfs_buf *bp);
-extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
+extern void     xfs_log_sb(struct xfs_trans *tp);
-extern void     xfs_sb_mount_common(struct xfs_mount *, struct xfs_sb *);
+extern int      xfs_sync_sb(struct xfs_mount *mp, bool wait);
-extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
+extern void     xfs_sb_mount_common(struct xfs_mount *mp, struct xfs_sb *sbp);
-extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern void     xfs_sb_from_disk(struct xfs_sb *to, struct xfs_dsb *from);
+extern void     xfs_sb_to_disk(struct xfs_dsb *to, struct xfs_sb *from);
 extern void     xfs_sb_quota_from_disk(struct xfs_sb *sbp);
 #endif  /* __XFS_SB_H__ */
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 82404da2ca67..8dda4b321343 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -82,7 +82,7 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 #define XFS_TRANS_ATTR_RM               23
 #define XFS_TRANS_ATTR_FLAG             24
 #define XFS_TRANS_CLEAR_AGI_BUCKET      25
-#define XFS_TRANS_QM_SBCHANGE           26
+#define XFS_TRANS_SB_CHANGE             26
 /*
 * Dummy entries since we use the transaction type to index into the
 * trans_type[] in xlog_recover_print_trans_head()
@@ -95,17 +95,15 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
 #define XFS_TRANS_QM_DQCLUSTER          32
 #define XFS_TRANS_QM_QINOCREATE         33
 #define XFS_TRANS_QM_QUOTAOFF_END       34
-#define XFS_TRANS_SB_UNIT               35
+#define XFS_TRANS_FSYNC_TS              35
-#define XFS_TRANS_FSYNC_TS              36
+#define XFS_TRANS_GROWFSRT_ALLOC        36
-#define XFS_TRANS_GROWFSRT_ALLOC        37
+#define XFS_TRANS_GROWFSRT_ZERO         37
-#define XFS_TRANS_GROWFSRT_ZERO         38
+#define XFS_TRANS_GROWFSRT_FREE         38
-#define XFS_TRANS_GROWFSRT_FREE         39
+#define XFS_TRANS_SWAPEXT               39
-#define XFS_TRANS_SWAPEXT               40
+#define XFS_TRANS_CHECKPOINT            40
-#define XFS_TRANS_SB_COUNT              41
+#define XFS_TRANS_ICREATE               41
-#define XFS_TRANS_CHECKPOINT            42
+#define XFS_TRANS_CREATE_TMPFILE        42
-#define XFS_TRANS_ICREATE               43
+#define XFS_TRANS_TYPE_MAX              43
-#define XFS_TRANS_CREATE_TMPFILE        44
-#define XFS_TRANS_TYPE_MAX              44
 /* new transaction types need to be reflected in xfs_logprint(8) */
 #define XFS_TRANS_TYPES \
@@ -113,7 +111,6 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
        { XFS_TRANS_SETATTR_SIZE,       "SETATTR_SIZE" }, \
        { XFS_TRANS_INACTIVE,           "INACTIVE" }, \
        { XFS_TRANS_CREATE,             "CREATE" }, \
-        { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
        { XFS_TRANS_CREATE_TRUNC,       "CREATE_TRUNC" }, \
        { XFS_TRANS_TRUNCATE_FILE,      "TRUNCATE_FILE" }, \
        { XFS_TRANS_REMOVE,             "REMOVE" }, \
@@ -134,23 +131,23 @@ extern const struct xfs_buf_ops xfs_symlink_buf_ops;
        { XFS_TRANS_ATTR_RM,            "ATTR_RM" }, \
        { XFS_TRANS_ATTR_FLAG,          "ATTR_FLAG" }, \
        { XFS_TRANS_CLEAR_AGI_BUCKET,   "CLEAR_AGI_BUCKET" }, \
-        { XFS_TRANS_QM_SBCHANGE,        "QM_SBCHANGE" }, \
+        { XFS_TRANS_SB_CHANGE,          "SBCHANGE" }, \
+        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
        { XFS_TRANS_QM_QUOTAOFF,        "QM_QUOTAOFF" }, \
        { XFS_TRANS_QM_DQALLOC,         "QM_DQALLOC" }, \
        { XFS_TRANS_QM_SETQLIM,         "QM_SETQLIM" }, \
        { XFS_TRANS_QM_DQCLUSTER,       "QM_DQCLUSTER" }, \
        { XFS_TRANS_QM_QINOCREATE,      "QM_QINOCREATE" }, \
        { XFS_TRANS_QM_QUOTAOFF_END,    "QM_QOFF_END" }, \
-        { XFS_TRANS_SB_UNIT,            "SB_UNIT" }, \
        { XFS_TRANS_FSYNC_TS,           "FSYNC_TS" }, \
        { XFS_TRANS_GROWFSRT_ALLOC,     "GROWFSRT_ALLOC" }, \
        { XFS_TRANS_GROWFSRT_ZERO,      "GROWFSRT_ZERO" }, \
        { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
        { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
-        { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
        { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
-        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
+        { XFS_TRANS_ICREATE,            "ICREATE" }, \
-        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
+        { XFS_TRANS_CREATE_TMPFILE,     "CREATE_TMPFILE" }, \
        { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
 /*
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index c80c5236c3da..e7e26bd6468f 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -178,6 +178,8 @@ xfs_symlink_local_to_remote(
        struct xfs_mount        *mp = ip->i_mount;
        char                    *buf;
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
        if (!xfs_sb_version_hascrc(&mp->m_sb)) {
                bp->b_ops = NULL;
                memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 6c1330f29050..68cb1e7bf2bb 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -716,17 +716,6 @@ xfs_calc_clear_agi_bucket_reservation(
 }
 /*
- * Clearing the quotaflags in the superblock.
- *      the super block for changing quota flags: sector size
- */
-STATIC uint
-xfs_calc_qm_sbchange_reservation(
-        struct xfs_mount        *mp)
-{
-        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
-}
-/*
 * Adjusting quota limits.
 *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
 */
@@ -864,9 +853,6 @@ xfs_trans_resv_calc(
         * The following transactions are logged in logical format with
         * a default log count.
         */
-        resp->tr_qm_sbchange.tr_logres = xfs_calc_qm_sbchange_reservation(mp);
-        resp->tr_qm_sbchange.tr_logcount = XFS_DEFAULT_LOG_COUNT;
        resp->tr_qm_setqlim.tr_logres = xfs_calc_qm_setqlim_reservation(mp);
        resp->tr_qm_setqlim.tr_logcount = XFS_DEFAULT_LOG_COUNT;
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 1097d14cd583..2d5bdfce6d8f 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -56,7 +56,6 @@ struct xfs_trans_resv {
        struct xfs_trans_res    tr_growrtalloc; /* grow realtime allocations */
        struct xfs_trans_res    tr_growrtzero;  /* grow realtime zeroing */
        struct xfs_trans_res    tr_growrtfree;  /* grow realtime freeing */
-        struct xfs_trans_res    tr_qm_sbchange; /* change quota flags */
        struct xfs_trans_res    tr_qm_setqlim;  /* adjust quota limits */
        struct xfs_trans_res    tr_qm_dqalloc;  /* allocate quota on disk */
        struct xfs_trans_res    tr_qm_quotaoff; /* turn quota off */
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index b79dc66b2ecd..b79dc66b2ecd 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 18e2f3bbae5e..3a9b7a1b8704 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -135,30 +135,22 @@ xfs_setfilesize_trans_alloc(
 */
 STATIC int
 xfs_setfilesize(
-        struct xfs_ioend        *ioend)
+        struct xfs_inode        *ip,
+        struct xfs_trans        *tp,
+        xfs_off_t               offset,
+        size_t                  size)
 {
-        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
-        struct xfs_trans        *tp = ioend->io_append_trans;
        xfs_fsize_t             isize;
-        /*
-         * The transaction may have been allocated in the I/O submission thread,
-         * thus we need to mark ourselves as beeing in a transaction manually.
-         * Similarly for freeze protection.
-         */
-        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                           0, 1, _THIS_IP_);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
+        isize = xfs_new_eof(ip, offset + size);
        if (!isize) {
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                xfs_trans_cancel(tp, 0);
                return 0;
        }
-        trace_xfs_setfilesize(ip, ioend->io_offset, ioend->io_size);
+        trace_xfs_setfilesize(ip, offset, size);
        ip->i_d.di_size = isize;
        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
@@ -167,6 +159,25 @@ xfs_setfilesize(
        return xfs_trans_commit(tp, 0);
 }
+STATIC int
+xfs_setfilesize_ioend(
+        struct xfs_ioend        *ioend)
+{
+        struct xfs_inode        *ip = XFS_I(ioend->io_inode);
+        struct xfs_trans        *tp = ioend->io_append_trans;
+        /*
+         * The transaction may have been allocated in the I/O submission thread,
+         * thus we need to mark ourselves as being in a transaction manually.
+         * Similarly for freeze protection.
+         */
+        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                           0, 1, _THIS_IP_);
+        return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
+}
 /*
 * Schedule IO completion handling on the final put of an ioend.
 *
@@ -182,8 +193,7 @@ xfs_finish_ioend(
                if (ioend->io_type == XFS_IO_UNWRITTEN)
                        queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-                else if (ioend->io_append_trans ||
+                else if (ioend->io_append_trans)
-                         (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
                        queue_work(mp->m_data_workqueue, &ioend->io_work);
                else
                        xfs_destroy_ioend(ioend);
@@ -215,22 +225,8 @@ xfs_end_io(
        if (ioend->io_type == XFS_IO_UNWRITTEN) {
                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
                                                  ioend->io_size);
-        } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
-                /*
-                 * For direct I/O we do not know if we need to allocate blocks
-                 * or not so we can't preallocate an append transaction as that
-                 * results in nested reservations and log space deadlocks. Hence
-                 * allocate the transaction here. While this is sub-optimal and
-                 * can block IO completion for some time, we're stuck with doing
-                 * it this way until we can pass the ioend to the direct IO
-                 * allocation callbacks and avoid nesting that way.
-                 */
-                error = xfs_setfilesize_trans_alloc(ioend);
-                if (error)
-                        goto done;
-                error = xfs_setfilesize(ioend);
        } else if (ioend->io_append_trans) {
-                error = xfs_setfilesize(ioend);
+                error = xfs_setfilesize_ioend(ioend);
        } else {
                ASSERT(!xfs_ioend_is_append(ioend));
        }
@@ -242,17 +238,6 @@ done:
 }
 /*
- * Call IO completion handling in caller context on the final put of an ioend.
- */
-STATIC void
-xfs_finish_ioend_sync(
-        struct xfs_ioend        *ioend)
-{
-        if (atomic_dec_and_test(&ioend->io_remaining))
-                xfs_end_io(&ioend->io_work);
-}
-/*
 * Allocate and initialise an IO completion structure.
 * We need to track unwritten extent write completion here initially.
 * We'll need to extend this for updating the ondisk inode size later
@@ -273,7 +258,6 @@ xfs_alloc_ioend(
         * all the I/O from calling the completion routine too early.
         */
        atomic_set(&ioend->io_remaining, 1);
-        ioend->io_isdirect = 0;
        ioend->io_error = 0;
        ioend->io_list = NULL;
        ioend->io_type = type;
@@ -1459,11 +1443,7 @@ xfs_get_blocks_direct(
 *
 * If the private argument is non-NULL __xfs_get_blocks signals us that we
 * need to issue a transaction to convert the range from unwritten to written
- * extents.  In case this is regular synchronous I/O we just call xfs_end_io
+ * extents.
- * to do this and we are done.  But in case this was a successful AIO
- * request this handler is called from interrupt context, from which we
- * can't start transactions.  In that case offload the I/O completion to
- * the workqueues we also use for buffered I/O completion.
 */
 STATIC void
 xfs_end_io_direct_write(
@@ -1472,7 +1452,12 @@ xfs_end_io_direct_write(
        ssize_t                 size,
        void                    *private)
 {
-        struct xfs_ioend        *ioend = iocb->private;
+        struct inode            *inode = file_inode(iocb->ki_filp);
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return;
        /*
         * While the generic direct I/O code updates the inode size, it does
@@ -1480,22 +1465,33 @@ xfs_end_io_direct_write(
         * end_io handler thinks the on-disk size is outside the in-core
         * size.  To prevent this just update it a little bit earlier here.
         */
-        if (offset + size > i_size_read(ioend->io_inode))
+        if (offset + size > i_size_read(inode))
-                i_size_write(ioend->io_inode, offset + size);
+                i_size_write(inode, offset + size);
        /*
-         * blockdev_direct_IO can return an error even after the I/O
+         * For direct I/O we do not know if we need to allocate blocks or not,
-         * completion handler was called.  Thus we need to protect
+         * so we can't preallocate an append transaction, as that results in
-         * against double-freeing.
+         * nested reservations and log space deadlocks. Hence allocate the
+         * transaction here. While this is sub-optimal and can block IO
+         * completion for some time, we're stuck with doing it this way until
+         * we can pass the ioend to the direct IO allocation callbacks and
+         * avoid nesting that way.
         */
-        iocb->private = NULL;
+        if (private && size > 0) {
+                xfs_iomap_write_unwritten(ip, offset, size);
-        ioend->io_offset = offset;
+        } else if (offset + size > ip->i_d.di_size) {
-        ioend->io_size = size;
+                struct xfs_trans        *tp;
-        if (private && size > 0)
+                int                     error;
-                ioend->io_type = XFS_IO_UNWRITTEN;
+                tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS);
+                error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0);
+                if (error) {
+                        xfs_trans_cancel(tp, 0);
+                        return;
+                }
-        xfs_finish_ioend_sync(ioend);
+                xfs_setfilesize(ip, tp, offset, size);
+        }
 }
 STATIC ssize_t
@@ -1507,39 +1503,16 @@ xfs_vm_direct_IO(
 {
        struct inode            *inode = iocb->ki_filp->f_mapping->host;
        struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
-        struct xfs_ioend        *ioend = NULL;
-        ssize_t                 ret;
        if (rw & WRITE) {
-                size_t size = iov_iter_count(iter);
+                return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
-                /*
-                 * We cannot preallocate a size update transaction here as we
-                 * don't know whether allocation is necessary or not. Hence we
-                 * can only tell IO completion that one is necessary if we are
-                 * not doing unwritten extent conversion.
-                 */
-                iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
-                if (offset + size > XFS_I(inode)->i_d.di_size)
-                        ioend->io_isdirect = 1;
-                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
                                            offset, xfs_get_blocks_direct,
                                            xfs_end_io_direct_write, NULL,
                                            DIO_ASYNC_EXTEND);
-                if (ret != -EIOCBQUEUED && iocb->private)
-                        goto out_destroy_ioend;
-        } else {
-                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
-                                            offset, xfs_get_blocks_direct,
-                                            NULL, NULL, 0);
        }
+        return __blockdev_direct_IO(rw, iocb, inode, bdev, iter,
-        return ret;
+                                    offset, xfs_get_blocks_direct,
+                                    NULL, NULL, 0);
-out_destroy_ioend:
-        xfs_destroy_ioend(ioend);
-        return ret;
 }
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f94dd459dff9..ac644e0137a4 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,14 +24,12 @@ extern mempool_t *xfs_ioend_pool;
 * Types of I/O for bmap clustering and I/O completion tracking.
 */
 enum {
-        XFS_IO_DIRECT = 0,      /* special case for direct I/O ioends */
        XFS_IO_DELALLOC,        /* covers delalloc region */
        XFS_IO_UNWRITTEN,       /* covers allocated but uninitialized data */
        XFS_IO_OVERWRITE,       /* covers already allocated extent */
 };
 #define XFS_IO_TYPES \
-        { 0,                    "" }, \
        { XFS_IO_DELALLOC,              "delalloc" }, \
        { XFS_IO_UNWRITTEN,             "unwritten" }, \
        { XFS_IO_OVERWRITE,             "overwrite" }
@@ -45,7 +43,6 @@ typedef struct xfs_ioend {
        unsigned int            io_type;        /* delalloc / unwritten */
        int                     io_error;       /* I/O error code */
        atomic_t                io_remaining;   /* hold count */
-        unsigned int            io_isdirect : 1;/* direct I/O */
        struct inode            *io_inode;      /* file being written to */
        struct buffer_head      *io_buffer_head;/* buffer linked list head */
        struct buffer_head      *io_buffer_tail;/* buffer linked list tail */
diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h
index 2fdb72d2c908..736429a72a12 100644
--- a/fs/xfs/xfs_bmap_util.h
+++ b/fs/xfs/xfs_bmap_util.h
@@ -26,43 +26,8 @@ struct xfs_ifork;
 struct xfs_inode;
 struct xfs_mount;
 struct xfs_trans;
+struct xfs_bmalloca;
-/*
- * Argument structure for xfs_bmap_alloc.
- */
-struct xfs_bmalloca {
-        xfs_fsblock_t           *firstblock; /* i/o first block allocated */
-        struct xfs_bmap_free    *flist; /* bmap freelist */
-        struct xfs_trans        *tp;    /* transaction pointer */
-        struct xfs_inode        *ip;    /* incore inode pointer */
-        struct xfs_bmbt_irec    prev;   /* extent before the new one */
-        struct xfs_bmbt_irec    got;    /* extent after, or delayed */
-        xfs_fileoff_t           offset; /* offset in file filling in */
-        xfs_extlen_t            length; /* i/o length asked/allocated */
-        xfs_fsblock_t           blkno;  /* starting block of new extent */
-        struct xfs_btree_cur    *cur;   /* btree cursor */
-        xfs_extnum_t            idx;    /* current extent index */
-        int                     nallocs;/* number of extents alloc'd */
-        int                     logflags;/* flags for transaction logging */
-        xfs_extlen_t            total;  /* total blocks needed for xaction */
-        xfs_extlen_t            minlen; /* minimum allocation size (blocks) */
-        xfs_extlen_t            minleft; /* amount must be left after alloc */
-        bool                    eof;    /* set if allocating past last extent */
-        bool                    wasdel; /* replacing a delayed allocation */
-        bool                    userdata;/* set if is user data */
-        bool                    aeof;   /* allocated space at eof */
-        bool                    conv;   /* overwriting unwritten extents */
-        int                     flags;
-        struct completion       *done;
-        struct work_struct      work;
-        int                     result;
-};
-int     xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
-                        int *committed);
 int     xfs_bmap_rtalloc(struct xfs_bmalloca *ap);
 int     xfs_bmap_eof(struct xfs_inode *ip, xfs_fileoff_t endoff,
                     int whichfork, int *eof);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index bb502a391792..1790b00bea7a 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1488,6 +1488,7 @@ xfs_buf_iomove(
 static enum lru_status
 xfs_buftarg_wait_rele(
        struct list_head        *item,
+        struct list_lru_one     *lru,
        spinlock_t              *lru_lock,
        void                    *arg)
@@ -1509,7 +1510,7 @@ xfs_buftarg_wait_rele(
         */
        atomic_set(&bp->b_lru_ref, 0);
        bp->b_state |= XFS_BSTATE_DISPOSE;
-        list_move(item, dispose);
+        list_lru_isolate_move(lru, item, dispose);
        spin_unlock(&bp->b_lock);
        return LRU_REMOVED;
 }
@@ -1546,6 +1547,7 @@ xfs_wait_buftarg(
 static enum lru_status
 xfs_buftarg_isolate(
        struct list_head        *item,
+        struct list_lru_one     *lru,
        spinlock_t              *lru_lock,
        void                    *arg)
 {
@@ -1569,7 +1571,7 @@ xfs_buftarg_isolate(
        }
        bp->b_state |= XFS_BSTATE_DISPOSE;
-        list_move(item, dispose);
+        list_lru_isolate_move(lru, item, dispose);
        spin_unlock(&bp->b_lock);
        return LRU_REMOVED;
 }
@@ -1583,10 +1585,9 @@ xfs_buftarg_shrink_scan(
                                        struct xfs_buftarg, bt_shrinker);
        LIST_HEAD(dispose);
        unsigned long           freed;
-        unsigned long           nr_to_scan = sc->nr_to_scan;
-        freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
+        freed = list_lru_shrink_walk(&btp->bt_lru, sc,
-                                       &dispose, &nr_to_scan);
+                                     xfs_buftarg_isolate, &dispose);
        while (!list_empty(&dispose)) {
                struct xfs_buf *bp;
@@ -1605,7 +1606,7 @@ xfs_buftarg_shrink_count(
 {
        struct xfs_buftarg      *btp = container_of(shrink,
                                        struct xfs_buftarg, bt_shrinker);
-        return list_lru_count_node(&btp->bt_lru, sc->nid);
+        return list_lru_shrink_count(&btp->bt_lru, sc);
 }
 void
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3f9bd58edec7..507d96a57ac7 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -319,6 +319,10 @@ xfs_buf_item_format(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
+        ASSERT((bip->bli_flags & XFS_BLI_STALE) ||
+               (xfs_blft_from_flags(&bip->__bli_format) > XFS_BLFT_UNKNOWN_BUF
+                && xfs_blft_from_flags(&bip->__bli_format) < XFS_BLFT_MAX_BUF));
        /*
         * If it is an inode buffer, transfer the in-memory state to the
@@ -535,7 +539,7 @@ xfs_buf_item_push(
        if ((bp->b_flags & XBF_WRITE_FAIL) &&
            ___ratelimit(&xfs_buf_write_fail_rl_state, "XFS:")) {
                xfs_warn(bp->b_target->bt_mount,
-"Detected failing async write on buffer block 0x%llx. Retrying async write.\n",
+"Detected failing async write on buffer block 0x%llx. Retrying async write.",
                         (long long)bp->b_bn);
        }
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index c24c67e22a2a..2f536f33cd26 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -86,7 +86,7 @@ static inline void xfs_dqflock(xfs_dquot_t *dqp)
        wait_for_completion(&dqp->q_flush);
 }
-static inline int xfs_dqflock_nowait(xfs_dquot_t *dqp)
+static inline bool xfs_dqflock_nowait(xfs_dquot_t *dqp)
 {
        return try_wait_for_completion(&dqp->q_flush);
 }
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 5eb4a14e0a0f..b97359ba2648 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -30,6 +30,7 @@
 #include "xfs_trace.h"
 #include "xfs_icache.h"
 #include "xfs_log.h"
+#include "xfs_pnfs.h"
 /*
 * Note that we only accept fileids which are long enough rather than allow
@@ -245,4 +246,9 @@ const struct export_operations xfs_export_operations = {
        .fh_to_parent           = xfs_fs_fh_to_parent,
        .get_parent             = xfs_fs_get_parent,
        .commit_metadata        = xfs_fs_nfs_commit_metadata,
+#ifdef CONFIG_NFSD_PNFS
+        .get_uuid               = xfs_fs_get_uuid,
+        .map_blocks             = xfs_fs_map_blocks,
+        .commit_blocks          = xfs_fs_commit_blocks,
+#endif
 };
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 13e974e6a889..ce615d12fb44 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -36,6 +36,7 @@
 #include "xfs_trace.h"
 #include "xfs_log.h"
 #include "xfs_icache.h"
+#include "xfs_pnfs.h"
 #include <linux/aio.h>
 #include <linux/dcache.h>
@@ -127,6 +128,42 @@ xfs_iozero(
        return (-status);
 }
+int
+xfs_update_prealloc_flags(
+        struct xfs_inode        *ip,
+        enum xfs_prealloc_flags flags)
+{
+        struct xfs_trans        *tp;
+        int                     error;
+        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
+        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                return error;
+        }
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        if (!(flags & XFS_PREALLOC_INVISIBLE)) {
+                ip->i_d.di_mode &= ~S_ISUID;
+                if (ip->i_d.di_mode & S_IXGRP)
+                        ip->i_d.di_mode &= ~S_ISGID;
+                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        }
+        if (flags & XFS_PREALLOC_SET)
+                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
+        if (flags & XFS_PREALLOC_CLEAR)
+                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        if (flags & XFS_PREALLOC_SYNC)
+                xfs_trans_set_sync(tp);
+        return xfs_trans_commit(tp, 0);
+}
 /*
 * Fsync operations on directories are much simpler than on regular files,
 * as there is no file data to flush, and thus also no need for explicit
@@ -518,6 +555,10 @@ restart:
        if (error)
                return error;
+        error = xfs_break_layouts(inode, iolock);
+        if (error)
+                return error;
        /*
         * If the offset is beyond the size of the file, we need to zero any
         * blocks that fall between the existing EOF and the start of this
@@ -699,7 +740,7 @@ xfs_file_buffered_aio_write(
        iov_iter_truncate(from, count);
        /* We can write back this queue in page reclaim */
-        current->backing_dev_info = mapping->backing_dev_info;
+        current->backing_dev_info = inode_to_bdi(inode);
 write_retry:
        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
@@ -784,8 +825,9 @@ xfs_file_fallocate(
 {
        struct inode            *inode = file_inode(file);
        struct xfs_inode        *ip = XFS_I(inode);
-        struct xfs_trans        *tp;
        long                    error;
+        enum xfs_prealloc_flags flags = 0;
+        uint                    iolock = XFS_IOLOCK_EXCL;
        loff_t                  new_size = 0;
        if (!S_ISREG(inode->i_mode))
@@ -794,7 +836,11 @@ xfs_file_fallocate(
                     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
                return -EOPNOTSUPP;
-        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        xfs_ilock(ip, iolock);
+        error = xfs_break_layouts(inode, &iolock);
+        if (error)
+                goto out_unlock;
        if (mode & FALLOC_FL_PUNCH_HOLE) {
                error = xfs_free_file_space(ip, offset, len);
                if (error)
@@ -822,6 +868,8 @@ xfs_file_fallocate(
                if (error)
                        goto out_unlock;
        } else {
+                flags |= XFS_PREALLOC_SET;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
                    offset + len > i_size_read(inode)) {
                        new_size = offset + len;
@@ -839,28 +887,10 @@ xfs_file_fallocate(
                        goto out_unlock;
        }
-        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
-        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                goto out_unlock;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        ip->i_d.di_mode &= ~S_ISUID;
-        if (ip->i_d.di_mode & S_IXGRP)
-                ip->i_d.di_mode &= ~S_ISGID;
-        if (!(mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_COLLAPSE_RANGE)))
-                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        if (file->f_flags & O_DSYNC)
-                xfs_trans_set_sync(tp);
+                flags |= XFS_PREALLOC_SYNC;
-        error = xfs_trans_commit(tp, 0);
+        error = xfs_update_prealloc_flags(ip, flags);
        if (error)
                goto out_unlock;
@@ -874,7 +904,7 @@ xfs_file_fallocate(
        }
 out_unlock:
-        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        xfs_iunlock(ip, iolock);
        return error;
 }
@@ -1384,5 +1414,4 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
        .fault          = filemap_fault,
        .map_pages      = filemap_map_pages,
        .page_mkwrite   = xfs_vm_page_mkwrite,
-        .remap_pages    = generic_file_remap_pages,
 };
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index fdc64220fcb0..74efe5b760dc 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -488,6 +488,7 @@ xfs_growfs_data_private(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, nfree);
        if (dpct)
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct);
+        xfs_trans_set_sync(tp);
        error = xfs_trans_commit(tp, 0);
        if (error)
                return error;
@@ -541,7 +542,7 @@ xfs_growfs_data_private(
                        saved_error = error;
                        continue;
                }
-                xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
+                xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb);
                error = xfs_bwrite(bp);
                xfs_buf_relse(bp);
@@ -601,6 +602,12 @@ xfs_growfs_data(
        if (!mutex_trylock(&mp->m_growlock))
                return -EWOULDBLOCK;
        error = xfs_growfs_data_private(mp, in);
+        /*
+         * Increment the generation unconditionally, the error could be from
+         * updating the secondary superblocks, in which case the new size
+         * is live already.
+         */
+        mp->m_generation++;
        mutex_unlock(&mp->m_growlock);
        return error;
 }
@@ -756,37 +763,6 @@ out:
        return 0;
 }
-/*
- * Dump a transaction into the log that contains no real change. This is needed
- * to be able to make the log dirty or stamp the current tail LSN into the log
- * during the covering operation.
- *
- * We cannot use an inode here for this - that will push dirty state back up
- * into the VFS and then periodic inode flushing will prevent log covering from
- * making progress. Hence we log a field in the superblock instead and use a
- * synchronous transaction to ensure the superblock is immediately unpinned
- * and can be written back.
- */
-int
-xfs_fs_log_dummy(
-        xfs_mount_t     *mp)
-{
-        xfs_trans_t     *tp;
-        int             error;
-        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        /* log the UUID because it is an unchanging field */
-        xfs_mod_sb(tp, XFS_SB_UUID);
-        xfs_trans_set_sync(tp);
-        return xfs_trans_commit(tp, 0);
-}
 int
 xfs_fs_goingdown(
        xfs_mount_t     *mp,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 41f804e740d7..daafa1f6d260 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1995,6 +1995,7 @@ xfs_iunlink(
        agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
        offset = offsetof(xfs_agi_t, agi_unlinked) +
                (sizeof(xfs_agino_t) * bucket_index);
+        xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
        xfs_trans_log_buf(tp, agibp, offset,
                          (offset + sizeof(xfs_agino_t) - 1));
        return 0;
@@ -2086,6 +2087,7 @@ xfs_iunlink_remove(
                agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
                offset = offsetof(xfs_agi_t, agi_unlinked) +
                        (sizeof(xfs_agino_t) * bucket_index);
+                xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
                xfs_trans_log_buf(tp, agibp, offset,
                                  (offset + sizeof(xfs_agino_t) - 1));
        } else {
@@ -2656,6 +2658,124 @@ xfs_sort_for_rename(
 }
 /*
+ * xfs_cross_rename()
+ *
+ * responsible for handling RENAME_EXCHANGE flag in renameat2() sytemcall
+ */
+STATIC int
+xfs_cross_rename(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp1,
+        struct xfs_name         *name1,
+        struct xfs_inode        *ip1,
+        struct xfs_inode        *dp2,
+        struct xfs_name         *name2,
+        struct xfs_inode        *ip2,
+        struct xfs_bmap_free    *free_list,
+        xfs_fsblock_t           *first_block,
+        int                     spaceres)
+{
+        int             error = 0;
+        int             ip1_flags = 0;
+        int             ip2_flags = 0;
+        int             dp2_flags = 0;
+        /* Swap inode number for dirent in first parent */
+        error = xfs_dir_replace(tp, dp1, name1,
+                                ip2->i_ino,
+                                first_block, free_list, spaceres);
+        if (error)
+                goto out;
+        /* Swap inode number for dirent in second parent */
+        error = xfs_dir_replace(tp, dp2, name2,
+                                ip1->i_ino,
+                                first_block, free_list, spaceres);
+        if (error)
+                goto out;
+        /*
+         * If we're renaming one or more directories across different parents,
+         * update the respective ".." entries (and link counts) to match the new
+         * parents.
+         */
+        if (dp1 != dp2) {
+                dp2_flags = XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+                if (S_ISDIR(ip2->i_d.di_mode)) {
+                        error = xfs_dir_replace(tp, ip2, &xfs_name_dotdot,
+                                                dp1->i_ino, first_block,
+                                                free_list, spaceres);
+                        if (error)
+                                goto out;
+                        /* transfer ip2 ".." reference to dp1 */
+                        if (!S_ISDIR(ip1->i_d.di_mode)) {
+                                error = xfs_droplink(tp, dp2);
+                                if (error)
+                                        goto out;
+                                error = xfs_bumplink(tp, dp1);
+                                if (error)
+                                        goto out;
+                        }
+                        /*
+                         * Although ip1 isn't changed here, userspace needs
+                         * to be warned about the change, so that applications
+                         * relying on it (like backup ones), will properly
+                         * notify the change
+                         */
+                        ip1_flags |= XFS_ICHGTIME_CHG;
+                        ip2_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+                }
+                if (S_ISDIR(ip1->i_d.di_mode)) {
+                        error = xfs_dir_replace(tp, ip1, &xfs_name_dotdot,
+                                                dp2->i_ino, first_block,
+                                                free_list, spaceres);
+                        if (error)
+                                goto out;
+                        /* transfer ip1 ".." reference to dp2 */
+                        if (!S_ISDIR(ip2->i_d.di_mode)) {
+                                error = xfs_droplink(tp, dp1);
+                                if (error)
+                                        goto out;
+                                error = xfs_bumplink(tp, dp2);
+                                if (error)
+                                        goto out;
+                        }
+                        /*
+                         * Although ip2 isn't changed here, userspace needs
+                         * to be warned about the change, so that applications
+                         * relying on it (like backup ones), will properly
+                         * notify the change
+                         */
+                        ip1_flags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG;
+                        ip2_flags |= XFS_ICHGTIME_CHG;
+                }
+        }
+        if (ip1_flags) {
+                xfs_trans_ichgtime(tp, ip1, ip1_flags);
+                xfs_trans_log_inode(tp, ip1, XFS_ILOG_CORE);
+        }
+        if (ip2_flags) {
+                xfs_trans_ichgtime(tp, ip2, ip2_flags);
+                xfs_trans_log_inode(tp, ip2, XFS_ILOG_CORE);
+        }
+        if (dp2_flags) {
+                xfs_trans_ichgtime(tp, dp2, dp2_flags);
+                xfs_trans_log_inode(tp, dp2, XFS_ILOG_CORE);
+        }
+        xfs_trans_ichgtime(tp, dp1, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, dp1, XFS_ILOG_CORE);
+out:
+        return error;
+}
+/*
 * xfs_rename
 */
 int
@@ -2665,7 +2785,8 @@ xfs_rename(
        xfs_inode_t     *src_ip,
        xfs_inode_t     *target_dp,
        struct xfs_name *target_name,
-        xfs_inode_t     *target_ip)
+        xfs_inode_t     *target_ip,
+        unsigned int    flags)
 {
        xfs_trans_t     *tp = NULL;
        xfs_mount_t     *mp = src_dp->i_mount;
@@ -2743,6 +2864,18 @@ xfs_rename(
        }
        /*
+         * Handle RENAME_EXCHANGE flags
+         */
+        if (flags & RENAME_EXCHANGE) {
+                error = xfs_cross_rename(tp, src_dp, src_name, src_ip,
+                                         target_dp, target_name, target_ip,
+                                         &free_list, &first_block, spaceres);
+                if (error)
+                        goto abort_return;
+                goto finish_rename;
+        }
+        /*
         * Set up the target.
         */
        if (target_ip == NULL) {
@@ -2881,6 +3014,7 @@ xfs_rename(
        if (new_parent)
                xfs_trans_log_inode(tp, target_dp, XFS_ILOG_CORE);
+finish_rename:
        /*
         * If this is a synchronous mount, make sure that the
         * rename transaction goes to disk before returning to
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 4ed2ba9342dc..86cd6b39bed7 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -338,7 +338,7 @@ int		xfs_link(struct xfs_inode *tdp, struct xfs_inode *sip,
 int             xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name,
                           struct xfs_inode *src_ip, struct xfs_inode *target_dp,
                           struct xfs_name *target_name,
-                           struct xfs_inode *target_ip);
+                           struct xfs_inode *target_ip, unsigned int flags);
 void            xfs_ilock(xfs_inode_t *, uint);
 int             xfs_ilock_nowait(xfs_inode_t *, uint);
@@ -377,6 +377,15 @@ int		xfs_droplink(struct xfs_trans *, struct xfs_inode *);
 int             xfs_bumplink(struct xfs_trans *, struct xfs_inode *);
 /* from xfs_file.c */
+enum xfs_prealloc_flags {
+        XFS_PREALLOC_SET        = (1 << 1),
+        XFS_PREALLOC_CLEAR      = (1 << 2),
+        XFS_PREALLOC_SYNC       = (1 << 3),
+        XFS_PREALLOC_INVISIBLE  = (1 << 4),
+};
+int             xfs_update_prealloc_flags(struct xfs_inode *,
+                        enum xfs_prealloc_flags);
 int             xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
 int             xfs_iozero(struct xfs_inode *, loff_t, size_t);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index a1831980a68e..bf70a2affb05 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -39,6 +39,7 @@
 #include "xfs_icache.h"
 #include "xfs_symlink.h"
 #include "xfs_trans.h"
+#include "xfs_pnfs.h"
 #include <linux/capability.h>
 #include <linux/dcache.h>
@@ -606,11 +607,9 @@ xfs_ioc_space(
        unsigned int            cmd,
        xfs_flock64_t           *bf)
 {
-        struct xfs_mount        *mp = ip->i_mount;
-        struct xfs_trans        *tp;
        struct iattr            iattr;
-        bool                    setprealloc = false;
+        enum xfs_prealloc_flags flags = 0;
-        bool                    clrprealloc = false;
+        uint                    iolock = XFS_IOLOCK_EXCL;
        int                     error;
        /*
@@ -630,11 +629,19 @@ xfs_ioc_space(
        if (!S_ISREG(inode->i_mode))
                return -EINVAL;
+        if (filp->f_flags & O_DSYNC)
+                flags |= XFS_PREALLOC_SYNC;
+        if (ioflags & XFS_IO_INVIS)     
+                flags |= XFS_PREALLOC_INVISIBLE;
        error = mnt_want_write_file(filp);
        if (error)
                return error;
-        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        xfs_ilock(ip, iolock);
+        error = xfs_break_layouts(inode, &iolock);
+        if (error)
+                goto out_unlock;
        switch (bf->l_whence) {
        case 0: /*SEEK_SET*/
@@ -673,25 +680,23 @@ xfs_ioc_space(
        }
        if (bf->l_start < 0 ||
-            bf->l_start > mp->m_super->s_maxbytes ||
+            bf->l_start > inode->i_sb->s_maxbytes ||
            bf->l_start + bf->l_len < 0 ||
-            bf->l_start + bf->l_len >= mp->m_super->s_maxbytes) {
+            bf->l_start + bf->l_len >= inode->i_sb->s_maxbytes) {
                error = -EINVAL;
                goto out_unlock;
        }
        switch (cmd) {
        case XFS_IOC_ZERO_RANGE:
+                flags |= XFS_PREALLOC_SET;
                error = xfs_zero_file_space(ip, bf->l_start, bf->l_len);
-                if (!error)
-                        setprealloc = true;
                break;
        case XFS_IOC_RESVSP:
        case XFS_IOC_RESVSP64:
+                flags |= XFS_PREALLOC_SET;
                error = xfs_alloc_file_space(ip, bf->l_start, bf->l_len,
                                                XFS_BMAPI_PREALLOC);
-                if (!error)
-                        setprealloc = true;
                break;
        case XFS_IOC_UNRESVSP:
        case XFS_IOC_UNRESVSP64:
@@ -701,6 +706,7 @@ xfs_ioc_space(
        case XFS_IOC_ALLOCSP64:
        case XFS_IOC_FREESP:
        case XFS_IOC_FREESP64:
+                flags |= XFS_PREALLOC_CLEAR;
                if (bf->l_start > XFS_ISIZE(ip)) {
                        error = xfs_alloc_file_space(ip, XFS_ISIZE(ip),
                                        bf->l_start - XFS_ISIZE(ip), 0);
@@ -712,8 +718,6 @@ xfs_ioc_space(
                iattr.ia_size = bf->l_start;
                error = xfs_setattr_size(ip, &iattr);
-                if (!error)
-                        clrprealloc = true;
                break;
        default:
                ASSERT(0);
@@ -723,35 +727,10 @@ xfs_ioc_space(
        if (error)
                goto out_unlock;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_WRITEID);
+        error = xfs_update_prealloc_flags(ip, flags);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_writeid, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                goto out_unlock;
-        }
-        xfs_ilock(ip, XFS_ILOCK_EXCL);
-        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-        if (!(ioflags & XFS_IO_INVIS)) {
-                ip->i_d.di_mode &= ~S_ISUID;
-                if (ip->i_d.di_mode & S_IXGRP)
-                        ip->i_d.di_mode &= ~S_ISGID;
-                xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG);
-        }
-        if (setprealloc)
-                ip->i_d.di_flags |= XFS_DIFLAG_PREALLOC;
-        else if (clrprealloc)
-                ip->i_d.di_flags &= ~XFS_DIFLAG_PREALLOC;
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        if (filp->f_flags & O_DSYNC)
-                xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
 out_unlock:
-        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        xfs_iunlock(ip, iolock);
        mnt_drop_write_file(filp);
        return error;
 }
@@ -1013,20 +992,182 @@ xfs_diflags_to_linux(
                inode->i_flags &= ~S_NOATIME;
 }
-#define FSX_PROJID      1
+static int
-#define FSX_EXTSIZE     2
+xfs_ioctl_setattr_xflags(
-#define FSX_XFLAGS      4
+        struct xfs_trans        *tp,
-#define FSX_NONBLOCK    8
+        struct xfs_inode        *ip,
+        struct fsxattr          *fa)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        /* Can't change realtime flag if any extents are allocated. */
+        if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
+            XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
+                return -EINVAL;
+        /* If realtime flag is set then must have realtime device */
+        if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
+                if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
+                    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
+                        return -EINVAL;
+        }
+        /*
+         * Can't modify an immutable/append-only file unless
+         * we have appropriate permission.
+         */
+        if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
+             (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+            !capable(CAP_LINUX_IMMUTABLE))
+                return -EPERM;
+        xfs_set_diflags(ip, fa->fsx_xflags);
+        xfs_diflags_to_linux(ip);
+        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        XFS_STATS_INC(xs_ig_attrchg);
+        return 0;
+}
+/*
+ * Set up the transaction structure for the setattr operation, checking that we
+ * have permission to do so. On success, return a clean transaction and the
+ * inode locked exclusively ready for further operation specific checks. On
+ * failure, return an error without modifying or locking the inode.
+ */
+static struct xfs_trans *
+xfs_ioctl_setattr_get_trans(
+        struct xfs_inode        *ip)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_trans        *tp;
+        int                     error;
+        if (mp->m_flags & XFS_MOUNT_RDONLY)
+                return ERR_PTR(-EROFS);
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return ERR_PTR(-EIO);
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+        if (error)
+                goto out_cancel;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        /*
+         * CAP_FOWNER overrides the following restrictions:
+         *
+         * The user ID of the calling process must be equal to the file owner
+         * ID, except in cases where the CAP_FSETID capability is applicable.
+         */
+        if (!inode_owner_or_capable(VFS_I(ip))) {
+                error = -EPERM;
+                goto out_cancel;
+        }
+        if (mp->m_flags & XFS_MOUNT_WSYNC)
+                xfs_trans_set_sync(tp);
+        return tp;
+out_cancel:
+        xfs_trans_cancel(tp, 0);
+        return ERR_PTR(error);
+}
+/*
+ * extent size hint validation is somewhat cumbersome. Rules are:
+ *
+ * 1. extent size hint is only valid for directories and regular files
+ * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
+ * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 4. can only be changed on regular files if no extents are allocated
+ * 5. can be changed on directories at any time
+ * 6. extsize hint of 0 turns off hints, clears inode flags.
+ * 7. Extent size must be a multiple of the appropriate block size.
+ * 8. for non-realtime files, the extent size hint must be limited
+ *    to half the AG size to avoid alignment extending the extent beyond the
+ *    limits of the AG.
+ */
+static int
+xfs_ioctl_setattr_check_extsize(
+        struct xfs_inode        *ip,
+        struct fsxattr          *fa)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+                return -EINVAL;
+        if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
+            !S_ISDIR(ip->i_d.di_mode))
+                return -EINVAL;
+        if (S_ISREG(ip->i_d.di_mode) && ip->i_d.di_nextents &&
+            ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) != fa->fsx_extsize))
+                return -EINVAL;
+        if (fa->fsx_extsize != 0) {
+                xfs_extlen_t    size;
+                xfs_fsblock_t   extsize_fsb;
+                extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
+                if (extsize_fsb > MAXEXTLEN)
+                        return -EINVAL;
+                if (XFS_IS_REALTIME_INODE(ip) ||
+                    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+                        size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
+                } else {
+                        size = mp->m_sb.sb_blocksize;
+                        if (extsize_fsb > mp->m_sb.sb_agblocks / 2)
+                                return -EINVAL;
+                }
+                if (fa->fsx_extsize % size)
+                        return -EINVAL;
+        } else
+                fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
+        return 0;
+}
+static int
+xfs_ioctl_setattr_check_projid(
+        struct xfs_inode        *ip,
+        struct fsxattr          *fa)
+{
+        /* Disallow 32bit project ids if projid32bit feature is not enabled. */
+        if (fa->fsx_projid > (__uint16_t)-1 &&
+            !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
+                return -EINVAL;
+        /*
+         * Project Quota ID state is only allowed to change from within the init
+         * namespace. Enforce that restriction only if we are trying to change
+         * the quota ID state. Everything else is allowed in user namespaces.
+         */
+        if (current_user_ns() == &init_user_ns)
+                return 0;
+        if (xfs_get_projid(ip) != fa->fsx_projid)
+                return -EINVAL;
+        if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
+            (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
+                return -EINVAL;
+        return 0;
+}
 STATIC int
 xfs_ioctl_setattr(
        xfs_inode_t             *ip,
-        struct fsxattr          *fa,
+        struct fsxattr          *fa)
-        int                     mask)
 {
        struct xfs_mount        *mp = ip->i_mount;
        struct xfs_trans        *tp;
-        unsigned int            lock_flags = 0;
        struct xfs_dquot        *udqp = NULL;
        struct xfs_dquot        *pdqp = NULL;
        struct xfs_dquot        *olddquot = NULL;
@@ -1034,17 +1175,9 @@ xfs_ioctl_setattr(
        trace_xfs_ioctl_setattr(ip);
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
+        code = xfs_ioctl_setattr_check_projid(ip, fa);
-                return -EROFS;
+        if (code)
-        if (XFS_FORCED_SHUTDOWN(mp))
+                return code;
-                return -EIO;
-        /*
-         * Disallow 32bit project ids when projid32bit feature is not enabled.
-         */
-        if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) &&
-                        !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb))
-                return -EINVAL;
        /*
         * If disk quotas is on, we make sure that the dquots do exist on disk,
@@ -1054,7 +1187,7 @@ xfs_ioctl_setattr(
         * If the IDs do change before we take the ilock, we're covered
         * because the i_*dquot fields will get updated anyway.
         */
-        if (XFS_IS_QUOTA_ON(mp) && (mask & FSX_PROJID)) {
+        if (XFS_IS_QUOTA_ON(mp)) {
                code = xfs_qm_vop_dqalloc(ip, ip->i_d.di_uid,
                                         ip->i_d.di_gid, fa->fsx_projid,
                                         XFS_QMOPT_PQUOTA, &udqp, NULL, &pdqp);
@@ -1062,175 +1195,49 @@ xfs_ioctl_setattr(
                        return code;
        }
-        /*
+        tp = xfs_ioctl_setattr_get_trans(ip);
-         * For the other attributes, we acquire the inode lock and
+        if (IS_ERR(tp)) {
-         * first do an error checking pass.
+                code = PTR_ERR(tp);
-         */
+                goto error_free_dquots;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
-        code = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
-        if (code)
-                goto error_return;
-        lock_flags = XFS_ILOCK_EXCL;
-        xfs_ilock(ip, lock_flags);
-        /*
-         * CAP_FOWNER overrides the following restrictions:
-         *
-         * The user ID of the calling process must be equal
-         * to the file owner ID, except in cases where the
-         * CAP_FSETID capability is applicable.
-         */
-        if (!inode_owner_or_capable(VFS_I(ip))) {
-                code = -EPERM;
-                goto error_return;
-        }
-        /*
-         * Do a quota reservation only if projid is actually going to change.
-         * Only allow changing of projid from init_user_ns since it is a
-         * non user namespace aware identifier.
-         */
-        if (mask & FSX_PROJID) {
-                if (current_user_ns() != &init_user_ns) {
-                        code = -EINVAL;
-                        goto error_return;
-                }
-                if (XFS_IS_QUOTA_RUNNING(mp) &&
-                    XFS_IS_PQUOTA_ON(mp) &&
-                    xfs_get_projid(ip) != fa->fsx_projid) {
-                        ASSERT(tp);
-                        code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL,
-                                                pdqp, capable(CAP_FOWNER) ?
-                                                XFS_QMOPT_FORCE_RES : 0);
-                        if (code)       /* out of quota */
-                                goto error_return;
-                }
        }
-        if (mask & FSX_EXTSIZE) {
-                /*
-                 * Can't change extent size if any extents are allocated.
-                 */
-                if (ip->i_d.di_nextents &&
-                    ((ip->i_d.di_extsize << mp->m_sb.sb_blocklog) !=
-                     fa->fsx_extsize)) {
-                        code = -EINVAL; /* EFBIG? */
-                        goto error_return;
-                }
-                /*
+        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) &&
-                 * Extent size must be a multiple of the appropriate block
+            xfs_get_projid(ip) != fa->fsx_projid) {
-                 * size, if set at all. It must also be smaller than the
+                code = xfs_qm_vop_chown_reserve(tp, ip, udqp, NULL, pdqp,
-                 * maximum extent size supported by the filesystem.
+                                capable(CAP_FOWNER) ?  XFS_QMOPT_FORCE_RES : 0);
-                 *
+                if (code)       /* out of quota */
-                 * Also, for non-realtime files, limit the extent size hint to
+                        goto error_trans_cancel;
-                 * half the size of the AGs in the filesystem so alignment
-                 * doesn't result in extents larger than an AG.
-                 */
-                if (fa->fsx_extsize != 0) {
-                        xfs_extlen_t    size;
-                        xfs_fsblock_t   extsize_fsb;
-                        extsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_extsize);
-                        if (extsize_fsb > MAXEXTLEN) {
-                                code = -EINVAL;
-                                goto error_return;
-                        }
-                        if (XFS_IS_REALTIME_INODE(ip) ||
-                            ((mask & FSX_XFLAGS) &&
-                            (fa->fsx_xflags & XFS_XFLAG_REALTIME))) {
-                                size = mp->m_sb.sb_rextsize <<
-                                       mp->m_sb.sb_blocklog;
-                        } else {
-                                size = mp->m_sb.sb_blocksize;
-                                if (extsize_fsb > mp->m_sb.sb_agblocks / 2) {
-                                        code = -EINVAL;
-                                        goto error_return;
-                                }
-                        }
-                        if (fa->fsx_extsize % size) {
-                                code = -EINVAL;
-                                goto error_return;
-                        }
-                }
        }
+        code = xfs_ioctl_setattr_check_extsize(ip, fa);
+        if (code)
+                goto error_trans_cancel;
-        if (mask & FSX_XFLAGS) {
+        code = xfs_ioctl_setattr_xflags(tp, ip, fa);
-                /*
+        if (code)
-                 * Can't change realtime flag if any extents are allocated.
+                goto error_trans_cancel;
-                 */
-                if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
-                    (XFS_IS_REALTIME_INODE(ip)) !=
-                    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-                        code = -EINVAL; /* EFBIG? */
-                        goto error_return;
-                }
-                /*
-                 * If realtime flag is set then must have realtime data.
-                 */
-                if ((fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
-                        if ((mp->m_sb.sb_rblocks == 0) ||
-                            (mp->m_sb.sb_rextsize == 0) ||
-                            (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) {
-                                code = -EINVAL;
-                                goto error_return;
-                        }
-                }
-                /*
-                 * Can't modify an immutable/append-only file unless
-                 * we have appropriate permission.
-                 */
-                if ((ip->i_d.di_flags &
-                                (XFS_DIFLAG_IMMUTABLE|XFS_DIFLAG_APPEND) ||
-                     (fa->fsx_xflags &
-                                (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
-                    !capable(CAP_LINUX_IMMUTABLE)) {
-                        code = -EPERM;
-                        goto error_return;
-                }
-        }
-        xfs_trans_ijoin(tp, ip, 0);
        /*
-         * Change file ownership.  Must be the owner or privileged.
+         * Change file ownership.  Must be the owner or privileged.  CAP_FSETID
+         * overrides the following restrictions:
+         *
+         * The set-user-ID and set-group-ID bits of a file will be cleared upon
+         * successful return from chown()
         */
-        if (mask & FSX_PROJID) {
-                /*
-                 * CAP_FSETID overrides the following restrictions:
-                 *
-                 * The set-user-ID and set-group-ID bits of a file will be
-                 * cleared upon successful return from chown()
-                 */
-                if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
-                    !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
-                        ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
-                /*
-                 * Change the ownerships and register quota modifications
-                 * in the transaction.
-                 */
-                if (xfs_get_projid(ip) != fa->fsx_projid) {
-                        if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
-                                olddquot = xfs_qm_vop_chown(tp, ip,
-                                                        &ip->i_pdquot, pdqp);
-                        }
-                        ASSERT(ip->i_d.di_version > 1);
-                        xfs_set_projid(ip, fa->fsx_projid);
-                }
-        }
+        if ((ip->i_d.di_mode & (S_ISUID|S_ISGID)) &&
+            !capable_wrt_inode_uidgid(VFS_I(ip), CAP_FSETID))
+                ip->i_d.di_mode &= ~(S_ISUID|S_ISGID);
-        if (mask & FSX_XFLAGS) {
+        /* Change the ownerships and register project quota modifications */
-                xfs_set_diflags(ip, fa->fsx_xflags);
+        if (xfs_get_projid(ip) != fa->fsx_projid) {
-                xfs_diflags_to_linux(ip);
+                if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) {
+                        olddquot = xfs_qm_vop_chown(tp, ip,
+                                                &ip->i_pdquot, pdqp);
+                }
+                ASSERT(ip->i_d.di_version > 1);
+                xfs_set_projid(ip, fa->fsx_projid);
        }
        /*
@@ -1238,34 +1245,12 @@ xfs_ioctl_setattr(
         * extent size hint should be set on the inode. If no extent size flags
         * are set on the inode then unconditionally clear the extent size hint.
         */
-        if (mask & FSX_EXTSIZE) {
+        if (ip->i_d.di_flags & (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
-                int     extsize = 0;
+                ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
+        else
-                if (ip->i_d.di_flags &
+                ip->i_d.di_extsize = 0;
-                                (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT))
-                        extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
-                ip->i_d.di_extsize = extsize;
-        }
-        xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG);
-        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-        XFS_STATS_INC(xs_ig_attrchg);
-        /*
-         * If this is a synchronous mount, make sure that the
-         * transaction goes to disk before returning to the user.
-         * This is slightly sub-optimal in that truncates require
-         * two sync transactions instead of one for wsync filesystems.
-         * One for the truncate and one for the timestamps since we
-         * don't want to change the timestamps unless we're sure the
-         * truncate worked.  Truncates are less than 1% of the laddis
-         * mix so this probably isn't worth the trouble to optimize.
-         */
-        if (mp->m_flags & XFS_MOUNT_WSYNC)
-                xfs_trans_set_sync(tp);
        code = xfs_trans_commit(tp, 0);
-        xfs_iunlock(ip, lock_flags);
        /*
         * Release any dquot(s) the inode had kept before chown.
@@ -1276,12 +1261,11 @@ xfs_ioctl_setattr(
        return code;
- error_return:
+error_trans_cancel:
+        xfs_trans_cancel(tp, 0);
+error_free_dquots:
        xfs_qm_dqrele(udqp);
        xfs_qm_dqrele(pdqp);
-        xfs_trans_cancel(tp, 0);
-        if (lock_flags)
-                xfs_iunlock(ip, lock_flags);
        return code;
 }
@@ -1292,20 +1276,15 @@ xfs_ioc_fssetxattr(
        void                    __user *arg)
 {
        struct fsxattr          fa;
-        unsigned int            mask;
        int error;
        if (copy_from_user(&fa, arg, sizeof(fa)))
                return -EFAULT;
-        mask = FSX_XFLAGS | FSX_EXTSIZE | FSX_PROJID;
-        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                mask |= FSX_NONBLOCK;
        error = mnt_want_write_file(filp);
        if (error)
                return error;
-        error = xfs_ioctl_setattr(ip, &fa, mask);
+        error = xfs_ioctl_setattr(ip, &fa);
        mnt_drop_write_file(filp);
        return error;
 }
@@ -1325,14 +1304,14 @@ xfs_ioc_getxflags(
 STATIC int
 xfs_ioc_setxflags(
-        xfs_inode_t             *ip,
+        struct xfs_inode        *ip,
        struct file             *filp,
        void                    __user *arg)
 {
+        struct xfs_trans        *tp;
        struct fsxattr          fa;
        unsigned int            flags;
-        unsigned int            mask;
+        int                     error;
-        int error;
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
@@ -1342,15 +1321,26 @@ xfs_ioc_setxflags(
                      FS_SYNC_FL))
                return -EOPNOTSUPP;
-        mask = FSX_XFLAGS;
-        if (filp->f_flags & (O_NDELAY|O_NONBLOCK))
-                mask |= FSX_NONBLOCK;
        fa.fsx_xflags = xfs_merge_ioc_xflags(flags, xfs_ip2xflags(ip));
        error = mnt_want_write_file(filp);
        if (error)
                return error;
-        error = xfs_ioctl_setattr(ip, &fa, mask);
+        tp = xfs_ioctl_setattr_get_trans(ip);
+        if (IS_ERR(tp)) {
+                error = PTR_ERR(tp);
+                goto out_drop_write;
+        }
+        error = xfs_ioctl_setattr_xflags(tp, ip, &fa);
+        if (error) {
+                xfs_trans_cancel(tp, 0);
+                goto out_drop_write;
+        }
+        error = xfs_trans_commit(tp, 0);
+out_drop_write:
        mnt_drop_write_file(filp);
        return error;
 }
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index ec6772866f3d..bfc7c7c8a0c8 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -423,7 +423,7 @@ xfs_compat_attrmulti_by_handle(
        ops = memdup_user(compat_ptr(am_hreq.ops), size);
        if (IS_ERR(ops)) {
-                error = -PTR_ERR(ops);
+                error = PTR_ERR(ops);
                goto out_dput;
        }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index c980e2a5086b..ccb1dd0d509e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -802,7 +802,7 @@ int
 xfs_iomap_write_unwritten(
        xfs_inode_t     *ip,
        xfs_off_t       offset,
-        size_t          count)
+        xfs_off_t       count)
 {
        xfs_mount_t     *mp = ip->i_mount;
        xfs_fileoff_t   offset_fsb;
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 411fbb8919ef..8688e663d744 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -27,6 +27,6 @@ int xfs_iomap_write_delay(struct xfs_inode *, xfs_off_t, size_t,
                        struct xfs_bmbt_irec *);
 int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
                        struct xfs_bmbt_irec *);
-int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, size_t);
+int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 #endif /* __XFS_IOMAP_H__*/
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c50311cae1b1..d919ad7b16bf 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -37,6 +37,7 @@
 #include "xfs_da_btree.h"
 #include "xfs_dir2.h"
 #include "xfs_trans_space.h"
+#include "xfs_pnfs.h"
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -380,18 +381,27 @@ xfs_vn_rename(
        struct inode    *odir,
        struct dentry   *odentry,
        struct inode    *ndir,
-        struct dentry   *ndentry)
+        struct dentry   *ndentry,
+        unsigned int    flags)
 {
        struct inode    *new_inode = ndentry->d_inode;
+        int             omode = 0;
        struct xfs_name oname;
        struct xfs_name nname;
-        xfs_dentry_to_name(&oname, odentry, 0);
+        if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+                return -EINVAL;
+        /* if we are exchanging files, we need to set i_mode of both files */
+        if (flags & RENAME_EXCHANGE)
+                omode = ndentry->d_inode->i_mode;
+        xfs_dentry_to_name(&oname, odentry, omode);
        xfs_dentry_to_name(&nname, ndentry, odentry->d_inode->i_mode);
        return xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode),
-                          XFS_I(ndir), &nname, new_inode ?
+                          XFS_I(ndir), &nname,
-                                                XFS_I(new_inode) : NULL);
+                          new_inode ? XFS_I(new_inode) : NULL, flags);
 }
 /*
@@ -496,7 +506,7 @@ xfs_setattr_mode(
        inode->i_mode |= mode & ~S_IFMT;
 }
-static void
+void
 xfs_setattr_time(
        struct xfs_inode        *ip,
        struct iattr            *iattr)
@@ -970,9 +980,13 @@ xfs_vn_setattr(
        int                     error;
        if (iattr->ia_valid & ATTR_SIZE) {
-                xfs_ilock(ip, XFS_IOLOCK_EXCL);
+                uint            iolock = XFS_IOLOCK_EXCL;
-                error = xfs_setattr_size(ip, iattr);
-                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+                xfs_ilock(ip, iolock);
+                error = xfs_break_layouts(dentry->d_inode, &iolock);
+                if (!error)
+                        error = xfs_setattr_size(ip, iattr);
+                xfs_iunlock(ip, iolock);
        } else {
                error = xfs_setattr_nonsize(ip, iattr, 0);
        }
@@ -1144,7 +1158,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
         */
        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
-        .rename                 = xfs_vn_rename,
+        .rename2                = xfs_vn_rename,
        .get_acl                = xfs_get_acl,
        .set_acl                = xfs_set_acl,
        .getattr                = xfs_vn_getattr,
@@ -1172,7 +1186,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
         */
        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
-        .rename                 = xfs_vn_rename,
+        .rename2                = xfs_vn_rename,
        .get_acl                = xfs_get_acl,
        .set_acl                = xfs_set_acl,
        .getattr                = xfs_vn_getattr,
diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h
index 1c34e4335920..ea7a98e9cb70 100644
--- a/fs/xfs/xfs_iops.h
+++ b/fs/xfs/xfs_iops.h
@@ -32,6 +32,7 @@ extern void xfs_setup_inode(struct xfs_inode *);
 */
 #define XFS_ATTR_NOACL          0x01    /* Don't call posix_acl_chmod */
+extern void xfs_setattr_time(struct xfs_inode *ip, struct iattr *iattr);
 extern int xfs_setattr_nonsize(struct xfs_inode *ip, struct iattr *vap,
                               int flags);
 extern int xfs_setattr_size(struct xfs_inode *ip, struct iattr *vap);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index e408bf5a3ff7..bcc7cfabb787 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -33,6 +33,7 @@
 #include "xfs_fsops.h"
 #include "xfs_cksum.h"
 #include "xfs_sysfs.h"
+#include "xfs_sb.h"
 kmem_zone_t     *xfs_log_ticket_zone;
@@ -1290,9 +1291,20 @@ xfs_log_worker(
        struct xfs_mount        *mp = log->l_mp;
        /* dgc: errors ignored - not fatal and nowhere to report them */
-        if (xfs_log_need_covered(mp))
+        if (xfs_log_need_covered(mp)) {
-                xfs_fs_log_dummy(mp);
+                /*
-        else
+                 * Dump a transaction into the log that contains no real change.
+                 * This is needed to stamp the current tail LSN into the log
+                 * during the covering operation.
+                 *
+                 * We cannot use an inode here for this - that will push dirty
+                 * state back up into the VFS and then periodic inode flushing
+                 * will prevent log covering from making progress. Hence we
+                 * synchronously log the superblock instead to ensure the
+                 * superblock is immediately unpinned and can be written back.
+                 */
+                xfs_sync_sb(mp, true);
+        } else
                xfs_log_force(mp, 0);
        /* start pushing all the metadata that is currently dirty */
@@ -1395,6 +1407,8 @@ xlog_alloc_log(
        ASSERT(xfs_buf_islocked(bp));
        xfs_buf_unlock(bp);
+        /* use high priority wq for log I/O completion */
+        bp->b_ioend_wq = mp->m_log_workqueue;
        bp->b_iodone = xlog_iodone;
        log->l_xbuf = bp;
@@ -1427,6 +1441,8 @@ xlog_alloc_log(
                ASSERT(xfs_buf_islocked(bp));
                xfs_buf_unlock(bp);
+                /* use high priority wq for log I/O completion */
+                bp->b_ioend_wq = mp->m_log_workqueue;
                bp->b_iodone = xlog_iodone;
                iclog->ic_bp = bp;
                iclog->ic_data = bp->b_addr;
@@ -1806,8 +1822,6 @@ xlog_sync(
        XFS_BUF_ZEROFLAGS(bp);
        XFS_BUF_ASYNC(bp);
        bp->b_flags |= XBF_SYNCIO;
-        /* use high priority completion wq */
-        bp->b_ioend_wq = log->l_mp->m_log_workqueue;
        if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
                bp->b_flags |= XBF_FUA;
@@ -1856,8 +1870,6 @@ xlog_sync(
                bp->b_flags |= XBF_SYNCIO;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        bp->b_flags |= XBF_FUA;
-                /* use high priority completion wq */
-                bp->b_ioend_wq = log->l_mp->m_log_workqueue;
                ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
                ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -2027,7 +2039,7 @@ xlog_print_tic_res(
                "  total reg   = %u bytes (o/flow = %u bytes)\n"
                "  ophdrs      = %u (ophdr space = %u bytes)\n"
                "  ophdr + reg = %u bytes\n"
-                "  num regions = %u\n",
+                "  num regions = %u",
                ((ticket->t_trans_type <= 0 ||
                  ticket->t_trans_type > XFS_TRANS_TYPE_MAX) ?
                  "bad-trans-type" : trans_type_str[ticket->t_trans_type-1]),
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d3d38836f87f..4fa80e63eea2 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -408,11 +408,11 @@ xfs_update_alignment(xfs_mount_t *mp)
                if (xfs_sb_version_hasdalign(sbp)) {
                        if (sbp->sb_unit != mp->m_dalign) {
                                sbp->sb_unit = mp->m_dalign;
-                                mp->m_update_flags |= XFS_SB_UNIT;
+                                mp->m_update_sb = true;
                        }
                        if (sbp->sb_width != mp->m_swidth) {
                                sbp->sb_width = mp->m_swidth;
-                                mp->m_update_flags |= XFS_SB_WIDTH;
+                                mp->m_update_sb = true;
                        }
                } else {
                        xfs_warn(mp,
@@ -583,38 +583,19 @@ int
 xfs_mount_reset_sbqflags(
        struct xfs_mount        *mp)
 {
-        int                     error;
-        struct xfs_trans        *tp;
        mp->m_qflags = 0;
-        /*
+        /* It is OK to look at sb_qflags in the mount path without m_sb_lock. */
-         * It is OK to look at sb_qflags here in mount path,
-         * without m_sb_lock.
-         */
        if (mp->m_sb.sb_qflags == 0)
                return 0;
        spin_lock(&mp->m_sb_lock);
        mp->m_sb.sb_qflags = 0;
        spin_unlock(&mp->m_sb_lock);
-        /*
+        if (!xfs_fs_writable(mp, SB_FREEZE_WRITE))
-         * If the fs is readonly, let the incore superblock run
-         * with quotas off but don't flush the update out to disk
-         */
-        if (mp->m_flags & XFS_MOUNT_RDONLY)
                return 0;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
+        return xfs_sync_sb(mp, false);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                xfs_alert(mp, "%s: Superblock update failed!", __func__);
-                return error;
-        }
-        xfs_mod_sb(tp, XFS_SB_QFLAGS);
-        return xfs_trans_commit(tp, 0);
 }
 __uint64_t
@@ -659,26 +640,25 @@ xfs_mountfs(
        xfs_sb_mount_common(mp, sbp);
        /*
-         * Check for a mismatched features2 values.  Older kernels
+         * Check for a mismatched features2 values.  Older kernels read & wrote
-         * read & wrote into the wrong sb offset for sb_features2
+         * into the wrong sb offset for sb_features2 on some platforms due to
-         * on some platforms due to xfs_sb_t not being 64bit size aligned
+         * xfs_sb_t not being 64bit size aligned when sb_features2 was added,
-         * when sb_features2 was added, which made older superblock
+         * which made older superblock reading/writing routines swap it as a
-         * reading/writing routines swap it as a 64-bit value.
+         * 64-bit value.
         *
         * For backwards compatibility, we make both slots equal.
         *
-         * If we detect a mismatched field, we OR the set bits into the
+         * If we detect a mismatched field, we OR the set bits into the existing
-         * existing features2 field in case it has already been modified; we
+         * features2 field in case it has already been modified; we don't want
-         * don't want to lose any features.  We then update the bad location
+         * to lose any features.  We then update the bad location with the ORed
-         * with the ORed value so that older kernels will see any features2
+         * value so that older kernels will see any features2 flags. The
-         * flags, and mark the two fields as needing updates once the
+         * superblock writeback code ensures the new sb_features2 is copied to
-         * transaction subsystem is online.
+         * sb_bad_features2 before it is logged or written to disk.
         */
        if (xfs_sb_has_mismatched_features2(sbp)) {
                xfs_warn(mp, "correcting sb_features alignment problem");
                sbp->sb_features2 |= sbp->sb_bad_features2;
-                sbp->sb_bad_features2 = sbp->sb_features2;
+                mp->m_update_sb = true;
-                mp->m_update_flags |= XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2;
                /*
                 * Re-check for ATTR2 in case it was found in bad_features2
@@ -692,17 +672,17 @@ xfs_mountfs(
        if (xfs_sb_version_hasattr2(&mp->m_sb) &&
           (mp->m_flags & XFS_MOUNT_NOATTR2)) {
                xfs_sb_version_removeattr2(&mp->m_sb);
-                mp->m_update_flags |= XFS_SB_FEATURES2;
+                mp->m_update_sb = true;
                /* update sb_versionnum for the clearing of the morebits */
                if (!sbp->sb_features2)
-                        mp->m_update_flags |= XFS_SB_VERSIONNUM;
+                        mp->m_update_sb = true;
        }
        /* always use v2 inodes by default now */
        if (!(mp->m_sb.sb_versionnum & XFS_SB_VERSION_NLINKBIT)) {
                mp->m_sb.sb_versionnum |= XFS_SB_VERSION_NLINKBIT;
-                mp->m_update_flags |= XFS_SB_VERSIONNUM;
+                mp->m_update_sb = true;
        }
        /*
@@ -895,8 +875,8 @@ xfs_mountfs(
         * the next remount into writeable mode.  Otherwise we would never
         * perform the update e.g. for the root filesystem.
         */
-        if (mp->m_update_flags && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
+        if (mp->m_update_sb && !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                error = xfs_mount_log_sb(mp, mp->m_update_flags);
+                error = xfs_sync_sb(mp, false);
                if (error) {
                        xfs_warn(mp, "failed to write sb changes");
                        goto out_rtunmount;
@@ -1103,9 +1083,6 @@ xfs_fs_writable(
 int
 xfs_log_sbcount(xfs_mount_t *mp)
 {
-        xfs_trans_t     *tp;
-        int             error;
        /* allow this to proceed during the freeze sequence... */
        if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
                return 0;
@@ -1119,17 +1096,7 @@ xfs_log_sbcount(xfs_mount_t *mp)
        if (!xfs_sb_version_haslazysbcount(&mp->m_sb))
                return 0;
-        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
+        return xfs_sync_sb(mp, true);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_mod_sb(tp, XFS_SB_IFREE | XFS_SB_ICOUNT | XFS_SB_FDBLOCKS);
-        xfs_trans_set_sync(tp);
-        error = xfs_trans_commit(tp, 0);
-        return error;
 }
 /*
@@ -1423,34 +1390,6 @@ xfs_freesb(
 }
 /*
- * Used to log changes to the superblock unit and width fields which could
- * be altered by the mount options, as well as any potential sb_features2
- * fixup. Only the first superblock is updated.
- */
-int
-xfs_mount_log_sb(
-        xfs_mount_t     *mp,
-        __int64_t       fields)
-{
-        xfs_trans_t     *tp;
-        int             error;
-        ASSERT(fields & (XFS_SB_UNIT | XFS_SB_WIDTH | XFS_SB_UUID |
-                         XFS_SB_FEATURES2 | XFS_SB_BAD_FEATURES2 |
-                         XFS_SB_VERSIONNUM));
-        tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_mod_sb(tp, fields);
-        error = xfs_trans_commit(tp, 0);
-        return error;
-}
-/*
 * If the underlying (data/log/rt) device is readonly, there are some
 * operations that cannot proceed.
 */
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 22ccf69d4d3c..0d8abd6364d9 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -162,8 +162,7 @@ typedef struct xfs_mount {
        struct delayed_work     m_reclaim_work; /* background inode reclaim */
        struct delayed_work     m_eofblocks_work; /* background eof blocks
                                                     trimming */
-        __int64_t               m_update_flags; /* sb flags we need to update
+        bool                    m_update_sb;    /* sb needs update in mount */
-                                                   on the next remount,rw */
        int64_t                 m_low_space[XFS_LOWSP_MAX];
                                                /* low free space thresholds */
        struct xfs_kobj         m_kobj;
@@ -175,6 +174,17 @@ typedef struct xfs_mount {
        struct workqueue_struct *m_reclaim_workqueue;
        struct workqueue_struct *m_log_workqueue;
        struct workqueue_struct *m_eofblocks_workqueue;
+        /*
+         * Generation of the filesysyem layout.  This is incremented by each
+         * growfs, and used by the pNFS server to ensure the client updates
+         * its view of the block device once it gets a layout that might
+         * reference the newly added blocks.  Does not need to be persistent
+         * as long as we only allow file system size increments, but if we
+         * ever support shrinks it would have to be persisted in addition
+         * to various other kinds of pain inflicted on the pNFS server.
+         */
+        __uint32_t              m_generation;
 } xfs_mount_t;
 /*
@@ -378,7 +388,7 @@ extern void	xfs_unmountfs(xfs_mount_t *);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
 extern int      xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
                        uint, int);
-extern int      xfs_mount_log_sb(xfs_mount_t *, __int64_t);
+extern int      xfs_mount_log_sb(xfs_mount_t *);
 extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
 extern int      xfs_readsb(xfs_mount_t *, int);
 extern void     xfs_freesb(xfs_mount_t *);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
new file mode 100644
index 000000000000..4b33ef112400
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.c
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2014 Christoph Hellwig.
+ */
+#include "xfs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_iomap.h"
+#include "xfs_shared.h"
+#include "xfs_bit.h"
+#include "xfs_pnfs.h"
+/*
+ * Ensure that we do not have any outstanding pNFS layouts that can be used by
+ * clients to directly read from or write to this inode.  This must be called
+ * before every operation that can remove blocks from the extent map.
+ * Additionally we call it during the write operation, where aren't concerned
+ * about exposing unallocated blocks but just want to provide basic
+ * synchronization between a local writer and pNFS clients.  mmap writes would
+ * also benefit from this sort of synchronization, but due to the tricky locking
+ * rules in the page fault path we don't bother.
+ */
+int
+xfs_break_layouts(
+        struct inode            *inode,
+        uint                    *iolock)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        int                     error;
+        ASSERT(xfs_isilocked(ip, XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL));
+        while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
+                xfs_iunlock(ip, *iolock);
+                error = break_layout(inode, true);
+                *iolock = XFS_IOLOCK_EXCL;
+                xfs_ilock(ip, *iolock);
+        }
+        return error;
+}
+/*
+ * Get a unique ID including its location so that the client can identify
+ * the exported device.
+ */
+int
+xfs_fs_get_uuid(
+        struct super_block      *sb,
+        u8                      *buf,
+        u32                     *len,
+        u64                     *offset)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        printk_once(KERN_NOTICE
+"XFS (%s): using experimental pNFS feature, use at your own risk!\n",
+                mp->m_fsname);
+        if (*len < sizeof(uuid_t))
+                return -EINVAL;
+        memcpy(buf, &mp->m_sb.sb_uuid, sizeof(uuid_t));
+        *len = sizeof(uuid_t);
+        *offset = offsetof(struct xfs_dsb, sb_uuid);
+        return 0;
+}
+static void
+xfs_bmbt_to_iomap(
+        struct xfs_inode        *ip,
+        struct iomap            *iomap,
+        struct xfs_bmbt_irec    *imap)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        if (imap->br_startblock == HOLESTARTBLOCK) {
+                iomap->blkno = IOMAP_NULL_BLOCK;
+                iomap->type = IOMAP_HOLE;
+        } else if (imap->br_startblock == DELAYSTARTBLOCK) {
+                iomap->blkno = IOMAP_NULL_BLOCK;
+                iomap->type = IOMAP_DELALLOC;
+        } else {
+                iomap->blkno =
+                        XFS_FSB_TO_DADDR(ip->i_mount, imap->br_startblock);
+                if (imap->br_state == XFS_EXT_UNWRITTEN)
+                        iomap->type = IOMAP_UNWRITTEN;
+                else
+                        iomap->type = IOMAP_MAPPED;
+        }
+        iomap->offset = XFS_FSB_TO_B(mp, imap->br_startoff);
+        iomap->length = XFS_FSB_TO_B(mp, imap->br_blockcount);
+}
+/*
+ * Get a layout for the pNFS client.
+ */
+int
+xfs_fs_map_blocks(
+        struct inode            *inode,
+        loff_t                  offset,
+        u64                     length,
+        struct iomap            *iomap,
+        bool                    write,
+        u32                     *device_generation)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_bmbt_irec    imap;
+        xfs_fileoff_t           offset_fsb, end_fsb;
+        loff_t                  limit;
+        int                     bmapi_flags = XFS_BMAPI_ENTIRE;
+        int                     nimaps = 1;
+        uint                    lock_flags;
+        int                     error = 0;
+        if (XFS_FORCED_SHUTDOWN(mp))
+                return -EIO;
+        /*
+         * We can't export inodes residing on the realtime device.  The realtime
+         * device doesn't have a UUID to identify it, so the client has no way
+         * to find it.
+         */
+        if (XFS_IS_REALTIME_INODE(ip))
+                return -ENXIO;
+        /*
+         * Lock out any other I/O before we flush and invalidate the pagecache,
+         * and then hand out a layout to the remote system.  This is very
+         * similar to direct I/O, except that the synchronization is much more
+         * complicated.  See the comment near xfs_break_layouts for a detailed
+         * explanation.
+         */
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        error = -EINVAL;
+        limit = mp->m_super->s_maxbytes;
+        if (!write)
+                limit = max(limit, round_up(i_size_read(inode),
+                                     inode->i_sb->s_blocksize));
+        if (offset > limit)
+                goto out_unlock;
+        if (offset > limit - length)
+                length = limit - offset;
+        error = filemap_write_and_wait(inode->i_mapping);
+        if (error)
+                goto out_unlock;
+        error = invalidate_inode_pages2(inode->i_mapping);
+        if (WARN_ON_ONCE(error))
+                return error;
+        end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + length);
+        offset_fsb = XFS_B_TO_FSBT(mp, offset);
+        lock_flags = xfs_ilock_data_map_shared(ip);
+        error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+                                &imap, &nimaps, bmapi_flags);
+        xfs_iunlock(ip, lock_flags);
+        if (error)
+                goto out_unlock;
+        if (write) {
+                enum xfs_prealloc_flags flags = 0;
+                ASSERT(imap.br_startblock != DELAYSTARTBLOCK);
+                if (!nimaps || imap.br_startblock == HOLESTARTBLOCK) {
+                        error = xfs_iomap_write_direct(ip, offset, length,
+                                                       &imap, nimaps);
+                        if (error)
+                                goto out_unlock;
+                        /*
+                         * Ensure the next transaction is committed
+                         * synchronously so that the blocks allocated and
+                         * handed out to the client are guaranteed to be
+                         * present even after a server crash.
+                         */
+                        flags |= XFS_PREALLOC_SET | XFS_PREALLOC_SYNC;
+                }
+                error = xfs_update_prealloc_flags(ip, flags);
+                if (error)
+                        goto out_unlock;
+        }
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        xfs_bmbt_to_iomap(ip, iomap, &imap);
+        *device_generation = mp->m_generation;
+        return error;
+out_unlock:
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        return error;
+}
+/*
+ * Ensure the size update falls into a valid allocated block.
+ */
+static int
+xfs_pnfs_validate_isize(
+        struct xfs_inode        *ip,
+        xfs_off_t               isize)
+{
+        struct xfs_bmbt_irec    imap;
+        int                     nimaps = 1;
+        int                     error = 0;
+        xfs_ilock(ip, XFS_ILOCK_SHARED);
+        error = xfs_bmapi_read(ip, XFS_B_TO_FSBT(ip->i_mount, isize - 1), 1,
+                                &imap, &nimaps, 0);
+        xfs_iunlock(ip, XFS_ILOCK_SHARED);
+        if (error)
+                return error;
+        if (imap.br_startblock == HOLESTARTBLOCK ||
+            imap.br_startblock == DELAYSTARTBLOCK ||
+            imap.br_state == XFS_EXT_UNWRITTEN)
+                return -EIO;
+        return 0;
+}
+/*
+ * Make sure the blocks described by maps are stable on disk.  This includes
+ * converting any unwritten extents, flushing the disk cache and updating the
+ * time stamps.
+ *
+ * Note that we rely on the caller to always send us a timestamp update so that
+ * we always commit a transaction here.  If that stops being true we will have
+ * to manually flush the cache here similar to what the fsync code path does
+ * for datasyncs on files that have no dirty metadata.
+ */
+int
+xfs_fs_commit_blocks(
+        struct inode            *inode,
+        struct iomap            *maps,
+        int                     nr_maps,
+        struct iattr            *iattr)
+{
+        struct xfs_inode        *ip = XFS_I(inode);
+        struct xfs_mount        *mp = ip->i_mount;
+        struct xfs_trans        *tp;
+        bool                    update_isize = false;
+        int                     error, i;
+        loff_t                  size;
+        ASSERT(iattr->ia_valid & (ATTR_ATIME|ATTR_CTIME|ATTR_MTIME));
+        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        size = i_size_read(inode);
+        if ((iattr->ia_valid & ATTR_SIZE) && iattr->ia_size > size) {
+                update_isize = true;
+                size = iattr->ia_size;
+        }
+        for (i = 0; i < nr_maps; i++) {
+                u64 start, length, end;
+                start = maps[i].offset;
+                if (start > size)
+                        continue;
+                end = start + maps[i].length;
+                if (end > size)
+                        end = size;
+                length = end - start;
+                if (!length)
+                        continue;
+        
+                /*
+                 * Make sure reads through the pagecache see the new data.
+                 */
+                error = invalidate_inode_pages2_range(inode->i_mapping,
+                                        start >> PAGE_CACHE_SHIFT,
+                                        (end - 1) >> PAGE_CACHE_SHIFT);
+                WARN_ON_ONCE(error);
+                error = xfs_iomap_write_unwritten(ip, start, length);
+                if (error)
+                        goto out_drop_iolock;
+        }
+        if (update_isize) {
+                error = xfs_pnfs_validate_isize(ip, size);
+                if (error)
+                        goto out_drop_iolock;
+        }
+        tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE);
+        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0);
+        if (error)
+                goto out_drop_iolock;
+        xfs_ilock(ip, XFS_ILOCK_EXCL);
+        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
+        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+        xfs_setattr_time(ip, iattr);
+        if (update_isize) {
+                i_size_write(inode, iattr->ia_size);
+                ip->i_d.di_size = iattr->ia_size;
+        }
+        xfs_trans_set_sync(tp);
+        error = xfs_trans_commit(tp, 0);
+out_drop_iolock:
+        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        return error;
+}
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
new file mode 100644
index 000000000000..b7fbfce660f6
--- /dev/null
+++ b/fs/xfs/xfs_pnfs.h
@@ -0,0 +1,18 @@
+#ifndef _XFS_PNFS_H
+#define _XFS_PNFS_H 1
+#ifdef CONFIG_NFSD_PNFS
+int xfs_fs_get_uuid(struct super_block *sb, u8 *buf, u32 *len, u64 *offset);
+int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
+                struct iomap *iomap, bool write, u32 *device_generation);
+int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
+                struct iattr *iattr);
+int xfs_break_layouts(struct inode *inode, uint *iolock);
+#else
+static inline int xfs_break_layouts(struct inode *inode, uint *iolock)
+{
+        return 0;
+}
+#endif /* CONFIG_NFSD_PNFS */
+#endif /* _XFS_PNFS_H */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 79fb19dd9c83..53cc2aaf8d2b 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -430,6 +430,7 @@ struct xfs_qm_isolate {
 static enum lru_status
 xfs_qm_dquot_isolate(
        struct list_head        *item,
+        struct list_lru_one     *lru,
        spinlock_t              *lru_lock,
        void                    *arg)
                __releases(lru_lock) __acquires(lru_lock)
@@ -450,7 +451,7 @@ xfs_qm_dquot_isolate(
                XFS_STATS_INC(xs_qm_dqwants);
                trace_xfs_dqreclaim_want(dqp);
-                list_del_init(&dqp->q_lru);
+                list_lru_isolate(lru, &dqp->q_lru);
                XFS_STATS_DEC(xs_qm_dquot_unused);
                return LRU_REMOVED;
        }
@@ -494,7 +495,7 @@ xfs_qm_dquot_isolate(
        xfs_dqunlock(dqp);
        ASSERT(dqp->q_nrefs == 0);
-        list_move_tail(&dqp->q_lru, &isol->dispose);
+        list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
        XFS_STATS_DEC(xs_qm_dquot_unused);
        trace_xfs_dqreclaim_done(dqp);
        XFS_STATS_INC(xs_qm_dqreclaims);
@@ -523,7 +524,6 @@ xfs_qm_shrink_scan(
        struct xfs_qm_isolate   isol;
        unsigned long           freed;
        int                     error;
-        unsigned long           nr_to_scan = sc->nr_to_scan;
        if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
                return 0;
@@ -531,8 +531,8 @@ xfs_qm_shrink_scan(
        INIT_LIST_HEAD(&isol.buffers);
        INIT_LIST_HEAD(&isol.dispose);
-        freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
+        freed = list_lru_shrink_walk(&qi->qi_lru, sc,
-                                        &nr_to_scan);
+                                     xfs_qm_dquot_isolate, &isol);
        error = xfs_buf_delwri_submit(&isol.buffers);
        if (error)
@@ -557,7 +557,7 @@ xfs_qm_shrink_count(
        struct xfs_quotainfo    *qi = container_of(shrink,
                                        struct xfs_quotainfo, qi_shrinker);
-        return list_lru_count_node(&qi->qi_lru, sc->nid);
+        return list_lru_shrink_count(&qi->qi_lru, sc);
 }
 /*
@@ -714,7 +714,6 @@ STATIC int
 xfs_qm_qino_alloc(
        xfs_mount_t     *mp,
        xfs_inode_t     **ip,
-        __int64_t       sbfields,
        uint            flags)
 {
        xfs_trans_t     *tp;
@@ -777,11 +776,6 @@ xfs_qm_qino_alloc(
        spin_lock(&mp->m_sb_lock);
        if (flags & XFS_QMOPT_SBVERSION) {
                ASSERT(!xfs_sb_version_hasquota(&mp->m_sb));
-                ASSERT((sbfields & (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                        XFS_SB_GQUOTINO | XFS_SB_PQUOTINO | XFS_SB_QFLAGS)) ==
-                                (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                                 XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
-                                 XFS_SB_QFLAGS));
                xfs_sb_version_addquota(&mp->m_sb);
                mp->m_sb.sb_uquotino = NULLFSINO;
@@ -798,7 +792,7 @@ xfs_qm_qino_alloc(
        else
                mp->m_sb.sb_pquotino = (*ip)->i_ino;
        spin_unlock(&mp->m_sb_lock);
-        xfs_mod_sb(tp, sbfields);
+        xfs_log_sb(tp);
        if ((error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES))) {
                xfs_alert(mp, "%s failed (error %d)!", __func__, error);
@@ -1451,7 +1445,7 @@ xfs_qm_mount_quotas(
        spin_unlock(&mp->m_sb_lock);
        if (sbf != (mp->m_qflags & XFS_MOUNT_QUOTA_ALL)) {
-                if (xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS)) {
+                if (xfs_sync_sb(mp, false)) {
                        /*
                         * We could only have been turning quotas off.
                         * We aren't in very good shape actually because
@@ -1482,7 +1476,6 @@ xfs_qm_init_quotainos(
        struct xfs_inode        *gip = NULL;
        struct xfs_inode        *pip = NULL;
        int                     error;
-        __int64_t               sbflags = 0;
        uint                    flags = 0;
        ASSERT(mp->m_quotainfo);
@@ -1517,9 +1510,6 @@ xfs_qm_init_quotainos(
                }
        } else {
                flags |= XFS_QMOPT_SBVERSION;
-                sbflags |= (XFS_SB_VERSIONNUM | XFS_SB_UQUOTINO |
-                            XFS_SB_GQUOTINO | XFS_SB_PQUOTINO |
-                            XFS_SB_QFLAGS);
        }
        /*
@@ -1530,7 +1520,6 @@ xfs_qm_init_quotainos(
         */
        if (XFS_IS_UQUOTA_ON(mp) && uip == NULL) {
                error = xfs_qm_qino_alloc(mp, &uip,
-                                              sbflags | XFS_SB_UQUOTINO,
                                              flags | XFS_QMOPT_UQUOTA);
                if (error)
                        goto error_rele;
@@ -1539,7 +1528,6 @@ xfs_qm_init_quotainos(
        }
        if (XFS_IS_GQUOTA_ON(mp) && gip == NULL) {
                error = xfs_qm_qino_alloc(mp, &gip,
-                                          sbflags | XFS_SB_GQUOTINO,
                                          flags | XFS_QMOPT_GQUOTA);
                if (error)
                        goto error_rele;
@@ -1548,7 +1536,6 @@ xfs_qm_init_quotainos(
        }
        if (XFS_IS_PQUOTA_ON(mp) && pip == NULL) {
                error = xfs_qm_qino_alloc(mp, &pip,
-                                          sbflags | XFS_SB_PQUOTINO,
                                          flags | XFS_QMOPT_PQUOTA);
                if (error)
                        goto error_rele;
@@ -1587,32 +1574,6 @@ xfs_qm_dqfree_one(
        xfs_qm_dqdestroy(dqp);
 }
-/*
- * Start a transaction and write the incore superblock changes to
- * disk. flags parameter indicates which fields have changed.
- */
-int
-xfs_qm_write_sb_changes(
-        xfs_mount_t     *mp,
-        __int64_t       flags)
-{
-        xfs_trans_t     *tp;
-        int             error;
-        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-        error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_sbchange, 0, 0);
-        if (error) {
-                xfs_trans_cancel(tp, 0);
-                return error;
-        }
-        xfs_mod_sb(tp, flags);
-        error = xfs_trans_commit(tp, 0);
-        return error;
-}
 /* --------------- utility functions for vnodeops ---------------- */
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 3a07a937e232..0d4d3590cf85 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -157,7 +157,6 @@ struct xfs_dquot_acct {
 #define XFS_QM_RTBWARNLIMIT     5
 extern void             xfs_qm_destroy_quotainfo(struct xfs_mount *);
-extern int              xfs_qm_write_sb_changes(struct xfs_mount *, __int64_t);
 /* dquot stuff */
 extern void             xfs_qm_dqpurge_all(struct xfs_mount *, uint);
@@ -166,9 +165,9 @@ extern void		xfs_qm_dqrele_all_inodes(struct xfs_mount *, uint);
 /* quota ops */
 extern int              xfs_qm_scall_trunc_qfiles(struct xfs_mount *, uint);
 extern int              xfs_qm_scall_getquota(struct xfs_mount *, xfs_dqid_t,
-                                        uint, struct fs_disk_quota *);
+                                        uint, struct qc_dqblk *);
 extern int              xfs_qm_scall_setqlim(struct xfs_mount *, xfs_dqid_t, uint,
-                                        struct fs_disk_quota *);
+                                        struct qc_dqblk *);
 extern int              xfs_qm_scall_getqstat(struct xfs_mount *,
                                        struct fs_quota_stat *);
 extern int              xfs_qm_scall_getqstatv(struct xfs_mount *,
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 74fca68e43b6..9b965db45800 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -39,7 +39,6 @@ STATIC int	xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int      xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
                                        uint);
 STATIC uint     xfs_qm_export_flags(uint);
-STATIC uint     xfs_qm_export_qtype_flags(uint);
 /*
 * Turn off quota accounting and/or enforcement for all udquots and/or
@@ -92,8 +91,7 @@ xfs_qm_scall_quotaoff(
                mutex_unlock(&q->qi_quotaofflock);
                /* XXX what to do if error ? Revert back to old vals incore ? */
-                error = xfs_qm_write_sb_changes(mp, XFS_SB_QFLAGS);
+                return xfs_sync_sb(mp, false);
-                return error;
        }
        dqtype = 0;
@@ -314,7 +312,6 @@ xfs_qm_scall_quotaon(
 {
        int             error;
        uint            qf;
-        __int64_t       sbflags;
        flags &= (XFS_ALL_QUOTA_ACCT | XFS_ALL_QUOTA_ENFD);
        /*
@@ -322,30 +319,22 @@ xfs_qm_scall_quotaon(
         */
        flags &= ~(XFS_ALL_QUOTA_ACCT);
-        sbflags = 0;
        if (flags == 0) {
                xfs_debug(mp, "%s: zero flags, m_qflags=%x",
                        __func__, mp->m_qflags);
                return -EINVAL;
        }
-        /* No fs can turn on quotas with a delayed effect */
-        ASSERT((flags & XFS_ALL_QUOTA_ACCT) == 0);
        /*
         * Can't enforce without accounting. We check the superblock
         * qflags here instead of m_qflags because rootfs can have
         * quota acct on ondisk without m_qflags' knowing.
         */
-        if (((flags & XFS_UQUOTA_ACCT) == 0 &&
+        if (((mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
-             (mp->m_sb.sb_qflags & XFS_UQUOTA_ACCT) == 0 &&
             (flags & XFS_UQUOTA_ENFD)) ||
-            ((flags & XFS_GQUOTA_ACCT) == 0 &&
+            ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
-             (mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) == 0 &&
             (flags & XFS_GQUOTA_ENFD)) ||
-            ((flags & XFS_PQUOTA_ACCT) == 0 &&
+            ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
-             (mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) == 0 &&
             (flags & XFS_PQUOTA_ENFD))) {
                xfs_debug(mp,
                        "%s: Can't enforce without acct, flags=%x sbflags=%x",
@@ -370,11 +359,11 @@ xfs_qm_scall_quotaon(
        /*
         * There's nothing to change if it's the same.
         */
-        if ((qf & flags) == flags && sbflags == 0)
+        if ((qf & flags) == flags)
                return -EEXIST;
-        sbflags |= XFS_SB_QFLAGS;
-        if ((error = xfs_qm_write_sb_changes(mp, sbflags)))
+        error = xfs_sync_sb(mp, false);
+        if (error)
                return error;
        /*
         * If we aren't trying to switch on quota enforcement, we are done.
@@ -384,8 +373,7 @@ xfs_qm_scall_quotaon(
             ((mp->m_sb.sb_qflags & XFS_PQUOTA_ACCT) !=
             (mp->m_qflags & XFS_PQUOTA_ACCT)) ||
             ((mp->m_sb.sb_qflags & XFS_GQUOTA_ACCT) !=
-             (mp->m_qflags & XFS_GQUOTA_ACCT)) ||
+             (mp->m_qflags & XFS_GQUOTA_ACCT)))
-            (flags & XFS_ALL_QUOTA_ENFD) == 0)
                return 0;
        if (! XFS_IS_QUOTA_RUNNING(mp))
@@ -422,20 +410,12 @@ xfs_qm_scall_getqstat(
        memset(out, 0, sizeof(fs_quota_stat_t));
        out->qs_version = FS_QSTAT_VERSION;
-        if (!xfs_sb_version_hasquota(&mp->m_sb)) {
-                out->qs_uquota.qfs_ino = NULLFSINO;
-                out->qs_gquota.qfs_ino = NULLFSINO;
-                return 0;
-        }
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
                                                        (XFS_ALL_QUOTA_ACCT|
                                                         XFS_ALL_QUOTA_ENFD));
-        if (q) {
+        uip = q->qi_uquotaip;
-                uip = q->qi_uquotaip;
+        gip = q->qi_gquotaip;
-                gip = q->qi_gquotaip;
+        pip = q->qi_pquotaip;
-                pip = q->qi_pquotaip;
-        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
                                        0, 0, &uip) == 0)
@@ -481,14 +461,13 @@ xfs_qm_scall_getqstat(
                if (temppqip)
                        IRELE(pip);
        }
-        if (q) {
+        out->qs_incoredqs = q->qi_dquots;
-                out->qs_incoredqs = q->qi_dquots;
+        out->qs_btimelimit = q->qi_btimelimit;
-                out->qs_btimelimit = q->qi_btimelimit;
+        out->qs_itimelimit = q->qi_itimelimit;
-                out->qs_itimelimit = q->qi_itimelimit;
+        out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-                out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+        out->qs_bwarnlimit = q->qi_bwarnlimit;
-                out->qs_bwarnlimit = q->qi_bwarnlimit;
+        out->qs_iwarnlimit = q->qi_iwarnlimit;
-                out->qs_iwarnlimit = q->qi_iwarnlimit;
-        }
        return 0;
 }
@@ -509,13 +488,6 @@ xfs_qm_scall_getqstatv(
        bool                    tempgqip = false;
        bool                    temppqip = false;
-        if (!xfs_sb_version_hasquota(&mp->m_sb)) {
-                out->qs_uquota.qfs_ino = NULLFSINO;
-                out->qs_gquota.qfs_ino = NULLFSINO;
-                out->qs_pquota.qfs_ino = NULLFSINO;
-                return 0;
-        }
        out->qs_flags = (__uint16_t) xfs_qm_export_flags(mp->m_qflags &
                                                        (XFS_ALL_QUOTA_ACCT|
                                                         XFS_ALL_QUOTA_ENFD));
@@ -523,11 +495,9 @@ xfs_qm_scall_getqstatv(
        out->qs_gquota.qfs_ino = mp->m_sb.sb_gquotino;
        out->qs_pquota.qfs_ino = mp->m_sb.sb_pquotino;
-        if (q) {
+        uip = q->qi_uquotaip;
-                uip = q->qi_uquotaip;
+        gip = q->qi_gquotaip;
-                gip = q->qi_gquotaip;
+        pip = q->qi_pquotaip;
-                pip = q->qi_pquotaip;
-        }
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
                                        0, 0, &uip) == 0)
@@ -562,19 +532,18 @@ xfs_qm_scall_getqstatv(
                if (temppqip)
                        IRELE(pip);
        }
-        if (q) {
+        out->qs_incoredqs = q->qi_dquots;
-                out->qs_incoredqs = q->qi_dquots;
+        out->qs_btimelimit = q->qi_btimelimit;
-                out->qs_btimelimit = q->qi_btimelimit;
+        out->qs_itimelimit = q->qi_itimelimit;
-                out->qs_itimelimit = q->qi_itimelimit;
+        out->qs_rtbtimelimit = q->qi_rtbtimelimit;
-                out->qs_rtbtimelimit = q->qi_rtbtimelimit;
+        out->qs_bwarnlimit = q->qi_bwarnlimit;
-                out->qs_bwarnlimit = q->qi_bwarnlimit;
+        out->qs_iwarnlimit = q->qi_iwarnlimit;
-                out->qs_iwarnlimit = q->qi_iwarnlimit;
-        }
        return 0;
 }
-#define XFS_DQ_MASK \
+#define XFS_QC_MASK \
-        (FS_DQ_LIMIT_MASK | FS_DQ_TIMER_MASK | FS_DQ_WARNS_MASK)
+        (QC_LIMIT_MASK | QC_TIMER_MASK | QC_WARNS_MASK)
 /*
 * Adjust quota limits, and start/stop timers accordingly.
@@ -584,7 +553,7 @@ xfs_qm_scall_setqlim(
        struct xfs_mount        *mp,
        xfs_dqid_t              id,
        uint                    type,
-        fs_disk_quota_t         *newlim)
+        struct qc_dqblk         *newlim)
 {
        struct xfs_quotainfo    *q = mp->m_quotainfo;
        struct xfs_disk_dquot   *ddq;
@@ -593,9 +562,9 @@ xfs_qm_scall_setqlim(
        int                     error;
        xfs_qcnt_t              hard, soft;
-        if (newlim->d_fieldmask & ~XFS_DQ_MASK)
+        if (newlim->d_fieldmask & ~XFS_QC_MASK)
                return -EINVAL;
-        if ((newlim->d_fieldmask & XFS_DQ_MASK) == 0)
+        if ((newlim->d_fieldmask & XFS_QC_MASK) == 0)
                return 0;
        /*
@@ -633,11 +602,11 @@ xfs_qm_scall_setqlim(
        /*
         * Make sure that hardlimits are >= soft limits before changing.
         */
-        hard = (newlim->d_fieldmask & FS_DQ_BHARD) ?
+        hard = (newlim->d_fieldmask & QC_SPC_HARD) ?
-                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_hardlimit) :
+                (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_hardlimit) :
                        be64_to_cpu(ddq->d_blk_hardlimit);
-        soft = (newlim->d_fieldmask & FS_DQ_BSOFT) ?
+        soft = (newlim->d_fieldmask & QC_SPC_SOFT) ?
-                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_blk_softlimit) :
+                (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_spc_softlimit) :
                        be64_to_cpu(ddq->d_blk_softlimit);
        if (hard == 0 || hard >= soft) {
                ddq->d_blk_hardlimit = cpu_to_be64(hard);
@@ -650,11 +619,11 @@ xfs_qm_scall_setqlim(
        } else {
                xfs_debug(mp, "blkhard %Ld < blksoft %Ld", hard, soft);
        }
-        hard = (newlim->d_fieldmask & FS_DQ_RTBHARD) ?
+        hard = (newlim->d_fieldmask & QC_RT_SPC_HARD) ?
-                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_hardlimit) :
+                (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_hardlimit) :
                        be64_to_cpu(ddq->d_rtb_hardlimit);
-        soft = (newlim->d_fieldmask & FS_DQ_RTBSOFT) ?
+        soft = (newlim->d_fieldmask & QC_RT_SPC_SOFT) ?
-                (xfs_qcnt_t) XFS_BB_TO_FSB(mp, newlim->d_rtb_softlimit) :
+                (xfs_qcnt_t) XFS_B_TO_FSB(mp, newlim->d_rt_spc_softlimit) :
                        be64_to_cpu(ddq->d_rtb_softlimit);
        if (hard == 0 || hard >= soft) {
                ddq->d_rtb_hardlimit = cpu_to_be64(hard);
@@ -667,10 +636,10 @@ xfs_qm_scall_setqlim(
                xfs_debug(mp, "rtbhard %Ld < rtbsoft %Ld", hard, soft);
        }
-        hard = (newlim->d_fieldmask & FS_DQ_IHARD) ?
+        hard = (newlim->d_fieldmask & QC_INO_HARD) ?
                (xfs_qcnt_t) newlim->d_ino_hardlimit :
                        be64_to_cpu(ddq->d_ino_hardlimit);
-        soft = (newlim->d_fieldmask & FS_DQ_ISOFT) ?
+        soft = (newlim->d_fieldmask & QC_INO_SOFT) ?
                (xfs_qcnt_t) newlim->d_ino_softlimit :
                        be64_to_cpu(ddq->d_ino_softlimit);
        if (hard == 0 || hard >= soft) {
@@ -687,12 +656,12 @@ xfs_qm_scall_setqlim(
        /*
         * Update warnings counter(s) if requested
         */
-        if (newlim->d_fieldmask & FS_DQ_BWARNS)
+        if (newlim->d_fieldmask & QC_SPC_WARNS)
-                ddq->d_bwarns = cpu_to_be16(newlim->d_bwarns);
+                ddq->d_bwarns = cpu_to_be16(newlim->d_spc_warns);
-        if (newlim->d_fieldmask & FS_DQ_IWARNS)
+        if (newlim->d_fieldmask & QC_INO_WARNS)
-                ddq->d_iwarns = cpu_to_be16(newlim->d_iwarns);
+                ddq->d_iwarns = cpu_to_be16(newlim->d_ino_warns);
-        if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
+        if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
-                ddq->d_rtbwarns = cpu_to_be16(newlim->d_rtbwarns);
+                ddq->d_rtbwarns = cpu_to_be16(newlim->d_rt_spc_warns);
        if (id == 0) {
                /*
@@ -702,24 +671,24 @@ xfs_qm_scall_setqlim(
                 * soft and hard limit values (already done, above), and
                 * for warnings.
                 */
-                if (newlim->d_fieldmask & FS_DQ_BTIMER) {
+                if (newlim->d_fieldmask & QC_SPC_TIMER) {
-                        q->qi_btimelimit = newlim->d_btimer;
+                        q->qi_btimelimit = newlim->d_spc_timer;
-                        ddq->d_btimer = cpu_to_be32(newlim->d_btimer);
+                        ddq->d_btimer = cpu_to_be32(newlim->d_spc_timer);
                }
-                if (newlim->d_fieldmask & FS_DQ_ITIMER) {
+                if (newlim->d_fieldmask & QC_INO_TIMER) {
-                        q->qi_itimelimit = newlim->d_itimer;
+                        q->qi_itimelimit = newlim->d_ino_timer;
-                        ddq->d_itimer = cpu_to_be32(newlim->d_itimer);
+                        ddq->d_itimer = cpu_to_be32(newlim->d_ino_timer);
                }
-                if (newlim->d_fieldmask & FS_DQ_RTBTIMER) {
+                if (newlim->d_fieldmask & QC_RT_SPC_TIMER) {
-                        q->qi_rtbtimelimit = newlim->d_rtbtimer;
+                        q->qi_rtbtimelimit = newlim->d_rt_spc_timer;
-                        ddq->d_rtbtimer = cpu_to_be32(newlim->d_rtbtimer);
+                        ddq->d_rtbtimer = cpu_to_be32(newlim->d_rt_spc_timer);
                }
-                if (newlim->d_fieldmask & FS_DQ_BWARNS)
+                if (newlim->d_fieldmask & QC_SPC_WARNS)
-                        q->qi_bwarnlimit = newlim->d_bwarns;
+                        q->qi_bwarnlimit = newlim->d_spc_warns;
-                if (newlim->d_fieldmask & FS_DQ_IWARNS)
+                if (newlim->d_fieldmask & QC_INO_WARNS)
-                        q->qi_iwarnlimit = newlim->d_iwarns;
+                        q->qi_iwarnlimit = newlim->d_ino_warns;
-                if (newlim->d_fieldmask & FS_DQ_RTBWARNS)
+                if (newlim->d_fieldmask & QC_RT_SPC_WARNS)
-                        q->qi_rtbwarnlimit = newlim->d_rtbwarns;
+                        q->qi_rtbwarnlimit = newlim->d_rt_spc_warns;
        } else {
                /*
                 * If the user is now over quota, start the timelimit.
@@ -801,7 +770,7 @@ xfs_qm_log_quotaoff(
        mp->m_sb.sb_qflags = (mp->m_qflags & ~(flags)) & XFS_MOUNT_QUOTA_ALL;
        spin_unlock(&mp->m_sb_lock);
-        xfs_mod_sb(tp, XFS_SB_QFLAGS);
+        xfs_log_sb(tp);
        /*
         * We have to make sure that the transaction is secure on disk before we
@@ -824,7 +793,7 @@ xfs_qm_scall_getquota(
        struct xfs_mount        *mp,
        xfs_dqid_t              id,
        uint                    type,
-        struct fs_disk_quota    *dst)
+        struct qc_dqblk         *dst)
 {
        struct xfs_dquot        *dqp;
        int                     error;
@@ -848,28 +817,25 @@ xfs_qm_scall_getquota(
        }
        memset(dst, 0, sizeof(*dst));
-        dst->d_version = FS_DQUOT_VERSION;
+        dst->d_spc_hardlimit =
-        dst->d_flags = xfs_qm_export_qtype_flags(dqp->q_core.d_flags);
+                XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-        dst->d_id = be32_to_cpu(dqp->q_core.d_id);
+        dst->d_spc_softlimit =
-        dst->d_blk_hardlimit =
+                XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
-                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_hardlimit));
-        dst->d_blk_softlimit =
-                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_blk_softlimit));
        dst->d_ino_hardlimit = be64_to_cpu(dqp->q_core.d_ino_hardlimit);
        dst->d_ino_softlimit = be64_to_cpu(dqp->q_core.d_ino_softlimit);
-        dst->d_bcount = XFS_FSB_TO_BB(mp, dqp->q_res_bcount);
+        dst->d_space = XFS_FSB_TO_B(mp, dqp->q_res_bcount);
-        dst->d_icount = dqp->q_res_icount;
+        dst->d_ino_count = dqp->q_res_icount;
-        dst->d_btimer = be32_to_cpu(dqp->q_core.d_btimer);
+        dst->d_spc_timer = be32_to_cpu(dqp->q_core.d_btimer);
-        dst->d_itimer = be32_to_cpu(dqp->q_core.d_itimer);
+        dst->d_ino_timer = be32_to_cpu(dqp->q_core.d_itimer);
-        dst->d_iwarns = be16_to_cpu(dqp->q_core.d_iwarns);
+        dst->d_ino_warns = be16_to_cpu(dqp->q_core.d_iwarns);
-        dst->d_bwarns = be16_to_cpu(dqp->q_core.d_bwarns);
+        dst->d_spc_warns = be16_to_cpu(dqp->q_core.d_bwarns);
-        dst->d_rtb_hardlimit =
+        dst->d_rt_spc_hardlimit =
-                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
+                XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_hardlimit));
-        dst->d_rtb_softlimit =
+        dst->d_rt_spc_softlimit =
-                XFS_FSB_TO_BB(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
+                XFS_FSB_TO_B(mp, be64_to_cpu(dqp->q_core.d_rtb_softlimit));
-        dst->d_rtbcount = XFS_FSB_TO_BB(mp, dqp->q_res_rtbcount);
+        dst->d_rt_space = XFS_FSB_TO_B(mp, dqp->q_res_rtbcount);
-        dst->d_rtbtimer = be32_to_cpu(dqp->q_core.d_rtbtimer);
+        dst->d_rt_spc_timer = be32_to_cpu(dqp->q_core.d_rtbtimer);
-        dst->d_rtbwarns = be16_to_cpu(dqp->q_core.d_rtbwarns);
+        dst->d_rt_spc_warns = be16_to_cpu(dqp->q_core.d_rtbwarns);
        /*
         * Internally, we don't reset all the timers when quota enforcement
@@ -882,23 +848,23 @@ xfs_qm_scall_getquota(
             dqp->q_core.d_flags == XFS_DQ_GROUP) ||
            (!XFS_IS_PQUOTA_ENFORCED(mp) &&
             dqp->q_core.d_flags == XFS_DQ_PROJ)) {
-                dst->d_btimer = 0;
+                dst->d_spc_timer = 0;
-                dst->d_itimer = 0;
+                dst->d_ino_timer = 0;
-                dst->d_rtbtimer = 0;
+                dst->d_rt_spc_timer = 0;
        }
 #ifdef DEBUG
-        if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) ||
+        if (((XFS_IS_UQUOTA_ENFORCED(mp) && type == XFS_DQ_USER) ||
-             (XFS_IS_GQUOTA_ENFORCED(mp) && dst->d_flags == FS_GROUP_QUOTA) ||
+             (XFS_IS_GQUOTA_ENFORCED(mp) && type == XFS_DQ_GROUP) ||
-             (XFS_IS_PQUOTA_ENFORCED(mp) && dst->d_flags == FS_PROJ_QUOTA)) &&
+             (XFS_IS_PQUOTA_ENFORCED(mp) && type == XFS_DQ_PROJ)) &&
-            dst->d_id != 0) {
+            id != 0) {
-                if ((dst->d_bcount > dst->d_blk_softlimit) &&
+                if ((dst->d_space > dst->d_spc_softlimit) &&
-                    (dst->d_blk_softlimit > 0)) {
+                    (dst->d_spc_softlimit > 0)) {
-                        ASSERT(dst->d_btimer != 0);
+                        ASSERT(dst->d_spc_timer != 0);
                }
-                if ((dst->d_icount > dst->d_ino_softlimit) &&
+                if ((dst->d_ino_count > dst->d_ino_softlimit) &&
                    (dst->d_ino_softlimit > 0)) {
-                        ASSERT(dst->d_itimer != 0);
+                        ASSERT(dst->d_ino_timer != 0);
                }
        }
 #endif
@@ -908,26 +874,6 @@ out_put:
 }
 STATIC uint
-xfs_qm_export_qtype_flags(
-        uint flags)
-{
-        /*
-         * Can't be more than one, or none.
-         */
-        ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) !=
-                (FS_PROJ_QUOTA | FS_USER_QUOTA));
-        ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) !=
-                (FS_PROJ_QUOTA | FS_GROUP_QUOTA));
-        ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) !=
-                (FS_USER_QUOTA | FS_GROUP_QUOTA));
-        ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0);
-        return (flags & XFS_DQ_USER) ?
-                FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ?
-                        FS_PROJ_QUOTA : FS_GROUP_QUOTA;
-}
-STATIC uint
 xfs_qm_export_flags(
        uint flags)
 {
diff --git a/fs/xfs/xfs_quotaops.c b/fs/xfs/xfs_quotaops.c
index 7542bbeca6a1..6923905ab33d 100644
--- a/fs/xfs/xfs_quotaops.c
+++ b/fs/xfs/xfs_quotaops.c
@@ -64,19 +64,10 @@ xfs_fs_get_xstatev(
        return xfs_qm_scall_getqstatv(mp, fqs);
 }
-STATIC int
+static unsigned int
-xfs_fs_set_xstate(
+xfs_quota_flags(unsigned int uflags)
-        struct super_block      *sb,
-        unsigned int            uflags,
-        int                     op)
 {
-        struct xfs_mount        *mp = XFS_M(sb);
+        unsigned int flags = 0;
-        unsigned int            flags = 0;
-        if (sb->s_flags & MS_RDONLY)
-                return -EROFS;
-        if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp))
-                return -ENOSYS;
        if (uflags & FS_QUOTA_UDQ_ACCT)
                flags |= XFS_UQUOTA_ACCT;
@@ -91,16 +82,39 @@ xfs_fs_set_xstate(
        if (uflags & FS_QUOTA_PDQ_ENFD)
                flags |= XFS_PQUOTA_ENFD;
-        switch (op) {
+        return flags;
-        case Q_XQUOTAON:
+}
-                return xfs_qm_scall_quotaon(mp, flags);
-        case Q_XQUOTAOFF:
+STATIC int
-                if (!XFS_IS_QUOTA_ON(mp))
+xfs_quota_enable(
-                        return -EINVAL;
+        struct super_block      *sb,
-                return xfs_qm_scall_quotaoff(mp, flags);
+        unsigned int            uflags)
-        }
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        return xfs_qm_scall_quotaon(mp, xfs_quota_flags(uflags));
+}
+STATIC int
+xfs_quota_disable(
+        struct super_block      *sb,
+        unsigned int            uflags)
+{
+        struct xfs_mount        *mp = XFS_M(sb);
+        if (sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (!XFS_IS_QUOTA_RUNNING(mp))
+                return -ENOSYS;
+        if (!XFS_IS_QUOTA_ON(mp))
+                return -EINVAL;
-        return -EINVAL;
+        return xfs_qm_scall_quotaoff(mp, xfs_quota_flags(uflags));
 }
 STATIC int
@@ -131,7 +145,7 @@ STATIC int
 xfs_fs_get_dqblk(
        struct super_block      *sb,
        struct kqid             qid,
-        struct fs_disk_quota    *fdq)
+        struct qc_dqblk         *qdq)
 {
        struct xfs_mount        *mp = XFS_M(sb);
@@ -141,14 +155,14 @@ xfs_fs_get_dqblk(
                return -ESRCH;
        return xfs_qm_scall_getquota(mp, from_kqid(&init_user_ns, qid),
-                                      xfs_quota_type(qid.type), fdq);
+                                      xfs_quota_type(qid.type), qdq);
 }
 STATIC int
 xfs_fs_set_dqblk(
        struct super_block      *sb,
        struct kqid             qid,
-        struct fs_disk_quota    *fdq)
+        struct qc_dqblk         *qdq)
 {
        struct xfs_mount        *mp = XFS_M(sb);
@@ -160,13 +174,14 @@ xfs_fs_set_dqblk(
                return -ESRCH;
        return xfs_qm_scall_setqlim(mp, from_kqid(&init_user_ns, qid),
-                                     xfs_quota_type(qid.type), fdq);
+                                     xfs_quota_type(qid.type), qdq);
 }
 const struct quotactl_ops xfs_quotactl_operations = {
        .get_xstatev            = xfs_fs_get_xstatev,
        .get_xstate             = xfs_fs_get_xstate,
-        .set_xstate             = xfs_fs_set_xstate,
+        .quota_enable           = xfs_quota_enable,
+        .quota_disable          = xfs_quota_disable,
        .rm_xquota              = xfs_fs_rm_xquota,
        .get_dqblk              = xfs_fs_get_dqblk,
        .set_dqblk              = xfs_fs_set_dqblk,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 19cbda196369..8fcc4ccc5c79 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -685,7 +685,7 @@ xfs_blkdev_get(
                                    mp);
        if (IS_ERR(*bdevp)) {
                error = PTR_ERR(*bdevp);
-                xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error);
+                xfs_warn(mp, "Invalid device [%s], error=%d", name, error);
        }
        return error;
@@ -1111,6 +1111,11 @@ xfs_fs_statfs(
                                        statp->f_files,
                                        mp->m_maxicount);
+        /* If sb_icount overshot maxicount, report actual allocation */
+        statp->f_files = max_t(typeof(statp->f_files),
+                                        statp->f_files,
+                                        sbp->sb_icount);
        /* make sure statp->f_ffree does not underflow */
        ffree = statp->f_files - (sbp->sb_icount - sbp->sb_ifree);
        statp->f_ffree = max_t(__int64_t, ffree, 0);
@@ -1257,13 +1262,13 @@ xfs_fs_remount(
                 * If this is the first remount to writeable state we
                 * might have some superblock changes to update.
                 */
-                if (mp->m_update_flags) {
+                if (mp->m_update_sb) {
-                        error = xfs_mount_log_sb(mp, mp->m_update_flags);
+                        error = xfs_sync_sb(mp, false);
                        if (error) {
                                xfs_warn(mp, "failed to write sb changes");
                                return error;
                        }
-                        mp->m_update_flags = 0;
+                        mp->m_update_sb = false;
                }
                /*
@@ -1293,8 +1298,9 @@ xfs_fs_remount(
 /*
 * Second stage of a freeze. The data is already frozen so we only
- * need to take care of the metadata. Once that's done write a dummy
+ * need to take care of the metadata. Once that's done sync the superblock
- * record to dirty the log in case of a crash while frozen.
+ * to the log to dirty it in case of a crash while frozen. This ensures that we
+ * will recover the unlinked inode lists on the next mount.
 */
 STATIC int
 xfs_fs_freeze(
@@ -1304,7 +1310,7 @@ xfs_fs_freeze(
        xfs_save_resvblks(mp);
        xfs_quiesce_attr(mp);
-        return xfs_fs_log_dummy(mp);
+        return xfs_sync_sb(mp, true);
 }
 STATIC int
@@ -1531,7 +1537,7 @@ xfs_fs_mount(
 static long
 xfs_fs_nr_cached_objects(
        struct super_block      *sb,
-        int                     nid)
+        struct shrink_control   *sc)
 {
        return xfs_reclaim_inodes_count(XFS_M(sb));
 }
@@ -1539,10 +1545,9 @@ xfs_fs_nr_cached_objects(
 static long
 xfs_fs_free_cached_objects(
        struct super_block      *sb,
-        long                    nr_to_scan,
+        struct shrink_control   *sc)
-        int                     nid)
 {
-        return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+        return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
 }
 static const struct super_operations xfs_super_operations = {
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index 1743b9f8e23d..a0c8067cea6f 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -149,24 +149,6 @@ static struct ctl_table xfs_table[] = {
                .extra2         = &xfs_params.inherit_noatim.max
        },
        {
-                .procname       = "xfsbufd_centisecs",
-                .data           = &xfs_params.xfs_buf_timer.val,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &xfs_params.xfs_buf_timer.min,
-                .extra2         = &xfs_params.xfs_buf_timer.max
-        },
-        {
-                .procname       = "age_buffer_centisecs",
-                .data           = &xfs_params.xfs_buf_age.val,
-                .maxlen         = sizeof(int),
-                .mode           = 0644,
-                .proc_handler   = proc_dointvec_minmax,
-                .extra1         = &xfs_params.xfs_buf_age.min,
-                .extra2         = &xfs_params.xfs_buf_age.max
-        },
-        {
                .procname       = "inherit_nosymlinks",
                .data           = &xfs_params.inherit_nosym.val,
                .maxlen         = sizeof(int),
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index fa3135b9bf04..eb90cd59a0ec 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -472,6 +472,7 @@ xfs_trans_apply_sb_deltas(
                whole = 1;
        }
+        xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
        if (whole)
                /*
                 * Log the whole thing, the fields are noncontiguous.
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 0a4d4ab6d9a9..75798412859a 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -327,9 +327,10 @@ xfs_trans_read_buf_map(
                return -EIO;
        }
-        if (tp)
+        if (tp) {
                _xfs_trans_bjoin(tp, bp, 1);
-        trace_xfs_trans_read_buf(bp->b_fspriv);
+                trace_xfs_trans_read_buf(bp->b_fspriv);
+        }
        *bpp = bp;
        return 0;