202 files changed, 3165 insertions, 2122 deletions
diff --git a/fs/9p/Kconfig b/fs/9p/Kconfig
index 0a93dc1cb4ac..55abfd62654a 100644
--- a/fs/9p/Kconfig
+++ b/fs/9p/Kconfig
@@ -11,8 +11,7 @@ config 9P_FS
 if 9P_FS
 config 9P_FSCACHE
-        bool "Enable 9P client caching support (EXPERIMENTAL)"
+        bool "Enable 9P client caching support"
-        depends on EXPERIMENTAL
        depends on 9P_FS=m && FSCACHE || 9P_FS=y && FSCACHE=y
        help
          Choose Y here to enable persistent, read-only local
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index ff911e779651..be1e34adc3c6 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -52,10 +52,9 @@
 */
 struct p9_rdir {
-        struct mutex mutex;
        int head;
        int tail;
-        uint8_t *buf;
+        uint8_t buf[];
 };
 /**
@@ -93,33 +92,12 @@ static void p9stat_init(struct p9_wstat *stbuf)
 *
 */
-static int v9fs_alloc_rdir_buf(struct file *filp, int buflen)
+static struct p9_rdir *v9fs_alloc_rdir_buf(struct file *filp, int buflen)
 {
-        struct p9_rdir *rdir;
+        struct p9_fid *fid = filp->private_data;
-        struct p9_fid *fid;
+        if (!fid->rdir)
-        int err = 0;
+                fid->rdir = kzalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
+        return fid->rdir;
-        fid = filp->private_data;
-        if (!fid->rdir) {
-                rdir = kmalloc(sizeof(struct p9_rdir) + buflen, GFP_KERNEL);
-                if (rdir == NULL) {
-                        err = -ENOMEM;
-                        goto exit;
-                }
-                spin_lock(&filp->f_dentry->d_lock);
-                if (!fid->rdir) {
-                        rdir->buf = (uint8_t *)rdir + sizeof(struct p9_rdir);
-                        mutex_init(&rdir->mutex);
-                        rdir->head = rdir->tail = 0;
-                        fid->rdir = (void *) rdir;
-                        rdir = NULL;
-                }
-                spin_unlock(&filp->f_dentry->d_lock);
-                kfree(rdir);
-        }
-exit:
-        return err;
 }
 /**
@@ -145,20 +123,16 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
        buflen = fid->clnt->msize - P9_IOHDRSZ;
-        err = v9fs_alloc_rdir_buf(filp, buflen);
+        rdir = v9fs_alloc_rdir_buf(filp, buflen);
-        if (err)
+        if (!rdir)
-                goto exit;
+                return -ENOMEM;
-        rdir = (struct p9_rdir *) fid->rdir;
-        err = mutex_lock_interruptible(&rdir->mutex);
+        while (1) {
-        if (err)
-                return err;
-        while (err == 0) {
                if (rdir->tail == rdir->head) {
                        err = v9fs_file_readn(filp, rdir->buf, NULL,
                                                        buflen, filp->f_pos);
                        if (err <= 0)
-                                goto unlock_and_exit;
+                                return err;
                        rdir->head = 0;
                        rdir->tail = err;
@@ -169,9 +143,8 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                                          rdir->tail - rdir->head, &st);
                        if (err) {
                                p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
-                                err = -EIO;
                                p9stat_free(&st);
-                                goto unlock_and_exit;
+                                return -EIO;
                        }
                        reclen = st.size+2;
@@ -180,19 +153,13 @@ static int v9fs_dir_readdir(struct file *filp, void *dirent, filldir_t filldir)
                        p9stat_free(&st);
-                        if (over) {
+                        if (over)
-                                err = 0;
+                                return 0;
-                                goto unlock_and_exit;
-                        }
                        rdir->head += reclen;
                        filp->f_pos += reclen;
                }
        }
-unlock_and_exit:
-        mutex_unlock(&rdir->mutex);
-exit:
-        return err;
 }
 /**
@@ -218,21 +185,16 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
        buflen = fid->clnt->msize - P9_READDIRHDRSZ;
-        err = v9fs_alloc_rdir_buf(filp, buflen);
+        rdir = v9fs_alloc_rdir_buf(filp, buflen);
-        if (err)
+        if (!rdir)
-                goto exit;
+                return -ENOMEM;
-        rdir = (struct p9_rdir *) fid->rdir;
-        err = mutex_lock_interruptible(&rdir->mutex);
+        while (1) {
-        if (err)
-                return err;
-        while (err == 0) {
                if (rdir->tail == rdir->head) {
                        err = p9_client_readdir(fid, rdir->buf, buflen,
                                                filp->f_pos);
                        if (err <= 0)
-                                goto unlock_and_exit;
+                                return err;
                        rdir->head = 0;
                        rdir->tail = err;
@@ -245,8 +207,7 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
                                            &curdirent);
                        if (err < 0) {
                                p9_debug(P9_DEBUG_VFS, "returned %d\n", err);
-                                err = -EIO;
+                                return -EIO;
-                                goto unlock_and_exit;
                        }
                        /* d_off in dirent structure tracks the offset into
@@ -261,20 +222,13 @@ static int v9fs_dir_readdir_dotl(struct file *filp, void *dirent,
                                        curdirent.d_type);
                        oldoffset = curdirent.d_off;
-                        if (over) {
+                        if (over)
-                                err = 0;
+                                return 0;
-                                goto unlock_and_exit;
-                        }
                        filp->f_pos = curdirent.d_off;
                        rdir->head += err;
                }
        }
-unlock_and_exit:
-        mutex_unlock(&rdir->mutex);
-exit:
-        return err;
 }
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index c2483e97beee..c921ac92ea4c 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -80,10 +80,6 @@ int v9fs_file_open(struct inode *inode, struct file *file)
                        p9_client_clunk(fid);
                        return err;
                }
-                if (file->f_flags & O_TRUNC) {
-                        i_size_write(inode, 0);
-                        inode->i_blocks = 0;
-                }
                if ((file->f_flags & O_APPEND) &&
                        (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)))
                        generic_file_llseek(file, 0, SEEK_END);
@@ -620,6 +616,7 @@ v9fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        lock_page(page);
        if (page->mapping != inode->i_mapping)
                goto out_unlock;
+        wait_for_stable_page(page);
        return VM_FAULT_LOCKED;
 out_unlock:
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 890bed538f9b..57d017ac68e4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -192,9 +192,6 @@ int v9fs_uflags2omode(int uflags, int extended)
                break;
        }
-        if (uflags & O_TRUNC)
-                ret |= P9_OTRUNC;
        if (extended) {
                if (uflags & O_EXCL)
                        ret |= P9_OEXCL;
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 40895546e103..8d24ad66dfb8 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -186,7 +186,6 @@ static int v9fs_mapped_dotl_flags(int flags)
                { O_CREAT,      P9_DOTL_CREATE },
                { O_EXCL,       P9_DOTL_EXCL },
                { O_NOCTTY,     P9_DOTL_NOCTTY },
-                { O_TRUNC,      P9_DOTL_TRUNC },
                { O_APPEND,     P9_DOTL_APPEND },
                { O_NONBLOCK,   P9_DOTL_NONBLOCK },
                { O_DSYNC,      P9_DOTL_DSYNC },
@@ -268,8 +267,14 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
        }
        /* Only creates */
-        if (!(flags & O_CREAT) || dentry->d_inode)
+        if (!(flags & O_CREAT))
-                return finish_no_open(file, res);
+                return  finish_no_open(file, res);
+        else if (dentry->d_inode) {
+                if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+                        return -EEXIST;
+                else
+                        return finish_no_open(file, res);
+        }
        v9ses = v9fs_inode2v9ses(dir);
diff --git a/fs/Kconfig b/fs/Kconfig
index cfe512fd1caf..780725a463b1 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -68,16 +68,6 @@ source "fs/quota/Kconfig"
 source "fs/autofs4/Kconfig"
 source "fs/fuse/Kconfig"
-config CUSE
-        tristate "Character device in Userspace support"
-        depends on FUSE_FS
-        help
-          This FUSE extension allows character devices to be
-          implemented in userspace.
-          If you want to develop or use userspace character device
-          based on CUSE, answer Y or M.
 config GENERIC_ACL
        bool
        select FS_POSIX_ACL
diff --git a/fs/adfs/Kconfig b/fs/adfs/Kconfig
index e55182a74605..c5a7787dd5e9 100644
--- a/fs/adfs/Kconfig
+++ b/fs/adfs/Kconfig
@@ -1,6 +1,6 @@
 config ADFS_FS
-        tristate "ADFS file system support (EXPERIMENTAL)"
+        tristate "ADFS file system support"
-        depends on BLOCK && EXPERIMENTAL
+        depends on BLOCK
        help
          The Acorn Disc Filing System is the standard file system of the
          RiscOS operating system which runs on Acorn's ARM-based Risc PC
diff --git a/fs/affs/Kconfig b/fs/affs/Kconfig
index cfad9afb4762..a04d9e848d05 100644
--- a/fs/affs/Kconfig
+++ b/fs/affs/Kconfig
@@ -1,6 +1,6 @@
 config AFFS_FS
-        tristate "Amiga FFS file system support (EXPERIMENTAL)"
+        tristate "Amiga FFS file system support"
-        depends on BLOCK && EXPERIMENTAL
+        depends on BLOCK
        help
          The Fast File System (FFS) is the common file system used on hard
          disks by Amiga(tm) systems since AmigaOS Version 1.3 (34.20).  Say Y
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 8f975f25b486..ebba3b18e5da 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -1,6 +1,6 @@
 config AFS_FS
-        tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
+        tristate "Andrew File System support (AFS)"
-        depends on INET && EXPERIMENTAL
+        depends on INET
        select AF_RXRPC
        select DNS_RESOLVER
        help
@@ -22,8 +22,7 @@ config AFS_DEBUG
          If unsure, say N.
 config AFS_FSCACHE
-        bool "Provide AFS client caching support (EXPERIMENTAL)"
+        bool "Provide AFS client caching support"
-        depends on EXPERIMENTAL
        depends on AFS_FS=m && FSCACHE || AFS_FS=y && FSCACHE=y
        help
          Say Y here if you want AFS data to be cached locally on disk through
diff --git a/fs/befs/Kconfig b/fs/befs/Kconfig
index 7835d30f211f..edc5cc2aefad 100644
--- a/fs/befs/Kconfig
+++ b/fs/befs/Kconfig
@@ -1,6 +1,6 @@
 config BEFS_FS
-        tristate "BeOS file system (BeFS) support (read only) (EXPERIMENTAL)"
+        tristate "BeOS file system (BeFS) support (read only)"
-        depends on BLOCK && EXPERIMENTAL
+        depends on BLOCK
        select NLS
        help
          The BeOS File System (BeFS) is the native file system of Be, Inc's
diff --git a/fs/bfs/Kconfig b/fs/bfs/Kconfig
index c2336c62024f..3728a6479c64 100644
--- a/fs/bfs/Kconfig
+++ b/fs/bfs/Kconfig
@@ -1,6 +1,6 @@
 config BFS_FS
-        tristate "BFS file system support (EXPERIMENTAL)"
+        tristate "BFS file system support"
-        depends on BLOCK && EXPERIMENTAL
+        depends on BLOCK
        help
          Boot File System (BFS) is a file system used under SCO UnixWare to
          allow the bootloader access to the kernel image and other important
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 0c42cdbabecf..ff9dbc630efa 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -33,6 +33,7 @@
 #include <linux/elf.h>
 #include <linux/utsname.h>
 #include <linux/coredump.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -1248,7 +1249,7 @@ static int writenote(struct memelfnote *men, struct file *file,
 #undef DUMP_WRITE
 static void fill_elf_header(struct elfhdr *elf, int segs,
-                            u16 machine, u32 flags, u8 osabi)
+                            u16 machine, u32 flags)
 {
        memset(elf, 0, sizeof(*elf));
@@ -1320,8 +1321,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
                cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
                cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
        } else {
-                cputime_to_timeval(p->utime, &prstatus->pr_utime);
+                cputime_t utime, stime;
-                cputime_to_timeval(p->stime, &prstatus->pr_stime);
+                task_cputime(p, &utime, &stime);
+                cputime_to_timeval(utime, &prstatus->pr_utime);
+                cputime_to_timeval(stime, &prstatus->pr_stime);
        }
        cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
        cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
@@ -1630,7 +1634,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
         * Initialize the ELF file header.
         */
        fill_elf_header(elf, phdrs,
-                        view->e_machine, view->e_flags, view->ei_osabi);
+                        view->e_machine, view->e_flags);
        /*
         * Allocate a structure for each thread.
@@ -1870,7 +1874,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs,
        elf_core_copy_regs(&info->prstatus->pr_reg, regs);
        /* Set up header */
-        fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS, ELF_OSABI);
+        fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
        /*
         * Set up the notes in similar form to SVR4 core dumps made
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index dc84732e554f..cb240dd3b402 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1375,8 +1375,11 @@ static void fill_prstatus(struct elf_prstatus *prstatus,
                cputime_to_timeval(cputime.utime, &prstatus->pr_utime);
                cputime_to_timeval(cputime.stime, &prstatus->pr_stime);
        } else {
-                cputime_to_timeval(p->utime, &prstatus->pr_utime);
+                cputime_t utime, stime;
-                cputime_to_timeval(p->stime, &prstatus->pr_stime);
+                task_cputime(p, &utime, &stime);
+                cputime_to_timeval(utime, &prstatus->pr_utime);
+                cputime_to_timeval(stime, &prstatus->pr_stime);
        }
        cputime_to_timeval(p->signal->cutime, &prstatus->pr_cutime);
        cputime_to_timeval(p->signal->cstime, &prstatus->pr_cstime);
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 172f8491a2bd..78333a37f49d 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -994,6 +994,7 @@ int revalidate_disk(struct gendisk *disk)
        mutex_lock(&bdev->bd_mutex);
        check_disk_size_change(disk, bdev);
+        bdev->bd_invalidated = 0;
        mutex_unlock(&bdev->bd_mutex);
        bdput(bdev);
        return ret;
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index d33f01c08b60..ccd25ba7a9ac 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -1,6 +1,5 @@
 config BTRFS_FS
-        tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format"
+        tristate "Btrfs filesystem Unstable disk format"
-        depends on EXPERIMENTAL
        select LIBCRC32C
        select ZLIB_INFLATE
        select ZLIB_DEFLATE
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 521e9d4424f6..1e59ed575cc9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3997,7 +3997,7 @@ again:
         * We make the other tasks wait for the flush only when we can flush
         * all things.
         */
-        if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) {
+        if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
                flushing = true;
                space_info->flush = 1;
        }
@@ -4534,7 +4534,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        unsigned nr_extents = 0;
        int extra_reserve = 0;
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
-        int ret;
+        int ret = 0;
        bool delalloc_lock = true;
        /* If we are a free space inode we need to not flush since we will be in
@@ -4579,20 +4579,18 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
-        if (root->fs_info->quota_enabled) {
+        if (root->fs_info->quota_enabled)
                ret = btrfs_qgroup_reserve(root, num_bytes +
                                           nr_extents * root->leafsize);
-                if (ret) {
-                        spin_lock(&BTRFS_I(inode)->lock);
-                        calc_csum_metadata_size(inode, num_bytes, 0);
-                        spin_unlock(&BTRFS_I(inode)->lock);
-                        if (delalloc_lock)
-                                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
-                        return ret;
-                }
-        }
-        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+        /*
+         * ret != 0 here means the qgroup reservation failed, we go straight to
+         * the shared error handling then.
+         */
+        if (ret == 0)
+                ret = reserve_metadata_bytes(root, block_rsv,
+                                             to_reserve, flush);
        if (ret) {
                u64 to_free = 0;
                unsigned dropped;
@@ -5560,7 +5558,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        int empty_cluster = 2 * 1024 * 1024;
        struct btrfs_space_info *space_info;
        int loop = 0;
-        int index = 0;
+        int index = __get_raid_index(data);
        int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
        bool found_uncached_bg = false;
@@ -6524,7 +6522,7 @@ reada:
 }
 /*
- * hepler to process tree block while walking down the tree.
+ * helper to process tree block while walking down the tree.
 *
 * when wc->stage == UPDATE_BACKREF, this function updates
 * back refs for pointers in the block.
@@ -6599,7 +6597,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
 }
 /*
- * hepler to process tree block pointer.
+ * helper to process tree block pointer.
 *
 * when wc->stage == DROP_REFERENCE, this function checks
 * reference count of the block pointed to. if the block
@@ -6737,7 +6735,7 @@ skip:
 }
 /*
- * hepler to process tree block while walking up the tree.
+ * helper to process tree block while walking up the tree.
 *
 * when wc->stage == DROP_REFERENCE, this function drops
 * reference count on the block.
@@ -6788,11 +6786,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                                       &wc->flags[level]);
                        if (ret < 0) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
+                                path->locks[level] = 0;
                                return ret;
                        }
                        BUG_ON(wc->refs[level] == 0);
                        if (wc->refs[level] == 1) {
                                btrfs_tree_unlock_rw(eb, path->locks[level]);
+                                path->locks[level] = 0;
                                return 1;
                        }
                }
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f169d6b11d7f..fdb7a8db3b57 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next)
        if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags))
                return 0;
+        if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) ||
+            test_bit(EXTENT_FLAG_LOGGING, &next->flags))
+                return 0;
        if (extent_map_end(prev) == next->start &&
            prev->flags == next->flags &&
            prev->bdev == next->bdev &&
@@ -255,7 +259,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len,
        if (!em)
                goto out;
-        list_move(&em->list, &tree->modified_extents);
+        if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
+                list_move(&em->list, &tree->modified_extents);
        em->generation = gen;
        clear_bit(EXTENT_FLAG_PINNED, &em->flags);
        em->mod_start = em->start;
@@ -280,6 +285,13 @@ out:
 }
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em)
+{
+        clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
+        if (em->in_tree)
+                try_merge_map(tree, em);
+}
 /**
 * add_extent_mapping - add new extent map to the extent tree
 * @tree:       tree to insert new map in
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index 922943ce29e8..c6598c89cff8 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
 int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen);
+void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em);
 struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index bd38cef42358..94aa53b38721 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                if (!contig)
                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-                if (!contig && (offset >= ordered->file_offset + ordered->len ||
+                if (offset >= ordered->file_offset + ordered->len ||
-                    offset < ordered->file_offset)) {
+                    offset < ordered->file_offset) {
                        unsigned long bytes_left;
                        sums->len = this_sum_bytes;
                        this_sum_bytes = 0;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 77061bf43edb..aeb84469d2c4 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -293,15 +293,24 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
        struct btrfs_key key;
        struct btrfs_ioctl_defrag_range_args range;
        int num_defrag;
+        int index;
+        int ret;
        /* get the inode */
        key.objectid = defrag->root;
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        key.offset = (u64)-1;
+        index = srcu_read_lock(&fs_info->subvol_srcu);
        inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
        if (IS_ERR(inode_root)) {
-                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+                ret = PTR_ERR(inode_root);
-                return PTR_ERR(inode_root);
+                goto cleanup;
+        }
+        if (btrfs_root_refs(&inode_root->root_item) == 0) {
+                ret = -ENOENT;
+                goto cleanup;
        }
        key.objectid = defrag->ino;
@@ -309,9 +318,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
        key.offset = 0;
        inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
        if (IS_ERR(inode)) {
-                kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+                ret = PTR_ERR(inode);
-                return PTR_ERR(inode);
+                goto cleanup;
        }
+        srcu_read_unlock(&fs_info->subvol_srcu, index);
        /* do a chunk of defrag */
        clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
@@ -346,6 +356,10 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
        iput(inode);
        return 0;
+cleanup:
+        srcu_read_unlock(&fs_info->subvol_srcu, index);
+        kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
+        return ret;
 }
 /*
@@ -1594,9 +1608,10 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
                if (err < 0 && num_written > 0)
                        num_written = err;
        }
-out:
        if (sync)
                atomic_dec(&BTRFS_I(inode)->sync_writers);
+out:
        sb_end_write(inode->i_sb);
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
@@ -2241,6 +2256,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
        if (lockend <= lockstart)
                lockend = lockstart + root->sectorsize;
+        lockend--;
        len = lockend - lockstart + 1;
        len = max_t(u64, len, root->sectorsize);
@@ -2307,9 +2323,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
                                        }
                                }
-                                *offset = start;
+                                if (!test_bit(EXTENT_FLAG_PREALLOC,
-                                free_extent_map(em);
+                                              &em->flags)) {
-                                break;
+                                        *offset = start;
+                                        free_extent_map(em);
+                                        break;
+                                }
                        }
                }
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 59ea2e4349c9..0be7a8742a43 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
 {
        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
-        int ret = 0;
+        int ret;
+        bool re_search = false;
        spin_lock(&ctl->tree_lock);
 again:
+        ret = 0;
        if (!bytes)
                goto out_lock;
@@ -1879,17 +1881,17 @@ again:
                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                          1, 0);
                if (!info) {
-                        /* the tree logging code might be calling us before we
+                        /*
-                         * have fully loaded the free space rbtree for this
+                         * If we found a partial bit of our free space in a
-                         * block group.  So it is possible the entry won't
+                         * bitmap but then couldn't find the other part this may
-                         * be in the rbtree yet at all.  The caching code
+                         * be a problem, so WARN about it.
-                         * will make sure not to put it in the rbtree if
-                         * the logging code has pinned it.
                         */
+                        WARN_ON(re_search);
                        goto out_lock;
                }
        }
+        re_search = false;
        if (!info->bitmap) {
                unlink_free_space(ctl, info);
                if (offset == info->offset) {
@@ -1935,8 +1937,10 @@ again:
        }
        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
-        if (ret == -EAGAIN)
+        if (ret == -EAGAIN) {
+                re_search = true;
                goto again;
+        }
        BUG_ON(ret); /* logic error */
 out_lock:
        spin_unlock(&ctl->tree_lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 16d9e8e191e6..cc93b23ca352 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
 };
-static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_setsize(struct inode *inode, struct iattr *attr);
 static int btrfs_truncate(struct inode *inode);
 static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
 static noinline int cow_file_range(struct inode *inode,
@@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                continue;
                        }
                        nr_truncate++;
+                        /* 1 for the orphan item deletion. */
+                        trans = btrfs_start_transaction(root, 1);
+                        if (IS_ERR(trans)) {
+                                ret = PTR_ERR(trans);
+                                goto out;
+                        }
+                        ret = btrfs_orphan_add(trans, inode);
+                        btrfs_end_transaction(trans, root);
+                        if (ret)
+                                goto out;
                        ret = btrfs_truncate(inode);
                } else {
                        nr_unlink++;
@@ -3665,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                                block_end - cur_offset, 0);
                if (IS_ERR(em)) {
                        err = PTR_ERR(em);
+                        em = NULL;
                        break;
                }
                last_byte = min(extent_map_end(em), block_end);
@@ -3748,16 +3761,27 @@ next:
        return err;
 }
-static int btrfs_setsize(struct inode *inode, loff_t newsize)
+static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        loff_t oldsize = i_size_read(inode);
+        loff_t newsize = attr->ia_size;
+        int mask = attr->ia_valid;
        int ret;
        if (newsize == oldsize)
                return 0;
+        /*
+         * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a
+         * special case where we need to update the times despite not having
+         * these flags set.  For all other operations the VFS set these flags
+         * explicitly if it wants a timestamp update.
+         */
+        if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME))))
+                inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
        if (newsize > oldsize) {
                truncate_pagecache(inode, oldsize, newsize);
                ret = btrfs_cont_expand(inode, oldsize, newsize);
@@ -3783,9 +3807,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize)
                        set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
                                &BTRFS_I(inode)->runtime_flags);
+                /*
+                 * 1 for the orphan item we're going to add
+                 * 1 for the orphan item deletion.
+                 */
+                trans = btrfs_start_transaction(root, 2);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
+                /*
+                 * We need to do this in case we fail at _any_ point during the
+                 * actual truncate.  Once we do the truncate_setsize we could
+                 * invalidate pages which forces any outstanding ordered io to
+                 * be instantly completed which will give us extents that need
+                 * to be truncated.  If we fail to get an orphan inode down we
+                 * could have left over extents that were never meant to live,
+                 * so we need to garuntee from this point on that everything
+                 * will be consistent.
+                 */
+                ret = btrfs_orphan_add(trans, inode);
+                btrfs_end_transaction(trans, root);
+                if (ret)
+                        return ret;
                /* we don't support swapfiles, so vmtruncate shouldn't fail */
                truncate_setsize(inode, newsize);
                ret = btrfs_truncate(inode);
+                if (ret && inode->i_nlink)
+                        btrfs_orphan_del(NULL, inode);
        }
        return ret;
@@ -3805,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
                return err;
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-                err = btrfs_setsize(inode, attr->ia_size);
+                err = btrfs_setsize(inode, attr);
                if (err)
                        return err;
        }
@@ -5572,10 +5621,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
                return em;
        if (em) {
                /*
-                 * if our em maps to a hole, there might
+                 * if our em maps to
-                 * actually be delalloc bytes behind it
+                 * -  a hole or
+                 * -  a pre-alloc extent,
+                 * there might actually be delalloc bytes behind it.
                 */
-                if (em->block_start != EXTENT_MAP_HOLE)
+                if (em->block_start != EXTENT_MAP_HOLE &&
+                    !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
                        return em;
                else
                        hole_em = em;
@@ -5657,6 +5709,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag
                         */
                        em->block_start = hole_em->block_start;
                        em->block_len = hole_len;
+                        if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags))
+                                set_bit(EXTENT_FLAG_PREALLOC, &em->flags);
                } else {
                        em->start = range_start;
                        em->len = found;
@@ -6915,11 +6969,9 @@ static int btrfs_truncate(struct inode *inode)
        /*
         * 1 for the truncate slack space
-         * 1 for the orphan item we're going to add
-         * 1 for the orphan item deletion
         * 1 for updating the inode.
         */
-        trans = btrfs_start_transaction(root, 4);
+        trans = btrfs_start_transaction(root, 2);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out;
@@ -6930,12 +6982,6 @@ static int btrfs_truncate(struct inode *inode)
                                      min_size);
        BUG_ON(ret);
-        ret = btrfs_orphan_add(trans, inode);
-        if (ret) {
-                btrfs_end_transaction(trans, root);
-                goto out;
-        }
        /*
         * setattr is responsible for setting the ordered_data_close flag,
         * but that is only tested during the last file release.  That
@@ -7004,12 +7050,6 @@ static int btrfs_truncate(struct inode *inode)
                ret = btrfs_orphan_del(trans, inode);
                if (ret)
                        err = ret;
-        } else if (ret && inode->i_nlink > 0) {
-                /*
-                 * Failed to do the truncate, remove us from the in memory
-                 * orphan list.
-                 */
-                ret = btrfs_orphan_del(NULL, inode);
        }
        if (trans) {
@@ -7531,41 +7571,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
 */
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
 {
-        struct list_head *head = &root->fs_info->delalloc_inodes;
        struct btrfs_inode *binode;
        struct inode *inode;
        struct btrfs_delalloc_work *work, *next;
        struct list_head works;
+        struct list_head splice;
        int ret = 0;
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
        INIT_LIST_HEAD(&works);
+        INIT_LIST_HEAD(&splice);
+again:
        spin_lock(&root->fs_info->delalloc_lock);
-        while (!list_empty(head)) {
+        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
-                binode = list_entry(head->next, struct btrfs_inode,
+        while (!list_empty(&splice)) {
+                binode = list_entry(splice.next, struct btrfs_inode,
                                    delalloc_inodes);
+                list_del_init(&binode->delalloc_inodes);
                inode = igrab(&binode->vfs_inode);
                if (!inode)
-                        list_del_init(&binode->delalloc_inodes);
+                        continue;
+                list_add_tail(&binode->delalloc_inodes,
+                              &root->fs_info->delalloc_inodes);
                spin_unlock(&root->fs_info->delalloc_lock);
-                if (inode) {
-                        work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+                work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
-                        if (!work) {
+                if (unlikely(!work)) {
-                                ret = -ENOMEM;
+                        ret = -ENOMEM;
-                                goto out;
+                        goto out;
-                        }
-                        list_add_tail(&work->list, &works);
-                        btrfs_queue_worker(&root->fs_info->flush_workers,
-                                           &work->work);
                }
+                list_add_tail(&work->list, &works);
+                btrfs_queue_worker(&root->fs_info->flush_workers,
+                                   &work->work);
                cond_resched();
                spin_lock(&root->fs_info->delalloc_lock);
        }
        spin_unlock(&root->fs_info->delalloc_lock);
+        list_for_each_entry_safe(work, next, &works, list) {
+                list_del_init(&work->list);
+                btrfs_wait_and_free_delalloc_work(work);
+        }
+        spin_lock(&root->fs_info->delalloc_lock);
+        if (!list_empty(&root->fs_info->delalloc_inodes)) {
+                spin_unlock(&root->fs_info->delalloc_lock);
+                goto again;
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
        /* the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
@@ -7578,11 +7638,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
                    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
        }
        atomic_dec(&root->fs_info->async_submit_draining);
+        return 0;
 out:
        list_for_each_entry_safe(work, next, &works, list) {
                list_del_init(&work->list);
                btrfs_wait_and_free_delalloc_work(work);
        }
+        if (!list_empty_careful(&splice)) {
+                spin_lock(&root->fs_info->delalloc_lock);
+                list_splice_tail(&splice, &root->fs_info->delalloc_inodes);
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
        return ret;
 }
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 4b4516770f05..338f2597bf7f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -515,7 +515,6 @@ static noinline int create_subvol(struct btrfs_root *root,
        BUG_ON(ret);
-        d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
        if (async_transid) {
                *async_transid = trans->transid;
@@ -525,6 +524,10 @@ fail:
        }
        if (err && !ret)
                ret = err;
+        if (!ret)
+                d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
        return ret;
 }
@@ -1339,7 +1342,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
                        1)) {
                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                return -EINPROGRESS;
+                mnt_drop_write_file(file);
+                return -EINVAL;
        }
        mutex_lock(&root->fs_info->volume_mutex);
@@ -1362,6 +1366,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                printk(KERN_INFO "btrfs: resizing devid %llu\n",
                       (unsigned long long)devid);
        }
        device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
        if (!device) {
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
@@ -1369,9 +1374,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                ret = -EINVAL;
                goto out_free;
        }
-        if (device->fs_devices && device->fs_devices->seeding) {
+        if (!device->writeable) {
                printk(KERN_INFO "btrfs: resizer unable to apply on "
-                       "seeding device %llu\n",
+                       "readonly device %llu\n",
                       (unsigned long long)devid);
                ret = -EINVAL;
                goto out_free;
@@ -1443,8 +1449,8 @@ out_free:
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
-        mnt_drop_write_file(file);
        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -2095,13 +2101,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
                if (err)
                        goto out_dput;
-                /* check if subvolume may be deleted by a non-root user */
-                err = btrfs_may_delete(dir, dentry, 1);
-                if (err)
-                        goto out_dput;
        }
+        /* check if subvolume may be deleted by a user */
+        err = btrfs_may_delete(dir, dentry, 1);
+        if (err)
+                goto out_dput;
        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
                err = -EINVAL;
                goto out_dput;
@@ -2183,19 +2189,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
        struct btrfs_ioctl_defrag_range_args *range;
        int ret;
-        if (btrfs_root_readonly(root))
+        ret = mnt_want_write_file(file);
-                return -EROFS;
+        if (ret)
+                return ret;
        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
                        1)) {
                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                return -EINPROGRESS;
+                mnt_drop_write_file(file);
+                return -EINVAL;
        }
-        ret = mnt_want_write_file(file);
-        if (ret) {
+        if (btrfs_root_readonly(root)) {
-                atomic_set(&root->fs_info->mutually_exclusive_operation_running,
+                ret = -EROFS;
-                           0);
+                goto out;
-                return ret;
        }
        switch (inode->i_mode & S_IFMT) {
@@ -2247,8 +2254,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                ret = -EINVAL;
        }
 out:
-        mnt_drop_write_file(file);
        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -2263,7 +2270,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
                        1)) {
                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-                return -EINPROGRESS;
+                return -EINVAL;
        }
        mutex_lock(&root->fs_info->volume_mutex);
@@ -2300,7 +2307,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
                        1)) {
                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
                mnt_drop_write_file(file);
-                return -EINPROGRESS;
+                return -EINVAL;
        }
        mutex_lock(&root->fs_info->volume_mutex);
@@ -2316,8 +2323,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
        kfree(vol_args);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
-        mnt_drop_write_file(file);
        atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0);
+        mnt_drop_write_file(file);
        return ret;
 }
@@ -3437,8 +3444,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct btrfs_ioctl_balance_args *bargs;
        struct btrfs_balance_control *bctl;
+        bool need_unlock; /* for mut. excl. ops lock */
        int ret;
-        int need_to_clear_lock = 0;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -3447,14 +3454,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
        if (ret)
                return ret;
-        mutex_lock(&fs_info->volume_mutex);
+again:
+        if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) {
+                mutex_lock(&fs_info->volume_mutex);
+                mutex_lock(&fs_info->balance_mutex);
+                need_unlock = true;
+                goto locked;
+        }
+        /*
+         * mut. excl. ops lock is locked.  Three possibilites:
+         *   (1) some other op is running
+         *   (2) balance is running
+         *   (3) balance is paused -- special case (think resume)
+         */
        mutex_lock(&fs_info->balance_mutex);
+        if (fs_info->balance_ctl) {
+                /* this is either (2) or (3) */
+                if (!atomic_read(&fs_info->balance_running)) {
+                        mutex_unlock(&fs_info->balance_mutex);
+                        if (!mutex_trylock(&fs_info->volume_mutex))
+                                goto again;
+                        mutex_lock(&fs_info->balance_mutex);
+                        if (fs_info->balance_ctl &&
+                            !atomic_read(&fs_info->balance_running)) {
+                                /* this is (3) */
+                                need_unlock = false;
+                                goto locked;
+                        }
+                        mutex_unlock(&fs_info->balance_mutex);
+                        mutex_unlock(&fs_info->volume_mutex);
+                        goto again;
+                } else {
+                        /* this is (2) */
+                        mutex_unlock(&fs_info->balance_mutex);
+                        ret = -EINPROGRESS;
+                        goto out;
+                }
+        } else {
+                /* this is (1) */
+                mutex_unlock(&fs_info->balance_mutex);
+                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
+                ret = -EINVAL;
+                goto out;
+        }
+locked:
+        BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running));
        if (arg) {
                bargs = memdup_user(arg, sizeof(*bargs));
                if (IS_ERR(bargs)) {
                        ret = PTR_ERR(bargs);
-                        goto out;
+                        goto out_unlock;
                }
                if (bargs->flags & BTRFS_BALANCE_RESUME) {
@@ -3474,13 +3528,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
                bargs = NULL;
        }
-        if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
+        if (fs_info->balance_ctl) {
-                        1)) {
-                pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
                ret = -EINPROGRESS;
                goto out_bargs;
        }
-        need_to_clear_lock = 1;
        bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
        if (!bctl) {
@@ -3501,11 +3552,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
        }
 do_balance:
-        ret = btrfs_balance(bctl, bargs);
        /*
-         * bctl is freed in __cancel_balance or in free_fs_info if
+         * Ownership of bctl and mutually_exclusive_operation_running
-         * restriper was paused all the way until unmount
+         * goes to to btrfs_balance.  bctl is freed in __cancel_balance,
+         * or, if restriper was paused all the way until unmount, in
+         * free_fs_info.  mutually_exclusive_operation_running is
+         * cleared in __cancel_balance.
         */
+        need_unlock = false;
+        ret = btrfs_balance(bctl, bargs);
        if (arg) {
                if (copy_to_user(arg, bargs, sizeof(*bargs)))
                        ret = -EFAULT;
@@ -3513,12 +3570,12 @@ do_balance:
 out_bargs:
        kfree(bargs);
-out:
+out_unlock:
-        if (need_to_clear_lock)
-                atomic_set(&root->fs_info->mutually_exclusive_operation_running,
-                           0);
        mutex_unlock(&fs_info->balance_mutex);
        mutex_unlock(&fs_info->volume_mutex);
+        if (need_unlock)
+                atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+out:
        mnt_drop_write_file(file);
        return ret;
 }
@@ -3698,6 +3755,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
                goto drop_write;
        }
+        if (!sa->qgroupid) {
+                ret = -EINVAL;
+                goto out;
+        }
        trans = btrfs_join_transaction(root);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index f10731297040..e5ed56729607 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -836,9 +836,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
         * if the disk i_size is already at the inode->i_size, or
         * this ordered extent is inside the disk i_size, we're done
         */
-        if (disk_i_size == i_size || offset <= disk_i_size) {
+        if (disk_i_size == i_size)
+                goto out;
+        /*
+         * We still need to update disk_i_size if outstanding_isize is greater
+         * than disk_i_size.
+         */
+        if (offset <= disk_i_size &&
+            (!ordered || ordered->outstanding_isize <= disk_i_size))
                goto out;
-        }
        /*
         * walk backward from this ordered extent to disk_i_size.
@@ -870,7 +877,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                        break;
                if (test->file_offset >= i_size)
                        break;
-                if (test->file_offset >= disk_i_size) {
+                if (entry_end(test) > disk_i_size) {
                        /*
                         * we don't update disk_i_size now, so record this
                         * undealt i_size. Or we will not know the real
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index fe9d02c45f8e..a5c856234323 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -379,6 +379,13 @@ next1:
                ret = add_relation_rb(fs_info, found_key.objectid,
                                      found_key.offset);
+                if (ret == -ENOENT) {
+                        printk(KERN_WARNING
+                                "btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
+                                (unsigned long long)found_key.objectid,
+                                (unsigned long long)found_key.offset);
+                        ret = 0;        /* ignore the error */
+                }
                if (ret)
                        goto out;
 next2:
@@ -956,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans,
                        struct btrfs_fs_info *fs_info, u64 qgroupid)
 {
        struct btrfs_root *quota_root;
+        struct btrfs_qgroup *qgroup;
        int ret = 0;
        quota_root = fs_info->quota_root;
        if (!quota_root)
                return -EINVAL;
+        /* check if there are no relations to this qgroup */
+        spin_lock(&fs_info->qgroup_lock);
+        qgroup = find_qgroup_rb(fs_info, qgroupid);
+        if (qgroup) {
+                if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) {
+                        spin_unlock(&fs_info->qgroup_lock);
+                        return -EBUSY;
+                }
+        }
+        spin_unlock(&fs_info->qgroup_lock);
        ret = del_qgroup_item(trans, quota_root, qgroupid);
        spin_lock(&fs_info->qgroup_lock);
        del_qgroup_rb(quota_root->fs_info, qgroupid);
        spin_unlock(&fs_info->qgroup_lock);
        return ret;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 300e09ac3659..17c306bf177a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3472,7 +3472,7 @@ out:
 }
 /*
- * hepler to find all tree blocks that reference a given data extent
+ * helper to find all tree blocks that reference a given data extent
 */
 static noinline_for_stack
 int add_data_references(struct reloc_control *rc,
@@ -3566,7 +3566,7 @@ int add_data_references(struct reloc_control *rc,
 }
 /*
- * hepler to find next unprocessed extent
+ * helper to find next unprocessed extent
 */
 static noinline_for_stack
 int find_next_extent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index bdbb94f245c9..67783e03d121 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -580,20 +580,29 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
        int corrected = 0;
        struct btrfs_key key;
        struct inode *inode = NULL;
+        struct btrfs_fs_info *fs_info;
        u64 end = offset + PAGE_SIZE - 1;
        struct btrfs_root *local_root;
+        int srcu_index;
        key.objectid = root;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
-        local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
-        if (IS_ERR(local_root))
+        fs_info = fixup->root->fs_info;
+        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
+        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
+        if (IS_ERR(local_root)) {
+                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
                return PTR_ERR(local_root);
+        }
        key.type = BTRFS_INODE_ITEM_KEY;
        key.objectid = inum;
        key.offset = 0;
-        inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -606,7 +615,6 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
        }
        if (PageUptodate(page)) {
-                struct btrfs_fs_info *fs_info;
                if (PageDirty(page)) {
                        /*
                         * we need to write the data to the defect sector. the
@@ -3180,18 +3188,25 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
        u64 physical_for_dev_replace;
        u64 len;
        struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
+        int srcu_index;
        key.objectid = root;
        key.type = BTRFS_ROOT_ITEM_KEY;
        key.offset = (u64)-1;
+        srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
        local_root = btrfs_read_fs_root_no_name(fs_info, &key);
-        if (IS_ERR(local_root))
+        if (IS_ERR(local_root)) {
+                srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
                return PTR_ERR(local_root);
+        }
        key.type = BTRFS_INODE_ITEM_KEY;
        key.objectid = inum;
        key.offset = 0;
        inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
+        srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 54454542ad40..321b7fb4e441 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx,
                        (unsigned long)nce->ino);
        if (!nce_head) {
                nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS);
-                if (!nce_head)
+                if (!nce_head) {
+                        kfree(nce);
                        return -ENOMEM;
+                }
                INIT_LIST_HEAD(nce_head);
                ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 99545df1b86c..d8982e9601d3 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                             function, line, errstr);
                return;
        }
-        trans->transaction->aborted = errno;
+        ACCESS_ONCE(trans->transaction->aborted) = errno;
        __btrfs_std_error(root->fs_info, function, line, errno, NULL);
 }
 /*
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 87fac9a21ea5..4c0067c4f76d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -112,7 +112,6 @@ loop:
                 * to redo the trans_no_join checks above
                 */
                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
-                cur_trans = fs_info->running_transaction;
                goto loop;
        } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
                spin_unlock(&fs_info->trans_lock);
@@ -333,12 +332,14 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
                                          &root->fs_info->trans_block_rsv,
                                          num_bytes, flush);
                if (ret)
-                        return ERR_PTR(ret);
+                        goto reserve_fail;
        }
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
-        if (!h)
+        if (!h) {
-                return ERR_PTR(-ENOMEM);
+                ret = -ENOMEM;
+                goto alloc_fail;
+        }
        /*
         * If we are JOIN_NOLOCK we're already committing a transaction and
@@ -365,11 +366,7 @@ again:
        if (ret < 0) {
                /* We must get the transaction if we are JOIN_NOLOCK. */
                BUG_ON(type == TRANS_JOIN_NOLOCK);
+                goto join_fail;
-                if (type < TRANS_JOIN_NOLOCK)
-                        sb_end_intwrite(root->fs_info->sb);
-                kmem_cache_free(btrfs_trans_handle_cachep, h);
-                return ERR_PTR(ret);
        }
        cur_trans = root->fs_info->running_transaction;
@@ -410,6 +407,19 @@ got_it:
        if (!current->journal_info && type != TRANS_USERSPACE)
                current->journal_info = h;
        return h;
+join_fail:
+        if (type < TRANS_JOIN_NOLOCK)
+                sb_end_intwrite(root->fs_info->sb);
+        kmem_cache_free(btrfs_trans_handle_cachep, h);
+alloc_fail:
+        if (num_bytes)
+                btrfs_block_rsv_release(root, &root->fs_info->trans_block_rsv,
+                                        num_bytes);
+reserve_fail:
+        if (qgroup_reserved)
+                btrfs_qgroup_free(root, qgroup_reserved);
+        return ERR_PTR(ret);
 }
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
@@ -1468,7 +1478,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                goto cleanup_transaction;
        }
-        if (cur_trans->aborted) {
+        /* Stop the commit early if ->aborted is set */
+        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
                ret = cur_trans->aborted;
                goto cleanup_transaction;
        }
@@ -1574,6 +1585,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        wait_event(cur_trans->writer_wait,
                   atomic_read(&cur_trans->num_writers) == 1);
+        /* ->aborted might be set after the previous check, so check it */
+        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+                ret = cur_trans->aborted;
+                goto cleanup_transaction;
+        }
        /*
         * the reloc mutex makes sure that we stop
         * the balancing code from coming in and moving
@@ -1657,6 +1673,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                goto cleanup_transaction;
        }
+        /*
+         * The tasks which save the space cache and inode cache may also
+         * update ->aborted, check it.
+         */
+        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
+                ret = cur_trans->aborted;
+                mutex_unlock(&root->fs_info->tree_log_mutex);
+                mutex_unlock(&root->fs_info->reloc_mutex);
+                goto cleanup_transaction;
+        }
        btrfs_prepare_extent_commit(trans, root);
        cur_trans = root->fs_info->running_transaction;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 83186c7e45d4..9027bb1e7466 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
        if (skip_csum)
                return 0;
+        if (em->compress_type) {
+                csum_offset = 0;
+                csum_len = block_len;
+        }
        /* block start is already adjusted for the file extent offset. */
        ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
                                       em->block_start + csum_offset,
@@ -3410,13 +3415,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                em = list_entry(extents.next, struct extent_map, list);
                list_del_init(&em->list);
-                clear_bit(EXTENT_FLAG_LOGGING, &em->flags);
                /*
                 * If we had an error we just need to delete everybody from our
                 * private list.
                 */
                if (ret) {
+                        clear_em_logging(tree, em);
                        free_extent_map(em);
                        continue;
                }
@@ -3424,8 +3429,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
                write_unlock(&tree->lock);
                ret = log_one_extent(trans, inode, root, em, path);
-                free_extent_map(em);
                write_lock(&tree->lock);
+                clear_em_logging(tree, em);
+                free_extent_map(em);
        }
        WARN_ON(!list_empty(&extents));
        write_unlock(&tree->lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cce6aa74012..5cbb7f4b1672 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                }
        } else {
                ret = btrfs_get_bdev_and_sb(device_path,
-                                            FMODE_READ | FMODE_EXCL,
+                                            FMODE_WRITE | FMODE_EXCL,
                                            root->fs_info->bdev_holder, 0,
                                            &bdev, &bh);
                if (ret)
@@ -1556,7 +1556,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        ret = 0;
        /* Notify udev that device has changed */
-        btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
+        if (bdev)
+                btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
 error_brelse:
        brelse(bh);
@@ -2614,7 +2615,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
        cache = btrfs_lookup_block_group(fs_info, chunk_offset);
        chunk_used = btrfs_block_group_used(&cache->item);
-        user_thresh = div_factor_fine(cache->key.offset, bargs->usage);
+        if (bargs->usage == 0)
+                user_thresh = 0;
+        else if (bargs->usage > 100)
+                user_thresh = cache->key.offset;
+        else
+                user_thresh = div_factor_fine(cache->key.offset,
+                                              bargs->usage);
        if (chunk_used < user_thresh)
                ret = 0;
@@ -2959,6 +2967,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
        unset_balance_control(fs_info);
        ret = del_balance_item(fs_info->tree_root);
        BUG_ON(ret);
+        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 }
 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
@@ -3138,8 +3148,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 out:
        if (bctl->flags & BTRFS_BALANCE_RESUME)
                __cancel_balance(fs_info);
-        else
+        else {
                kfree(bctl);
+                atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
+        }
        return ret;
 }
@@ -3156,7 +3168,6 @@ static int balance_kthread(void *data)
                ret = btrfs_balance(fs_info->balance_ctl, NULL);
        }
-        atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
        mutex_unlock(&fs_info->balance_mutex);
        mutex_unlock(&fs_info->volume_mutex);
@@ -3179,7 +3190,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
                return 0;
        }
-        WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
        tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
        if (IS_ERR(tsk))
                return PTR_ERR(tsk);
@@ -3233,6 +3243,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
        btrfs_balance_sys(leaf, item, &disk_bargs);
        btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
+        WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1));
        mutex_lock(&fs_info->volume_mutex);
        mutex_lock(&fs_info->balance_mutex);
@@ -3496,7 +3508,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
        { 1, 1, 2, 2, 2, 2 /* raid1 */ },
        { 1, 2, 1, 1, 1, 2 /* dup */ },
        { 1, 1, 0, 2, 1, 1 /* raid0 */ },
-        { 1, 1, 0, 1, 1, 1 /* single */ },
+        { 1, 1, 1, 1, 1, 1 /* single */ },
 };
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
diff --git a/fs/buffer.c b/fs/buffer.c
index c017a2dfb909..2ea9cd44aeae 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2359,7 +2359,7 @@ int __block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
        if (unlikely(ret < 0))
                goto out_unlock;
        set_page_dirty(page);
-        wait_on_page_writeback(page);
+        wait_for_stable_page(page);
        return 0;
 out_unlock:
        unlock_page(page);
@@ -2935,6 +2935,7 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
                void *kaddr = kmap_atomic(bh->b_page);
                memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
                kunmap_atomic(kaddr);
+                flush_dcache_page(bh->b_page);
        }
 }
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
index 9eb134ea6eb2..49bc78243db9 100644
--- a/fs/ceph/Kconfig
+++ b/fs/ceph/Kconfig
@@ -1,6 +1,6 @@
 config CEPH_FS
-        tristate "Ceph distributed file system (EXPERIMENTAL)"
+        tristate "Ceph distributed file system"
-        depends on INET && EXPERIMENTAL
+        depends on INET
        select CEPH_LIB
        select LIBCRC32C
        select CRYPTO_AES
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 21ff76c22a17..2906ee276408 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -155,14 +155,14 @@ config CIFS_DFS_UPCALL
            points. If unsure, say N.
 config CIFS_NFSD_EXPORT
-          bool "Allow nfsd to export CIFS file system (EXPERIMENTAL)"
+          bool "Allow nfsd to export CIFS file system"
-          depends on CIFS && EXPERIMENTAL && BROKEN
+          depends on CIFS && BROKEN
          help
           Allows NFS server to export a CIFS mounted share (nfsd over cifs)
 config CIFS_SMB2
-        bool "SMB2 network file system support (EXPERIMENTAL)"
+        bool "SMB2 network file system support"
-        depends on CIFS && EXPERIMENTAL && INET
+        depends on CIFS && INET
        select NLS
        select KEYS
        select FSCACHE
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index ce5cbd717bfc..210fce2df308 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -226,6 +226,8 @@ compose_mount_options_out:
 compose_mount_options_err:
        kfree(mountdata);
        mountdata = ERR_PTR(rc);
+        kfree(*devname);
+        *devname = NULL;
        goto compose_mount_options_out;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f653835d067b..de7f9168a118 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -228,7 +228,6 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_set_oplock_level(cifs_inode, 0);
        cifs_inode->delete_pending = false;
        cifs_inode->invalid_mapping = false;
-        cifs_inode->leave_pages_clean = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
        cifs_inode->uniqueid = 0;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index aea1eec64911..e6899cea1c35 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -386,6 +386,7 @@ struct smb_version_values {
        unsigned int    cap_unix;
        unsigned int    cap_nt_find;
        unsigned int    cap_large_files;
+        unsigned int    oplock_read;
 };
 #define HEADER_SIZE(server) (server->vals->header_size)
@@ -1030,7 +1031,6 @@ struct cifsInodeInfo {
        bool clientCanCacheAll;         /* read and writebehind oplock */
        bool delete_pending;            /* DELETE_ON_CLOSE is set */
        bool invalid_mapping;           /* pagecache is invalid */
-        bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */
        unsigned long time;             /* jiffies of last update of inode */
        u64  server_eof;                /* current file size on server -- protected by i_lock */
        u64  uniqueid;                  /* server inode number */
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 17c3643e5950..12b3da39733b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1917,7 +1917,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs)
        }
        case AF_INET6: {
                struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr;
-                struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs;
+                struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs;
                return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr);
        }
        default:
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0a6677ba212b..8ea6ca50a665 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -238,6 +238,23 @@ out:
        return rc;
 }
+static bool
+cifs_has_mand_locks(struct cifsInodeInfo *cinode)
+{
+        struct cifs_fid_locks *cur;
+        bool has_locks = false;
+        down_read(&cinode->lock_sem);
+        list_for_each_entry(cur, &cinode->llist, llist) {
+                if (!list_empty(&cur->locks)) {
+                        has_locks = true;
+                        break;
+                }
+        }
+        up_read(&cinode->lock_sem);
+        return has_locks;
+}
 struct cifsFileInfo *
 cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
                  struct tcon_link *tlink, __u32 oplock)
@@ -248,6 +265,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
        struct cifsFileInfo *cfile;
        struct cifs_fid_locks *fdlocks;
        struct cifs_tcon *tcon = tlink_tcon(tlink);
+        struct TCP_Server_Info *server = tcon->ses->server;
        cfile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
        if (cfile == NULL)
@@ -276,12 +294,22 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
        INIT_WORK(&cfile->oplock_break, cifs_oplock_break);
        mutex_init(&cfile->fh_mutex);
+        /*
+         * If the server returned a read oplock and we have mandatory brlocks,
+         * set oplock level to None.
+         */
+        if (oplock == server->vals->oplock_read &&
+                                                cifs_has_mand_locks(cinode)) {
+                cFYI(1, "Reset oplock val from read to None due to mand locks");
+                oplock = 0;
+        }
        spin_lock(&cifs_file_list_lock);
-        if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE)
+        if (fid->pending_open->oplock != CIFS_OPLOCK_NO_CHANGE && oplock)
                oplock = fid->pending_open->oplock;
        list_del(&fid->pending_open->olist);
-        tlink_tcon(tlink)->ses->server->ops->set_fid(cfile, fid, oplock);
+        server->ops->set_fid(cfile, fid, oplock);
        list_add(&cfile->tlist, &tcon->openFileList);
        /* if readable file instance put first in list*/
@@ -1422,6 +1450,7 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
        struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        struct TCP_Server_Info *server = tcon->ses->server;
+        struct inode *inode = cfile->dentry->d_inode;
        if (posix_lck) {
                int posix_lock_type;
@@ -1459,6 +1488,21 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
                if (!rc)
                        goto out;
+                /*
+                 * Windows 7 server can delay breaking lease from read to None
+                 * if we set a byte-range lock on a file - break it explicitly
+                 * before sending the lock to the server to be sure the next
+                 * read won't conflict with non-overlapted locks due to
+                 * pagereading.
+                 */
+                if (!CIFS_I(inode)->clientCanCacheAll &&
+                                        CIFS_I(inode)->clientCanCacheRead) {
+                        cifs_invalidate_mapping(inode);
+                        cFYI(1, "Set no oplock for inode=%p due to mand locks",
+                             inode);
+                        CIFS_I(inode)->clientCanCacheRead = false;
+                }
                rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
                                            type, 1, 0, wait_flag);
                if (rc) {
@@ -2103,15 +2147,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        } else {
                rc = copied;
                pos += copied;
-                /*
+                set_page_dirty(page);
-                 * When we use strict cache mode and cifs_strict_writev was run
-                 * with level II oplock (indicated by leave_pages_clean field of
-                 * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev
-                 * sent the data to the server itself.
-                 */
-                if (!CIFS_I(inode)->leave_pages_clean ||
-                    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO))
-                        set_page_dirty(page);
        }
        if (rc > 0) {
@@ -2462,8 +2498,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
 }
 static ssize_t
-cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov,
+cifs_writev(struct kiocb *iocb, const struct iovec *iov,
-                      unsigned long nr_segs, loff_t pos, bool cache_ex)
+            unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
        struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@ -2485,12 +2521,8 @@ cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov,
                                     server->vals->exclusive_lock_type, NULL,
                                     CIFS_WRITE_OP)) {
                mutex_lock(&inode->i_mutex);
-                if (!cache_ex)
-                        cinode->leave_pages_clean = true;
                rc = __generic_file_aio_write(iocb, iov, nr_segs,
-                                              &iocb->ki_pos);
+                                               &iocb->ki_pos);
-                if (!cache_ex)
-                        cinode->leave_pages_clean = false;
                mutex_unlock(&inode->i_mutex);
        }
@@ -2517,60 +2549,32 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
        struct cifsFileInfo *cfile = (struct cifsFileInfo *)
                                                iocb->ki_filp->private_data;
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
-        ssize_t written, written2;
+        ssize_t written;
-        /*
-         * We need to store clientCanCacheAll here to prevent race
-         * conditions - this value can be changed during an execution
-         * of generic_file_aio_write. For CIFS it can be changed from
-         * true to false only, but for SMB2 it can be changed both from
-         * true to false and vice versa. So, we can end up with a data
-         * stored in the cache, not marked dirty and not sent to the
-         * server if this value changes its state from false to true
-         * after cifs_write_end.
-         */
-        bool cache_ex = cinode->clientCanCacheAll;
-        bool cache_read = cinode->clientCanCacheRead;
-        int rc;
-        loff_t saved_pos;
-        if (cache_ex) {
+        if (cinode->clientCanCacheAll) {
                if (cap_unix(tcon->ses) &&
-                    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
+                (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
-                    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
+                    && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
-                                                tcon->fsUnixInfo.Capability)))
                        return generic_file_aio_write(iocb, iov, nr_segs, pos);
-                return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex);
+                return cifs_writev(iocb, iov, nr_segs, pos);
        }
        /*
-         * For files without exclusive oplock in strict cache mode we need to
+         * For non-oplocked files in strict cache mode we need to write the data
-         * write the data to the server exactly from the pos to pos+len-1 rather
+         * to the server exactly from the pos to pos+len-1 rather than flush all
-         * than flush all affected pages because it may cause a error with
+         * affected pages because it may cause a error with mandatory locks on
-         * mandatory locks on these pages but not on the region from pos to
+         * these pages but not on the region from pos to ppos+len-1.
-         * ppos+len-1.
         */
        written = cifs_user_writev(iocb, iov, nr_segs, pos);
-        if (!cache_read || written <= 0)
+        if (written > 0 && cinode->clientCanCacheRead) {
-                return written;
+                /*
+                 * Windows 7 server can delay breaking level2 oplock if a write
-        saved_pos = iocb->ki_pos;
+                 * request comes - break it on the client to prevent reading
-        iocb->ki_pos = pos;
+                 * an old data.
-        /* we have a read oplock - need to store a data in the page cache */
+                 */
-        if (cap_unix(tcon->ses) &&
+                cifs_invalidate_mapping(inode);
-            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
+                cFYI(1, "Set no oplock for inode=%p after a write operation",
-            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
+                     inode);
-                                        tcon->fsUnixInfo.Capability)))
+                cinode->clientCanCacheRead = false;
-                written2 = generic_file_aio_write(iocb, iov, nr_segs, pos);
-        else
-                written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos,
-                                                 cache_ex);
-        /* errors occured during writing - invalidate the page cache */
-        if (written2 < 0) {
-                rc = cifs_invalidate_mapping(inode);
-                if (rc)
-                        written = (ssize_t)rc;
-                else
-                        iocb->ki_pos = saved_pos;
        }
        return written;
 }
@@ -3577,6 +3581,13 @@ void cifs_oplock_break(struct work_struct *work)
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        int rc = 0;
+        if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead &&
+                                                cifs_has_mand_locks(cinode)) {
+                cFYI(1, "Reset oplock to None for inode=%p due to mand locks",
+                     inode);
+                cinode->clientCanCacheRead = false;
+        }
        if (inode && S_ISREG(inode->i_mode)) {
                if (cinode->clientCanCacheRead)
                        break_lease(inode, O_RDONLY);
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 51dc2fb6e854..9f6c4c45d21e 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -76,7 +76,7 @@ symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
        }
        rc = crypto_shash_update(&sdescmd5->shash, link_str, link_len);
        if (rc) {
-                cERROR(1, "%s: Could not update iwth link_str", __func__);
+                cERROR(1, "%s: Could not update with link_str", __func__);
                goto symlink_hash_err;
        }
        rc = crypto_shash_final(&sdescmd5->shash, md5_hash);
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index a5d234c8d5d9..47bc5a87f94e 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -53,6 +53,13 @@ send_nt_cancel(struct TCP_Server_Info *server, void *buf,
                mutex_unlock(&server->srv_mutex);
                return rc;
        }
+        /*
+         * The response to this call was already factored into the sequence
+         * number when the call went out, so we must adjust it back downward
+         * after signing here.
+         */
+        --server->sequence_number;
        rc = smb_send(server, in_buf, be32_to_cpu(in_buf->smb_buf_length));
        mutex_unlock(&server->srv_mutex);
@@ -952,4 +959,5 @@ struct smb_version_values smb1_values = {
        .cap_unix = CAP_UNIX,
        .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
        .cap_large_files = CAP_LARGE_FILES,
+        .oplock_read = OPLOCK_READ,
 };
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index d79de7bc4435..c9c7aa7ed966 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -708,6 +708,7 @@ struct smb_version_values smb20_values = {
        .cap_unix = 0,
        .cap_nt_find = SMB2_NT_FIND,
        .cap_large_files = SMB2_LARGE_FILES,
+        .oplock_read = SMB2_OPLOCK_LEVEL_II,
 };
 struct smb_version_values smb21_values = {
@@ -725,6 +726,7 @@ struct smb_version_values smb21_values = {
        .cap_unix = 0,
        .cap_nt_find = SMB2_NT_FIND,
        .cap_large_files = SMB2_LARGE_FILES,
+        .oplock_read = SMB2_OPLOCK_LEVEL_II,
 };
 struct smb_version_values smb30_values = {
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 76d974c952fe..1a528680ec5a 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -144,9 +144,6 @@ smb_send_kvec(struct TCP_Server_Info *server, struct kvec *iov, size_t n_vec,
        *sent = 0;
-        if (ssocket == NULL)
-                return -ENOTSOCK; /* BB eventually add reconnect code here */
        smb_msg.msg_name = (struct sockaddr *) &server->dstaddr;
        smb_msg.msg_namelen = sizeof(struct sockaddr);
        smb_msg.msg_control = NULL;
@@ -291,6 +288,9 @@ smb_send_rqst(struct TCP_Server_Info *server, struct smb_rqst *rqst)
        struct socket *ssocket = server->ssocket;
        int val = 1;
+        if (ssocket == NULL)
+                return -ENOTSOCK;
        cFYI(1, "Sending smb: smb_len=%u", smb_buf_length);
        dump_smb(iov[0].iov_base, iov[0].iov_len);
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 712b10f64c70..e9dcfa3c208c 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1037,10 +1037,11 @@ static int configfs_dump(struct configfs_dirent *sd, int level)
 static int configfs_depend_prep(struct dentry *origin,
                                struct config_item *target)
 {
-        struct configfs_dirent *child_sd, *sd = origin->d_fsdata;
+        struct configfs_dirent *child_sd, *sd;
        int ret = 0;
-        BUG_ON(!origin || !sd);
+        BUG_ON(!origin || !origin->d_fsdata);
+        sd = origin->d_fsdata;
        if (sd->s_element == target)  /* Boo-yah */
                goto out;
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 153bb1e42e63..0c4f80b447fb 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -176,7 +176,7 @@ static int debugfs_parse_options(char *data, struct debugfs_mount_opts *opts)
                        opts->uid = uid;
                        break;
                case Opt_gid:
-                        if (match_octal(&args[0], &option))
+                        if (match_int(&args[0], &option))
                                return -EINVAL;
                        gid = make_kgid(current_user_ns(), option);
                        if (!gid_valid(gid))
@@ -322,7 +322,6 @@ static struct dentry *__create_file(const char *name, umode_t mode,
        if (!parent)
                parent = debugfs_mount->mnt_root;
-        dentry = NULL;
        mutex_lock(&parent->d_inode->i_mutex);
        dentry = lookup_one_len(name, parent, strlen(name));
        if (!IS_ERR(dentry)) {
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 77c0f70f8fe8..e7665c31f7b1 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -96,10 +96,13 @@ do { \
 }
+#define DLM_RTF_SHRINK          0x00000001
 struct dlm_rsbtable {
        struct rb_root          keep;
        struct rb_root          toss;
        spinlock_t              lock;
+        uint32_t                flags;
 };
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index a579f30f237d..f7501651762d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1132,6 +1132,7 @@ static void toss_rsb(struct kref *kref)
        rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[r->res_bucket].keep);
        rsb_insert(r, &ls->ls_rsbtbl[r->res_bucket].toss);
        r->res_toss_time = jiffies;
+        ls->ls_rsbtbl[r->res_bucket].flags |= DLM_RTF_SHRINK;
        if (r->res_lvbptr) {
                dlm_free_lvb(r->res_lvbptr);
                r->res_lvbptr = NULL;
@@ -1659,11 +1660,18 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
        char *name;
        int our_nodeid = dlm_our_nodeid();
        int remote_count = 0;
+        int need_shrink = 0;
        int i, len, rv;
        memset(&ls->ls_remove_lens, 0, sizeof(int) * DLM_REMOVE_NAMES_MAX);
        spin_lock(&ls->ls_rsbtbl[b].lock);
+        if (!(ls->ls_rsbtbl[b].flags & DLM_RTF_SHRINK)) {
+                spin_unlock(&ls->ls_rsbtbl[b].lock);
+                return;
+        }
        for (n = rb_first(&ls->ls_rsbtbl[b].toss); n; n = next) {
                next = rb_next(n);
                r = rb_entry(n, struct dlm_rsb, res_hashnode);
@@ -1679,6 +1687,8 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
                        continue;
                }
+                need_shrink = 1;
                if (!time_after_eq(jiffies, r->res_toss_time +
                                   dlm_config.ci_toss_secs * HZ)) {
                        continue;
@@ -1710,6 +1720,11 @@ static void shrink_bucket(struct dlm_ls *ls, int b)
                rb_erase(&r->res_hashnode, &ls->ls_rsbtbl[b].toss);
                dlm_free_rsb(r);
        }
+        if (need_shrink)
+                ls->ls_rsbtbl[b].flags |= DLM_RTF_SHRINK;
+        else
+                ls->ls_rsbtbl[b].flags &= ~DLM_RTF_SHRINK;
        spin_unlock(&ls->ls_rsbtbl[b].lock);
        /*
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 7ff49852b0cb..911649a47dd5 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -503,11 +503,11 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 #endif
                return -EINVAL;
-#ifdef CONFIG_COMPAT
+        /*
-        if (count > sizeof(struct dlm_write_request32) + DLM_RESNAME_MAXLEN)
+         * can't compare against COMPAT/dlm_write_request32 because
-#else
+         * we don't yet know if is64bit is zero
+         */
        if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
-#endif
                return -EINVAL;
        kbuf = kzalloc(count + 1, GFP_NOFS);
diff --git a/fs/ecryptfs/Kconfig b/fs/ecryptfs/Kconfig
index cc16562654de..e15ef38c24fa 100644
--- a/fs/ecryptfs/Kconfig
+++ b/fs/ecryptfs/Kconfig
@@ -1,6 +1,6 @@
 config ECRYPT_FS
-        tristate "eCrypt filesystem layer support (EXPERIMENTAL)"
+        tristate "eCrypt filesystem layer support"
-        depends on EXPERIMENTAL && KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
+        depends on KEYS && CRYPTO && (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n)
        select CRYPTO_ECB
        select CRYPTO_CBC
        select CRYPTO_MD5
diff --git a/fs/efs/Kconfig b/fs/efs/Kconfig
index 6ebfc1c207a8..d020e3c30fea 100644
--- a/fs/efs/Kconfig
+++ b/fs/efs/Kconfig
@@ -1,6 +1,6 @@
 config EFS_FS
-        tristate "EFS file system support (read only) (EXPERIMENTAL)"
+        tristate "EFS file system support (read only)"
-        depends on BLOCK && EXPERIMENTAL
+        depends on BLOCK
        help
          EFS is an older file system used for non-ISO9660 CD-ROMs and hard
          disk partitions by SGI's IRIX operating system (IRIX 6.0 and newer
diff --git a/fs/exec.c b/fs/exec.c
index 18c45cac368f..20df02c1cc70 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -434,8 +434,9 @@ static int count(struct user_arg_ptr argv, int max)
                        if (IS_ERR(p))
                                return -EFAULT;
-                        if (i++ >= max)
+                        if (i >= max)
                                return -E2BIG;
+                        ++i;
                        if (fatal_signal_pending(current))
                                return -ERESTARTNOHAND;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6e50223b3299..4ba2683c1d44 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2065,6 +2065,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
                test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ? "journal":
                test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered":
                "writeback");
+        sb->s_flags |= MS_SNAP_STABLE;
        return 0;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 0a475c881852..987358740cb9 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -41,6 +41,7 @@ config EXT4_USE_FOR_EXT23
 config EXT4_FS_POSIX_ACL
        bool "Ext4 POSIX Access Control Lists"
+        depends on EXT4_FS
        select FS_POSIX_ACL
        help
          POSIX Access Control Lists (ACLs) support permissions for users and
@@ -53,6 +54,7 @@ config EXT4_FS_POSIX_ACL
 config EXT4_FS_SECURITY
        bool "Ext4 Security Labels"
+        depends on EXT4_FS
        help
          Security labels support alternative access control models
          implemented by security modules like SELinux.  This option
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cbfe13bf5b2a..cd818d8bb221 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4968,7 +4968,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                                            0, len, NULL,
                                            ext4_bh_unmapped)) {
                        /* Wait so that we don't change page under IO */
-                        wait_on_page_writeback(page);
+                        wait_for_stable_page(page);
                        ret = VM_FAULT_LOCKED;
                        goto out;
                }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 8990165346ee..f9ed946a448e 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -722,7 +722,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir,
                        ext4_warning(dir->i_sb, "Node failed checksum");
                        brelse(bh);
                        *err = ERR_BAD_DX_DIR;
-                        goto fail;
+                        goto fail2;
                }
                set_buffer_verified(bh);
@@ -2368,7 +2368,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir,
        }
        inode->i_size = EXT4_I(inode)->i_disksize = blocksize;
-        dir_block = ext4_bread(handle, inode, 0, 1, &err);
        if (!(dir_block = ext4_bread(handle, inode, 0, 1, &err))) {
                if (!err) {
                        err = -EIO;
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index e95b94945d5f..137af4255da6 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -191,15 +191,14 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type)
                retval = f2fs_getxattr(inode, name_index, "", value, retval);
        }
-        if (retval < 0) {
+        if (retval > 0)
-                if (retval == -ENODATA)
-                        acl = NULL;
-                else
-                        acl = ERR_PTR(retval);
-        } else {
                acl = f2fs_acl_from_disk(value, retval);
-        }
+        else if (retval == -ENODATA)
+                acl = NULL;
+        else
+                acl = ERR_PTR(retval);
        kfree(value);
        if (!IS_ERR(acl))
                set_cached_acl(inode, type, acl);
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 6ef36c37e2be..ff3c8439af87 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -214,7 +214,6 @@ retry:
                goto retry;
        }
        new->ino = ino;
-        INIT_LIST_HEAD(&new->list);
        /* add new_oentry into list which is sorted by inode number */
        if (orphan) {
@@ -772,7 +771,7 @@ void init_orphan_info(struct f2fs_sb_info *sbi)
        sbi->n_orphans = 0;
 }
-int create_checkpoint_caches(void)
+int __init create_checkpoint_caches(void)
 {
        orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry",
                        sizeof(struct orphan_inode_entry), NULL);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 655aeabc1dd4..7bd22a201125 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -16,6 +16,7 @@
 #include <linux/backing-dev.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
+#include <linux/prefetch.h>
 #include "f2fs.h"
 #include "node.h"
@@ -546,6 +547,15 @@ redirty_out:
 #define MAX_DESIRED_PAGES_WP    4096
+static int __f2fs_writepage(struct page *page, struct writeback_control *wbc,
+                        void *data)
+{
+        struct address_space *mapping = data;
+        int ret = mapping->a_ops->writepage(page, wbc);
+        mapping_set_error(mapping, ret);
+        return ret;
+}
 static int f2fs_write_data_pages(struct address_space *mapping,
                            struct writeback_control *wbc)
 {
@@ -562,7 +572,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
        if (!S_ISDIR(inode->i_mode))
                mutex_lock(&sbi->writepages);
-        ret = generic_writepages(mapping, wbc);
+        ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping);
        if (!S_ISDIR(inode->i_mode))
                mutex_unlock(&sbi->writepages);
        f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL));
@@ -688,6 +698,11 @@ static int f2fs_set_data_page_dirty(struct page *page)
        return 0;
 }
+static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
+{
+        return generic_block_bmap(mapping, block, get_data_block_ro);
+}
 const struct address_space_operations f2fs_dblock_aops = {
        .readpage       = f2fs_read_data_page,
        .readpages      = f2fs_read_data_pages,
@@ -699,4 +714,5 @@ const struct address_space_operations f2fs_dblock_aops = {
        .invalidatepage = f2fs_invalidate_data_page,
        .releasepage    = f2fs_release_data_page,
        .direct_IO      = f2fs_direct_IO,
+        .bmap           = f2fs_bmap,
 };
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 0e0380a588ad..c8c37307b326 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -26,6 +26,7 @@
 static LIST_HEAD(f2fs_stat_list);
 static struct dentry *debugfs_root;
+static DEFINE_MUTEX(f2fs_stat_mutex);
 static void update_general_status(struct f2fs_sb_info *sbi)
 {
@@ -180,18 +181,14 @@ static int stat_show(struct seq_file *s, void *v)
        int i = 0;
        int j;
+        mutex_lock(&f2fs_stat_mutex);
        list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) {
-                mutex_lock(&si->stat_lock);
-                if (!si->sbi) {
-                        mutex_unlock(&si->stat_lock);
-                        continue;
-                }
                update_general_status(si->sbi);
                seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++);
-                seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ",
+                seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
-                           si->nat_area_segs, si->sit_area_segs);
+                           si->sit_area_segs, si->nat_area_segs);
                seq_printf(s, "[SSA: %d] [MAIN: %d",
                           si->ssa_area_segs, si->main_area_segs);
                seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
@@ -286,8 +283,8 @@ static int stat_show(struct seq_file *s, void *v)
                seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n",
                                (si->base_mem + si->cache_mem) >> 10,
                                si->base_mem >> 10, si->cache_mem >> 10);
-                mutex_unlock(&si->stat_lock);
        }
+        mutex_unlock(&f2fs_stat_mutex);
        return 0;
 }
@@ -303,7 +300,7 @@ static const struct file_operations stat_fops = {
        .release = single_release,
 };
-static int init_stats(struct f2fs_sb_info *sbi)
+int f2fs_build_stats(struct f2fs_sb_info *sbi)
 {
        struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
        struct f2fs_stat_info *si;
@@ -313,9 +310,6 @@ static int init_stats(struct f2fs_sb_info *sbi)
                return -ENOMEM;
        si = sbi->stat_info;
-        mutex_init(&si->stat_lock);
-        list_add_tail(&si->stat_list, &f2fs_stat_list);
        si->all_area_segs = le32_to_cpu(raw_super->segment_count);
        si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit);
        si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat);
@@ -325,21 +319,11 @@ static int init_stats(struct f2fs_sb_info *sbi)
        si->main_area_zones = si->main_area_sections /
                                le32_to_cpu(raw_super->secs_per_zone);
        si->sbi = sbi;
-        return 0;
-}
-int f2fs_build_stats(struct f2fs_sb_info *sbi)
+        mutex_lock(&f2fs_stat_mutex);
-{
+        list_add_tail(&si->stat_list, &f2fs_stat_list);
-        int retval;
+        mutex_unlock(&f2fs_stat_mutex);
-        retval = init_stats(sbi);
-        if (retval)
-                return retval;
-        if (!debugfs_root)
-                debugfs_root = debugfs_create_dir("f2fs", NULL);
-        debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops);
        return 0;
 }
@@ -347,14 +331,22 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
 {
        struct f2fs_stat_info *si = sbi->stat_info;
+        mutex_lock(&f2fs_stat_mutex);
        list_del(&si->stat_list);
-        mutex_lock(&si->stat_lock);
+        mutex_unlock(&f2fs_stat_mutex);
-        si->sbi = NULL;
-        mutex_unlock(&si->stat_lock);
        kfree(sbi->stat_info);
 }
-void destroy_root_stats(void)
+void __init f2fs_create_root_stats(void)
+{
+        debugfs_root = debugfs_create_dir("f2fs", NULL);
+        if (debugfs_root)
+                debugfs_create_file("status", S_IRUGO, debugfs_root,
+                                         NULL, &stat_fops);
+}
+void f2fs_destroy_root_stats(void)
 {
        debugfs_remove_recursive(debugfs_root);
        debugfs_root = NULL;
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index b4e24f32b54e..989980e16d0b 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -11,6 +11,7 @@
 #include <linux/fs.h>
 #include <linux/f2fs_fs.h>
 #include "f2fs.h"
+#include "node.h"
 #include "acl.h"
 static unsigned long dir_blocks(struct inode *inode)
@@ -74,7 +75,7 @@ static unsigned long dir_block_index(unsigned int level, unsigned int idx)
        return bidx;
 }
-static bool early_match_name(const char *name, int namelen,
+static bool early_match_name(const char *name, size_t namelen,
                        f2fs_hash_t namehash, struct f2fs_dir_entry *de)
 {
        if (le16_to_cpu(de->name_len) != namelen)
@@ -87,7 +88,7 @@ static bool early_match_name(const char *name, int namelen,
 }
 static struct f2fs_dir_entry *find_in_block(struct page *dentry_page,
-                        const char *name, int namelen, int *max_slots,
+                        const char *name, size_t namelen, int *max_slots,
                        f2fs_hash_t namehash, struct page **res_page)
 {
        struct f2fs_dir_entry *de;
@@ -126,7 +127,7 @@ found:
 }
 static struct f2fs_dir_entry *find_in_level(struct inode *dir,
-                unsigned int level, const char *name, int namelen,
+                unsigned int level, const char *name, size_t namelen,
                        f2fs_hash_t namehash, struct page **res_page)
 {
        int s = GET_DENTRY_SLOTS(namelen);
@@ -181,7 +182,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
                        struct qstr *child, struct page **res_page)
 {
        const char *name = child->name;
-        int namelen = child->len;
+        size_t namelen = child->len;
        unsigned long npages = dir_blocks(dir);
        struct f2fs_dir_entry *de = NULL;
        f2fs_hash_t name_hash;
@@ -308,6 +309,7 @@ static int init_inode_metadata(struct inode *inode, struct dentry *dentry)
                ipage = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino);
                if (IS_ERR(ipage))
                        return PTR_ERR(ipage);
+                set_cold_node(inode, ipage);
                init_dent_inode(dentry, ipage);
                f2fs_put_page(ipage, 1);
        }
@@ -381,7 +383,7 @@ int f2fs_add_link(struct dentry *dentry, struct inode *inode)
        struct inode *dir = dentry->d_parent->d_inode;
        struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb);
        const char *name = dentry->d_name.name;
-        int namelen = dentry->d_name.len;
+        size_t namelen = dentry->d_name.len;
        struct page *dentry_page = NULL;
        struct f2fs_dentry_block *dentry_blk = NULL;
        int slots = GET_DENTRY_SLOTS(namelen);
@@ -501,7 +503,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
        }
        if (inode) {
-                inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
+                inode->i_ctime = CURRENT_TIME;
                drop_nlink(inode);
                if (S_ISDIR(inode->i_mode)) {
                        drop_nlink(inode);
@@ -540,13 +542,13 @@ int f2fs_make_empty(struct inode *inode, struct inode *parent)
        de = &dentry_blk->dentry[0];
        de->name_len = cpu_to_le16(1);
-        de->hash_code = 0;
+        de->hash_code = f2fs_dentry_hash(".", 1);
        de->ino = cpu_to_le32(inode->i_ino);
        memcpy(dentry_blk->filename[0], ".", 1);
        set_de_type(de, inode);
        de = &dentry_blk->dentry[1];
-        de->hash_code = 0;
+        de->hash_code = f2fs_dentry_hash("..", 2);
        de->name_len = cpu_to_le16(2);
        de->ino = cpu_to_le32(parent->i_ino);
        memcpy(dentry_blk->filename[1], "..", 2);
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index a18d63db2fb6..c8e2d751ef9c 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -211,11 +211,11 @@ struct dnode_of_data {
 static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode,
                struct page *ipage, struct page *npage, nid_t nid)
 {
+        memset(dn, 0, sizeof(*dn));
        dn->inode = inode;
        dn->inode_page = ipage;
        dn->node_page = npage;
        dn->nid = nid;
-        dn->inode_page_locked = 0;
 }
 /*
@@ -877,11 +877,13 @@ bool f2fs_empty_dir(struct inode *);
 * super.c
 */
 int f2fs_sync_fs(struct super_block *, int);
+extern __printf(3, 4)
+void f2fs_msg(struct super_block *, const char *, const char *, ...);
 /*
 * hash.c
 */
-f2fs_hash_t f2fs_dentry_hash(const char *, int);
+f2fs_hash_t f2fs_dentry_hash(const char *, size_t);
 /*
 * node.c
@@ -912,7 +914,7 @@ int restore_node_summary(struct f2fs_sb_info *, unsigned int,
 void flush_nat_entries(struct f2fs_sb_info *);
 int build_node_manager(struct f2fs_sb_info *);
 void destroy_node_manager(struct f2fs_sb_info *);
-int create_node_manager_caches(void);
+int __init create_node_manager_caches(void);
 void destroy_node_manager_caches(void);
 /*
@@ -964,7 +966,7 @@ void sync_dirty_dir_inodes(struct f2fs_sb_info *);
 void block_operations(struct f2fs_sb_info *);
 void write_checkpoint(struct f2fs_sb_info *, bool, bool);
 void init_orphan_info(struct f2fs_sb_info *);
-int create_checkpoint_caches(void);
+int __init create_checkpoint_caches(void);
 void destroy_checkpoint_caches(void);
 /*
@@ -984,9 +986,9 @@ int do_write_data_page(struct page *);
 int start_gc_thread(struct f2fs_sb_info *);
 void stop_gc_thread(struct f2fs_sb_info *);
 block_t start_bidx_of_node(unsigned int);
-int f2fs_gc(struct f2fs_sb_info *, int);
+int f2fs_gc(struct f2fs_sb_info *);
 void build_gc_manager(struct f2fs_sb_info *);
-int create_gc_caches(void);
+int __init create_gc_caches(void);
 void destroy_gc_caches(void);
 /*
@@ -1058,7 +1060,8 @@ struct f2fs_stat_info {
 int f2fs_build_stats(struct f2fs_sb_info *);
 void f2fs_destroy_stats(struct f2fs_sb_info *);
-void destroy_root_stats(void);
+void __init f2fs_create_root_stats(void);
+void f2fs_destroy_root_stats(void);
 #else
 #define stat_inc_call_count(si)
 #define stat_inc_seg_count(si, type)
@@ -1068,7 +1071,8 @@ void destroy_root_stats(void);
 static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline void destroy_root_stats(void) { }
+static inline void __init f2fs_create_root_stats(void) { }
+static inline void f2fs_destroy_root_stats(void) { }
 #endif
 extern const struct file_operations f2fs_dir_operations;
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index f9e085dfb1f0..3191b52aafb0 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -96,8 +96,9 @@ out:
 }
 static const struct vm_operations_struct f2fs_file_vm_ops = {
-        .fault        = filemap_fault,
+        .fault          = filemap_fault,
-        .page_mkwrite = f2fs_vm_page_mkwrite,
+        .page_mkwrite   = f2fs_vm_page_mkwrite,
+        .remap_pages    = generic_file_remap_pages,
 };
 static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode)
@@ -137,6 +138,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (ret)
                return ret;
+        /* guarantee free sections for fsync */
+        f2fs_balance_fs(sbi);
        mutex_lock(&inode->i_mutex);
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
@@ -160,15 +164,17 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (need_to_sync_dir(sbi, inode))
                need_cp = true;
-        f2fs_write_inode(inode, NULL);
        if (need_cp) {
                /* all the dirty node pages should be flushed for POR */
                ret = f2fs_sync_fs(inode->i_sb, 1);
                clear_inode_flag(F2FS_I(inode), FI_NEED_CP);
        } else {
-                while (sync_node_pages(sbi, inode->i_ino, &wbc) == 0)
+                /* if there is no written node page, write its inode page */
-                        f2fs_write_inode(inode, NULL);
+                while (!sync_node_pages(sbi, inode->i_ino, &wbc)) {
+                        ret = f2fs_write_inode(inode, NULL);
+                        if (ret)
+                                goto out;
+                }
                filemap_fdatawait_range(sbi->node_inode->i_mapping,
                                                        0, LONG_MAX);
        }
@@ -405,6 +411,8 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
                struct dnode_of_data dn;
                struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
+                f2fs_balance_fs(sbi);
                mutex_lock_op(sbi, DATA_TRUNC);
                set_new_dnode(&dn, inode, NULL, NULL, 0);
                err = get_dnode_of_data(&dn, index, RDONLY_NODE);
@@ -532,7 +540,6 @@ static long f2fs_fallocate(struct file *file, int mode,
                                loff_t offset, loff_t len)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
-        struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb);
        long ret;
        if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
@@ -543,7 +550,10 @@ static long f2fs_fallocate(struct file *file, int mode,
        else
                ret = expand_inode_data(inode, offset, len, mode);
-        f2fs_balance_fs(sbi);
+        if (!ret) {
+                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                mark_inode_dirty(inode);
+        }
        return ret;
 }
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 644aa3808273..c386910dacc5 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -78,7 +78,7 @@ static int gc_thread_func(void *data)
                sbi->bg_gc++;
-                if (f2fs_gc(sbi, 1) == GC_NONE)
+                if (f2fs_gc(sbi) == GC_NONE)
                        wait_ms = GC_THREAD_NOGC_SLEEP_TIME;
                else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME)
                        wait_ms = GC_THREAD_MAX_SLEEP_TIME;
@@ -390,9 +390,7 @@ next_step:
                }
                err = check_valid_map(sbi, segno, off);
-                if (err == GC_ERROR)
+                if (err == GC_NEXT)
-                        return err;
-                else if (err == GC_NEXT)
                        continue;
                if (initial) {
@@ -426,32 +424,30 @@ next_step:
 }
 /*
- * Calculate start block index that this node page contains
+ * Calculate start block index indicating the given node offset.
+ * Be careful, caller should give this node offset only indicating direct node
+ * blocks. If any node offsets, which point the other types of node blocks such
+ * as indirect or double indirect node blocks, are given, it must be a caller's
+ * bug.
 */
 block_t start_bidx_of_node(unsigned int node_ofs)
 {
-        block_t start_bidx;
+        unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4;
-        unsigned int bidx, indirect_blks;
+        unsigned int bidx;
-        int dec;
-        indirect_blks = 2 * NIDS_PER_BLOCK + 4;
+        if (node_ofs == 0)
+                return 0;
-        start_bidx = 1;
+        if (node_ofs <= 2) {
-        if (node_ofs == 0) {
-                start_bidx = 0;
-        } else if (node_ofs <= 2) {
                bidx = node_ofs - 1;
        } else if (node_ofs <= indirect_blks) {
-                dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
+                int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1);
                bidx = node_ofs - 2 - dec;
        } else {
-                dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
+                int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1);
                bidx = node_ofs - 5 - dec;
        }
+        return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE;
-        if (start_bidx)
-                start_bidx = bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE;
-        return start_bidx;
 }
 static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
@@ -556,9 +552,7 @@ next_step:
                }
                err = check_valid_map(sbi, segno, off);
-                if (err == GC_ERROR)
+                if (err == GC_NEXT)
-                        goto stop;
-                else if (err == GC_NEXT)
                        continue;
                if (phase == 0) {
@@ -568,9 +562,7 @@ next_step:
                /* Get an inode by ino with checking validity */
                err = check_dnode(sbi, entry, &dni, start_addr + off, &nofs);
-                if (err == GC_ERROR)
+                if (err == GC_NEXT)
-                        goto stop;
-                else if (err == GC_NEXT)
                        continue;
                if (phase == 1) {
@@ -663,62 +655,44 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno,
        return ret;
 }
-int f2fs_gc(struct f2fs_sb_info *sbi, int nGC)
+int f2fs_gc(struct f2fs_sb_info *sbi)
 {
-        unsigned int segno;
-        int old_free_secs, cur_free_secs;
-        int gc_status, nfree;
        struct list_head ilist;
+        unsigned int segno, i;
        int gc_type = BG_GC;
+        int gc_status = GC_NONE;
        INIT_LIST_HEAD(&ilist);
 gc_more:
-        nfree = 0;
+        if (!(sbi->sb->s_flags & MS_ACTIVE))
-        gc_status = GC_NONE;
+                goto stop;
        if (has_not_enough_free_secs(sbi))
-                old_free_secs = reserved_sections(sbi);
+                gc_type = FG_GC;
-        else
-                old_free_secs = free_sections(sbi);
-        while (sbi->sb->s_flags & MS_ACTIVE) {
-                int i;
-                if (has_not_enough_free_secs(sbi))
-                        gc_type = FG_GC;
-                cur_free_secs = free_sections(sbi) + nfree;
+        if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
+                goto stop;
-                /* We got free space successfully. */
+        for (i = 0; i < sbi->segs_per_sec; i++) {
-                if (nGC < cur_free_secs - old_free_secs)
+                /*
-                        break;
+                 * do_garbage_collect will give us three gc_status:
+                 * GC_ERROR, GC_DONE, and GC_BLOCKED.
-                if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE))
+                 * If GC is finished uncleanly, we have to return
+                 * the victim to dirty segment list.
+                 */
+                gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type);
+                if (gc_status != GC_DONE)
                        break;
-                for (i = 0; i < sbi->segs_per_sec; i++) {
-                        /*
-                         * do_garbage_collect will give us three gc_status:
-                         * GC_ERROR, GC_DONE, and GC_BLOCKED.
-                         * If GC is finished uncleanly, we have to return
-                         * the victim to dirty segment list.
-                         */
-                        gc_status = do_garbage_collect(sbi, segno + i,
-                                        &ilist, gc_type);
-                        if (gc_status != GC_DONE)
-                                goto stop;
-                        nfree++;
-                }
        }
-stop:
+        if (has_not_enough_free_secs(sbi)) {
-        if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) {
                write_checkpoint(sbi, (gc_status == GC_BLOCKED), false);
-                if (nfree)
+                if (has_not_enough_free_secs(sbi))
                        goto gc_more;
        }
+stop:
        mutex_unlock(&sbi->gc_mutex);
        put_gc_inode(&ilist);
-        BUG_ON(!list_empty(&ilist));
        return gc_status;
 }
@@ -727,7 +701,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi)
        DIRTY_I(sbi)->v_ops = &default_v_ops;
 }
-int create_gc_caches(void)
+int __init create_gc_caches(void)
 {
        winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes",
                        sizeof(struct inode_entry), NULL);
diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c
index a60f04200f8b..6eb8d269b53b 100644
--- a/fs/f2fs/hash.c
+++ b/fs/f2fs/hash.c
@@ -42,7 +42,7 @@ static void TEA_transform(unsigned int buf[4], unsigned int const in[])
        buf[1] += b1;
 }
-static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num)
+static void str2hashbuf(const char *msg, size_t len, unsigned int *buf, int num)
 {
        unsigned pad, val;
        int i;
@@ -69,13 +69,17 @@ static void str2hashbuf(const char *msg, int len, unsigned int *buf, int num)
                *buf++ = pad;
 }
-f2fs_hash_t f2fs_dentry_hash(const char *name, int len)
+f2fs_hash_t f2fs_dentry_hash(const char *name, size_t len)
 {
-        __u32 hash, minor_hash;
+        __u32 hash;
        f2fs_hash_t f2fs_hash;
        const char *p;
        __u32 in[8], buf[4];
+        if ((len <= 2) && (name[0] == '.') &&
+                (name[1] == '.' || name[1] == '\0'))
+                return 0;
        /* Initialize the default seed for the hash checksum functions */
        buf[0] = 0x67452301;
        buf[1] = 0xefcdab89;
@@ -83,15 +87,15 @@ f2fs_hash_t f2fs_dentry_hash(const char *name, int len)
        buf[3] = 0x10325476;
        p = name;
-        while (len > 0) {
+        while (1) {
                str2hashbuf(p, len, in, 4);
                TEA_transform(buf, in);
-                len -= 16;
                p += 16;
+                if (len <= 16)
+                        break;
+                len -= 16;
        }
        hash = buf[0];
-        minor_hash = buf[1];
        f2fs_hash = cpu_to_le32(hash & ~F2FS_HASH_COL_BIT);
        return f2fs_hash;
 }
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index df5fb381ebf1..794241777322 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -203,6 +203,7 @@ void update_inode(struct inode *inode, struct page *node_page)
        ri->i_flags = cpu_to_le32(F2FS_I(inode)->i_flags);
        ri->i_pino = cpu_to_le32(F2FS_I(inode)->i_pino);
        ri->i_generation = cpu_to_le32(inode->i_generation);
+        set_cold_node(inode, node_page);
        set_page_dirty(node_page);
 }
@@ -216,6 +217,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
                        inode->i_ino == F2FS_META_INO(sbi))
                return 0;
+        if (wbc)
+                f2fs_balance_fs(sbi);
        node_page = get_node_page(sbi, inode->i_ino);
        if (IS_ERR(node_page))
                return PTR_ERR(node_page);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 89b7675dc377..1a49b881bac0 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -77,8 +77,8 @@ fail:
 static int is_multimedia_file(const unsigned char *s, const char *sub)
 {
-        int slen = strlen(s);
+        size_t slen = strlen(s);
-        int sublen = strlen(sub);
+        size_t sublen = strlen(sub);
        int ret;
        if (sublen > slen)
@@ -123,6 +123,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        nid_t ino = 0;
        int err;
+        f2fs_balance_fs(sbi);
        inode = f2fs_new_inode(dir, mode);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -144,8 +146,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
        if (!sbi->por_doing)
                d_instantiate(dentry, inode);
        unlock_new_inode(inode);
-        f2fs_balance_fs(sbi);
        return 0;
 out:
        clear_nlink(inode);
@@ -163,6 +163,8 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        int err;
+        f2fs_balance_fs(sbi);
        inode->i_ctime = CURRENT_TIME;
        atomic_inc(&inode->i_count);
@@ -172,8 +174,6 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
                goto out;
        d_instantiate(dentry, inode);
-        f2fs_balance_fs(sbi);
        return 0;
 out:
        clear_inode_flag(F2FS_I(inode), FI_INC_LINK);
@@ -223,6 +223,8 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
        struct page *page;
        int err = -ENOENT;
+        f2fs_balance_fs(sbi);
        de = f2fs_find_entry(dir, &dentry->d_name, &page);
        if (!de)
                goto fail;
@@ -238,7 +240,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
        /* In order to evict this inode,  we set it dirty */
        mark_inode_dirty(inode);
-        f2fs_balance_fs(sbi);
 fail:
        return err;
 }
@@ -249,9 +250,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
        struct super_block *sb = dir->i_sb;
        struct f2fs_sb_info *sbi = F2FS_SB(sb);
        struct inode *inode;
-        unsigned symlen = strlen(symname) + 1;
+        size_t symlen = strlen(symname) + 1;
        int err;
+        f2fs_balance_fs(sbi);
        inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -268,9 +271,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
-        f2fs_balance_fs(sbi);
        return err;
 out:
        clear_nlink(inode);
@@ -286,6 +286,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        struct inode *inode;
        int err;
+        f2fs_balance_fs(sbi);
        inode = f2fs_new_inode(dir, S_IFDIR | mode);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -305,7 +307,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
-        f2fs_balance_fs(sbi);
        return 0;
 out_fail:
@@ -336,6 +337,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
+        f2fs_balance_fs(sbi);
        inode = f2fs_new_inode(dir, mode);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
@@ -350,9 +353,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
        alloc_nid_done(sbi, inode->i_ino);
        d_instantiate(dentry, inode);
        unlock_new_inode(inode);
-        f2fs_balance_fs(sbi);
        return 0;
 out:
        clear_nlink(inode);
@@ -376,6 +376,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct f2fs_dir_entry *new_entry;
        int err = -ENOENT;
+        f2fs_balance_fs(sbi);
        old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_entry)
                goto out;
@@ -441,8 +443,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
        }
        mutex_unlock_op(sbi, RENAME);
-        f2fs_balance_fs(sbi);
        return 0;
 out_dir:
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 19870361497e..9bda63c9c166 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -484,12 +484,14 @@ static void truncate_node(struct dnode_of_data *dn)
        struct node_info ni;
        get_node_info(sbi, dn->nid, &ni);
+        if (dn->inode->i_blocks == 0) {
+                BUG_ON(ni.blk_addr != NULL_ADDR);
+                goto invalidate;
+        }
        BUG_ON(ni.blk_addr == NULL_ADDR);
-        if (ni.blk_addr != NULL_ADDR)
-                invalidate_blocks(sbi, ni.blk_addr);
        /* Deallocate node address */
+        invalidate_blocks(sbi, ni.blk_addr);
        dec_valid_node_count(sbi, dn->inode, 1);
        set_node_addr(sbi, &ni, NULL_ADDR);
@@ -499,7 +501,7 @@ static void truncate_node(struct dnode_of_data *dn)
        } else {
                sync_inode_page(dn);
        }
+invalidate:
        clear_node_page_dirty(dn->node_page);
        F2FS_SET_SB_DIRT(sbi);
@@ -768,20 +770,12 @@ int remove_inode_page(struct inode *inode)
                dn.inode_page_locked = 1;
                truncate_node(&dn);
        }
-        if (inode->i_blocks == 1) {
-                /* inernally call f2fs_put_page() */
-                set_new_dnode(&dn, inode, page, page, ino);
-                truncate_node(&dn);
-        } else if (inode->i_blocks == 0) {
-                struct node_info ni;
-                get_node_info(sbi, inode->i_ino, &ni);
-                /* called after f2fs_new_inode() is failed */
+        /* 0 is possible, after f2fs_new_inode() is failed */
-                BUG_ON(ni.blk_addr != NULL_ADDR);
+        BUG_ON(inode->i_blocks != 0 && inode->i_blocks != 1);
-                f2fs_put_page(page, 1);
+        set_new_dnode(&dn, inode, page, page, ino);
-        } else {
+        truncate_node(&dn);
-                BUG();
-        }
        mutex_unlock_op(sbi, NODE_TRUNC);
        return 0;
 }
@@ -834,17 +828,18 @@ struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs)
                goto fail;
        }
        set_node_addr(sbi, &new_ni, NEW_ADDR);
+        set_cold_node(dn->inode, page);
        dn->node_page = page;
        sync_inode_page(dn);
        set_page_dirty(page);
-        set_cold_node(dn->inode, page);
        if (ofs == 0)
                inc_valid_inode_count(sbi);
        return page;
 fail:
+        clear_node_page_dirty(page);
        f2fs_put_page(page, 1);
        return ERR_PTR(err);
 }
@@ -1093,7 +1088,6 @@ static int f2fs_write_node_page(struct page *page,
 {
        struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb);
        nid_t nid;
-        unsigned int nofs;
        block_t new_addr;
        struct node_info ni;
@@ -1110,7 +1104,6 @@ static int f2fs_write_node_page(struct page *page,
        /* get old block addr of this node page */
        nid = nid_of_node(page);
-        nofs = ofs_of_node(page);
        BUG_ON(page->index != nid);
        get_node_info(sbi, nid, &ni);
@@ -1131,6 +1124,12 @@ static int f2fs_write_node_page(struct page *page,
        return 0;
 }
+/*
+ * It is very important to gather dirty pages and write at once, so that we can
+ * submit a big bio without interfering other data writes.
+ * Be default, 512 pages (2MB), a segment size, is quite reasonable.
+ */
+#define COLLECT_DIRTY_NODES     512
 static int f2fs_write_node_pages(struct address_space *mapping,
                            struct writeback_control *wbc)
 {
@@ -1138,17 +1137,16 @@ static int f2fs_write_node_pages(struct address_space *mapping,
        struct block_device *bdev = sbi->sb->s_bdev;
        long nr_to_write = wbc->nr_to_write;
-        if (wbc->for_kupdate)
+        /* First check balancing cached NAT entries */
-                return 0;
-        if (get_pages(sbi, F2FS_DIRTY_NODES) == 0)
-                return 0;
        if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) {
                write_checkpoint(sbi, false, false);
                return 0;
        }
+        /* collect a number of dirty node pages and write together */
+        if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES)
+                return 0;
        /* if mounting is failed, skip writing node pages */
        wbc->nr_to_write = bio_get_nr_vecs(bdev);
        sync_node_pages(sbi, 0, wbc);
@@ -1571,7 +1569,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
                nid_t nid;
                struct f2fs_nat_entry raw_ne;
                int offset = -1;
-                block_t old_blkaddr, new_blkaddr;
+                block_t new_blkaddr;
                ne = list_entry(cur, struct nat_entry, list);
                nid = nat_get_nid(ne);
@@ -1585,7 +1583,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
                offset = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 1);
                if (offset >= 0) {
                        raw_ne = nat_in_journal(sum, offset);
-                        old_blkaddr = le32_to_cpu(raw_ne.block_addr);
                        goto flush_now;
                }
 to_nat_page:
@@ -1607,7 +1604,6 @@ to_nat_page:
                BUG_ON(!nat_blk);
                raw_ne = nat_blk->entries[nid - start_nid];
-                old_blkaddr = le32_to_cpu(raw_ne.block_addr);
 flush_now:
                new_blkaddr = nat_get_blkaddr(ne);
@@ -1741,7 +1737,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
        kfree(nm_i);
 }
-int create_node_manager_caches(void)
+int __init create_node_manager_caches(void)
 {
        nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
                        sizeof(struct nat_entry), NULL);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index b07e9b6ef376..f42e4060b399 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -67,7 +67,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode)
                kunmap(page);
                f2fs_put_page(page, 0);
        } else {
-                f2fs_add_link(&dent, inode);
+                err = f2fs_add_link(&dent, inode);
        }
        iput(dir);
 out:
@@ -144,14 +144,14 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
                                goto out;
                        }
-                        INIT_LIST_HEAD(&entry->list);
-                        list_add_tail(&entry->list, head);
                        entry->inode = f2fs_iget(sbi->sb, ino_of_node(page));
                        if (IS_ERR(entry->inode)) {
                                err = PTR_ERR(entry->inode);
+                                kmem_cache_free(fsync_entry_slab, entry);
                                goto out;
                        }
+                        list_add_tail(&entry->list, head);
                        entry->blkaddr = blkaddr;
                }
                if (IS_INODE(page)) {
@@ -173,10 +173,9 @@ out:
 static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi,
                                        struct list_head *head)
 {
-        struct list_head *this;
+        struct fsync_inode_entry *entry, *tmp;
-        struct fsync_inode_entry *entry;
-        list_for_each(this, head) {
+        list_for_each_entry_safe(entry, tmp, head, list) {
-                entry = list_entry(this, struct fsync_inode_entry, list);
                iput(entry->inode);
                list_del(&entry->list);
                kmem_cache_free(fsync_entry_slab, entry);
@@ -228,6 +227,9 @@ static void check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
        /* Deallocate previous index in the node page */
        inode = f2fs_iget_nowait(sbi->sb, ino);
+        if (IS_ERR(inode))
+                return;
        truncate_hole(inode, bidx, bidx + 1);
        iput(inode);
 }
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 1b26e4ea1016..4b0099066582 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -12,57 +12,26 @@
 #include <linux/f2fs_fs.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
+#include <linux/prefetch.h>
 #include <linux/vmalloc.h>
 #include "f2fs.h"
 #include "segment.h"
 #include "node.h"
-static int need_to_flush(struct f2fs_sb_info *sbi)
-{
-        unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) *
-                        sbi->segs_per_sec;
-        int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
-                >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
-        int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
-                >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
-        if (sbi->por_doing)
-                return 0;
-        if (free_sections(sbi) <= (node_secs + 2 * dent_secs +
-                                                reserved_sections(sbi)))
-                return 1;
-        return 0;
-}
 /*
 * This function balances dirty node and dentry pages.
 * In addition, it controls garbage collection.
 */
 void f2fs_balance_fs(struct f2fs_sb_info *sbi)
 {
-        struct writeback_control wbc = {
-                .sync_mode = WB_SYNC_ALL,
-                .nr_to_write = LONG_MAX,
-                .for_reclaim = 0,
-        };
-        if (sbi->por_doing)
-                return;
        /*
-         * We should do checkpoint when there are so many dirty node pages
+         * We should do GC or end up with checkpoint, if there are so many dirty
-         * with enough free segments. After then, we should do GC.
+         * dir/node pages without enough free segments.
         */
-        if (need_to_flush(sbi)) {
-                sync_dirty_dir_inodes(sbi);
-                sync_node_pages(sbi, 0, &wbc);
-        }
        if (has_not_enough_free_secs(sbi)) {
                mutex_lock(&sbi->gc_mutex);
-                f2fs_gc(sbi, 1);
+                f2fs_gc(sbi);
        }
 }
@@ -631,7 +600,6 @@ static void f2fs_end_io_write(struct bio *bio, int err)
                        if (page->mapping)
                                set_bit(AS_EIO, &page->mapping->flags);
                        set_ckpt_flags(p->sbi->ckpt, CP_ERROR_FLAG);
-                        set_page_dirty(page);
                }
                end_page_writeback(page);
                dec_page_count(p->sbi, F2FS_WRITEBACK);
@@ -791,11 +759,10 @@ static int __get_segment_type(struct page *page, enum page_type p_type)
                return __get_segment_type_2(page, p_type);
        case 4:
                return __get_segment_type_4(page, p_type);
-        case 6:
-                return __get_segment_type_6(page, p_type);
-        default:
-                BUG();
        }
+        /* NR_CURSEG_TYPE(6) logs by default */
+        BUG_ON(sbi->active_logs != NR_CURSEG_TYPE);
+        return __get_segment_type_6(page, p_type);
 }
 static void do_write_page(struct f2fs_sb_info *sbi, struct page *page,
@@ -1608,7 +1575,6 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi)
        for (i = 0; i < NR_DIRTY_TYPE; i++) {
                dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL);
-                dirty_i->nr_dirty[i] = 0;
                if (!dirty_i->dirty_segmap[i])
                        return -ENOMEM;
        }
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 0948405af6f5..66a288a52fd3 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -459,7 +459,20 @@ static inline int get_ssr_segment(struct f2fs_sb_info *sbi, int type)
 static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi)
 {
-        return free_sections(sbi) <= reserved_sections(sbi);
+        unsigned int pages_per_sec = (1 << sbi->log_blocks_per_seg) *
+                        sbi->segs_per_sec;
+        int node_secs = ((get_pages(sbi, F2FS_DIRTY_NODES) + pages_per_sec - 1)
+                        >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+        int dent_secs = ((get_pages(sbi, F2FS_DIRTY_DENTS) + pages_per_sec - 1)
+                        >> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
+        if (sbi->por_doing)
+                return false;
+        if (free_sections(sbi) <= (node_secs + 2 * dent_secs +
+                                                reserved_sections(sbi)))
+                return true;
+        return false;
 }
 static inline int utilization(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 13867322cf5a..37fad04c8669 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -53,6 +53,18 @@ static match_table_t f2fs_tokens = {
        {Opt_err, NULL},
 };
+void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...)
+{
+        struct va_format vaf;
+        va_list args;
+        va_start(args, fmt);
+        vaf.fmt = fmt;
+        vaf.va = &args;
+        printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf);
+        va_end(args);
+}
 static void init_once(void *foo)
 {
        struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo;
@@ -119,15 +131,16 @@ static void f2fs_put_super(struct super_block *sb)
 int f2fs_sync_fs(struct super_block *sb, int sync)
 {
        struct f2fs_sb_info *sbi = F2FS_SB(sb);
-        int ret = 0;
        if (!sbi->s_dirty && !get_pages(sbi, F2FS_DIRTY_NODES))
                return 0;
        if (sync)
                write_checkpoint(sbi, false, false);
+        else
+                f2fs_balance_fs(sbi);
-        return ret;
+        return 0;
 }
 static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -148,8 +161,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count;
        buf->f_bavail = user_block_count - valid_user_blocks(sbi);
-        buf->f_files = valid_inode_count(sbi);
+        buf->f_files = sbi->total_node_count;
-        buf->f_ffree = sbi->total_node_count - valid_node_count(sbi);
+        buf->f_ffree = sbi->total_node_count - valid_inode_count(sbi);
        buf->f_namelen = F2FS_MAX_NAME_LEN;
        buf->f_fsid.val[0] = (u32)id;
@@ -248,7 +261,8 @@ static const struct export_operations f2fs_export_ops = {
        .get_parent = f2fs_get_parent,
 };
-static int parse_options(struct f2fs_sb_info *sbi, char *options)
+static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi,
+                                char *options)
 {
        substring_t args[MAX_OPT_ARGS];
        char *p;
@@ -287,7 +301,8 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options)
                        break;
 #else
                case Opt_nouser_xattr:
-                        pr_info("nouser_xattr options not supported\n");
+                        f2fs_msg(sb, KERN_INFO,
+                                "nouser_xattr options not supported");
                        break;
 #endif
 #ifdef CONFIG_F2FS_FS_POSIX_ACL
@@ -296,13 +311,13 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options)
                        break;
 #else
                case Opt_noacl:
-                        pr_info("noacl options not supported\n");
+                        f2fs_msg(sb, KERN_INFO, "noacl options not supported");
                        break;
 #endif
                case Opt_active_logs:
                        if (args->from && match_int(args, &arg))
                                return -EINVAL;
-                        if (arg != 2 && arg != 4 && arg != 6)
+                        if (arg != 2 && arg != 4 && arg != NR_CURSEG_TYPE)
                                return -EINVAL;
                        sbi->active_logs = arg;
                        break;
@@ -310,8 +325,9 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options)
                        set_opt(sbi, DISABLE_EXT_IDENTIFY);
                        break;
                default:
-                        pr_err("Unrecognized mount option \"%s\" or missing value\n",
+                        f2fs_msg(sb, KERN_ERR,
-                                        p);
+                                "Unrecognized mount option \"%s\" or missing value",
+                                p);
                        return -EINVAL;
                }
        }
@@ -338,23 +354,36 @@ static loff_t max_file_size(unsigned bits)
        return result;
 }
-static int sanity_check_raw_super(struct f2fs_super_block *raw_super)
+static int sanity_check_raw_super(struct super_block *sb,
+                        struct f2fs_super_block *raw_super)
 {
        unsigned int blocksize;
-        if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic))
+        if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) {
+                f2fs_msg(sb, KERN_INFO,
+                        "Magic Mismatch, valid(0x%x) - read(0x%x)",
+                        F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic));
                return 1;
+        }
        /* Currently, support only 4KB block size */
        blocksize = 1 << le32_to_cpu(raw_super->log_blocksize);
-        if (blocksize != PAGE_CACHE_SIZE)
+        if (blocksize != PAGE_CACHE_SIZE) {
+                f2fs_msg(sb, KERN_INFO,
+                        "Invalid blocksize (%u), supports only 4KB\n",
+                        blocksize);
                return 1;
+        }
        if (le32_to_cpu(raw_super->log_sectorsize) !=
-                                        F2FS_LOG_SECTOR_SIZE)
+                                        F2FS_LOG_SECTOR_SIZE) {
+                f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize");
                return 1;
+        }
        if (le32_to_cpu(raw_super->log_sectors_per_block) !=
-                                        F2FS_LOG_SECTORS_PER_BLOCK)
+                                        F2FS_LOG_SECTORS_PER_BLOCK) {
+                f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block");
                return 1;
+        }
        return 0;
 }
@@ -414,14 +443,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        if (!sbi)
                return -ENOMEM;
-        /* set a temporary block size */
+        /* set a block size */
-        if (!sb_set_blocksize(sb, F2FS_BLKSIZE))
+        if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) {
+                f2fs_msg(sb, KERN_ERR, "unable to set blocksize");
                goto free_sbi;
+        }
        /* read f2fs raw super block */
        raw_super_buf = sb_bread(sb, 0);
        if (!raw_super_buf) {
                err = -EIO;
+                f2fs_msg(sb, KERN_ERR, "unable to read superblock");
                goto free_sbi;
        }
        raw_super = (struct f2fs_super_block *)
@@ -439,12 +471,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        set_opt(sbi, POSIX_ACL);
 #endif
        /* parse mount options */
-        if (parse_options(sbi, (char *)data))
+        if (parse_options(sb, sbi, (char *)data))
                goto free_sb_buf;
        /* sanity checking of raw super */
-        if (sanity_check_raw_super(raw_super))
+        if (sanity_check_raw_super(sb, raw_super)) {
+                f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem");
                goto free_sb_buf;
+        }
        sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
        sb->s_max_links = F2FS_LINK_MAX;
@@ -478,18 +512,23 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        /* get an inode for meta space */
        sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
        if (IS_ERR(sbi->meta_inode)) {
+                f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode");
                err = PTR_ERR(sbi->meta_inode);
                goto free_sb_buf;
        }
        err = get_valid_checkpoint(sbi);
-        if (err)
+        if (err) {
+                f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint");
                goto free_meta_inode;
+        }
        /* sanity checking of checkpoint */
        err = -EINVAL;
-        if (sanity_check_ckpt(raw_super, sbi->ckpt))
+        if (sanity_check_ckpt(raw_super, sbi->ckpt)) {
+                f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint");
                goto free_cp;
+        }
        sbi->total_valid_node_count =
                                le32_to_cpu(sbi->ckpt->valid_node_count);
@@ -503,38 +542,41 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        INIT_LIST_HEAD(&sbi->dir_inode_list);
        spin_lock_init(&sbi->dir_inode_lock);
-        /* init super block */
-        if (!sb_set_blocksize(sb, sbi->blocksize))
-                goto free_cp;
        init_orphan_info(sbi);
        /* setup f2fs internal modules */
        err = build_segment_manager(sbi);
-        if (err)
+        if (err) {
+                f2fs_msg(sb, KERN_ERR,
+                        "Failed to initialize F2FS segment manager");
                goto free_sm;
+        }
        err = build_node_manager(sbi);
-        if (err)
+        if (err) {
+                f2fs_msg(sb, KERN_ERR,
+                        "Failed to initialize F2FS node manager");
                goto free_nm;
+        }
        build_gc_manager(sbi);
        /* get an inode for node space */
        sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi));
        if (IS_ERR(sbi->node_inode)) {
+                f2fs_msg(sb, KERN_ERR, "Failed to read node inode");
                err = PTR_ERR(sbi->node_inode);
                goto free_nm;
        }
        /* if there are nt orphan nodes free them */
        err = -EINVAL;
-        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) &&
+        if (recover_orphan_inodes(sbi))
-                                recover_orphan_inodes(sbi))
                goto free_node_inode;
        /* read root inode and dentry */
        root = f2fs_iget(sb, F2FS_ROOT_INO(sbi));
        if (IS_ERR(root)) {
+                f2fs_msg(sb, KERN_ERR, "Failed to read root inode");
                err = PTR_ERR(root);
                goto free_node_inode;
        }
@@ -548,8 +590,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
        }
        /* recover fsynced data */
-        if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) &&
+        if (!test_opt(sbi, DISABLE_ROLL_FORWARD))
-                                !test_opt(sbi, DISABLE_ROLL_FORWARD))
                recover_fsync_data(sbi);
        /* After POR, we can run background GC thread */
@@ -599,7 +640,7 @@ static struct file_system_type f2fs_fs_type = {
        .fs_flags       = FS_REQUIRES_DEV,
 };
-static int init_inodecache(void)
+static int __init init_inodecache(void)
 {
        f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
                        sizeof(struct f2fs_inode_info), NULL);
@@ -634,14 +675,17 @@ static int __init init_f2fs_fs(void)
        err = create_checkpoint_caches();
        if (err)
                goto fail;
-        return register_filesystem(&f2fs_fs_type);
+        err = register_filesystem(&f2fs_fs_type);
+        if (err)
+                goto fail;
+        f2fs_create_root_stats();
 fail:
        return err;
 }
 static void __exit exit_f2fs_fs(void)
 {
-        destroy_root_stats();
+        f2fs_destroy_root_stats();
        unregister_filesystem(&f2fs_fs_type);
        destroy_checkpoint_caches();
        destroy_gc_caches();
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 7d52e8dc0c59..8038c0496504 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -208,7 +208,7 @@ int f2fs_getxattr(struct inode *inode, int name_index, const char *name,
        struct page *page;
        void *base_addr;
        int error = 0, found = 0;
-        int value_len, name_len;
+        size_t value_len, name_len;
        if (name == NULL)
                return -EINVAL;
@@ -304,7 +304,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
        struct f2fs_xattr_entry *here, *last;
        struct page *page;
        void *base_addr;
-        int error, found, free, name_len, newsize;
+        int error, found, free, newsize;
+        size_t name_len;
        char *pval;
        if (name == NULL)
@@ -317,6 +318,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name,
        if (name_len > 255 || value_len > MAX_VALUE_LEN)
                return -ERANGE;
+        f2fs_balance_fs(sbi);
        mutex_lock_op(sbi, NODE_NEW);
        if (!fi->i_xattr_nid) {
                /* Allocate new attribute block */
diff --git a/fs/file.c b/fs/file.c
index 15cb8618e95d..2b3570b7caeb 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -490,7 +490,7 @@ void exit_files(struct task_struct *tsk)
        }
 }
-static void __devinit fdtable_defer_list_init(int cpu)
+static void fdtable_defer_list_init(int cpu)
 {
        struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu);
        spin_lock_init(&fddef->lock);
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
index 0cf160a94eda..1b2f6c2c3aaf 100644
--- a/fs/fuse/Kconfig
+++ b/fs/fuse/Kconfig
@@ -4,12 +4,24 @@ config FUSE_FS
          With FUSE it is possible to implement a fully functional filesystem
          in a userspace program.
-          There's also companion library: libfuse.  This library along with
+          There's also a companion library: libfuse2.  This library is available
-          utilities is available from the FUSE homepage:
+          from the FUSE homepage:
          <http://fuse.sourceforge.net/>
+          although chances are your distribution already has that library
+          installed if you've installed the "fuse" package itself.
          See <file:Documentation/filesystems/fuse.txt> for more information.
          See <file:Documentation/Changes> for needed library/utility version.
          If you want to develop a userspace FS, or if you want to use
          a filesystem based on FUSE, answer Y or M.
+config CUSE
+        tristate "Character device in Userspace support"
+        depends on FUSE_FS
+        help
+          This FUSE extension allows character devices to be
+          implemented in userspace.
+          If you want to develop or use a userspace character device
+          based on CUSE, answer Y or M.
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
index ee8d55042298..6f96a8def147 100644
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -45,7 +45,6 @@
 #include <linux/miscdevice.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/spinlock.h>
 #include <linux/stat.h>
 #include <linux/module.h>
@@ -63,7 +62,7 @@ struct cuse_conn {
        bool                    unrestricted_ioctl;
 };
-static DEFINE_SPINLOCK(cuse_lock);              /* protects cuse_conntbl */
+static DEFINE_MUTEX(cuse_lock);         /* protects registration */
 static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
 static struct class *cuse_class;
@@ -92,19 +91,22 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
                         loff_t *ppos)
 {
        loff_t pos = 0;
+        struct iovec iov = { .iov_base = buf, .iov_len = count };
-        return fuse_direct_io(file, buf, count, &pos, 0);
+        return fuse_direct_io(file, &iov, 1, count, &pos, 0);
 }
 static ssize_t cuse_write(struct file *file, const char __user *buf,
                          size_t count, loff_t *ppos)
 {
        loff_t pos = 0;
+        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
        /*
         * No locking or generic_write_checks(), the server is
         * responsible for locking and sanity checks.
         */
-        return fuse_direct_io(file, buf, count, &pos, 1);
+        return fuse_direct_io(file, &iov, 1, count, &pos, 1);
 }
 static int cuse_open(struct inode *inode, struct file *file)
@@ -114,14 +116,14 @@ static int cuse_open(struct inode *inode, struct file *file)
        int rc;
        /* look up and get the connection */
-        spin_lock(&cuse_lock);
+        mutex_lock(&cuse_lock);
        list_for_each_entry(pos, cuse_conntbl_head(devt), list)
                if (pos->dev->devt == devt) {
                        fuse_conn_get(&pos->fc);
                        cc = pos;
                        break;
                }
-        spin_unlock(&cuse_lock);
+        mutex_unlock(&cuse_lock);
        /* dead? */
        if (!cc)
@@ -267,7 +269,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
 static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
 {
        char *end = p + len;
-        char *key, *val;
+        char *uninitialized_var(key), *uninitialized_var(val);
        int rc;
        while (true) {
@@ -305,14 +307,14 @@ static void cuse_gendev_release(struct device *dev)
 */
 static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
-        struct cuse_conn *cc = fc_to_cc(fc);
+        struct cuse_conn *cc = fc_to_cc(fc), *pos;
        struct cuse_init_out *arg = req->out.args[0].value;
        struct page *page = req->pages[0];
        struct cuse_devinfo devinfo = { };
        struct device *dev;
        struct cdev *cdev;
        dev_t devt;
-        int rc;
+        int rc, i;
        if (req->out.h.error ||
            arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
@@ -356,15 +358,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
        dev_set_drvdata(dev, cc);
        dev_set_name(dev, "%s", devinfo.name);
+        mutex_lock(&cuse_lock);
+        /* make sure the device-name is unique */
+        for (i = 0; i < CUSE_CONNTBL_LEN; ++i) {
+                list_for_each_entry(pos, &cuse_conntbl[i], list)
+                        if (!strcmp(dev_name(pos->dev), dev_name(dev)))
+                                goto err_unlock;
+        }
        rc = device_add(dev);
        if (rc)
-                goto err_device;
+                goto err_unlock;
        /* register cdev */
        rc = -ENOMEM;
        cdev = cdev_alloc();
        if (!cdev)
-                goto err_device;
+                goto err_unlock;
        cdev->owner = THIS_MODULE;
        cdev->ops = &cuse_frontend_fops;
@@ -377,9 +388,8 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
        cc->cdev = cdev;
        /* make the device available */
-        spin_lock(&cuse_lock);
        list_add(&cc->list, cuse_conntbl_head(devt));
-        spin_unlock(&cuse_lock);
+        mutex_unlock(&cuse_lock);
        /* announce device availability */
        dev_set_uevent_suppress(dev, 0);
@@ -391,7 +401,8 @@ out:
 err_cdev:
        cdev_del(cdev);
-err_device:
+err_unlock:
+        mutex_unlock(&cuse_lock);
        put_device(dev);
 err_region:
        unregister_chrdev_region(devt, 1);
@@ -411,7 +422,7 @@ static int cuse_send_init(struct cuse_conn *cc)
        BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
-        req = fuse_get_req(fc);
+        req = fuse_get_req(fc, 1);
        if (IS_ERR(req)) {
                rc = PTR_ERR(req);
                goto err;
@@ -441,6 +452,7 @@ static int cuse_send_init(struct cuse_conn *cc)
        req->out.argvar = 1;
        req->out.argpages = 1;
        req->pages[0] = page;
+        req->page_descs[0].length = req->out.args[1].size;
        req->num_pages = 1;
        req->end = cuse_process_init_reply;
        fuse_request_send_background(fc, req);
@@ -520,9 +532,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file)
        int rc;
        /* remove from the conntbl, no more access from this point on */
-        spin_lock(&cuse_lock);
+        mutex_lock(&cuse_lock);
        list_del_init(&cc->list);
-        spin_unlock(&cuse_lock);
+        mutex_unlock(&cuse_lock);
        /* remove device */
        if (cc->dev)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index c16335315e5d..e9bdec0b16d9 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -34,34 +34,67 @@ static struct fuse_conn *fuse_get_conn(struct file *file)
        return file->private_data;
 }
-static void fuse_request_init(struct fuse_req *req)
+static void fuse_request_init(struct fuse_req *req, struct page **pages,
+                              struct fuse_page_desc *page_descs,
+                              unsigned npages)
 {
        memset(req, 0, sizeof(*req));
+        memset(pages, 0, sizeof(*pages) * npages);
+        memset(page_descs, 0, sizeof(*page_descs) * npages);
        INIT_LIST_HEAD(&req->list);
        INIT_LIST_HEAD(&req->intr_entry);
        init_waitqueue_head(&req->waitq);
        atomic_set(&req->count, 1);
+        req->pages = pages;
+        req->page_descs = page_descs;
+        req->max_pages = npages;
 }
-struct fuse_req *fuse_request_alloc(void)
+static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
 {
-        struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_KERNEL);
+        struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
-        if (req)
+        if (req) {
-                fuse_request_init(req);
+                struct page **pages;
+                struct fuse_page_desc *page_descs;
+                if (npages <= FUSE_REQ_INLINE_PAGES) {
+                        pages = req->inline_pages;
+                        page_descs = req->inline_page_descs;
+                } else {
+                        pages = kmalloc(sizeof(struct page *) * npages, flags);
+                        page_descs = kmalloc(sizeof(struct fuse_page_desc) *
+                                             npages, flags);
+                }
+                if (!pages || !page_descs) {
+                        kfree(pages);
+                        kfree(page_descs);
+                        kmem_cache_free(fuse_req_cachep, req);
+                        return NULL;
+                }
+                fuse_request_init(req, pages, page_descs, npages);
+        }
        return req;
 }
+struct fuse_req *fuse_request_alloc(unsigned npages)
+{
+        return __fuse_request_alloc(npages, GFP_KERNEL);
+}
 EXPORT_SYMBOL_GPL(fuse_request_alloc);
-struct fuse_req *fuse_request_alloc_nofs(void)
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
 {
-        struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS);
+        return __fuse_request_alloc(npages, GFP_NOFS);
-        if (req)
-                fuse_request_init(req);
-        return req;
 }
 void fuse_request_free(struct fuse_req *req)
 {
+        if (req->pages != req->inline_pages) {
+                kfree(req->pages);
+                kfree(req->page_descs);
+        }
        kmem_cache_free(fuse_req_cachep, req);
 }
@@ -97,7 +130,7 @@ static void fuse_req_init_context(struct fuse_req *req)
        req->in.h.pid = current->pid;
 }
-struct fuse_req *fuse_get_req(struct fuse_conn *fc)
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
 {
        struct fuse_req *req;
        sigset_t oldset;
@@ -116,7 +149,7 @@ struct fuse_req *fuse_get_req(struct fuse_conn *fc)
        if (!fc->connected)
                goto out;
-        req = fuse_request_alloc();
+        req = fuse_request_alloc(npages);
        err = -ENOMEM;
        if (!req)
                goto out;
@@ -165,7 +198,7 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
        struct fuse_file *ff = file->private_data;
        spin_lock(&fc->lock);
-        fuse_request_init(req);
+        fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
        BUG_ON(ff->reserved_req);
        ff->reserved_req = req;
        wake_up_all(&fc->reserved_req_waitq);
@@ -186,13 +219,14 @@ static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
 * filesystem should not have it's own file open.  If deadlock is
 * intentional, it can still be broken by "aborting" the filesystem.
 */
-struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file)
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+                                             struct file *file)
 {
        struct fuse_req *req;
        atomic_inc(&fc->num_waiting);
        wait_event(fc->blocked_waitq, !fc->blocked);
-        req = fuse_request_alloc();
+        req = fuse_request_alloc(0);
        if (!req)
                req = get_reserved_req(fc, file);
@@ -406,9 +440,8 @@ __acquires(fc->lock)
        }
 }
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
-        req->isreply = 1;
        spin_lock(&fc->lock);
        if (!fc->connected)
                req->out.h.error = -ENOTCONN;
@@ -425,6 +458,12 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
        }
        spin_unlock(&fc->lock);
 }
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+        req->isreply = 1;
+        __fuse_request_send(fc, req);
+}
 EXPORT_SYMBOL_GPL(fuse_request_send);
 static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
@@ -491,6 +530,27 @@ void fuse_request_send_background_locked(struct fuse_conn *fc,
        fuse_request_send_nowait_locked(fc, req);
 }
+void fuse_force_forget(struct file *file, u64 nodeid)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct fuse_conn *fc = get_fuse_conn(inode);
+        struct fuse_req *req;
+        struct fuse_forget_in inarg;
+        memset(&inarg, 0, sizeof(inarg));
+        inarg.nlookup = 1;
+        req = fuse_get_req_nofail_nopages(fc, file);
+        req->in.h.opcode = FUSE_FORGET;
+        req->in.h.nodeid = nodeid;
+        req->in.numargs = 1;
+        req->in.args[0].size = sizeof(inarg);
+        req->in.args[0].value = &inarg;
+        req->isreply = 0;
+        __fuse_request_send(fc, req);
+        /* ignore errors */
+        fuse_put_request(fc, req);
+}
 /*
 * Lock the request.  Up to the next unlock_request() there mustn't be
 * anything that could cause a page-fault.  If the request was already
@@ -692,8 +752,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
        struct page *oldpage = *pagep;
        struct page *newpage;
        struct pipe_buffer *buf = cs->pipebufs;
-        struct address_space *mapping;
-        pgoff_t index;
        unlock_request(cs->fc, cs->req);
        fuse_copy_finish(cs);
@@ -724,9 +782,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
        if (fuse_check_page(newpage) != 0)
                goto out_fallback_unlock;
-        mapping = oldpage->mapping;
-        index = oldpage->index;
        /*
         * This is a new and locked page, it shouldn't be mapped or
         * have any special flags on it
@@ -855,11 +910,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 {
        unsigned i;
        struct fuse_req *req = cs->req;
-        unsigned offset = req->page_offset;
-        unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
        for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
                int err;
+                unsigned offset = req->page_descs[i].offset;
+                unsigned count = min(nbytes, req->page_descs[i].length);
                err = fuse_copy_page(cs, &req->pages[i], offset, count,
                                     zeroing);
@@ -867,8 +922,6 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
                        return err;
                nbytes -= count;
-                count = min(nbytes, (unsigned) PAGE_SIZE);
-                offset = 0;
        }
        return 0;
 }
@@ -1541,29 +1594,34 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
        unsigned int num;
        unsigned int offset;
        size_t total_len = 0;
+        int num_pages;
+        offset = outarg->offset & ~PAGE_CACHE_MASK;
+        file_size = i_size_read(inode);
+        num = outarg->size;
+        if (outarg->offset > file_size)
+                num = 0;
+        else if (outarg->offset + num > file_size)
+                num = file_size - outarg->offset;
-        req = fuse_get_req(fc);
+        num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
+        req = fuse_get_req(fc, num_pages);
        if (IS_ERR(req))
                return PTR_ERR(req);
-        offset = outarg->offset & ~PAGE_CACHE_MASK;
        req->in.h.opcode = FUSE_NOTIFY_REPLY;
        req->in.h.nodeid = outarg->nodeid;
        req->in.numargs = 2;
        req->in.argpages = 1;
-        req->page_offset = offset;
+        req->page_descs[0].offset = offset;
        req->end = fuse_retrieve_end;
        index = outarg->offset >> PAGE_CACHE_SHIFT;
-        file_size = i_size_read(inode);
-        num = outarg->size;
-        if (outarg->offset > file_size)
-                num = 0;
-        else if (outarg->offset + num > file_size)
-                num = file_size - outarg->offset;
-        while (num && req->num_pages < FUSE_MAX_PAGES_PER_REQ) {
+        while (num && req->num_pages < num_pages) {
                struct page *page;
                unsigned int this_num;
@@ -1573,6 +1631,7 @@ static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
                this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
                req->pages[req->num_pages] = page;
+                req->page_descs[req->num_pages].length = this_num;
                req->num_pages++;
                offset = 0;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b7c09f9eb40c..85065221a58a 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -14,6 +14,29 @@
 #include <linux/namei.h>
 #include <linux/slab.h>
+static bool fuse_use_readdirplus(struct inode *dir, struct file *filp)
+{
+        struct fuse_conn *fc = get_fuse_conn(dir);
+        struct fuse_inode *fi = get_fuse_inode(dir);
+        if (!fc->do_readdirplus)
+                return false;
+        if (!fc->readdirplus_auto)
+                return true;
+        if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+                return true;
+        if (filp->f_pos == 0)
+                return true;
+        return false;
+}
+static void fuse_advise_use_readdirplus(struct inode *dir)
+{
+        struct fuse_inode *fi = get_fuse_inode(dir);
+        set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
+}
 #if BITS_PER_LONG >= 64
 static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
 {
@@ -178,7 +201,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
                        return -ECHILD;
                fc = get_fuse_conn(inode);
-                req = fuse_get_req(fc);
+                req = fuse_get_req_nopages(fc);
                if (IS_ERR(req))
                        return 0;
@@ -219,6 +242,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
                                       attr_version);
                fuse_change_entry_timeout(entry, &outarg);
        }
+        fuse_advise_use_readdirplus(inode);
        return 1;
 }
@@ -271,7 +295,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
        if (name->len > FUSE_NAME_MAX)
                goto out;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        err = PTR_ERR(req);
        if (IS_ERR(req))
                goto out;
@@ -355,6 +379,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
        else
                fuse_invalidate_entry_cache(entry);
+        fuse_advise_use_readdirplus(dir);
        return newent;
 out_iput:
@@ -391,7 +416,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
        if (!forget)
                goto out_err;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        err = PTR_ERR(req);
        if (IS_ERR(req))
                goto out_put_forget_req;
@@ -592,7 +617,7 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
 {
        struct fuse_mknod_in inarg;
        struct fuse_conn *fc = get_fuse_conn(dir);
-        struct fuse_req *req = fuse_get_req(fc);
+        struct fuse_req *req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -623,7 +648,7 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
 {
        struct fuse_mkdir_in inarg;
        struct fuse_conn *fc = get_fuse_conn(dir);
-        struct fuse_req *req = fuse_get_req(fc);
+        struct fuse_req *req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -647,7 +672,7 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 {
        struct fuse_conn *fc = get_fuse_conn(dir);
        unsigned len = strlen(link) + 1;
-        struct fuse_req *req = fuse_get_req(fc);
+        struct fuse_req *req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -664,7 +689,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 {
        int err;
        struct fuse_conn *fc = get_fuse_conn(dir);
-        struct fuse_req *req = fuse_get_req(fc);
+        struct fuse_req *req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -682,7 +707,14 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
                spin_lock(&fc->lock);
                fi->attr_version = ++fc->attr_version;
-                drop_nlink(inode);
+                /*
+                 * If i_nlink == 0 then unlink doesn't make sense, yet this can
+                 * happen if userspace filesystem is careless.  It would be
+                 * difficult to enforce correct nlink usage so just ignore this
+                 * condition here
+                 */
+                if (inode->i_nlink > 0)
+                        drop_nlink(inode);
                spin_unlock(&fc->lock);
                fuse_invalidate_attr(inode);
                fuse_invalidate_attr(dir);
@@ -696,7 +728,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 {
        int err;
        struct fuse_conn *fc = get_fuse_conn(dir);
-        struct fuse_req *req = fuse_get_req(fc);
+        struct fuse_req *req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -723,7 +755,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
        int err;
        struct fuse_rename_in inarg;
        struct fuse_conn *fc = get_fuse_conn(olddir);
-        struct fuse_req *req = fuse_get_req(fc);
+        struct fuse_req *req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -776,7 +808,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
        struct fuse_link_in inarg;
        struct inode *inode = entry->d_inode;
        struct fuse_conn *fc = get_fuse_conn(inode);
-        struct fuse_req *req = fuse_get_req(fc);
+        struct fuse_req *req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -848,7 +880,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
        struct fuse_req *req;
        u64 attr_version;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -985,7 +1017,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 /*
 * Calling into a user-controlled filesystem gives the filesystem
- * daemon ptrace-like capabilities over the requester process.  This
+ * daemon ptrace-like capabilities over the current process.  This
 * means, that the filesystem daemon is able to record the exact
 * filesystem operations performed, and can also control the behavior
 * of the requester process in otherwise impossible ways.  For example
@@ -996,27 +1028,23 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 * for which the owner of the mount has ptrace privilege.  This
 * excludes processes started by other users, suid or sgid processes.
 */
-int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task)
+int fuse_allow_current_process(struct fuse_conn *fc)
 {
        const struct cred *cred;
-        int ret;
        if (fc->flags & FUSE_ALLOW_OTHER)
                return 1;
-        rcu_read_lock();
+        cred = current_cred();
-        ret = 0;
-        cred = __task_cred(task);
        if (uid_eq(cred->euid, fc->user_id) &&
            uid_eq(cred->suid, fc->user_id) &&
            uid_eq(cred->uid,  fc->user_id) &&
            gid_eq(cred->egid, fc->group_id) &&
            gid_eq(cred->sgid, fc->group_id) &&
            gid_eq(cred->gid,  fc->group_id))
-                ret = 1;
+                return 1;
-        rcu_read_unlock();
-        return ret;
+        return 0;
 }
 static int fuse_access(struct inode *inode, int mask)
@@ -1029,7 +1057,7 @@ static int fuse_access(struct inode *inode, int mask)
        if (fc->no_access)
                return 0;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -1077,7 +1105,7 @@ static int fuse_permission(struct inode *inode, int mask)
        bool refreshed = false;
        int err = 0;
-        if (!fuse_allow_task(fc, current))
+        if (!fuse_allow_current_process(fc))
                return -EACCES;
        /*
@@ -1155,19 +1183,157 @@ static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
        return 0;
 }
-static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+static int fuse_direntplus_link(struct file *file,
+                                struct fuse_direntplus *direntplus,
+                                u64 attr_version)
 {
        int err;
+        struct fuse_entry_out *o = &direntplus->entry_out;
+        struct fuse_dirent *dirent = &direntplus->dirent;
+        struct dentry *parent = file->f_path.dentry;
+        struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+        struct dentry *dentry;
+        struct dentry *alias;
+        struct inode *dir = parent->d_inode;
+        struct fuse_conn *fc;
+        struct inode *inode;
+        if (!o->nodeid) {
+                /*
+                 * Unlike in the case of fuse_lookup, zero nodeid does not mean
+                 * ENOENT. Instead, it only means the userspace filesystem did
+                 * not want to return attributes/handle for this entry.
+                 *
+                 * So do nothing.
+                 */
+                return 0;
+        }
+        if (name.name[0] == '.') {
+                /*
+                 * We could potentially refresh the attributes of the directory
+                 * and its parent?
+                 */
+                if (name.len == 1)
+                        return 0;
+                if (name.name[1] == '.' && name.len == 2)
+                        return 0;
+        }
+        fc = get_fuse_conn(dir);
+        name.hash = full_name_hash(name.name, name.len);
+        dentry = d_lookup(parent, &name);
+        if (dentry && dentry->d_inode) {
+                inode = dentry->d_inode;
+                if (get_node_id(inode) == o->nodeid) {
+                        struct fuse_inode *fi;
+                        fi = get_fuse_inode(inode);
+                        spin_lock(&fc->lock);
+                        fi->nlookup++;
+                        spin_unlock(&fc->lock);
+                        /*
+                         * The other branch to 'found' comes via fuse_iget()
+                         * which bumps nlookup inside
+                         */
+                        goto found;
+                }
+                err = d_invalidate(dentry);
+                if (err)
+                        goto out;
+                dput(dentry);
+                dentry = NULL;
+        }
+        dentry = d_alloc(parent, &name);
+        err = -ENOMEM;
+        if (!dentry)
+                goto out;
+        inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+                          &o->attr, entry_attr_timeout(o), attr_version);
+        if (!inode)
+                goto out;
+        alias = d_materialise_unique(dentry, inode);
+        err = PTR_ERR(alias);
+        if (IS_ERR(alias))
+                goto out;
+        if (alias) {
+                dput(dentry);
+                dentry = alias;
+        }
+found:
+        fuse_change_attributes(inode, &o->attr, entry_attr_timeout(o),
+                               attr_version);
+        fuse_change_entry_timeout(dentry, o);
+        err = 0;
+out:
+        if (dentry)
+                dput(dentry);
+        return err;
+}
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+                             void *dstbuf, filldir_t filldir, u64 attr_version)
+{
+        struct fuse_direntplus *direntplus;
+        struct fuse_dirent *dirent;
+        size_t reclen;
+        int over = 0;
+        int ret;
+        while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+                direntplus = (struct fuse_direntplus *) buf;
+                dirent = &direntplus->dirent;
+                reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+                if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+                        return -EIO;
+                if (reclen > nbytes)
+                        break;
+                if (!over) {
+                        /* We fill entries into dstbuf only as much as
+                           it can hold. But we still continue iterating
+                           over remaining entries to link them. If not,
+                           we need to send a FORGET for each of those
+                           which we did not link.
+                        */
+                        over = filldir(dstbuf, dirent->name, dirent->namelen,
+                                       file->f_pos, dirent->ino,
+                                       dirent->type);
+                        file->f_pos = dirent->off;
+                }
+                buf += reclen;
+                nbytes -= reclen;
+                ret = fuse_direntplus_link(file, direntplus, attr_version);
+                if (ret)
+                        fuse_force_forget(file, direntplus->entry_out.nodeid);
+        }
+        return 0;
+}
+static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
+{
+        int plus, err;
        size_t nbytes;
        struct page *page;
        struct inode *inode = file->f_path.dentry->d_inode;
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_req *req;
+        u64 attr_version = 0;
        if (is_bad_inode(inode))
                return -EIO;
-        req = fuse_get_req(fc);
+        req = fuse_get_req(fc, 1);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -1176,17 +1342,34 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
                fuse_put_request(fc, req);
                return -ENOMEM;
        }
+        plus = fuse_use_readdirplus(inode, file);
        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
-        fuse_read_fill(req, file, file->f_pos, PAGE_SIZE, FUSE_READDIR);
+        req->page_descs[0].length = PAGE_SIZE;
+        if (plus) {
+                attr_version = fuse_get_attr_version(fc);
+                fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+                               FUSE_READDIRPLUS);
+        } else {
+                fuse_read_fill(req, file, file->f_pos, PAGE_SIZE,
+                               FUSE_READDIR);
+        }
        fuse_request_send(fc, req);
        nbytes = req->out.args[0].size;
        err = req->out.h.error;
        fuse_put_request(fc, req);
-        if (!err)
+        if (!err) {
-                err = parse_dirfile(page_address(page), nbytes, file, dstbuf,
+                if (plus) {
-                                    filldir);
+                        err = parse_dirplusfile(page_address(page), nbytes,
+                                                file, dstbuf, filldir,
+                                                attr_version);
+                } else {
+                        err = parse_dirfile(page_address(page), nbytes, file,
+                                            dstbuf, filldir);
+                }
+        }
        __free_page(page);
        fuse_invalidate_attr(inode); /* atime changed */
@@ -1197,7 +1380,7 @@ static char *read_link(struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
        struct fuse_conn *fc = get_fuse_conn(inode);
-        struct fuse_req *req = fuse_get_req(fc);
+        struct fuse_req *req = fuse_get_req_nopages(fc);
        char *link;
        if (IS_ERR(req))
@@ -1391,7 +1574,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
        loff_t oldsize;
        int err;
-        if (!fuse_allow_task(fc, current))
+        if (!fuse_allow_current_process(fc))
                return -EACCES;
        if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
@@ -1410,7 +1593,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr,
        if (attr->ia_valid & ATTR_SIZE)
                is_truncate = true;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -1500,7 +1683,7 @@ static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
        struct inode *inode = entry->d_inode;
        struct fuse_conn *fc = get_fuse_conn(inode);
-        if (!fuse_allow_task(fc, current))
+        if (!fuse_allow_current_process(fc))
                return -EACCES;
        return fuse_update_attributes(inode, stat, NULL, NULL);
@@ -1518,7 +1701,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
        if (fc->no_setxattr)
                return -EOPNOTSUPP;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -1557,7 +1740,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
        if (fc->no_getxattr)
                return -EOPNOTSUPP;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -1603,13 +1786,13 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
        struct fuse_getxattr_out outarg;
        ssize_t ret;
-        if (!fuse_allow_task(fc, current))
+        if (!fuse_allow_current_process(fc))
                return -EACCES;
        if (fc->no_listxattr)
                return -EOPNOTSUPP;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -1654,7 +1837,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
        if (fc->no_removexattr)
                return -EOPNOTSUPP;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index e21d4d8f87e3..c8071768b950 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -25,7 +25,7 @@ static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
        struct fuse_req *req;
        int err;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -57,7 +57,7 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
                return NULL;
        ff->fc = fc;
-        ff->reserved_req = fuse_request_alloc();
+        ff->reserved_req = fuse_request_alloc(0);
        if (unlikely(!ff->reserved_req)) {
                kfree(ff);
                return NULL;
@@ -368,7 +368,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
        if (fc->no_flush)
                return 0;
-        req = fuse_get_req_nofail(fc, file);
+        req = fuse_get_req_nofail_nopages(fc, file);
        memset(&inarg, 0, sizeof(inarg));
        inarg.fh = ff->fh;
        inarg.lock_owner = fuse_lock_owner_id(fc, id);
@@ -436,7 +436,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
        fuse_sync_writes(inode);
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                goto out;
@@ -544,7 +544,7 @@ static int fuse_readpage(struct file *file, struct page *page)
         */
        fuse_wait_on_page_writeback(inode, page->index);
-        req = fuse_get_req(fc);
+        req = fuse_get_req(fc, 1);
        err = PTR_ERR(req);
        if (IS_ERR(req))
                goto out;
@@ -555,6 +555,7 @@ static int fuse_readpage(struct file *file, struct page *page)
        req->out.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = page;
+        req->page_descs[0].length = count;
        num_read = fuse_send_read(req, file, pos, count, NULL);
        err = req->out.h.error;
        fuse_put_request(fc, req);
@@ -641,6 +642,7 @@ struct fuse_fill_data {
        struct fuse_req *req;
        struct file *file;
        struct inode *inode;
+        unsigned nr_pages;
 };
 static int fuse_readpages_fill(void *_data, struct page *page)
@@ -656,16 +658,26 @@ static int fuse_readpages_fill(void *_data, struct page *page)
            (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
             (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
             req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+                int nr_alloc = min_t(unsigned, data->nr_pages,
+                                     FUSE_MAX_PAGES_PER_REQ);
                fuse_send_readpages(req, data->file);
-                data->req = req = fuse_get_req(fc);
+                data->req = req = fuse_get_req(fc, nr_alloc);
                if (IS_ERR(req)) {
                        unlock_page(page);
                        return PTR_ERR(req);
                }
        }
+        if (WARN_ON(req->num_pages >= req->max_pages)) {
+                fuse_put_request(fc, req);
+                return -EIO;
+        }
        page_cache_get(page);
        req->pages[req->num_pages] = page;
+        req->page_descs[req->num_pages].length = PAGE_SIZE;
        req->num_pages++;
+        data->nr_pages--;
        return 0;
 }
@@ -676,6 +688,7 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_fill_data data;
        int err;
+        int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
        err = -EIO;
        if (is_bad_inode(inode))
@@ -683,7 +696,8 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
        data.file = file;
        data.inode = inode;
-        data.req = fuse_get_req(fc);
+        data.req = fuse_get_req(fc, nr_alloc);
+        data.nr_pages = nr_pages;
        err = PTR_ERR(data.req);
        if (IS_ERR(data.req))
                goto out;
@@ -786,7 +800,7 @@ static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
        res = fuse_send_write(req, file, pos, count, NULL);
-        offset = req->page_offset;
+        offset = req->page_descs[0].offset;
        count = res;
        for (i = 0; i < req->num_pages; i++) {
                struct page *page = req->pages[i];
@@ -817,7 +831,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
        int err;
        req->in.argpages = 1;
-        req->page_offset = offset;
+        req->page_descs[0].offset = offset;
        do {
                size_t tmp;
@@ -857,6 +871,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
                err = 0;
                req->pages[req->num_pages] = page;
+                req->page_descs[req->num_pages].length = tmp;
                req->num_pages++;
                iov_iter_advance(ii, tmp);
@@ -869,11 +884,19 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
                if (!fc->big_writes)
                        break;
        } while (iov_iter_count(ii) && count < fc->max_write &&
-                 req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0);
+                 req->num_pages < req->max_pages && offset == 0);
        return count > 0 ? count : err;
 }
+static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
+{
+        return min_t(unsigned,
+                     ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
+                     (pos >> PAGE_CACHE_SHIFT) + 1,
+                     FUSE_MAX_PAGES_PER_REQ);
+}
 static ssize_t fuse_perform_write(struct file *file,
                                  struct address_space *mapping,
                                  struct iov_iter *ii, loff_t pos)
@@ -889,8 +912,9 @@ static ssize_t fuse_perform_write(struct file *file,
        do {
                struct fuse_req *req;
                ssize_t count;
+                unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
-                req = fuse_get_req(fc);
+                req = fuse_get_req(fc, nr_pages);
                if (IS_ERR(req)) {
                        err = PTR_ERR(req);
                        break;
@@ -1023,47 +1047,110 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
        }
 }
-static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
+static inline void fuse_page_descs_length_init(struct fuse_req *req,
+                unsigned index, unsigned nr_pages)
+{
+        int i;
+        for (i = index; i < index + nr_pages; i++)
+                req->page_descs[i].length = PAGE_SIZE -
+                        req->page_descs[i].offset;
+}
+static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
+{
+        return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+}
+static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
+                                        size_t max_size)
+{
+        return min(iov_iter_single_seg_count(ii), max_size);
+}
+static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
                               size_t *nbytesp, int write)
 {
-        size_t nbytes = *nbytesp;
+        size_t nbytes = 0;  /* # bytes already packed in req */
-        unsigned long user_addr = (unsigned long) buf;
-        unsigned offset = user_addr & ~PAGE_MASK;
-        int npages;
        /* Special case for kernel I/O: can copy directly into the buffer */
        if (segment_eq(get_fs(), KERNEL_DS)) {
+                unsigned long user_addr = fuse_get_user_addr(ii);
+                size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
                if (write)
                        req->in.args[1].value = (void *) user_addr;
                else
                        req->out.args[0].value = (void *) user_addr;
+                iov_iter_advance(ii, frag_size);
+                *nbytesp = frag_size;
                return 0;
        }
-        nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+        while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
-        npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                unsigned npages;
-        npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
+                unsigned long user_addr = fuse_get_user_addr(ii);
-        npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
+                unsigned offset = user_addr & ~PAGE_MASK;
-        if (npages < 0)
+                size_t frag_size = fuse_get_frag_size(ii, *nbytesp - nbytes);
-                return npages;
+                int ret;
+                unsigned n = req->max_pages - req->num_pages;
+                frag_size = min_t(size_t, frag_size, n << PAGE_SHIFT);
+                npages = (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                npages = clamp(npages, 1U, n);
+                ret = get_user_pages_fast(user_addr, npages, !write,
+                                          &req->pages[req->num_pages]);
+                if (ret < 0)
+                        return ret;
-        req->num_pages = npages;
+                npages = ret;
-        req->page_offset = offset;
+                frag_size = min_t(size_t, frag_size,
+                                  (npages << PAGE_SHIFT) - offset);
+                iov_iter_advance(ii, frag_size);
+                req->page_descs[req->num_pages].offset = offset;
+                fuse_page_descs_length_init(req, req->num_pages, npages);
+                req->num_pages += npages;
+                req->page_descs[req->num_pages - 1].length -=
+                        (npages << PAGE_SHIFT) - offset - frag_size;
+                nbytes += frag_size;
+        }
        if (write)
                req->in.argpages = 1;
        else
                req->out.argpages = 1;
-        nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset;
+        *nbytesp = nbytes;
-        *nbytesp = min(*nbytesp, nbytes);
        return 0;
 }
-ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+static inline int fuse_iter_npages(const struct iov_iter *ii_p)
-                       size_t count, loff_t *ppos, int write)
+{
+        struct iov_iter ii = *ii_p;
+        int npages = 0;
+        while (iov_iter_count(&ii) && npages < FUSE_MAX_PAGES_PER_REQ) {
+                unsigned long user_addr = fuse_get_user_addr(&ii);
+                unsigned offset = user_addr & ~PAGE_MASK;
+                size_t frag_size = iov_iter_single_seg_count(&ii);
+                npages += (frag_size + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+                iov_iter_advance(&ii, frag_size);
+        }
+        return min(npages, FUSE_MAX_PAGES_PER_REQ);
+}
+ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
+                       unsigned long nr_segs, size_t count, loff_t *ppos,
+                       int write)
 {
        struct fuse_file *ff = file->private_data;
        struct fuse_conn *fc = ff->fc;
@@ -1071,8 +1158,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
        loff_t pos = *ppos;
        ssize_t res = 0;
        struct fuse_req *req;
+        struct iov_iter ii;
+        iov_iter_init(&ii, iov, nr_segs, count, 0);
-        req = fuse_get_req(fc);
+        req = fuse_get_req(fc, fuse_iter_npages(&ii));
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -1080,7 +1170,7 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
                size_t nres;
                fl_owner_t owner = current->files;
                size_t nbytes = min(count, nmax);
-                int err = fuse_get_user_pages(req, buf, &nbytes, write);
+                int err = fuse_get_user_pages(req, &ii, &nbytes, write);
                if (err) {
                        res = err;
                        break;
@@ -1103,12 +1193,11 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
                count -= nres;
                res += nres;
                pos += nres;
-                buf += nres;
                if (nres != nbytes)
                        break;
                if (count) {
                        fuse_put_request(fc, req);
-                        req = fuse_get_req(fc);
+                        req = fuse_get_req(fc, fuse_iter_npages(&ii));
                        if (IS_ERR(req))
                                break;
                }
@@ -1122,8 +1211,8 @@ ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 }
 EXPORT_SYMBOL_GPL(fuse_direct_io);
-static ssize_t fuse_direct_read(struct file *file, char __user *buf,
+static ssize_t __fuse_direct_read(struct file *file, const struct iovec *iov,
-                                     size_t count, loff_t *ppos)
+                                  unsigned long nr_segs, loff_t *ppos)
 {
        ssize_t res;
        struct inode *inode = file->f_path.dentry->d_inode;
@@ -1131,22 +1220,31 @@ static ssize_t fuse_direct_read(struct file *file, char __user *buf,
        if (is_bad_inode(inode))
                return -EIO;
-        res = fuse_direct_io(file, buf, count, ppos, 0);
+        res = fuse_direct_io(file, iov, nr_segs, iov_length(iov, nr_segs),
+                             ppos, 0);
        fuse_invalidate_attr(inode);
        return res;
 }
-static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,
+static ssize_t fuse_direct_read(struct file *file, char __user *buf,
-                                   size_t count, loff_t *ppos)
+                                     size_t count, loff_t *ppos)
+{
+        struct iovec iov = { .iov_base = buf, .iov_len = count };
+        return __fuse_direct_read(file, &iov, 1, ppos);
+}
+static ssize_t __fuse_direct_write(struct file *file, const struct iovec *iov,
+                                   unsigned long nr_segs, loff_t *ppos)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
+        size_t count = iov_length(iov, nr_segs);
        ssize_t res;
        res = generic_write_checks(file, ppos, &count, 0);
        if (!res) {
-                res = fuse_direct_io(file, buf, count, ppos, 1);
+                res = fuse_direct_io(file, iov, nr_segs, count, ppos, 1);
                if (res > 0)
                        fuse_write_update_size(inode, *ppos);
        }
@@ -1159,6 +1257,7 @@ static ssize_t __fuse_direct_write(struct file *file, const char __user *buf,
 static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
                                 size_t count, loff_t *ppos)
 {
+        struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
        struct inode *inode = file->f_path.dentry->d_inode;
        ssize_t res;
@@ -1167,7 +1266,7 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
        /* Don't allow parallel writes to the same file */
        mutex_lock(&inode->i_mutex);
-        res = __fuse_direct_write(file, buf, count, ppos);
+        res = __fuse_direct_write(file, &iov, 1, ppos);
        mutex_unlock(&inode->i_mutex);
        return res;
@@ -1272,7 +1371,7 @@ static int fuse_writepage_locked(struct page *page)
        set_page_writeback(page);
-        req = fuse_request_alloc_nofs();
+        req = fuse_request_alloc_nofs(1);
        if (!req)
                goto err;
@@ -1293,7 +1392,8 @@ static int fuse_writepage_locked(struct page *page)
        req->in.argpages = 1;
        req->num_pages = 1;
        req->pages[0] = tmp_page;
-        req->page_offset = 0;
+        req->page_descs[0].offset = 0;
+        req->page_descs[0].length = PAGE_SIZE;
        req->end = fuse_writepage_end;
        req->inode = inode;
@@ -1471,7 +1571,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl)
        struct fuse_lk_out outarg;
        int err;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -1506,7 +1606,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
        if (fl->fl_flags & FL_CLOSE)
                return 0;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -1575,7 +1675,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
        if (!inode->i_sb->s_bdev || fc->no_bmap)
                return 0;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return 0;
@@ -1873,7 +1973,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                num_pages++;
        }
-        req = fuse_get_req(fc);
+        req = fuse_get_req(fc, num_pages);
        if (IS_ERR(req)) {
                err = PTR_ERR(req);
                req = NULL;
@@ -1881,6 +1981,7 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
        }
        memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
        req->num_pages = num_pages;
+        fuse_page_descs_length_init(req, 0, req->num_pages);
        /* okay, let's send it to the client */
        req->in.h.opcode = FUSE_IOCTL;
@@ -1981,7 +2082,7 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
        struct inode *inode = file->f_dentry->d_inode;
        struct fuse_conn *fc = get_fuse_conn(inode);
-        if (!fuse_allow_task(fc, current))
+        if (!fuse_allow_current_process(fc))
                return -EACCES;
        if (is_bad_inode(inode))
@@ -2066,6 +2167,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
                return DEFAULT_POLLMASK;
        poll_wait(file, &ff->poll_wait, wait);
+        inarg.events = (__u32)poll_requested_events(wait);
        /*
         * Ask for notification iff there's someone waiting for it.
@@ -2076,7 +2178,7 @@ unsigned fuse_file_poll(struct file *file, poll_table *wait)
                fuse_register_polled_file(fc, ff);
        }
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return POLLERR;
@@ -2126,41 +2228,6 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
        return 0;
 }
-static ssize_t fuse_loop_dio(struct file *filp, const struct iovec *iov,
-                             unsigned long nr_segs, loff_t *ppos, int rw)
-{
-        const struct iovec *vector = iov;
-        ssize_t ret = 0;
-        while (nr_segs > 0) {
-                void __user *base;
-                size_t len;
-                ssize_t nr;
-                base = vector->iov_base;
-                len = vector->iov_len;
-                vector++;
-                nr_segs--;
-                if (rw == WRITE)
-                        nr = __fuse_direct_write(filp, base, len, ppos);
-                else
-                        nr = fuse_direct_read(filp, base, len, ppos);
-                if (nr < 0) {
-                        if (!ret)
-                                ret = nr;
-                        break;
-                }
-                ret += nr;
-                if (nr != len)
-                        break;
-        }
-        return ret;
-}
 static ssize_t
 fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                        loff_t offset, unsigned long nr_segs)
@@ -2172,13 +2239,16 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        file = iocb->ki_filp;
        pos = offset;
-        ret = fuse_loop_dio(file, iov, nr_segs, &pos, rw);
+        if (rw == WRITE)
+                ret = __fuse_direct_write(file, iov, nr_segs, &pos);
+        else
+                ret = __fuse_direct_read(file, iov, nr_segs, &pos);
        return ret;
 }
-long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
-                            loff_t length)
+                                loff_t length)
 {
        struct fuse_file *ff = file->private_data;
        struct fuse_conn *fc = ff->fc;
@@ -2194,7 +2264,7 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
        if (fc->no_fallocate)
                return -EOPNOTSUPP;
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -2213,7 +2283,6 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
        return err;
 }
-EXPORT_SYMBOL_GPL(fuse_file_fallocate);
 static const struct file_operations fuse_file_operations = {
        .llseek         = fuse_file_llseek,
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index e105a53fc72d..6aeba864f070 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -44,6 +44,9 @@
    doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
+/** Number of page pointers embedded in fuse_req */
+#define FUSE_REQ_INLINE_PAGES 1
 /** List of active connections */
 extern struct list_head fuse_conn_list;
@@ -103,6 +106,15 @@ struct fuse_inode {
        /** List of writepage requestst (pending or sent) */
        struct list_head writepages;
+        /** Miscellaneous bits describing inode state */
+        unsigned long state;
+};
+/** FUSE inode state bits */
+enum {
+        /** Advise readdirplus  */
+        FUSE_I_ADVISE_RDPLUS,
 };
 struct fuse_conn;
@@ -200,6 +212,12 @@ struct fuse_out {
        struct fuse_arg args[3];
 };
+/** FUSE page descriptor */
+struct fuse_page_desc {
+        unsigned int length;
+        unsigned int offset;
+};
 /** The request state */
 enum fuse_req_state {
        FUSE_REQ_INIT = 0,
@@ -291,14 +309,23 @@ struct fuse_req {
        } misc;
        /** page vector */
-        struct page *pages[FUSE_MAX_PAGES_PER_REQ];
+        struct page **pages;
+        /** page-descriptor vector */
+        struct fuse_page_desc *page_descs;
+        /** size of the 'pages' array */
+        unsigned max_pages;
+        /** inline page vector */
+        struct page *inline_pages[FUSE_REQ_INLINE_PAGES];
+        /** inline page-descriptor vector */
+        struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
        /** number of pages in vector */
        unsigned num_pages;
-        /** offset of data on first page */
-        unsigned page_offset;
        /** File used in the request (or NULL) */
        struct fuse_file *ff;
@@ -487,6 +514,12 @@ struct fuse_conn {
        /** Use enhanced/automatic page cache invalidation. */
        unsigned auto_inval_data:1;
+        /** Does the filesystem support readdirplus? */
+        unsigned do_readdirplus:1;
+        /** Does the filesystem want adaptive readdirplus? */
+        unsigned readdirplus_auto:1;
        /** The number of requests waiting for completion */
        atomic_t num_waiting;
@@ -578,6 +611,9 @@ void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
 struct fuse_forget_link *fuse_alloc_forget(void);
+/* Used by READDIRPLUS */
+void fuse_force_forget(struct file *file, u64 nodeid);
 /**
 * Initialize READ or READDIR request
 */
@@ -658,9 +694,9 @@ void fuse_ctl_cleanup(void);
 /**
 * Allocate a request
 */
-struct fuse_req *fuse_request_alloc(void);
+struct fuse_req *fuse_request_alloc(unsigned npages);
-struct fuse_req *fuse_request_alloc_nofs(void);
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
 /**
 * Free a request
@@ -668,14 +704,25 @@ struct fuse_req *fuse_request_alloc_nofs(void);
 void fuse_request_free(struct fuse_req *req);
 /**
- * Get a request, may fail with -ENOMEM
+ * Get a request, may fail with -ENOMEM,
+ * caller should specify # elements in req->pages[] explicitly
 */
-struct fuse_req *fuse_get_req(struct fuse_conn *fc);
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
+/**
+ * Get a request, may fail with -ENOMEM,
+ * useful for callers who doesn't use req->pages[]
+ */
+static inline struct fuse_req *fuse_get_req_nopages(struct fuse_conn *fc)
+{
+        return fuse_get_req(fc, 0);
+}
 /**
 * Gets a requests for a file operation, always succeeds
 */
-struct fuse_req *fuse_get_req_nofail(struct fuse_conn *fc, struct file *file);
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+                                             struct file *file);
 /**
 * Decrement reference count of a request.  If count goes to zero free
@@ -739,9 +786,9 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc);
 int fuse_valid_type(int m);
 /**
- * Is task allowed to perform filesystem operation?
+ * Is current process allowed to perform filesystem operation?
 */
-int fuse_allow_task(struct fuse_conn *fc, struct task_struct *task);
+int fuse_allow_current_process(struct fuse_conn *fc);
 u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
@@ -776,8 +823,9 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
                 bool isdir);
-ssize_t fuse_direct_io(struct file *file, const char __user *buf,
+ssize_t fuse_direct_io(struct file *file, const struct iovec *iov,
-                       size_t count, loff_t *ppos, int write);
+                       unsigned long nr_segs, size_t count, loff_t *ppos,
+                       int write);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
                   unsigned int flags);
 long fuse_ioctl_common(struct file *file, unsigned int cmd,
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 73ca6b72beaf..01353ed75750 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -92,6 +92,7 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
        fi->attr_version = 0;
        fi->writectr = 0;
        fi->orig_ino = 0;
+        fi->state = 0;
        INIT_LIST_HEAD(&fi->write_files);
        INIT_LIST_HEAD(&fi->queued_writes);
        INIT_LIST_HEAD(&fi->writepages);
@@ -408,12 +409,12 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct fuse_statfs_out outarg;
        int err;
-        if (!fuse_allow_task(fc, current)) {
+        if (!fuse_allow_current_process(fc)) {
                buf->f_type = FUSE_SUPER_MAGIC;
                return 0;
        }
-        req = fuse_get_req(fc);
+        req = fuse_get_req_nopages(fc);
        if (IS_ERR(req))
                return PTR_ERR(req);
@@ -863,6 +864,10 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
                                fc->dont_mask = 1;
                        if (arg->flags & FUSE_AUTO_INVAL_DATA)
                                fc->auto_inval_data = 1;
+                        if (arg->flags & FUSE_DO_READDIRPLUS)
+                                fc->do_readdirplus = 1;
+                        if (arg->flags & FUSE_READDIRPLUS_AUTO)
+                                fc->readdirplus_auto = 1;
                } else {
                        ra_pages = fc->max_read / PAGE_CACHE_SIZE;
                        fc->no_lock = 1;
@@ -889,7 +894,8 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
        arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
                FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
                FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
-                FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA;
+                FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
+                FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO;
        req->in.h.opcode = FUSE_INIT;
        req->in.numargs = 1;
        req->in.args[0].size = sizeof(*arg);
@@ -1034,12 +1040,12 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        /* only now - we want root dentry with NULL ->d_op */
        sb->s_d_op = &fuse_dentry_operations;
-        init_req = fuse_request_alloc();
+        init_req = fuse_request_alloc(0);
        if (!init_req)
                goto err_put_root;
        if (is_bdev) {
-                fc->destroy_req = fuse_request_alloc();
+                fc->destroy_req = fuse_request_alloc(0);
                if (!fc->destroy_req)
                        goto err_free_init_req;
        }
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 30de4f2a2ea9..24f414f0ce61 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -51,7 +51,7 @@ static void gfs2_page_add_databufs(struct gfs2_inode *ip, struct page *page,
                        continue;
                if (gfs2_is_jdata(ip))
                        set_buffer_uptodate(bh);
-                gfs2_trans_add_bh(ip->i_gl, bh, 0);
+                gfs2_trans_add_data(ip->i_gl, bh);
        }
 }
@@ -230,16 +230,14 @@ out_ignore:
 }
 /**
- * gfs2_writeback_writepages - Write a bunch of dirty pages back to disk
+ * gfs2_writepages - Write a bunch of dirty pages back to disk
 * @mapping: The mapping to write
 * @wbc: Write-back control
 *
- * For the data=writeback case we can already ignore buffer heads
+ * Used for both ordered and writeback modes.
- * and write whole extents at once. This is a big reduction in the
- * number of I/O requests we send and the bmap calls we make in this case.
 */
-static int gfs2_writeback_writepages(struct address_space *mapping,
+static int gfs2_writepages(struct address_space *mapping,
-                                     struct writeback_control *wbc)
+                           struct writeback_control *wbc)
 {
        return mpage_writepages(mapping, wbc, gfs2_get_block_noalloc);
 }
@@ -852,7 +850,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
                goto failed;
        }
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        if (gfs2_is_stuffed(ip))
                return gfs2_stuffed_write_end(inode, dibh, pos, len, copied, page);
@@ -1102,7 +1100,7 @@ cannot_release:
 static const struct address_space_operations gfs2_writeback_aops = {
        .writepage = gfs2_writeback_writepage,
-        .writepages = gfs2_writeback_writepages,
+        .writepages = gfs2_writepages,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
        .write_begin = gfs2_write_begin,
@@ -1118,6 +1116,7 @@ static const struct address_space_operations gfs2_writeback_aops = {
 static const struct address_space_operations gfs2_ordered_aops = {
        .writepage = gfs2_ordered_writepage,
+        .writepages = gfs2_writepages,
        .readpage = gfs2_readpage,
        .readpages = gfs2_readpages,
        .write_begin = gfs2_write_begin,
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index a68e91bcef3d..df686d13a7d2 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -22,6 +22,7 @@
 #include "meta_io.h"
 #include "quota.h"
 #include "rgrp.h"
+#include "log.h"
 #include "super.h"
 #include "trans.h"
 #include "dir.h"
@@ -93,7 +94,7 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
        if (!gfs2_is_jdata(ip))
                mark_buffer_dirty(bh);
        if (!gfs2_is_writeback(ip))
-                gfs2_trans_add_bh(ip->i_gl, bh, 0);
+                gfs2_trans_add_data(ip->i_gl, bh);
        if (release) {
                unlock_page(page);
@@ -153,7 +154,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page)
        /*  Set up the pointer to the new block  */
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        di = (struct gfs2_dinode *)dibh->b_data;
        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
@@ -405,7 +406,7 @@ static inline __be64 *gfs2_indirect_init(struct metapath *mp,
        BUG_ON(i < 1);
        BUG_ON(mp->mp_bh[i] != NULL);
        mp->mp_bh[i] = gfs2_meta_new(gl, bn);
-        gfs2_trans_add_bh(gl, mp->mp_bh[i], 1);
+        gfs2_trans_add_meta(gl, mp->mp_bh[i]);
        gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
        gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
        ptr += offset;
@@ -468,7 +469,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
        BUG_ON(sheight < 1);
        BUG_ON(dibh == NULL);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        if (height == sheight) {
                struct buffer_head *bh;
@@ -544,7 +545,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
                /* Branching from existing tree */
                case ALLOC_GROW_DEPTH:
                        if (i > 1 && i < height)
-                                gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[i-1], 1);
+                                gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
                        for (; i < height && n > 0; i++, n--)
                                gfs2_indirect_init(mp, ip->i_gl, i,
                                                   mp->mp_list[i-1], bn++);
@@ -556,7 +557,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
                case ALLOC_DATA:
                        BUG_ON(n > dblks);
                        BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
-                        gfs2_trans_add_bh(ip->i_gl, mp->mp_bh[end_of_metadata], 1);
+                        gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
                        dblks = n;
                        ptr = metapointer(end_of_metadata, mp);
                        dblock = bn;
@@ -796,8 +797,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
        down_write(&ip->i_rw_mutex);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, bh);
        bstart = 0;
        blen = 0;
@@ -981,7 +982,7 @@ static int gfs2_block_truncate_page(struct address_space *mapping, loff_t from)
        }
        if (!gfs2_is_writeback(ip))
-                gfs2_trans_add_bh(ip->i_gl, bh, 0);
+                gfs2_trans_add_data(ip->i_gl, bh);
        zero_user(page, offset, length);
        mark_buffer_dirty(bh);
@@ -1046,7 +1047,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
        if (error)
                goto out;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        if (gfs2_is_stuffed(ip)) {
                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
@@ -1137,11 +1138,12 @@ static int trunc_end(struct gfs2_inode *ip)
                ip->i_height = 0;
                ip->i_goal = ip->i_no_addr;
                gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
+                gfs2_ordered_del_inode(ip);
        }
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
@@ -1246,7 +1248,7 @@ static int do_grow(struct inode *inode, u64 size)
        i_size_write(inode, size);
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
@@ -1286,6 +1288,10 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
        inode_dio_wait(inode);
+        ret = gfs2_rs_alloc(GFS2_I(inode));
+        if (ret)
+                return ret;
        oldsize = inode->i_size;
        if (newsize >= oldsize)
                return do_grow(inode, newsize);
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 9a35670fdc38..7179478e5a28 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -93,7 +93,7 @@ int gfs2_dir_get_new_buffer(struct gfs2_inode *ip, u64 block,
        struct buffer_head *bh;
        bh = gfs2_meta_new(ip->i_gl, block);
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, bh);
        gfs2_metatype_set(bh, GFS2_METATYPE_JD, GFS2_FORMAT_JD);
        gfs2_buffer_clear_tail(bh, sizeof(struct gfs2_meta_header));
        *bhp = bh;
@@ -127,7 +127,7 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf,
        if (error)
                return error;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size);
        if (ip->i_inode.i_size < offset + size)
                i_size_write(&ip->i_inode, offset + size);
@@ -209,7 +209,7 @@ static int gfs2_dir_write_data(struct gfs2_inode *ip, const char *buf,
                if (error)
                        goto fail;
-                gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                gfs2_trans_add_meta(ip->i_gl, bh);
                memcpy(bh->b_data + o, buf, amount);
                brelse(bh);
@@ -231,7 +231,7 @@ out:
                i_size_write(&ip->i_inode, offset + copied);
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
@@ -647,7 +647,7 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
                return;
        }
-        gfs2_trans_add_bh(dip->i_gl, bh, 1);
+        gfs2_trans_add_meta(dip->i_gl, bh);
        /* If there is no prev entry, this is the first entry in the block.
           The de_rec_len is already as big as it needs to be.  Just zero
@@ -690,7 +690,7 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
                offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
        totlen = be16_to_cpu(dent->de_rec_len);
        BUG_ON(offset + name->len > totlen);
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, bh);
        ndent = (struct gfs2_dirent *)((char *)dent + offset);
        dent->de_rec_len = cpu_to_be16(offset);
        gfs2_qstr2dirent(name, totlen - offset, ndent);
@@ -831,7 +831,7 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh,
                return NULL;
        gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1);
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, bh);
        gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF);
        leaf = (struct gfs2_leaf *)bh->b_data;
        leaf->lf_depth = cpu_to_be16(depth);
@@ -916,7 +916,7 @@ static int dir_make_exhash(struct inode *inode)
        /*  We're done with the new leaf block, now setup the new
            hash table.  */
-        gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(dip->i_gl, dibh);
        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
        lp = (__be64 *)(dibh->b_data + sizeof(struct gfs2_dinode));
@@ -976,7 +976,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
                return 1; /* can't split */
        }
-        gfs2_trans_add_bh(dip->i_gl, obh, 1);
+        gfs2_trans_add_meta(dip->i_gl, obh);
        nleaf = new_leaf(inode, &nbh, be16_to_cpu(oleaf->lf_depth) + 1);
        if (!nleaf) {
@@ -1069,7 +1069,7 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
        error = gfs2_meta_inode_buffer(dip, &dibh);
        if (!gfs2_assert_withdraw(GFS2_SB(&dip->i_inode), !error)) {
-                gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+                gfs2_trans_add_meta(dip->i_gl, dibh);
                gfs2_add_inode_blocks(&dip->i_inode, 1);
                gfs2_dinode_out(dip, dibh->b_data);
                brelse(dibh);
@@ -1622,7 +1622,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
                        return error;
        } while(1);
-        gfs2_trans_add_bh(ip->i_gl, obh, 1);
+        gfs2_trans_add_meta(ip->i_gl, obh);
        leaf = new_leaf(inode, &bh, be16_to_cpu(oleaf->lf_depth));
        if (!leaf) {
@@ -1636,7 +1636,7 @@ static int dir_new_leaf(struct inode *inode, const struct qstr *name)
        error = gfs2_meta_inode_buffer(ip, &bh);
        if (error)
                return error;
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, bh);
        gfs2_add_inode_blocks(&ip->i_inode, 1);
        gfs2_dinode_out(ip, bh->b_data);
        brelse(bh);
@@ -1795,7 +1795,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
        if (IS_ERR(dent))
                return PTR_ERR(dent);
-        gfs2_trans_add_bh(dip->i_gl, bh, 1);
+        gfs2_trans_add_meta(dip->i_gl, bh);
        gfs2_inum_out(nip, dent);
        dent->de_type = cpu_to_be16(new_type);
@@ -1804,7 +1804,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename,
                error = gfs2_meta_inode_buffer(dip, &bh);
                if (error)
                        return error;
-                gfs2_trans_add_bh(dip->i_gl, bh, 1);
+                gfs2_trans_add_meta(dip->i_gl, bh);
        }
        dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME;
@@ -1917,7 +1917,7 @@ static int leaf_dealloc(struct gfs2_inode *dip, u32 index, u32 len,
        if (error)
                goto out_end_trans;
-        gfs2_trans_add_bh(dip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(dip->i_gl, dibh);
        /* On the last dealloc, make this a regular file in case we crash.
           (We don't want to free these blocks a second time.)  */
        if (last_dealloc)
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 991ab2d484dd..2687f50d98cb 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -276,7 +276,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        error = gfs2_meta_inode_buffer(ip, &bh);
        if (error)
                goto out_trans_end;
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, bh);
        ip->i_diskflags = new_flags;
        gfs2_dinode_out(ip, bh->b_data);
        brelse(bh);
@@ -483,7 +483,7 @@ out:
        gfs2_holder_uninit(&gh);
        if (ret == 0) {
                set_page_dirty(page);
-                wait_on_page_writeback(page);
+                wait_for_stable_page(page);
        }
        sb_end_pagefault(inode->i_sb);
        return block_page_mkwrite_return(ret);
@@ -709,7 +709,7 @@ static int fallocate_chunk(struct inode *inode, loff_t offset, loff_t len,
        if (unlikely(error))
                return error;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        if (gfs2_is_stuffed(ip)) {
                error = gfs2_unstuff_dinode(ip, NULL);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 992c5c0cb504..cf3515546739 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -30,6 +30,7 @@
 #include <linux/rculist_bl.h>
 #include <linux/bit_spinlock.h>
 #include <linux/percpu.h>
+#include <linux/list_sort.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -1376,56 +1377,105 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
                gfs2_glock_put(gl);
 }
+static int glock_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        struct gfs2_glock *gla, *glb;
-static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+        gla = list_entry(a, struct gfs2_glock, gl_lru);
-                                    struct shrink_control *sc)
+        glb = list_entry(b, struct gfs2_glock, gl_lru);
+        if (gla->gl_name.ln_number > glb->gl_name.ln_number)
+                return 1;
+        if (gla->gl_name.ln_number < glb->gl_name.ln_number)
+                return -1;
+        return 0;
+}
+/**
+ * gfs2_dispose_glock_lru - Demote a list of glocks
+ * @list: The list to dispose of
+ *
+ * Disposing of glocks may involve disk accesses, so that here we sort
+ * the glocks by number (i.e. disk location of the inodes) so that if
+ * there are any such accesses, they'll be sent in order (mostly).
+ *
+ * Must be called under the lru_lock, but may drop and retake this
+ * lock. While the lru_lock is dropped, entries may vanish from the
+ * list, but no new entries will appear on the list (since it is
+ * private)
+ */
+static void gfs2_dispose_glock_lru(struct list_head *list)
+__releases(&lru_lock)
+__acquires(&lru_lock)
 {
        struct gfs2_glock *gl;
-        int may_demote;
-        int nr_skipped = 0;
-        int nr = sc->nr_to_scan;
-        gfp_t gfp_mask = sc->gfp_mask;
-        LIST_HEAD(skipped);
-        if (nr == 0)
+        list_sort(NULL, list, glock_cmp);
-                goto out;
-        if (!(gfp_mask & __GFP_FS))
+        while(!list_empty(list)) {
-                return -1;
+                gl = list_entry(list->next, struct gfs2_glock, gl_lru);
+                list_del_init(&gl->gl_lru);
+                clear_bit(GLF_LRU, &gl->gl_flags);
+                gfs2_glock_hold(gl);
+                spin_unlock(&lru_lock);
+                spin_lock(&gl->gl_spin);
+                if (demote_ok(gl))
+                        handle_callback(gl, LM_ST_UNLOCKED, 0);
+                WARN_ON(!test_and_clear_bit(GLF_LOCK, &gl->gl_flags));
+                smp_mb__after_clear_bit();
+                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                        gfs2_glock_put_nolock(gl);
+                spin_unlock(&gl->gl_spin);
+                spin_lock(&lru_lock);
+        }
+}
+/**
+ * gfs2_scan_glock_lru - Scan the LRU looking for locks to demote
+ * @nr: The number of entries to scan
+ *
+ * This function selects the entries on the LRU which are able to
+ * be demoted, and then kicks off the process by calling
+ * gfs2_dispose_glock_lru() above.
+ */
+static void gfs2_scan_glock_lru(int nr)
+{
+        struct gfs2_glock *gl;
+        LIST_HEAD(skipped);
+        LIST_HEAD(dispose);
        spin_lock(&lru_lock);
        while(nr && !list_empty(&lru_list)) {
                gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
-                list_del_init(&gl->gl_lru);
-                clear_bit(GLF_LRU, &gl->gl_flags);
-                atomic_dec(&lru_count);
                /* Test for being demotable */
                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
-                        gfs2_glock_hold(gl);
+                        list_move(&gl->gl_lru, &dispose);
-                        spin_unlock(&lru_lock);
+                        atomic_dec(&lru_count);
-                        spin_lock(&gl->gl_spin);
+                        nr--;
-                        may_demote = demote_ok(gl);
-                        if (may_demote) {
-                                handle_callback(gl, LM_ST_UNLOCKED, 0);
-                                nr--;
-                        }
-                        clear_bit(GLF_LOCK, &gl->gl_flags);
-                        smp_mb__after_clear_bit();
-                        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-                                gfs2_glock_put_nolock(gl);
-                        spin_unlock(&gl->gl_spin);
-                        spin_lock(&lru_lock);
                        continue;
                }
-                nr_skipped++;
-                list_add(&gl->gl_lru, &skipped);
+                list_move(&gl->gl_lru, &skipped);
-                set_bit(GLF_LRU, &gl->gl_flags);
        }
        list_splice(&skipped, &lru_list);
-        atomic_add(nr_skipped, &lru_count);
+        if (!list_empty(&dispose))
+                gfs2_dispose_glock_lru(&dispose);
        spin_unlock(&lru_lock);
-out:
+}
+static int gfs2_shrink_glock_memory(struct shrinker *shrink,
+                                    struct shrink_control *sc)
+{
+        if (sc->nr_to_scan) {
+                if (!(sc->gfp_mask & __GFP_FS))
+                        return -1;
+                gfs2_scan_glock_lru(sc->nr_to_scan);
+        }
        return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index c373a24fedd9..e2601ba38ef5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -52,7 +52,6 @@ struct gfs2_log_header_host {
 */
 struct gfs2_log_operations {
-        void (*lo_add) (struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
        void (*lo_before_commit) (struct gfs2_sbd *sdp);
        void (*lo_after_commit) (struct gfs2_sbd *sdp, struct gfs2_ail *ai);
        void (*lo_before_scan) (struct gfs2_jdesc *jd,
@@ -341,6 +340,7 @@ enum {
        GIF_QD_LOCKED           = 1,
        GIF_ALLOC_FAILED        = 2,
        GIF_SW_PAGED            = 3,
+        GIF_ORDERED             = 4,
 };
 struct gfs2_inode {
@@ -357,6 +357,7 @@ struct gfs2_inode {
        struct gfs2_rgrpd *i_rgd;
        u64 i_goal;     /* goal block for allocations */
        struct rw_semaphore i_rw_mutex;
+        struct list_head i_ordered;
        struct list_head i_trunc_list;
        __be64 *i_hash_cache;
        u32 i_entries;
@@ -641,6 +642,7 @@ struct gfs2_sbd {
        wait_queue_head_t sd_glock_wait;
        atomic_t sd_glock_disposal;
        struct completion sd_locking_init;
+        struct completion sd_wdack;
        struct delayed_work sd_control_work;
        /* Inode Stuff */
@@ -723,6 +725,7 @@ struct gfs2_sbd {
        struct list_head sd_log_le_revoke;
        struct list_head sd_log_le_databuf;
        struct list_head sd_log_le_ordered;
+        spinlock_t sd_ordered_lock;
        atomic_t sd_log_thresh1;
        atomic_t sd_log_thresh2;
@@ -758,10 +761,7 @@ struct gfs2_sbd {
        unsigned int sd_replayed_blocks;
        /* For quiescing the filesystem */
        struct gfs2_holder sd_freeze_gh;
-        struct mutex sd_freeze_lock;
-        unsigned int sd_freeze_count;
        char sd_fsname[GFS2_FSNAME_LEN];
        char sd_table_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2b6f5698ef18..db048a8ab6a8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -447,7 +447,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
        struct timespec tv = CURRENT_TIME;
        dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
        di = (struct gfs2_dinode *)dibh->b_data;
@@ -584,7 +584,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
        if (error)
                goto fail_end_trans;
        set_nlink(&ip->i_inode, S_ISDIR(ip->i_inode.i_mode) ? 2 : 1);
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
        return 0;
@@ -931,7 +931,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
        if (error)
                goto out_brelse;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        inc_nlink(&ip->i_inode);
        ip->i_inode.i_ctime = CURRENT_TIME;
        ihold(inode);
@@ -1412,7 +1412,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                if (error)
                        goto out_end_trans;
                ip->i_inode.i_ctime = CURRENT_TIME;
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_trans_add_meta(ip->i_gl, dibh);
                gfs2_dinode_out(ip, dibh->b_data);
                brelse(dibh);
        }
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 8dad6b093716..9802de0f85e6 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -241,6 +241,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
 static void gfs2_reverse_hex(char *c, u64 value)
 {
+        *c = '0';
        while (value) {
                *c-- = hex_asc[value & 0x0f];
                value >>= 4;
@@ -280,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
 {
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
+        int lvb_needs_unlock = 0;
        int error;
        if (gl->gl_lksb.sb_lkid == 0) {
@@ -293,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
        gfs2_update_request_times(gl);
        /* don't want to skip dlm_unlock writing the lvb when lock is ex */
+        if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE))
+                lvb_needs_unlock = 1;
        if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
-            gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) {
+            !lvb_needs_unlock) {
                gfs2_glock_free(gl);
                return;
        }
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index f4beeb9c81c1..9a2ca8be7647 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -482,70 +482,66 @@ static void log_flush_wait(struct gfs2_sbd *sdp)
        }
 }
-static int bd_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int ip_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
-        struct gfs2_bufdata *bda, *bdb;
+        struct gfs2_inode *ipa, *ipb;
-        bda = list_entry(a, struct gfs2_bufdata, bd_list);
+        ipa = list_entry(a, struct gfs2_inode, i_ordered);
-        bdb = list_entry(b, struct gfs2_bufdata, bd_list);
+        ipb = list_entry(b, struct gfs2_inode, i_ordered);
-        if (bda->bd_bh->b_blocknr < bdb->bd_bh->b_blocknr)
+        if (ipa->i_no_addr < ipb->i_no_addr)
                return -1;
-        if (bda->bd_bh->b_blocknr > bdb->bd_bh->b_blocknr)
+        if (ipa->i_no_addr > ipb->i_no_addr)
                return 1;
        return 0;
 }
 static void gfs2_ordered_write(struct gfs2_sbd *sdp)
 {
-        struct gfs2_bufdata *bd;
+        struct gfs2_inode *ip;
-        struct buffer_head *bh;
        LIST_HEAD(written);
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ordered_lock);
-        list_sort(NULL, &sdp->sd_log_le_ordered, &bd_cmp);
+        list_sort(NULL, &sdp->sd_log_le_ordered, &ip_cmp);
        while (!list_empty(&sdp->sd_log_le_ordered)) {
-                bd = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_bufdata, bd_list);
+                ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
-                list_move(&bd->bd_list, &written);
+                list_move(&ip->i_ordered, &written);
-                bh = bd->bd_bh;
+                if (ip->i_inode.i_mapping->nrpages == 0)
-                if (!buffer_dirty(bh))
                        continue;
-                get_bh(bh);
+                spin_unlock(&sdp->sd_ordered_lock);
-                gfs2_log_unlock(sdp);
+                filemap_fdatawrite(ip->i_inode.i_mapping);
-                lock_buffer(bh);
+                spin_lock(&sdp->sd_ordered_lock);
-                if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) {
-                        bh->b_end_io = end_buffer_write_sync;
-                        submit_bh(WRITE_SYNC, bh);
-                } else {
-                        unlock_buffer(bh);
-                        brelse(bh);
-                }
-                gfs2_log_lock(sdp);
        }
        list_splice(&written, &sdp->sd_log_le_ordered);
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ordered_lock);
 }
 static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
 {
-        struct gfs2_bufdata *bd;
+        struct gfs2_inode *ip;
-        struct buffer_head *bh;
-        gfs2_log_lock(sdp);
+        spin_lock(&sdp->sd_ordered_lock);
        while (!list_empty(&sdp->sd_log_le_ordered)) {
-                bd = list_entry(sdp->sd_log_le_ordered.prev, struct gfs2_bufdata, bd_list);
+                ip = list_entry(sdp->sd_log_le_ordered.next, struct gfs2_inode, i_ordered);
-                bh = bd->bd_bh;
+                list_del(&ip->i_ordered);
-                if (buffer_locked(bh)) {
+                WARN_ON(!test_and_clear_bit(GIF_ORDERED, &ip->i_flags));
-                        get_bh(bh);
+                if (ip->i_inode.i_mapping->nrpages == 0)
-                        gfs2_log_unlock(sdp);
-                        wait_on_buffer(bh);
-                        brelse(bh);
-                        gfs2_log_lock(sdp);
                        continue;
-                }
+                spin_unlock(&sdp->sd_ordered_lock);
-                list_del_init(&bd->bd_list);
+                filemap_fdatawait(ip->i_inode.i_mapping);
+                spin_lock(&sdp->sd_ordered_lock);
        }
-        gfs2_log_unlock(sdp);
+        spin_unlock(&sdp->sd_ordered_lock);
+}
+void gfs2_ordered_del_inode(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        spin_lock(&sdp->sd_ordered_lock);
+        if (test_and_clear_bit(GIF_ORDERED, &ip->i_flags))
+                list_del(&ip->i_ordered);
+        spin_unlock(&sdp->sd_ordered_lock);
 }
 /**
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 3fd5215ea25f..3566f35915e0 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -48,6 +48,18 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
        sdp->sd_log_head = sdp->sd_log_tail = value;
 }
+static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        if (!test_bit(GIF_ORDERED, &ip->i_flags)) {
+                spin_lock(&sdp->sd_ordered_lock);
+                if (!test_and_set_bit(GIF_ORDERED, &ip->i_flags))
+                        list_add(&ip->i_ordered, &sdp->sd_log_le_ordered);
+                spin_unlock(&sdp->sd_ordered_lock);
+        }
+}
+extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
 extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
                            unsigned int ssize);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 9ceccb1595a3..a5055977a214 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -37,7 +37,7 @@
 *
 * The log lock must be held when calling this function
 */
-static void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
+void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh)
 {
        struct gfs2_bufdata *bd;
@@ -388,32 +388,6 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type,
        return page;
 }
-static void buf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
-        struct gfs2_meta_header *mh;
-        struct gfs2_trans *tr;
-        tr = current->journal_info;
-        tr->tr_touched = 1;
-        if (!list_empty(&bd->bd_list))
-                return;
-        set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
-        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
-        mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
-        if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
-                printk(KERN_ERR
-                       "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
-                       (unsigned long long)bd->bd_bh->b_blocknr);
-                BUG();
-        }
-        gfs2_pin(sdp, bd->bd_bh);
-        mh->__pad0 = cpu_to_be64(0);
-        mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
-        sdp->sd_log_num_buf++;
-        list_add(&bd->bd_list, &sdp->sd_log_le_buf);
-        tr->tr_num_buf_new++;
-}
 static void gfs2_check_magic(struct buffer_head *bh)
 {
        void *kaddr;
@@ -600,20 +574,6 @@ static void buf_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
                jd->jd_jid, sdp->sd_replayed_blocks, sdp->sd_found_blocks);
 }
-static void revoke_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
-        struct gfs2_glock *gl = bd->bd_gl;
-        struct gfs2_trans *tr;
-        tr = current->journal_info;
-        tr->tr_touched = 1;
-        tr->tr_num_revoke++;
-        sdp->sd_log_num_revoke++;
-        atomic_inc(&gl->gl_revokes);
-        set_bit(GLF_LFLUSH, &gl->gl_flags);
-        list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
-}
 static void revoke_lo_before_commit(struct gfs2_sbd *sdp)
 {
        struct gfs2_meta_header *mh;
@@ -749,44 +709,6 @@ static void revoke_lo_after_scan(struct gfs2_jdesc *jd, int error, int pass)
 }
 /**
- * databuf_lo_add - Add a databuf to the transaction.
- *
- * This is used in two distinct cases:
- * i) In ordered write mode
- *    We put the data buffer on a list so that we can ensure that its
- *    synced to disk at the right time
- * ii) In journaled data mode
- *    We need to journal the data block in the same way as metadata in
- *    the functions above. The difference is that here we have a tag
- *    which is two __be64's being the block number (as per meta data)
- *    and a flag which says whether the data block needs escaping or
- *    not. This means we need a new log entry for each 251 or so data
- *    blocks, which isn't an enormous overhead but twice as much as
- *    for normal metadata blocks.
- */
-static void databuf_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
-        struct gfs2_trans *tr = current->journal_info;
-        struct address_space *mapping = bd->bd_bh->b_page->mapping;
-        struct gfs2_inode *ip = GFS2_I(mapping->host);
-        if (tr)
-                tr->tr_touched = 1;
-        if (!list_empty(&bd->bd_list))
-                return;
-        set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
-        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
-        if (gfs2_is_jdata(ip)) {
-                gfs2_pin(sdp, bd->bd_bh);
-                tr->tr_num_databuf_new++;
-                sdp->sd_log_num_databuf++;
-                list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
-        } else {
-                list_add_tail(&bd->bd_list, &sdp->sd_log_le_ordered);
-        }
-}
-/**
 * databuf_lo_before_commit - Scan the data buffers, writing as we go
 *
 */
@@ -885,7 +807,6 @@ static void databuf_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_ail *ai)
 const struct gfs2_log_operations gfs2_buf_lops = {
-        .lo_add = buf_lo_add,
        .lo_before_commit = buf_lo_before_commit,
        .lo_after_commit = buf_lo_after_commit,
        .lo_before_scan = buf_lo_before_scan,
@@ -895,7 +816,6 @@ const struct gfs2_log_operations gfs2_buf_lops = {
 };
 const struct gfs2_log_operations gfs2_revoke_lops = {
-        .lo_add = revoke_lo_add,
        .lo_before_commit = revoke_lo_before_commit,
        .lo_after_commit = revoke_lo_after_commit,
        .lo_before_scan = revoke_lo_before_scan,
@@ -909,7 +829,6 @@ const struct gfs2_log_operations gfs2_rg_lops = {
 };
 const struct gfs2_log_operations gfs2_databuf_lops = {
-        .lo_add = databuf_lo_add,
        .lo_before_commit = databuf_lo_before_commit,
        .lo_after_commit = databuf_lo_after_commit,
        .lo_scan_elements = databuf_lo_scan_elements,
diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h
index 954a330585f4..ba77b7da8325 100644
--- a/fs/gfs2/lops.h
+++ b/fs/gfs2/lops.h
@@ -29,6 +29,7 @@ extern const struct gfs2_log_operations gfs2_databuf_lops;
 extern const struct gfs2_log_operations *gfs2_log_ops[];
 extern void gfs2_log_write_page(struct gfs2_sbd *sdp, struct page *page);
 extern void gfs2_log_flush_bio(struct gfs2_sbd *sdp, int rw);
+extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh);
 static inline unsigned int buf_limit(struct gfs2_sbd *sdp)
 {
@@ -46,19 +47,6 @@ static inline unsigned int databuf_limit(struct gfs2_sbd *sdp)
        return limit;
 }
-static inline void lops_init_le(struct gfs2_bufdata *bd,
-                                const struct gfs2_log_operations *lops)
-{
-        INIT_LIST_HEAD(&bd->bd_list);
-        bd->bd_ops = lops;
-}
-static inline void lops_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
-{
-        if (bd->bd_ops->lo_add)
-                bd->bd_ops->lo_add(sdp, bd);
-}
 static inline void lops_before_commit(struct gfs2_sbd *sdp)
 {
        int x;
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 22255d96b27e..b059bbb5059e 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -271,41 +271,6 @@ int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh)
        return 0;
 }
-/**
- * gfs2_attach_bufdata - attach a struct gfs2_bufdata structure to a buffer
- * @gl: the glock the buffer belongs to
- * @bh: The buffer to be attached to
- * @meta: Flag to indicate whether its metadata or not
- */
-void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
-                         int meta)
-{
-        struct gfs2_bufdata *bd;
-        if (meta)
-                lock_page(bh->b_page);
-        if (bh->b_private) {
-                if (meta)
-                        unlock_page(bh->b_page);
-                return;
-        }
-        bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
-        bd->bd_bh = bh;
-        bd->bd_gl = gl;
-        if (meta)
-                lops_init_le(bd, &gfs2_buf_lops);
-        else
-                lops_init_le(bd, &gfs2_databuf_lops);
-        bh->b_private = bd;
-        if (meta)
-                unlock_page(bh->b_page);
-}
 void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr, int meta)
 {
        struct address_space *mapping = bh->b_page->mapping;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index c30973b07a7c..0d4c843b6f8e 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -56,9 +56,6 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno,
 int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
 struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create);
-void gfs2_attach_bufdata(struct gfs2_glock *gl, struct buffer_head *bh,
-                         int meta);
 void gfs2_remove_from_journal(struct buffer_head *bh, struct gfs2_trans *tr,
                              int meta);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 0e3554edb8f2..1b612be4b873 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -81,6 +81,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        init_waitqueue_head(&sdp->sd_glock_wait);
        atomic_set(&sdp->sd_glock_disposal, 0);
        init_completion(&sdp->sd_locking_init);
+        init_completion(&sdp->sd_wdack);
        spin_lock_init(&sdp->sd_statfs_spin);
        spin_lock_init(&sdp->sd_rindex_spin);
@@ -102,6 +103,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        INIT_LIST_HEAD(&sdp->sd_log_le_revoke);
        INIT_LIST_HEAD(&sdp->sd_log_le_databuf);
        INIT_LIST_HEAD(&sdp->sd_log_le_ordered);
+        spin_lock_init(&sdp->sd_ordered_lock);
        init_waitqueue_head(&sdp->sd_log_waitq);
        init_waitqueue_head(&sdp->sd_logd_waitq);
@@ -115,8 +117,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        INIT_LIST_HEAD(&sdp->sd_revoke_list);
-        mutex_init(&sdp->sd_freeze_lock);
        return sdp;
 }
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index ae55e248c3b7..06122d09c0d1 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -590,7 +590,7 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change)
        s64 x;
        mutex_lock(&sdp->sd_quota_mutex);
-        gfs2_trans_add_bh(ip->i_gl, qd->qd_bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, qd->qd_bh);
        if (!test_bit(QDF_CHANGE, &qd->qd_flags)) {
                qc->qc_change = 0;
@@ -726,7 +726,7 @@ get_a_page:
                        goto unlock_out;
        }
-        gfs2_trans_add_bh(ip->i_gl, bh, 0);
+        gfs2_trans_add_meta(ip->i_gl, bh);
        kaddr = kmap_atomic(page);
        if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 37ee061d899e..52c2aeaf45ce 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -350,10 +350,14 @@ static u32 gfs2_free_extlen(const struct gfs2_rbm *rrbm, u32 len)
                BUG_ON(len < chunk_size);
                len -= chunk_size;
                block = gfs2_rbm_to_block(&rbm);
-                gfs2_rbm_from_block(&rbm, block + chunk_size);
+                if (gfs2_rbm_from_block(&rbm, block + chunk_size)) {
-                n_unaligned = 3;
+                        n_unaligned = 0;
-                if (ptr)
                        break;
+                }
+                if (ptr) {
+                        n_unaligned = 3;
+                        break;
+                }
                n_unaligned = len & 3;
        }
@@ -557,22 +561,20 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
 */
 int gfs2_rs_alloc(struct gfs2_inode *ip)
 {
-        struct gfs2_blkreserv *res;
+        int error = 0;
+        down_write(&ip->i_rw_mutex);
        if (ip->i_res)
-                return 0;
+                goto out;
-        res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
-        if (!res)
-                return -ENOMEM;
-        RB_CLEAR_NODE(&res->rs_node);
+        ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
+        if (!ip->i_res) {
+                error = -ENOMEM;
+                goto out;
+        }
-        down_write(&ip->i_rw_mutex);
+        RB_CLEAR_NODE(&ip->i_res->rs_node);
-        if (ip->i_res)
+out:
-                kmem_cache_free(gfs2_rsrv_cachep, res);
-        else
-                ip->i_res = res;
        up_write(&ip->i_rw_mutex);
        return 0;
 }
@@ -1321,7 +1323,7 @@ int gfs2_fitrim(struct file *filp, void __user *argp)
                        if (ret == 0) {
                                bh = rgd->rd_bits[0].bi_bh;
                                rgd->rd_flags |= GFS2_RGF_TRIMMED;
-                                gfs2_trans_add_bh(rgd->rd_gl, bh, 1);
+                                gfs2_trans_add_meta(rgd->rd_gl, bh);
                                gfs2_rgrp_out(rgd, bh->b_data);
                                gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, bh->b_data);
                                gfs2_trans_end(sdp);
@@ -1424,6 +1426,9 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
                rs->rs_free = extlen;
                rs->rs_inum = ip->i_no_addr;
                rs_insert(ip);
+        } else {
+                if (goal == rgd->rd_last_alloc + rgd->rd_data0)
+                        rgd->rd_last_alloc = 0;
        }
 }
@@ -1963,14 +1968,14 @@ static void gfs2_alloc_extent(const struct gfs2_rbm *rbm, bool dinode,
        *n = 1;
        block = gfs2_rbm_to_block(rbm);
-        gfs2_trans_add_bh(rbm->rgd->rd_gl, rbm->bi->bi_bh, 1);
+        gfs2_trans_add_meta(rbm->rgd->rd_gl, rbm->bi->bi_bh);
        gfs2_setbit(rbm, true, dinode ? GFS2_BLKST_DINODE : GFS2_BLKST_USED);
        block++;
        while (*n < elen) {
                ret = gfs2_rbm_from_block(&pos, block);
                if (ret || gfs2_testbit(&pos) != GFS2_BLKST_FREE)
                        break;
-                gfs2_trans_add_bh(pos.rgd->rd_gl, pos.bi->bi_bh, 1);
+                gfs2_trans_add_meta(pos.rgd->rd_gl, pos.bi->bi_bh);
                gfs2_setbit(&pos, true, GFS2_BLKST_USED);
                (*n)++;
                block++;
@@ -2009,7 +2014,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart,
                               rbm.bi->bi_bh->b_data + rbm.bi->bi_offset,
                               rbm.bi->bi_len);
                }
-                gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.bi->bi_bh, 1);
+                gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.bi->bi_bh);
                gfs2_setbit(&rbm, false, new_state);
        }
@@ -2152,7 +2157,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
                if (error == 0) {
                        struct gfs2_dinode *di =
                                (struct gfs2_dinode *)dibh->b_data;
-                        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                        gfs2_trans_add_meta(ip->i_gl, dibh);
                        di->di_goal_meta = di->di_goal_data =
                                cpu_to_be64(ip->i_goal);
                        brelse(dibh);
@@ -2171,7 +2176,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
                        *generation = rbm.rgd->rd_igeneration++;
        }
-        gfs2_trans_add_bh(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh, 1);
+        gfs2_trans_add_meta(rbm.rgd->rd_gl, rbm.rgd->rd_bits[0].bi_bh);
        gfs2_rgrp_out(rbm.rgd, rbm.rgd->rd_bits[0].bi_bh->b_data);
        gfs2_rgrp_ondisk2lvb(rbm.rgd->rd_rgl, rbm.rgd->rd_bits[0].bi_bh->b_data);
@@ -2218,7 +2223,7 @@ void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta)
        trace_gfs2_block_alloc(ip, rgd, bstart, blen, GFS2_BLKST_FREE);
        rgd->rd_free += blen;
        rgd->rd_flags &= ~GFS2_RGF_TRIMMED;
-        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
@@ -2255,7 +2260,7 @@ void gfs2_unlink_di(struct inode *inode)
        if (!rgd)
                return;
        trace_gfs2_block_alloc(ip, rgd, blkno, 1, GFS2_BLKST_UNLINKED);
-        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
        update_rgrp_lvb_unlinked(rgd, 1);
@@ -2276,7 +2281,7 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno)
        rgd->rd_dinodes--;
        rgd->rd_free++;
-        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
+        gfs2_trans_add_meta(rgd->rd_gl, rgd->rd_bits[0].bi_bh);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
        gfs2_rgrp_ondisk2lvb(rgd->rd_rgl, rgd->rd_bits[0].bi_bh->b_data);
        update_rgrp_lvb_unlinked(rgd, -1);
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d6488674d916..a3b40eeaa6e2 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -500,7 +500,7 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
        if (error)
                return;
-        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        gfs2_trans_add_meta(l_ip->i_gl, l_bh);
        spin_lock(&sdp->sd_statfs_spin);
        l_sc->sc_total += total;
@@ -528,7 +528,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
-        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        gfs2_trans_add_meta(l_ip->i_gl, l_bh);
        spin_lock(&sdp->sd_statfs_spin);
        m_sc->sc_total += l_sc->sc_total;
@@ -539,7 +539,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
               0, sizeof(struct gfs2_statfs_change));
        spin_unlock(&sdp->sd_statfs_spin);
-        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+        gfs2_trans_add_meta(m_ip->i_gl, m_bh);
        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
 }
@@ -663,54 +663,6 @@ out:
        return error;
 }
-/**
- * gfs2_freeze_fs - freezes the file system
- * @sdp: the file system
- *
- * This function flushes data and meta data for all machines by
- * acquiring the transaction log exclusively.  All journals are
- * ensured to be in a clean state as well.
- *
- * Returns: errno
- */
-int gfs2_freeze_fs(struct gfs2_sbd *sdp)
-{
-        int error = 0;
-        mutex_lock(&sdp->sd_freeze_lock);
-        if (!sdp->sd_freeze_count++) {
-                error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
-                if (error)
-                        sdp->sd_freeze_count--;
-        }
-        mutex_unlock(&sdp->sd_freeze_lock);
-        return error;
-}
-/**
- * gfs2_unfreeze_fs - unfreezes the file system
- * @sdp: the file system
- *
- * This function allows the file system to proceed by unlocking
- * the exclusively held transaction lock.  Other GFS2 nodes are
- * now free to acquire the lock shared and go on with their lives.
- *
- */
-void gfs2_unfreeze_fs(struct gfs2_sbd *sdp)
-{
-        mutex_lock(&sdp->sd_freeze_lock);
-        if (sdp->sd_freeze_count && !--sdp->sd_freeze_count)
-                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-        mutex_unlock(&sdp->sd_freeze_lock);
-}
 void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf)
 {
        struct gfs2_dinode *str = buf;
@@ -824,7 +776,7 @@ static void gfs2_dirty_inode(struct inode *inode, int flags)
        ret = gfs2_meta_inode_buffer(ip, &bh);
        if (ret == 0) {
-                gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                gfs2_trans_add_meta(ip->i_gl, bh);
                gfs2_dinode_out(ip, bh->b_data);
                brelse(bh);
        }
@@ -888,13 +840,6 @@ static void gfs2_put_super(struct super_block *sb)
        int error;
        struct gfs2_jdesc *jd;
-        /*  Unfreeze the filesystem, if we need to  */
-        mutex_lock(&sdp->sd_freeze_lock);
-        if (sdp->sd_freeze_count)
-                gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
-        mutex_unlock(&sdp->sd_freeze_lock);
        /* No more recovery requests */
        set_bit(SDF_NORECOVERY, &sdp->sd_flags);
        smp_mb();
@@ -985,7 +930,7 @@ static int gfs2_freeze(struct super_block *sb)
                return -EINVAL;
        for (;;) {
-                error = gfs2_freeze_fs(sdp);
+                error = gfs2_lock_fs_check_clean(sdp, &sdp->sd_freeze_gh);
                if (!error)
                        break;
@@ -1013,7 +958,9 @@ static int gfs2_freeze(struct super_block *sb)
 static int gfs2_unfreeze(struct super_block *sb)
 {
-        gfs2_unfreeze_fs(sb->s_fs_info);
+        struct gfs2_sbd *sdp = sb->s_fs_info;
+        gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
        return 0;
 }
@@ -1577,6 +1524,7 @@ out:
        /* Case 3 starts here */
        truncate_inode_pages(&inode->i_data, 0);
        gfs2_rs_delete(ip);
+        gfs2_ordered_del_inode(ip);
        clear_inode(inode);
        gfs2_dir_hash_inval(ip);
        ip->i_gl->gl_object = NULL;
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index a0464680af0b..90e3322ffa10 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -46,9 +46,6 @@ extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
                          struct buffer_head *l_bh);
 extern int gfs2_statfs_sync(struct super_block *sb, int type);
-extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
-extern void gfs2_unfreeze_fs(struct gfs2_sbd *sdp);
 extern struct file_system_type gfs2_fs_type;
 extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 8056b7b7238e..4fb9ad80d260 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -91,19 +91,15 @@ static ssize_t uuid_show(struct gfs2_sbd *sdp, char *buf)
 static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf)
 {
-        unsigned int count;
+        struct super_block *sb = sdp->sd_vfs;
+        int frozen = (sb->s_writers.frozen == SB_UNFROZEN) ? 0 : 1;
-        mutex_lock(&sdp->sd_freeze_lock);
-        count = sdp->sd_freeze_count;
-        mutex_unlock(&sdp->sd_freeze_lock);
-        return snprintf(buf, PAGE_SIZE, "%u\n", count);
+        return snprintf(buf, PAGE_SIZE, "%u\n", frozen);
 }
 static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 {
-        ssize_t ret = len;
+        int error;
-        int error = 0;
        int n = simple_strtol(buf, NULL, 0);
        if (!capable(CAP_SYS_ADMIN))
@@ -111,19 +107,21 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
        switch (n) {
        case 0:
-                gfs2_unfreeze_fs(sdp);
+                error = thaw_super(sdp->sd_vfs);
                break;
        case 1:
-                error = gfs2_freeze_fs(sdp);
+                error = freeze_super(sdp->sd_vfs);
                break;
        default:
-                ret = -EINVAL;
+                return -EINVAL;
        }
-        if (error)
+        if (error) {
                fs_warn(sdp, "freeze %d error %d", n, error);
+                return error;
+        }
-        return ret;
+        return len;
 }
 static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf)
@@ -332,6 +330,28 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
        return ret;
 }
+static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf)
+{
+        int val = completion_done(&sdp->sd_wdack) ? 1 : 0;
+        return sprintf(buf, "%d\n", val);
+}
+static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
+{
+        ssize_t ret = len;
+        int val;
+        val = simple_strtol(buf, NULL, 0);
+        if ((val == 1) &&
+            !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
+                complete(&sdp->sd_wdack);
+        else
+                ret = -EINVAL;
+        return ret;
+}
 static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -463,7 +483,7 @@ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
 GDLM_ATTR(proto_name,           0444, proto_name_show,          NULL);
 GDLM_ATTR(block,                0644, block_show,               block_store);
-GDLM_ATTR(withdraw,             0644, withdraw_show,            withdraw_store);
+GDLM_ATTR(withdraw,             0644, wdack_show,               wdack_store);
 GDLM_ATTR(jid,                  0644, jid_show,                 jid_store);
 GDLM_ATTR(first,                0644, lkfirst_show,             lkfirst_store);
 GDLM_ATTR(first_done,           0444, first_done_show,          NULL);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index 413627072f36..88162fae27a5 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -18,6 +18,7 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "glock.h"
+#include "inode.h"
 #include "log.h"
 #include "lops.h"
 #include "meta_io.h"
@@ -142,44 +143,143 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
        sb_end_intwrite(sdp->sd_vfs);
 }
+static struct gfs2_bufdata *gfs2_alloc_bufdata(struct gfs2_glock *gl,
+                                               struct buffer_head *bh,
+                                               const struct gfs2_log_operations *lops)
+{
+        struct gfs2_bufdata *bd;
+        bd = kmem_cache_zalloc(gfs2_bufdata_cachep, GFP_NOFS | __GFP_NOFAIL);
+        bd->bd_bh = bh;
+        bd->bd_gl = gl;
+        bd->bd_ops = lops;
+        INIT_LIST_HEAD(&bd->bd_list);
+        bh->b_private = bd;
+        return bd;
+}
 /**
- * gfs2_trans_add_bh - Add a to-be-modified buffer to the current transaction
+ * gfs2_trans_add_data - Add a databuf to the transaction.
- * @gl: the glock the buffer belongs to
+ * @gl: The inode glock associated with the buffer
 * @bh: The buffer to add
- * @meta: True in the case of adding metadata
 *
+ * This is used in two distinct cases:
+ * i) In ordered write mode
+ *    We put the data buffer on a list so that we can ensure that its
+ *    synced to disk at the right time
+ * ii) In journaled data mode
+ *    We need to journal the data block in the same way as metadata in
+ *    the functions above. The difference is that here we have a tag
+ *    which is two __be64's being the block number (as per meta data)
+ *    and a flag which says whether the data block needs escaping or
+ *    not. This means we need a new log entry for each 251 or so data
+ *    blocks, which isn't an enormous overhead but twice as much as
+ *    for normal metadata blocks.
 */
+void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh)
+{
+        struct gfs2_trans *tr = current->journal_info;
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct address_space *mapping = bh->b_page->mapping;
+        struct gfs2_inode *ip = GFS2_I(mapping->host);
+        struct gfs2_bufdata *bd;
-void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta)
+        if (!gfs2_is_jdata(ip)) {
+                gfs2_ordered_add_inode(ip);
+                return;
+        }
+        lock_buffer(bh);
+        gfs2_log_lock(sdp);
+        bd = bh->b_private;
+        if (bd == NULL) {
+                gfs2_log_unlock(sdp);
+                unlock_buffer(bh);
+                if (bh->b_private == NULL)
+                        bd = gfs2_alloc_bufdata(gl, bh, &gfs2_databuf_lops);
+                lock_buffer(bh);
+                gfs2_log_lock(sdp);
+        }
+        gfs2_assert(sdp, bd->bd_gl == gl);
+        tr->tr_touched = 1;
+        if (list_empty(&bd->bd_list)) {
+                set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+                set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+                gfs2_pin(sdp, bd->bd_bh);
+                tr->tr_num_databuf_new++;
+                sdp->sd_log_num_databuf++;
+                list_add_tail(&bd->bd_list, &sdp->sd_log_le_databuf);
+        }
+        gfs2_log_unlock(sdp);
+        unlock_buffer(bh);
+}
+static void meta_lo_add(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 {
+        struct gfs2_meta_header *mh;
+        struct gfs2_trans *tr;
+        tr = current->journal_info;
+        tr->tr_touched = 1;
+        if (!list_empty(&bd->bd_list))
+                return;
+        set_bit(GLF_LFLUSH, &bd->bd_gl->gl_flags);
+        set_bit(GLF_DIRTY, &bd->bd_gl->gl_flags);
+        mh = (struct gfs2_meta_header *)bd->bd_bh->b_data;
+        if (unlikely(mh->mh_magic != cpu_to_be32(GFS2_MAGIC))) {
+                printk(KERN_ERR
+                       "Attempting to add uninitialised block to journal (inplace block=%lld)\n",
+                       (unsigned long long)bd->bd_bh->b_blocknr);
+                BUG();
+        }
+        gfs2_pin(sdp, bd->bd_bh);
+        mh->__pad0 = cpu_to_be64(0);
+        mh->mh_jid = cpu_to_be32(sdp->sd_jdesc->jd_jid);
+        sdp->sd_log_num_buf++;
+        list_add(&bd->bd_list, &sdp->sd_log_le_buf);
+        tr->tr_num_buf_new++;
+}
+void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh)
+{
        struct gfs2_sbd *sdp = gl->gl_sbd;
        struct gfs2_bufdata *bd;
        lock_buffer(bh);
        gfs2_log_lock(sdp);
        bd = bh->b_private;
-        if (bd)
+        if (bd == NULL) {
-                gfs2_assert(sdp, bd->bd_gl == gl);
-        else {
                gfs2_log_unlock(sdp);
                unlock_buffer(bh);
-                gfs2_attach_bufdata(gl, bh, meta);
+                lock_page(bh->b_page);
-                bd = bh->b_private;
+                if (bh->b_private == NULL)
+                        bd = gfs2_alloc_bufdata(gl, bh, &gfs2_buf_lops);
+                unlock_page(bh->b_page);
                lock_buffer(bh);
                gfs2_log_lock(sdp);
        }
-        lops_add(sdp, bd);
+        gfs2_assert(sdp, bd->bd_gl == gl);
+        meta_lo_add(sdp, bd);
        gfs2_log_unlock(sdp);
        unlock_buffer(bh);
 }
 void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd)
 {
+        struct gfs2_glock *gl = bd->bd_gl;
+        struct gfs2_trans *tr = current->journal_info;
        BUG_ON(!list_empty(&bd->bd_list));
        BUG_ON(!list_empty(&bd->bd_ail_st_list));
        BUG_ON(!list_empty(&bd->bd_ail_gl_list));
-        lops_init_le(bd, &gfs2_revoke_lops);
+        bd->bd_ops = &gfs2_revoke_lops;
-        lops_add(sdp, bd);
+        tr->tr_touched = 1;
+        tr->tr_num_revoke++;
+        sdp->sd_log_num_revoke++;
+        atomic_inc(&gl->gl_revokes);
+        set_bit(GLF_LFLUSH, &gl->gl_flags);
+        list_add(&bd->bd_list, &sdp->sd_log_le_revoke);
 }
 void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len)
diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h
index bf2ae9aeee7a..1e6e7da25a17 100644
--- a/fs/gfs2/trans.h
+++ b/fs/gfs2/trans.h
@@ -39,7 +39,8 @@ extern int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
                            unsigned int revokes);
 extern void gfs2_trans_end(struct gfs2_sbd *sdp);
-extern void gfs2_trans_add_bh(struct gfs2_glock *gl, struct buffer_head *bh, int meta);
+extern void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh);
+extern void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh);
 extern void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
 extern void gfs2_trans_add_unrevoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index f00d7c5744f6..6402fb69d71b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -54,6 +54,9 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
                kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+                if (!strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm"))
+                        wait_for_completion(&sdp->sd_wdack);
                if (lm->lm_unmount) {
                        fs_err(sdp, "telling LM to unmount\n");
                        lm->lm_unmount(sdp);
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 76c144b3c9bb..cbb46c2baa69 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -270,7 +270,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
        if (error)
                goto out_gunlock;
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, bh);
        dataptrs = GFS2_EA2DATAPTRS(ea);
        for (x = 0; x < ea->ea_num_ptrs; x++, dataptrs++) {
@@ -309,7 +309,7 @@ static int ea_dealloc_unstuffed(struct gfs2_inode *ip, struct buffer_head *bh,
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
                ip->i_inode.i_ctime = CURRENT_TIME;
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_trans_add_meta(ip->i_gl, dibh);
                gfs2_dinode_out(ip, dibh->b_data);
                brelse(dibh);
        }
@@ -509,7 +509,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
                }
                if (din) {
-                        gfs2_trans_add_bh(ip->i_gl, bh[x], 1);
+                        gfs2_trans_add_meta(ip->i_gl, bh[x]);
                        memcpy(pos, din, cp_size);
                        din += sdp->sd_jbsize;
                }
@@ -629,7 +629,7 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp)
                return error;
        gfs2_trans_add_unrevoke(sdp, block, 1);
        *bhp = gfs2_meta_new(ip->i_gl, block);
-        gfs2_trans_add_bh(ip->i_gl, *bhp, 1);
+        gfs2_trans_add_meta(ip->i_gl, *bhp);
        gfs2_metatype_set(*bhp, GFS2_METATYPE_EA, GFS2_FORMAT_EA);
        gfs2_buffer_clear_tail(*bhp, sizeof(struct gfs2_meta_header));
@@ -691,7 +691,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
                                return error;
                        gfs2_trans_add_unrevoke(sdp, block, 1);
                        bh = gfs2_meta_new(ip->i_gl, block);
-                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+                        gfs2_trans_add_meta(ip->i_gl, bh);
                        gfs2_metatype_set(bh, GFS2_METATYPE_ED, GFS2_FORMAT_ED);
                        gfs2_add_inode_blocks(&ip->i_inode, 1);
@@ -751,7 +751,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
                ip->i_inode.i_ctime = CURRENT_TIME;
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_trans_add_meta(ip->i_gl, dibh);
                gfs2_dinode_out(ip, dibh->b_data);
                brelse(dibh);
        }
@@ -834,7 +834,7 @@ static void ea_set_remove_stuffed(struct gfs2_inode *ip,
        struct gfs2_ea_header *prev = el->el_prev;
        u32 len;
-        gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, el->el_bh);
        if (!prev || !GFS2_EA_IS_STUFFED(ea)) {
                ea->ea_type = GFS2_EATYPE_UNUSED;
@@ -872,7 +872,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
        if (error)
                return error;
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, bh);
        if (es->ea_split)
                ea = ea_split_ea(ea);
@@ -886,7 +886,7 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
        if (error)
                goto out;
        ip->i_inode.i_ctime = CURRENT_TIME;
-        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+        gfs2_trans_add_meta(ip->i_gl, dibh);
        gfs2_dinode_out(ip, dibh->b_data);
        brelse(dibh);
 out:
@@ -901,7 +901,7 @@ static int ea_set_simple_alloc(struct gfs2_inode *ip,
        struct gfs2_ea_header *ea = es->es_ea;
        int error;
-        gfs2_trans_add_bh(ip->i_gl, es->es_bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, es->es_bh);
        if (es->ea_split)
                ea = ea_split_ea(ea);
@@ -997,7 +997,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                        goto out;
                }
-                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+                gfs2_trans_add_meta(ip->i_gl, indbh);
        } else {
                u64 blk;
                unsigned int n = 1;
@@ -1006,7 +1006,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
                        return error;
                gfs2_trans_add_unrevoke(sdp, blk, 1);
                indbh = gfs2_meta_new(ip->i_gl, blk);
-                gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+                gfs2_trans_add_meta(ip->i_gl, indbh);
                gfs2_metatype_set(indbh, GFS2_METATYPE_IN, GFS2_FORMAT_IN);
                gfs2_buffer_clear_tail(indbh, mh_size);
@@ -1092,7 +1092,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
        if (error)
                return error;
-        gfs2_trans_add_bh(ip->i_gl, el->el_bh, 1);
+        gfs2_trans_add_meta(ip->i_gl, el->el_bh);
        if (prev) {
                u32 len;
@@ -1109,7 +1109,7 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
                ip->i_inode.i_ctime = CURRENT_TIME;
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_trans_add_meta(ip->i_gl, dibh);
                gfs2_dinode_out(ip, dibh->b_data);
                brelse(dibh);
        }
@@ -1265,7 +1265,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
        if (GFS2_EA_IS_STUFFED(el.el_ea)) {
                error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
                if (error == 0) {
-                        gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1);
+                        gfs2_trans_add_meta(ip->i_gl, el.el_bh);
                        memcpy(GFS2_EA2DATA(el.el_ea), data,
                               GFS2_EA_DATA_LEN(el.el_ea));
                }
@@ -1352,7 +1352,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
        if (error)
                goto out_gunlock;
-        gfs2_trans_add_bh(ip->i_gl, indbh, 1);
+        gfs2_trans_add_meta(ip->i_gl, indbh);
        eablk = (__be64 *)(indbh->b_data + sizeof(struct gfs2_meta_header));
        bstart = 0;
@@ -1384,7 +1384,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_trans_add_meta(ip->i_gl, dibh);
                gfs2_dinode_out(ip, dibh->b_data);
                brelse(dibh);
        }
@@ -1434,7 +1434,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip)
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
-                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
+                gfs2_trans_add_meta(ip->i_gl, dibh);
                gfs2_dinode_out(ip, dibh->b_data);
                brelse(dibh);
        }
diff --git a/fs/hfs/Kconfig b/fs/hfs/Kconfig
index b77c5bc20f8a..998e3a6decf3 100644
--- a/fs/hfs/Kconfig
+++ b/fs/hfs/Kconfig
@@ -1,6 +1,6 @@
 config HFS_FS
-        tristate "Apple Macintosh file system support (EXPERIMENTAL)"
+        tristate "Apple Macintosh file system support"
-        depends on BLOCK && EXPERIMENTAL
+        depends on BLOCK
        select NLS
        help
          If you say Y here, you will be able to mount Macintosh-formatted
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 5dc06c837105..9edeeb0ea97e 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -147,7 +147,7 @@ static void hpfs_write_inode_ea(struct inode *i, struct fnode *fnode)
        /*if (le32_to_cpu(fnode->acl_size_l) || le16_to_cpu(fnode->acl_size_s)) {
                   Some unknown structures like ACL may be in fnode,
                   we'd better not overwrite them
-                hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 stuctures", i->i_ino);
+                hpfs_error(i->i_sb, "fnode %08x has some unknown HPFS386 structures", i->i_ino);
        } else*/ if (hpfs_sb(i->i_sb)->sb_eas >= 2) {
                __le32 ea;
                if (!uid_eq(i->i_uid, hpfs_sb(i->i_sb)->sb_uid) || hpfs_inode->i_ea_uid) {
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index a2862339323b..81cc7eaff863 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -446,7 +446,8 @@ int __log_start_commit(journal_t *journal, tid_t target)
         * currently running transaction (if it exists).  Otherwise,
         * the target tid must be an old one.
         */
-        if (journal->j_running_transaction &&
+        if (journal->j_commit_request != target &&
+            journal->j_running_transaction &&
            journal->j_running_transaction->t_tid == target) {
                /*
                 * We want a new commit: OK, mark the request and wakeup the
diff --git a/fs/jffs2/Kconfig b/fs/jffs2/Kconfig
index 6ae169cd8faa..d8bb6c411e96 100644
--- a/fs/jffs2/Kconfig
+++ b/fs/jffs2/Kconfig
@@ -50,8 +50,8 @@ config JFFS2_FS_WBUF_VERIFY
          write-buffer, and check for errors.
 config JFFS2_SUMMARY
-        bool "JFFS2 summary support (EXPERIMENTAL)"
+        bool "JFFS2 summary support"
-        depends on JFFS2_FS && EXPERIMENTAL
+        depends on JFFS2_FS
        default n
        help
          This feature makes it possible to use summary information
@@ -63,8 +63,8 @@ config JFFS2_SUMMARY
          If unsure, say 'N'.
 config JFFS2_FS_XATTR
-        bool "JFFS2 XATTR support (EXPERIMENTAL)"
+        bool "JFFS2 XATTR support"
-        depends on JFFS2_FS && EXPERIMENTAL
+        depends on JFFS2_FS
        default n
        help
          Extended attributes are name:value pairs associated with inodes by
@@ -173,7 +173,7 @@ config JFFS2_CMODE_PRIORITY
          successful one.
 config JFFS2_CMODE_SIZE
-        bool "size (EXPERIMENTAL)"
+        bool "size"
        help
          Tries all compressors and chooses the one which has the smallest
          result.
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 1a543be09c79..060ba638becb 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -154,7 +154,7 @@ static int jfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        /*
         * If we really return the number of allocated & free inodes, some
         * applications will fail because they won't see enough free inodes.
-         * We'll try to calculate some guess as to how may inodes we can
+         * We'll try to calculate some guess as to how many inodes we can
         * really allocate
         *
         * buf->f_files = atomic_read(&imap->im_numinos);
diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c
index 54f9e6ce0430..52e5120bb159 100644
--- a/fs/lockd/clntproc.c
+++ b/fs/lockd/clntproc.c
@@ -550,6 +550,9 @@ again:
                status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT);
                if (status < 0)
                        break;
+                /* Resend the blocking lock request after a server reboot */
+                if (resp->status ==  nlm_lck_denied_grace_period)
+                        continue;
                if (resp->status != nlm_lck_blocked)
                        break;
        }
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
index daf9a9b32dd3..09ed066c0221 100644
--- a/fs/logfs/Kconfig
+++ b/fs/logfs/Kconfig
@@ -1,6 +1,6 @@
 config LOGFS
-        tristate "LogFS file system (EXPERIMENTAL)"
+        tristate "LogFS file system"
-        depends on (MTD || BLOCK) && EXPERIMENTAL
+        depends on (MTD || BLOCK)
        select ZLIB_INFLATE
        select ZLIB_DEFLATE
        select CRC32
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 4fa788c93f46..434b93ec0970 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -1273,6 +1273,7 @@ static const struct nfs_pageio_ops bl_pg_write_ops = {
 static struct pnfs_layoutdriver_type blocklayout_type = {
        .id                             = LAYOUT_BLOCK_VOLUME,
        .name                           = "LAYOUT_BLOCK_VOLUME",
+        .owner                          = THIS_MODULE,
        .read_pagelist                  = bl_read_pagelist,
        .write_pagelist                 = bl_write_pagelist,
        .alloc_layout_hdr               = bl_alloc_layout_hdr,
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index c89b26bc9759..2960512792c2 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -183,60 +183,15 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 static u32 initiate_bulk_draining(struct nfs_client *clp,
                                  struct cb_layoutrecallargs *args)
 {
-        struct nfs_server *server;
+        int stat;
-        struct pnfs_layout_hdr *lo;
-        struct inode *ino;
-        u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
-        struct pnfs_layout_hdr *tmp;
-        LIST_HEAD(recall_list);
-        LIST_HEAD(free_me_list);
-        struct pnfs_layout_range range = {
-                .iomode = IOMODE_ANY,
-                .offset = 0,
-                .length = NFS4_MAX_UINT64,
-        };
-        spin_lock(&clp->cl_lock);
-        rcu_read_lock();
-        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if ((args->cbl_recall_type == RETURN_FSID) &&
-                    memcmp(&server->fsid, &args->cbl_fsid,
-                           sizeof(struct nfs_fsid)))
-                        continue;
-                list_for_each_entry(lo, &server->layouts, plh_layouts) {
-                        ino = igrab(lo->plh_inode);
-                        if (ino)
-                                continue;
-                        spin_lock(&ino->i_lock);
-                        /* Is this layout in the process of being freed? */
-                        if (NFS_I(ino)->layout != lo) {
-                                spin_unlock(&ino->i_lock);
-                                iput(ino);
-                                continue;
-                        }
-                        pnfs_get_layout_hdr(lo);
-                        spin_unlock(&ino->i_lock);
-                        list_add(&lo->plh_bulk_recall, &recall_list);
-                }
-        }
-        rcu_read_unlock();
-        spin_unlock(&clp->cl_lock);
-        list_for_each_entry_safe(lo, tmp,
+        if (args->cbl_recall_type == RETURN_FSID)
-                                 &recall_list, plh_bulk_recall) {
+                stat = pnfs_destroy_layouts_byfsid(clp, &args->cbl_fsid, true);
-                ino = lo->plh_inode;
+        else
-                spin_lock(&ino->i_lock);
+                stat = pnfs_destroy_layouts_byclid(clp, true);
-                set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+        if (stat != 0)
-                if (pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, &range))
+                return NFS4ERR_DELAY;
-                        rv = NFS4ERR_DELAY;
+        return NFS4ERR_NOMATCHING_LAYOUT;
-                list_del_init(&lo->plh_bulk_recall);
-                spin_unlock(&ino->i_lock);
-                pnfs_free_lseg_list(&free_me_list);
-                pnfs_put_layout_hdr(lo);
-                iput(ino);
-        }
-        return rv;
 }
 static u32 do_callback_layoutrecall(struct nfs_client *clp,
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 81c5eec3cf38..6390a4b5fee7 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -55,7 +55,8 @@ int nfs4_have_delegation(struct inode *inode, fmode_t flags)
        flags &= FMODE_READ|FMODE_WRITE;
        rcu_read_lock();
        delegation = rcu_dereference(NFS_I(inode)->delegation);
-        if (delegation != NULL && (delegation->type & flags) == flags) {
+        if (delegation != NULL && (delegation->type & flags) == flags &&
+            !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
                nfs_mark_delegation_referenced(delegation);
                ret = 1;
        }
@@ -70,8 +71,10 @@ static int nfs_delegation_claim_locks(struct nfs_open_context *ctx, struct nfs4_
        int status = 0;
        if (inode->i_flock == NULL)
-                goto out;
+                return 0;
+        if (inode->i_flock == NULL)
+                goto out;
        /* Protect inode->i_flock using the file locks lock */
        lock_flocks();
        for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) {
@@ -94,7 +97,9 @@ static int nfs_delegation_claim_opens(struct inode *inode, const nfs4_stateid *s
 {
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_open_context *ctx;
+        struct nfs4_state_owner *sp;
        struct nfs4_state *state;
+        unsigned int seq;
        int err;
 again:
@@ -109,9 +114,16 @@ again:
                        continue;
                get_nfs_open_context(ctx);
                spin_unlock(&inode->i_lock);
+                sp = state->owner;
+                /* Block nfs4_proc_unlck */
+                mutex_lock(&sp->so_delegreturn_mutex);
+                seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
                err = nfs4_open_delegation_recall(ctx, state, stateid);
-                if (err >= 0)
+                if (!err)
                        err = nfs_delegation_claim_locks(ctx, state);
+                if (!err && read_seqcount_retry(&sp->so_reclaim_seqcount, seq))
+                        err = -EAGAIN;
+                mutex_unlock(&sp->so_delegreturn_mutex);
                put_nfs_open_context(ctx);
                if (err != 0)
                        return err;
@@ -182,39 +194,91 @@ static struct inode *nfs_delegation_grab_inode(struct nfs_delegation *delegation
 }
 static struct nfs_delegation *
+nfs_start_delegation_return_locked(struct nfs_inode *nfsi)
+{
+        struct nfs_delegation *ret = NULL;
+        struct nfs_delegation *delegation = rcu_dereference(nfsi->delegation);
+        if (delegation == NULL)
+                goto out;
+        spin_lock(&delegation->lock);
+        if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+                ret = delegation;
+        spin_unlock(&delegation->lock);
+out:
+        return ret;
+}
+static struct nfs_delegation *
+nfs_start_delegation_return(struct nfs_inode *nfsi)
+{
+        struct nfs_delegation *delegation;
+        rcu_read_lock();
+        delegation = nfs_start_delegation_return_locked(nfsi);
+        rcu_read_unlock();
+        return delegation;
+}
+static void
+nfs_abort_delegation_return(struct nfs_delegation *delegation,
+                struct nfs_client *clp)
+{
+        spin_lock(&delegation->lock);
+        clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
+        set_bit(NFS_DELEGATION_RETURN, &delegation->flags);
+        spin_unlock(&delegation->lock);
+        set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
+}
+static struct nfs_delegation *
 nfs_detach_delegation_locked(struct nfs_inode *nfsi,
-                             struct nfs_server *server)
+                struct nfs_delegation *delegation,
+                struct nfs_client *clp)
 {
-        struct nfs_delegation *delegation =
+        struct nfs_delegation *deleg_cur =
                rcu_dereference_protected(nfsi->delegation,
-                                lockdep_is_held(&server->nfs_client->cl_lock));
+                                lockdep_is_held(&clp->cl_lock));
-        if (delegation == NULL)
+        if (deleg_cur == NULL || delegation != deleg_cur)
-                goto nomatch;
+                return NULL;
        spin_lock(&delegation->lock);
+        set_bit(NFS_DELEGATION_RETURNING, &delegation->flags);
        list_del_rcu(&delegation->super_list);
        delegation->inode = NULL;
        nfsi->delegation_state = 0;
        rcu_assign_pointer(nfsi->delegation, NULL);
        spin_unlock(&delegation->lock);
        return delegation;
-nomatch:
-        return NULL;
 }
 static struct nfs_delegation *nfs_detach_delegation(struct nfs_inode *nfsi,
-                                                    struct nfs_server *server)
+                struct nfs_delegation *delegation,
+                struct nfs_server *server)
 {
        struct nfs_client *clp = server->nfs_client;
-        struct nfs_delegation *delegation;
        spin_lock(&clp->cl_lock);
-        delegation = nfs_detach_delegation_locked(nfsi, server);
+        delegation = nfs_detach_delegation_locked(nfsi, delegation, clp);
        spin_unlock(&clp->cl_lock);
        return delegation;
 }
+static struct nfs_delegation *
+nfs_inode_detach_delegation(struct inode *inode)
+{
+        struct nfs_inode *nfsi = NFS_I(inode);
+        struct nfs_server *server = NFS_SERVER(inode);
+        struct nfs_delegation *delegation;
+        delegation = nfs_start_delegation_return(nfsi);
+        if (delegation == NULL)
+                return NULL;
+        return nfs_detach_delegation(nfsi, delegation, server);
+}
 /**
 * nfs_inode_set_delegation - set up a delegation on an inode
 * @inode: inode to which delegation applies
@@ -268,7 +332,10 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
                        delegation = NULL;
                        goto out;
                }
-                freeme = nfs_detach_delegation_locked(nfsi, server);
+                freeme = nfs_detach_delegation_locked(nfsi, 
+                                old_delegation, clp);
+                if (freeme == NULL)
+                        goto out;
        }
        list_add_rcu(&delegation->super_list, &server->delegations);
        nfsi->delegation_state = delegation->type;
@@ -292,19 +359,29 @@ out:
 /*
 * Basic procedure for returning a delegation to the server
 */
-static int __nfs_inode_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
+static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync)
 {
+        struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
        struct nfs_inode *nfsi = NFS_I(inode);
        int err;
-        /*
+        if (delegation == NULL)
-         * Guard against new delegated open/lock/unlock calls and against
+                return 0;
-         * state recovery
+        do {
-         */
+                err = nfs_delegation_claim_opens(inode, &delegation->stateid);
-        down_write(&nfsi->rwsem);
+                if (!issync || err != -EAGAIN)
-        err = nfs_delegation_claim_opens(inode, &delegation->stateid);
+                        break;
-        up_write(&nfsi->rwsem);
+                /*
-        if (err)
+                 * Guard against state recovery
+                 */
+                err = nfs4_wait_clnt_recover(clp);
+        } while (err == 0);
+        if (err) {
+                nfs_abort_delegation_return(delegation, clp);
+                goto out;
+        }
+        if (!nfs_detach_delegation(nfsi, delegation, NFS_SERVER(inode)))
                goto out;
        err = nfs_do_return_delegation(inode, delegation, issync);
@@ -340,13 +417,10 @@ restart:
                        inode = nfs_delegation_grab_inode(delegation);
                        if (inode == NULL)
                                continue;
-                        delegation = nfs_detach_delegation(NFS_I(inode),
+                        delegation = nfs_start_delegation_return_locked(NFS_I(inode));
-                                                                server);
                        rcu_read_unlock();
-                        if (delegation != NULL)
+                        err = nfs_end_delegation_return(inode, delegation, 0);
-                                err = __nfs_inode_return_delegation(inode,
-                                                                delegation, 0);
                        iput(inode);
                        if (!err)
                                goto restart;
@@ -367,15 +441,11 @@ restart:
 */
 void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 {
-        struct nfs_server *server = NFS_SERVER(inode);
-        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
-        if (rcu_access_pointer(nfsi->delegation) != NULL) {
+        delegation = nfs_inode_detach_delegation(inode);
-                delegation = nfs_detach_delegation(nfsi, server);
+        if (delegation != NULL)
-                if (delegation != NULL)
+                nfs_do_return_delegation(inode, delegation, 0);
-                        nfs_do_return_delegation(inode, delegation, 0);
-        }
 }
 /**
@@ -390,18 +460,14 @@ void nfs_inode_return_delegation_noreclaim(struct inode *inode)
 */
 int nfs4_inode_return_delegation(struct inode *inode)
 {
-        struct nfs_server *server = NFS_SERVER(inode);
        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_delegation *delegation;
        int err = 0;
        nfs_wb_all(inode);
-        if (rcu_access_pointer(nfsi->delegation) != NULL) {
+        delegation = nfs_start_delegation_return(nfsi);
-                delegation = nfs_detach_delegation(nfsi, server);
+        if (delegation != NULL)
-                if (delegation != NULL) {
+                err = nfs_end_delegation_return(inode, delegation, 1);
-                        err = __nfs_inode_return_delegation(inode, delegation, 1);
-                }
-        }
        return err;
 }
@@ -471,7 +537,7 @@ void nfs_remove_bad_delegation(struct inode *inode)
 {
        struct nfs_delegation *delegation;
-        delegation = nfs_detach_delegation(NFS_I(inode), NFS_SERVER(inode));
+        delegation = nfs_inode_detach_delegation(inode);
        if (delegation) {
                nfs_inode_find_state_and_recover(inode, &delegation->stateid);
                nfs_free_delegation(delegation);
@@ -649,7 +715,7 @@ restart:
                        if (inode == NULL)
                                continue;
                        delegation = nfs_detach_delegation(NFS_I(inode),
-                                                                server);
+                                        delegation, server);
                        rcu_read_unlock();
                        if (delegation != NULL)
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index bbc6a4dba0d8..d54d4fca6793 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -29,6 +29,7 @@ enum {
        NFS_DELEGATION_NEED_RECLAIM = 0,
        NFS_DELEGATION_RETURN,
        NFS_DELEGATION_REFERENCED,
+        NFS_DELEGATION_RETURNING,
 };
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 32e6c53520e2..1b2d7eb93796 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2153,12 +2153,16 @@ static int nfs_open_permission_mask(int openflags)
 {
        int mask = 0;
-        if ((openflags & O_ACCMODE) != O_WRONLY)
+        if (openflags & __FMODE_EXEC) {
-                mask |= MAY_READ;
+                /* ONLY check exec rights */
-        if ((openflags & O_ACCMODE) != O_RDONLY)
+                mask = MAY_EXEC;
-                mask |= MAY_WRITE;
+        } else {
-        if (openflags & __FMODE_EXEC)
+                if ((openflags & O_ACCMODE) != O_WRONLY)
-                mask |= MAY_EXEC;
+                        mask |= MAY_READ;
+                if ((openflags & O_ACCMODE) != O_RDONLY)
+                        mask |= MAY_WRITE;
+        }
        return mask;
 }
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index 033803c36644..44efaa8c5f78 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -126,8 +126,7 @@ struct dentry *nfs_get_root(struct super_block *sb, struct nfs_fh *mntfh,
        }
        spin_unlock(&ret->d_lock);
 out:
-        if (name)
+        kfree(name);
-                kfree(name);
        nfs_free_fattr(fsinfo.fattr);
        return ret;
 }
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index ebeb94ce1b0b..6acc73c80d7f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -694,10 +694,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
        if (ctx->cred != NULL)
                put_rpccred(ctx->cred);
        dput(ctx->dentry);
-        if (is_sync)
+        nfs_sb_deactive(sb);
-                nfs_sb_deactive(sb);
-        else
-                nfs_sb_deactive_async(sb);
        kfree(ctx->mdsthreshold);
        kfree(ctx);
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f0e6c7df1a07..541c9ebdbc5a 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -329,7 +329,6 @@ extern int __init register_nfs_fs(void);
 extern void __exit unregister_nfs_fs(void);
 extern void nfs_sb_active(struct super_block *sb);
 extern void nfs_sb_deactive(struct super_block *sb);
-extern void nfs_sb_deactive_async(struct super_block *sb);
 /* namespace.c */
 #define NFS_PATH_CANONICAL 1
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index dd057bc6b65b..fc8dc20fdeb9 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -177,11 +177,31 @@ out_nofree:
        return mnt;
 }
+static int
+nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+        if (NFS_FH(dentry->d_inode)->size != 0)
+                return nfs_getattr(mnt, dentry, stat);
+        generic_fillattr(dentry->d_inode, stat);
+        return 0;
+}
+static int
+nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr)
+{
+        if (NFS_FH(dentry->d_inode)->size != 0)
+                return nfs_setattr(dentry, attr);
+        return -EACCES;
+}
 const struct inode_operations nfs_mountpoint_inode_operations = {
        .getattr        = nfs_getattr,
+        .setattr        = nfs_setattr,
 };
 const struct inode_operations nfs_referral_inode_operations = {
+        .getattr        = nfs_namespace_getattr,
+        .setattr        = nfs_namespace_setattr,
 };
 static void nfs_expire_automounts(struct work_struct *work)
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index a3f488b074a2..944c9a5c1039 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -13,6 +13,8 @@
 #define NFS4_MAX_LOOP_ON_RECOVER (10)
+#include <linux/seqlock.h>
 struct idmap;
 enum nfs4_client_state {
@@ -90,6 +92,8 @@ struct nfs4_state_owner {
        unsigned long        so_flags;
        struct list_head     so_states;
        struct nfs_seqid_counter so_seqid;
+        seqcount_t           so_reclaim_seqcount;
+        struct mutex         so_delegreturn_mutex;
 };
 enum {
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index acc347268124..2e9779b58b7a 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -236,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp,
        error = nfs4_discover_server_trunking(clp, &old);
        if (error < 0)
                goto error;
+        nfs_put_client(clp);
        if (clp != old) {
                clp->cl_preserve_clid = true;
-                nfs_put_client(clp);
                clp = old;
-                atomic_inc(&clp->cl_count);
        }
        return clp;
@@ -306,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
                .clientid       = new->cl_clientid,
                .confirm        = new->cl_confirm,
        };
-        int status;
+        int status = -NFS4ERR_STALE_CLIENTID;
        spin_lock(&nn->nfs_client_lock);
        list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
@@ -332,40 +331,33 @@ int nfs40_walk_client_list(struct nfs_client *new,
                if (prev)
                        nfs_put_client(prev);
+                prev = pos;
                status = nfs4_proc_setclientid_confirm(pos, &clid, cred);
-                if (status == 0) {
+                switch (status) {
+                case -NFS4ERR_STALE_CLIENTID:
+                        break;
+                case 0:
                        nfs4_swap_callback_idents(pos, new);
-                        nfs_put_client(pos);
+                        prev = NULL;
                        *result = pos;
                        dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
                                __func__, pos, atomic_read(&pos->cl_count));
-                        return 0;
+                default:
-                }
+                        goto out;
-                if (status != -NFS4ERR_STALE_CLIENTID) {
-                        nfs_put_client(pos);
-                        dprintk("NFS: <-- %s status = %d, no result\n",
-                                __func__, status);
-                        return status;
                }
                spin_lock(&nn->nfs_client_lock);
-                prev = pos;
        }
+        spin_unlock(&nn->nfs_client_lock);
-        /*
+        /* No match found. The server lost our clientid */
-         * No matching nfs_client found.  This should be impossible,
+out:
-         * because the new nfs_client has already been added to
-         * nfs_client_list by nfs_get_client().
-         *
-         * Don't BUG(), since the caller is holding a mutex.
-         */
        if (prev)
                nfs_put_client(prev);
-        spin_unlock(&nn->nfs_client_lock);
+        dprintk("NFS: <-- %s status = %d\n", __func__, status);
-        pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
+        return status;
-        return -NFS4ERR_STALE_CLIENTID;
 }
 #ifdef CONFIG_NFS_V4_1
@@ -432,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
 {
        struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id);
        struct nfs_client *pos, *n, *prev = NULL;
-        int error;
+        int status = -NFS4ERR_STALE_CLIENTID;
        spin_lock(&nn->nfs_client_lock);
        list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) {
@@ -448,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new,
                                nfs_put_client(prev);
                        prev = pos;
-                        error = nfs_wait_client_init_complete(pos);
+                        nfs4_schedule_lease_recovery(pos);
-                        if (error < 0) {
+                        status = nfs_wait_client_init_complete(pos);
+                        if (status < 0) {
                                nfs_put_client(pos);
                                spin_lock(&nn->nfs_client_lock);
                                continue;
                        }
+                        status = pos->cl_cons_state;
                        spin_lock(&nn->nfs_client_lock);
+                        if (status < 0)
+                                continue;
                }
                if (pos->rpc_ops != new->rpc_ops)
@@ -473,6 +468,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
                if (!nfs4_match_serverowners(pos, new))
                        continue;
+                atomic_inc(&pos->cl_count);
                spin_unlock(&nn->nfs_client_lock);
                dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n",
                        __func__, pos, atomic_read(&pos->cl_count));
@@ -481,16 +477,10 @@ int nfs41_walk_client_list(struct nfs_client *new,
                return 0;
        }
-        /*
+        /* No matching nfs_client found. */
-         * No matching nfs_client found.  This should be impossible,
-         * because the new nfs_client has already been added to
-         * nfs_client_list by nfs_get_client().
-         *
-         * Don't BUG(), since the caller is holding a mutex.
-         */
        spin_unlock(&nn->nfs_client_lock);
-        pr_err("NFS: %s Error: no matching nfs_client found\n", __func__);
+        dprintk("NFS: <-- %s status = %d\n", __func__, status);
-        return -NFS4ERR_STALE_CLIENTID;
+        return status;
 }
 #endif  /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 5d864fb36578..eae83bf96c6d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -896,6 +896,8 @@ static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode)
                return 0;
        if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags))
                return 0;
+        if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+                return 0;
        nfs_mark_delegation_referenced(delegation);
        return 1;
 }
@@ -973,6 +975,7 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
        spin_lock(&deleg_cur->lock);
        if (nfsi->delegation != deleg_cur ||
+           test_bit(NFS_DELEGATION_RETURNING, &deleg_cur->flags) ||
            (deleg_cur->type & fmode) != fmode)
                goto no_delegation_unlock;
@@ -1352,19 +1355,18 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                        case -NFS4ERR_BAD_HIGH_SLOT:
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        case -NFS4ERR_DEADSESSION:
+                                set_bit(NFS_DELEGATED_STATE, &state->flags);
                                nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
+                                err = -EAGAIN;
                                goto out;
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
+                                set_bit(NFS_DELEGATED_STATE, &state->flags);
                        case -NFS4ERR_EXPIRED:
                                /* Don't recall a delegation if it was lost */
                                nfs4_schedule_lease_recovery(server->nfs_client);
+                                err = -EAGAIN;
                                goto out;
-                        case -ERESTARTSYS:
-                                /*
-                                 * The show must go on: exit, but mark the
-                                 * stateid as needing recovery.
-                                 */
                        case -NFS4ERR_DELEG_REVOKED:
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
@@ -1375,6 +1377,7 @@ int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state
                                err = 0;
                                goto out;
                }
+                set_bit(NFS_DELEGATED_STATE, &state->flags);
                err = nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
 out:
@@ -1463,7 +1466,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
        struct nfs4_state_owner *sp = data->owner;
        if (nfs_wait_on_sequence(data->o_arg.seqid, task) != 0)
-                return;
+                goto out_wait;
        /*
         * Check if we still need to send an OPEN call, or if we can use
         * a delegation instead.
@@ -1498,6 +1501,7 @@ unlock_no_action:
        rcu_read_unlock();
 out_no_action:
        task->tk_action = NULL;
+out_wait:
        nfs4_sequence_done(task, &data->o_res.seq_res);
 }
@@ -1626,7 +1630,8 @@ static int _nfs4_recover_proc_open(struct nfs4_opendata *data)
 static int nfs4_opendata_access(struct rpc_cred *cred,
                                struct nfs4_opendata *opendata,
-                                struct nfs4_state *state, fmode_t fmode)
+                                struct nfs4_state *state, fmode_t fmode,
+                                int openflags)
 {
        struct nfs_access_entry cache;
        u32 mask;
@@ -1638,11 +1643,14 @@ static int nfs4_opendata_access(struct rpc_cred *cred,
        mask = 0;
        /* don't check MAY_WRITE - a newly created file may not have
-         * write mode bits, but POSIX allows the creating process to write */
+         * write mode bits, but POSIX allows the creating process to write.
-        if (fmode & FMODE_READ)
+         * use openflags to check for exec, because fmode won't
-                mask |= MAY_READ;
+         * always have FMODE_EXEC set when file open for exec. */
-        if (fmode & FMODE_EXEC)
+        if (openflags & __FMODE_EXEC) {
-                mask |= MAY_EXEC;
+                /* ONLY check for exec rights */
+                mask = MAY_EXEC;
+        } else if (fmode & FMODE_READ)
+                mask = MAY_READ;
        cache.cred = cred;
        cache.jiffies = jiffies;
@@ -1841,6 +1849,43 @@ static inline void nfs4_exclusive_attrset(struct nfs4_opendata *opendata, struct
                sattr->ia_valid |= ATTR_MTIME;
 }
+static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
+                fmode_t fmode,
+                int flags,
+                struct nfs4_state **res)
+{
+        struct nfs4_state_owner *sp = opendata->owner;
+        struct nfs_server *server = sp->so_server;
+        struct nfs4_state *state;
+        unsigned int seq;
+        int ret;
+        seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+        ret = _nfs4_proc_open(opendata);
+        if (ret != 0)
+                goto out;
+        state = nfs4_opendata_to_nfs4_state(opendata);
+        ret = PTR_ERR(state);
+        if (IS_ERR(state))
+                goto out;
+        if (server->caps & NFS_CAP_POSIX_LOCK)
+                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+        ret = nfs4_opendata_access(sp->so_cred, opendata, state, fmode, flags);
+        if (ret != 0)
+                goto out;
+        if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
+                nfs4_schedule_stateid_recovery(server, state);
+                nfs4_wait_clnt_recover(server->nfs_client);
+        }
+        *res = state;
+out:
+        return ret;
+}
 /*
 * Returns a referenced nfs4_state
 */
@@ -1885,18 +1930,7 @@ static int _nfs4_do_open(struct inode *dir,
        if (dentry->d_inode != NULL)
                opendata->state = nfs4_get_open_state(dentry->d_inode, sp);
-        status = _nfs4_proc_open(opendata);
+        status = _nfs4_open_and_get_state(opendata, fmode, flags, &state);
-        if (status != 0)
-                goto err_opendata_put;
-        state = nfs4_opendata_to_nfs4_state(opendata);
-        status = PTR_ERR(state);
-        if (IS_ERR(state))
-                goto err_opendata_put;
-        if (server->caps & NFS_CAP_POSIX_LOCK)
-                set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
-        status = nfs4_opendata_access(cred, opendata, state, fmode);
        if (status != 0)
                goto err_opendata_put;
@@ -2084,7 +2118,7 @@ static void nfs4_free_closedata(void *data)
        nfs4_put_open_state(calldata->state);
        nfs_free_seqid(calldata->arg.seqid);
        nfs4_put_state_owner(sp);
-        nfs_sb_deactive_async(sb);
+        nfs_sb_deactive(sb);
        kfree(calldata);
 }
@@ -2146,7 +2180,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        dprintk("%s: begin!\n", __func__);
        if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
-                return;
+                goto out_wait;
        task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_DOWNGRADE];
        calldata->arg.fmode = FMODE_READ|FMODE_WRITE;
@@ -2168,16 +2202,14 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
        if (!call_close) {
                /* Note: exit _without_ calling nfs4_close_done */
-                task->tk_action = NULL;
+                goto out_no_action;
-                nfs4_sequence_done(task, &calldata->res.seq_res);
-                goto out;
        }
        if (calldata->arg.fmode == 0) {
                task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
                if (calldata->roc &&
                    pnfs_roc_drain(inode, &calldata->roc_barrier, task))
-                        goto out;
+                        goto out_wait;
        }
        nfs_fattr_init(calldata->res.fattr);
@@ -2187,8 +2219,12 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
                                &calldata->res.seq_res,
                                task) != 0)
                nfs_release_seqid(calldata->arg.seqid);
-out:
        dprintk("%s: done!\n", __func__);
+        return;
+out_no_action:
+        task->tk_action = NULL;
+out_wait:
+        nfs4_sequence_done(task, &calldata->res.seq_res);
 }
 static const struct rpc_call_ops nfs4_close_ops = {
@@ -4419,12 +4455,10 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
        struct nfs4_unlockdata *calldata = data;
        if (nfs_wait_on_sequence(calldata->arg.seqid, task) != 0)
-                return;
+                goto out_wait;
        if (test_bit(NFS_LOCK_INITIALIZED, &calldata->lsp->ls_flags) == 0) {
                /* Note: exit _without_ running nfs4_locku_done */
-                task->tk_action = NULL;
+                goto out_no_action;
-                nfs4_sequence_done(task, &calldata->res.seq_res);
-                return;
        }
        calldata->timestamp = jiffies;
        if (nfs4_setup_sequence(calldata->server,
@@ -4432,6 +4466,11 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data)
                                &calldata->res.seq_res,
                                task) != 0)
                nfs_release_seqid(calldata->arg.seqid);
+        return;
+out_no_action:
+        task->tk_action = NULL;
+out_wait:
+        nfs4_sequence_done(task, &calldata->res.seq_res);
 }
 static const struct rpc_call_ops nfs4_locku_ops = {
@@ -4478,7 +4517,9 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl,
 static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
-        struct nfs_inode *nfsi = NFS_I(state->inode);
+        struct inode *inode = state->inode;
+        struct nfs4_state_owner *sp = state->owner;
+        struct nfs_inode *nfsi = NFS_I(inode);
        struct nfs_seqid *seqid;
        struct nfs4_lock_state *lsp;
        struct rpc_task *task;
@@ -4488,12 +4529,17 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
        status = nfs4_set_lock_state(state, request);
        /* Unlock _before_ we do the RPC call */
        request->fl_flags |= FL_EXISTS;
+        /* Exclude nfs_delegation_claim_locks() */
+        mutex_lock(&sp->so_delegreturn_mutex);
+        /* Exclude nfs4_reclaim_open_stateid() - note nesting! */
        down_read(&nfsi->rwsem);
        if (do_vfs_lock(request->fl_file, request) == -ENOENT) {
                up_read(&nfsi->rwsem);
+                mutex_unlock(&sp->so_delegreturn_mutex);
                goto out;
        }
        up_read(&nfsi->rwsem);
+        mutex_unlock(&sp->so_delegreturn_mutex);
        if (status != 0)
                goto out;
        /* Is this a delegated lock? */
@@ -4572,7 +4618,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        dprintk("%s: begin!\n", __func__);
        if (nfs_wait_on_sequence(data->arg.lock_seqid, task) != 0)
-                return;
+                goto out_wait;
        /* Do we need to do an open_to_lock_owner? */
        if (!(data->arg.lock_seqid->sequence->flags & NFS_SEQID_CONFIRMED)) {
                if (nfs_wait_on_sequence(data->arg.open_seqid, task) != 0) {
@@ -4592,6 +4638,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata)
        nfs_release_seqid(data->arg.open_seqid);
 out_release_lock_seqid:
        nfs_release_seqid(data->arg.lock_seqid);
+out_wait:
+        nfs4_sequence_done(task, &data->res.seq_res);
        dprintk("%s: done!, ret = %d\n", __func__, data->rpc_status);
 }
@@ -4809,8 +4857,10 @@ static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *reques
 static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
+        struct nfs4_state_owner *sp = state->owner;
        struct nfs_inode *nfsi = NFS_I(state->inode);
        unsigned char fl_flags = request->fl_flags;
+        unsigned int seq;
        int status = -ENOLCK;
        if ((fl_flags & FL_POSIX) &&
@@ -4832,9 +4882,16 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
                status = do_vfs_lock(request->fl_file, request);
                goto out_unlock;
        }
+        seq = raw_seqcount_begin(&sp->so_reclaim_seqcount);
+        up_read(&nfsi->rwsem);
        status = _nfs4_do_setlk(state, cmd, request, NFS_LOCK_NEW);
        if (status != 0)
+                goto out;
+        down_read(&nfsi->rwsem);
+        if (read_seqcount_retry(&sp->so_reclaim_seqcount, seq)) {
+                status = -NFS4ERR_DELAY;
                goto out_unlock;
+        }
        /* Note: we always want to sleep here! */
        request->fl_flags = fl_flags | FL_SLEEP;
        if (do_vfs_lock(request->fl_file, request) < 0)
@@ -4941,24 +4998,22 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                        case 0:
                        case -ESTALE:
                                goto out;
-                        case -NFS4ERR_EXPIRED:
-                                nfs4_schedule_stateid_recovery(server, state);
                        case -NFS4ERR_STALE_CLIENTID:
                        case -NFS4ERR_STALE_STATEID:
+                                set_bit(NFS_DELEGATED_STATE, &state->flags);
+                        case -NFS4ERR_EXPIRED:
                                nfs4_schedule_lease_recovery(server->nfs_client);
+                                err = -EAGAIN;
                                goto out;
                        case -NFS4ERR_BADSESSION:
                        case -NFS4ERR_BADSLOT:
                        case -NFS4ERR_BAD_HIGH_SLOT:
                        case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
                        case -NFS4ERR_DEADSESSION:
+                                set_bit(NFS_DELEGATED_STATE, &state->flags);
                                nfs4_schedule_session_recovery(server->nfs_client->cl_session, err);
+                                err = -EAGAIN;
                                goto out;
-                        case -ERESTARTSYS:
-                                /*
-                                 * The show must go on: exit, but mark the
-                                 * stateid as needing recovery.
-                                 */
                        case -NFS4ERR_DELEG_REVOKED:
                        case -NFS4ERR_ADMIN_REVOKED:
                        case -NFS4ERR_BAD_STATEID:
@@ -4971,9 +5026,8 @@ int nfs4_lock_delegation_recall(struct nfs4_state *state, struct file_lock *fl)
                                /* kill_proc(fl->fl_pid, SIGLOST, 1); */
                                err = 0;
                                goto out;
-                        case -NFS4ERR_DELAY:
-                                break;
                }
+                set_bit(NFS_DELEGATED_STATE, &state->flags);
                err = nfs4_handle_exception(server, err, &exception);
        } while (exception.retry);
 out:
@@ -6130,7 +6184,8 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
        status = nfs4_wait_for_completion_rpc_task(task);
        if (status == 0)
                status = task->tk_status;
-        if (status == 0)
+        /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
+        if (status == 0 && lgp->res.layoutp->len)
                lseg = pnfs_layout_process(lgp);
        rpc_put_task(task);
        dprintk("<-- %s status=%d\n", __func__, status);
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 9448c579d41a..6ace365c6334 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp,
        clp->cl_confirm = clid.confirm;
        status = nfs40_walk_client_list(clp, result, cred);
-        switch (status) {
+        if (status == 0) {
-        case -NFS4ERR_STALE_CLIENTID:
-                set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state);
-        case 0:
                /* Sustain the lease, even if it's empty.  If the clientid4
                 * goes stale it's of no use for trunking discovery. */
                nfs4_schedule_state_renewal(*result);
-                break;
        }
 out:
        return status;
 }
@@ -523,6 +518,8 @@ nfs4_alloc_state_owner(struct nfs_server *server,
        nfs4_init_seqid_counter(&sp->so_seqid);
        atomic_set(&sp->so_count, 1);
        INIT_LIST_HEAD(&sp->so_lru);
+        seqcount_init(&sp->so_reclaim_seqcount);
+        mutex_init(&sp->so_delegreturn_mutex);
        return sp;
 }
@@ -1395,8 +1392,9 @@ static int nfs4_reclaim_open_state(struct nfs4_state_owner *sp, const struct nfs
         * recovering after a network partition or a reboot from a
         * server that doesn't support a grace period.
         */
-restart:
        spin_lock(&sp->so_lock);
+        write_seqcount_begin(&sp->so_reclaim_seqcount);
+restart:
        list_for_each_entry(state, &sp->so_states, open_states) {
                if (!test_and_clear_bit(ops->state_flag_bit, &state->flags))
                        continue;
@@ -1417,6 +1415,7 @@ restart:
                                }
                                spin_unlock(&state->state_lock);
                                nfs4_put_open_state(state);
+                                spin_lock(&sp->so_lock);
                                goto restart;
                        }
                }
@@ -1454,12 +1453,17 @@ restart:
                                goto out_err;
                }
                nfs4_put_open_state(state);
+                spin_lock(&sp->so_lock);
                goto restart;
        }
+        write_seqcount_end(&sp->so_reclaim_seqcount);
        spin_unlock(&sp->so_lock);
        return 0;
 out_err:
        nfs4_put_open_state(state);
+        spin_lock(&sp->so_lock);
+        write_seqcount_end(&sp->so_reclaim_seqcount);
+        spin_unlock(&sp->so_lock);
        return status;
 }
@@ -1863,6 +1867,7 @@ again:
        case -ETIMEDOUT:
        case -EAGAIN:
                ssleep(1);
+        case -NFS4ERR_STALE_CLIENTID:
                dprintk("NFS: %s after status %d, retrying\n",
                        __func__, status);
                goto again;
@@ -2022,8 +2027,18 @@ static int nfs4_reset_session(struct nfs_client *clp)
        nfs4_begin_drain_session(clp);
        cred = nfs4_get_exchange_id_cred(clp);
        status = nfs4_proc_destroy_session(clp->cl_session, cred);
-        if (status && status != -NFS4ERR_BADSESSION &&
+        switch (status) {
-            status != -NFS4ERR_DEADSESSION) {
+        case 0:
+        case -NFS4ERR_BADSESSION:
+        case -NFS4ERR_DEADSESSION:
+                break;
+        case -NFS4ERR_BACK_CHAN_BUSY:
+        case -NFS4ERR_DELAY:
+                set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state);
+                status = 0;
+                ssleep(1);
+                goto out;
+        default:
                status = nfs4_recovery_handle_error(clp, status);
                goto out;
        }
diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c
index c6f990656f89..88f9611a945c 100644
--- a/fs/nfs/objlayout/objio_osd.c
+++ b/fs/nfs/objlayout/objio_osd.c
@@ -647,6 +647,7 @@ static struct pnfs_layoutdriver_type objlayout_type = {
        .flags                   = PNFS_LAYOUTRET_ON_SETATTR |
                                   PNFS_LAYOUTRET_ON_ERROR,
+        .owner                   = THIS_MODULE,
        .alloc_layout_hdr        = objlayout_alloc_layout_hdr,
        .free_layout_hdr         = objlayout_free_layout_hdr,
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index e7165d915362..6be70f622b62 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -254,7 +254,7 @@ static void
 pnfs_layout_set_fail_bit(struct pnfs_layout_hdr *lo, int fail_bit)
 {
        lo->plh_retry_timestamp = jiffies;
-        if (test_and_set_bit(fail_bit, &lo->plh_flags))
+        if (!test_and_set_bit(fail_bit, &lo->plh_flags))
                atomic_inc(&lo->plh_refcount);
 }
@@ -505,37 +505,147 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 }
 EXPORT_SYMBOL_GPL(pnfs_destroy_layout);
-/*
+static bool
- * Called by the state manger to remove all layouts established under an
+pnfs_layout_add_bulk_destroy_list(struct inode *inode,
- * expired lease.
+                struct list_head *layout_list)
- */
-void
-pnfs_destroy_all_layouts(struct nfs_client *clp)
 {
-        struct nfs_server *server;
        struct pnfs_layout_hdr *lo;
-        LIST_HEAD(tmp_list);
+        bool ret = false;
-        nfs4_deviceid_mark_client_invalid(clp);
+        spin_lock(&inode->i_lock);
-        nfs4_deviceid_purge_client(clp);
+        lo = NFS_I(inode)->layout;
+        if (lo != NULL && list_empty(&lo->plh_bulk_destroy)) {
+                pnfs_get_layout_hdr(lo);
+                list_add(&lo->plh_bulk_destroy, layout_list);
+                ret = true;
+        }
+        spin_unlock(&inode->i_lock);
+        return ret;
+}
+/* Caller must hold rcu_read_lock and clp->cl_lock */
+static int
+pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
+                struct nfs_server *server,
+                struct list_head *layout_list)
+{
+        struct pnfs_layout_hdr *lo, *next;
+        struct inode *inode;
+        list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
+                inode = igrab(lo->plh_inode);
+                if (inode == NULL)
+                        continue;
+                list_del_init(&lo->plh_layouts);
+                if (pnfs_layout_add_bulk_destroy_list(inode, layout_list))
+                        continue;
+                rcu_read_unlock();
+                spin_unlock(&clp->cl_lock);
+                iput(inode);
+                spin_lock(&clp->cl_lock);
+                rcu_read_lock();
+                return -EAGAIN;
+        }
+        return 0;
+}
+static int
+pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
+                bool is_bulk_recall)
+{
+        struct pnfs_layout_hdr *lo;
+        struct inode *inode;
+        struct pnfs_layout_range range = {
+                .iomode = IOMODE_ANY,
+                .offset = 0,
+                .length = NFS4_MAX_UINT64,
+        };
+        LIST_HEAD(lseg_list);
+        int ret = 0;
+        while (!list_empty(layout_list)) {
+                lo = list_entry(layout_list->next, struct pnfs_layout_hdr,
+                                plh_bulk_destroy);
+                dprintk("%s freeing layout for inode %lu\n", __func__,
+                        lo->plh_inode->i_ino);
+                inode = lo->plh_inode;
+                spin_lock(&inode->i_lock);
+                list_del_init(&lo->plh_bulk_destroy);
+                lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
+                if (is_bulk_recall)
+                        set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
+                if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
+                        ret = -EAGAIN;
+                spin_unlock(&inode->i_lock);
+                pnfs_free_lseg_list(&lseg_list);
+                pnfs_put_layout_hdr(lo);
+                iput(inode);
+        }
+        return ret;
+}
+int
+pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+                struct nfs_fsid *fsid,
+                bool is_recall)
+{
+        struct nfs_server *server;
+        LIST_HEAD(layout_list);
        spin_lock(&clp->cl_lock);
        rcu_read_lock();
+restart:
        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
-                if (!list_empty(&server->layouts))
+                if (memcmp(&server->fsid, fsid, sizeof(*fsid)) != 0)
-                        list_splice_init(&server->layouts, &tmp_list);
+                        continue;
+                if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+                                server,
+                                &layout_list) != 0)
+                        goto restart;
        }
        rcu_read_unlock();
        spin_unlock(&clp->cl_lock);
-        while (!list_empty(&tmp_list)) {
+        if (list_empty(&layout_list))
-                lo = list_entry(tmp_list.next, struct pnfs_layout_hdr,
+                return 0;
-                                plh_layouts);
+        return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
-                dprintk("%s freeing layout for inode %lu\n", __func__,
+}
-                        lo->plh_inode->i_ino);
-                list_del_init(&lo->plh_layouts);
+int
-                pnfs_destroy_layout(NFS_I(lo->plh_inode));
+pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+                bool is_recall)
+{
+        struct nfs_server *server;
+        LIST_HEAD(layout_list);
+        spin_lock(&clp->cl_lock);
+        rcu_read_lock();
+restart:
+        list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+                if (pnfs_layout_bulk_destroy_byserver_locked(clp,
+                                        server,
+                                        &layout_list) != 0)
+                        goto restart;
        }
+        rcu_read_unlock();
+        spin_unlock(&clp->cl_lock);
+        if (list_empty(&layout_list))
+                return 0;
+        return pnfs_layout_free_bulk_destroy_list(&layout_list, is_recall);
+}
+/*
+ * Called by the state manger to remove all layouts established under an
+ * expired lease.
+ */
+void
+pnfs_destroy_all_layouts(struct nfs_client *clp)
+{
+        nfs4_deviceid_mark_client_invalid(clp);
+        nfs4_deviceid_purge_client(clp);
+        pnfs_destroy_layouts_byclid(clp, false);
 }
 /*
@@ -888,7 +998,7 @@ alloc_init_layout_hdr(struct inode *ino,
        atomic_set(&lo->plh_refcount, 1);
        INIT_LIST_HEAD(&lo->plh_layouts);
        INIT_LIST_HEAD(&lo->plh_segs);
-        INIT_LIST_HEAD(&lo->plh_bulk_recall);
+        INIT_LIST_HEAD(&lo->plh_bulk_destroy);
        lo->plh_inode = ino;
        lo->plh_lc_cred = get_rpccred(ctx->state->owner->so_cred);
        return lo;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index dbf7bba52da0..97cb358bb882 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -132,7 +132,7 @@ struct pnfs_layoutdriver_type {
 struct pnfs_layout_hdr {
        atomic_t                plh_refcount;
        struct list_head        plh_layouts;   /* other client layouts */
-        struct list_head        plh_bulk_recall; /* clnt list of bulk recalls */
+        struct list_head        plh_bulk_destroy;
        struct list_head        plh_segs;      /* layout segments list */
        nfs4_stateid            plh_stateid;
        atomic_t                plh_outstanding; /* number of RPCs out */
@@ -196,6 +196,11 @@ struct pnfs_layout_segment *pnfs_layout_process(struct nfs4_layoutget *lgp);
 void pnfs_free_lseg_list(struct list_head *tmp_list);
 void pnfs_destroy_layout(struct nfs_inode *);
 void pnfs_destroy_all_layouts(struct nfs_client *);
+int pnfs_destroy_layouts_byfsid(struct nfs_client *clp,
+                struct nfs_fsid *fsid,
+                bool is_recall);
+int pnfs_destroy_layouts_byclid(struct nfs_client *clp,
+                bool is_recall);
 void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
                             const nfs4_stateid *new,
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index b6bdb18e892c..a5e5d9899d56 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -91,12 +91,16 @@ void nfs_readdata_release(struct nfs_read_data *rdata)
        put_nfs_open_context(rdata->args.context);
        if (rdata->pages.pagevec != rdata->pages.page_array)
                kfree(rdata->pages.pagevec);
-        if (rdata != &read_header->rpc_data)
+        if (rdata == &read_header->rpc_data) {
-                kfree(rdata);
-        else
                rdata->header = NULL;
+                rdata = NULL;
+        }
        if (atomic_dec_and_test(&hdr->refcnt))
                hdr->completion_ops->completion(hdr);
+        /* Note: we only free the rpc_task after callbacks are done.
+         * See the comment in rpc_free_task() for why
+         */
+        kfree(rdata);
 }
 EXPORT_SYMBOL_GPL(nfs_readdata_release);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index c25cadf8f8c4..befbae0cce41 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -54,7 +54,6 @@
 #include <linux/parser.h>
 #include <linux/nsproxy.h>
 #include <linux/rcupdate.h>
-#include <linux/kthread.h>
 #include <asm/uaccess.h>
@@ -418,54 +417,6 @@ void nfs_sb_deactive(struct super_block *sb)
 }
 EXPORT_SYMBOL_GPL(nfs_sb_deactive);
-static int nfs_deactivate_super_async_work(void *ptr)
-{
-        struct super_block *sb = ptr;
-        deactivate_super(sb);
-        module_put_and_exit(0);
-        return 0;
-}
-/*
- * same effect as deactivate_super, but will do final unmount in kthread
- * context
- */
-static void nfs_deactivate_super_async(struct super_block *sb)
-{
-        struct task_struct *task;
-        char buf[INET6_ADDRSTRLEN + 1];
-        struct nfs_server *server = NFS_SB(sb);
-        struct nfs_client *clp = server->nfs_client;
-        if (!atomic_add_unless(&sb->s_active, -1, 1)) {
-                rcu_read_lock();
-                snprintf(buf, sizeof(buf),
-                        rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR));
-                rcu_read_unlock();
-                __module_get(THIS_MODULE);
-                task = kthread_run(nfs_deactivate_super_async_work, sb,
-                                "%s-deactivate-super", buf);
-                if (IS_ERR(task)) {
-                        pr_err("%s: kthread_run: %ld\n",
-                                __func__, PTR_ERR(task));
-                        /* make synchronous call and hope for the best */
-                        deactivate_super(sb);
-                        module_put(THIS_MODULE);
-                }
-        }
-}
-void nfs_sb_deactive_async(struct super_block *sb)
-{
-        struct nfs_server *server = NFS_SB(sb);
-        if (atomic_dec_and_test(&server->active))
-                nfs_deactivate_super_async(sb);
-}
-EXPORT_SYMBOL_GPL(nfs_sb_deactive_async);
 /*
 * Deliver file system statistics to userspace
 */
@@ -1152,7 +1103,7 @@ static int nfs_get_option_str(substring_t args[], char **option)
 {
        kfree(*option);
        *option = match_strdup(args);
-        return !option;
+        return !*option;
 }
 static int nfs_get_option_ul(substring_t args[], unsigned long *option)
@@ -2589,27 +2540,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
        struct nfs_server *server;
        struct dentry *mntroot = ERR_PTR(-ENOMEM);
        struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod;
-        int error;
-        dprintk("--> nfs_xdev_mount_common()\n");
+        dprintk("--> nfs_xdev_mount()\n");
        mount_info.mntfh = mount_info.cloned->fh;
        /* create a new volume representation */
        server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor);
-        if (IS_ERR(server)) {
-                error = PTR_ERR(server);
-                goto out_err;
-        }
-        mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod);
+        if (IS_ERR(server))
-        dprintk("<-- nfs_xdev_mount_common() = 0\n");
+                mntroot = ERR_CAST(server);
-out:
+        else
-        return mntroot;
+                mntroot = nfs_fs_mount_common(server, flags,
+                                dev_name, &mount_info, nfs_mod);
-out_err:
+        dprintk("<-- nfs_xdev_mount() = %ld\n",
-        dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error);
+                        IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L);
-        goto out;
+        return mntroot;
 }
 #if IS_ENABLED(CONFIG_NFS_V4)
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 3f79c77153b8..d26a32f5b53b 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -95,7 +95,7 @@ static void nfs_async_unlink_release(void *calldata)
        nfs_dec_sillycount(data->dir);
        nfs_free_unlinkdata(data);
-        nfs_sb_deactive_async(sb);
+        nfs_sb_deactive(sb);
 }
 static void nfs_unlink_prepare(struct rpc_task *task, void *calldata)
@@ -268,8 +268,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry)
         * point dentry is definitely not a root, so we won't need
         * that anymore.
         */
-        if (devname_garbage)
+        kfree(devname_garbage);
-                kfree(devname_garbage);
        return 0;
 out_unlock:
        spin_unlock(&dentry->d_lock);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b673be31590e..c483cc50b82e 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -126,12 +126,16 @@ void nfs_writedata_release(struct nfs_write_data *wdata)
        put_nfs_open_context(wdata->args.context);
        if (wdata->pages.pagevec != wdata->pages.page_array)
                kfree(wdata->pages.pagevec);
-        if (wdata != &write_header->rpc_data)
+        if (wdata == &write_header->rpc_data) {
-                kfree(wdata);
-        else
                wdata->header = NULL;
+                wdata = NULL;
+        }
        if (atomic_dec_and_test(&hdr->refcnt))
                hdr->completion_ops->completion(hdr);
+        /* Note: we only free the rpc_task after callbacks are done.
+         * See the comment in rpc_free_task() for why
+         */
+        kfree(wdata);
 }
 EXPORT_SYMBOL_GPL(nfs_writedata_release);
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 8df1ea4a6ff9..430b6872806f 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -65,8 +65,8 @@ config NFSD_V3_ACL
          If unsure, say N.
 config NFSD_V4
-        bool "NFS server support for NFS version 4 (EXPERIMENTAL)"
+        bool "NFS server support for NFS version 4"
-        depends on NFSD && PROC_FS && EXPERIMENTAL
+        depends on NFSD && PROC_FS
        select NFSD_V3
        select FS_POSIX_ACL
        select SUNRPC_GSS
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 251da07b2a1d..80da8eb27393 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,5 @@
 config NILFS2_FS
-        tristate "NILFS2 file system support (EXPERIMENTAL)"
+        tristate "NILFS2 file system support"
-        depends on EXPERIMENTAL
        select CRC32
        help
          NILFS2 is a log-structured file system (LFS) supporting continuous
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 61946883025c..bec4af6eab13 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -126,7 +126,7 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        nilfs_transaction_commit(inode->i_sb);
 mapped:
-        wait_on_page_writeback(page);
+        wait_for_stable_page(page);
 out:
        sb_end_pagefault(inode->i_sb);
        return block_page_mkwrite_return(ret);
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index fdb180769485..f3859354e41a 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -664,8 +664,11 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
        if (ret < 0)
                printk(KERN_ERR "NILFS: GC failed during preparation: "
                        "cannot read source blocks: err=%d\n", ret);
-        else
+        else {
+                if (nilfs_sb_need_update(nilfs))
+                        set_nilfs_discontinued(nilfs);
                ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+        }
        nilfs_remove_all_gcinodes(nilfs);
        clear_nilfs_gc_running(nilfs);
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 228a2c2ad8d7..07f7a92fe88e 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -576,8 +576,6 @@ static int inotify_update_existing_watch(struct fsnotify_group *group,
        /* don't allow invalid bits: we don't want flags set */
        mask = inotify_arg_to_mask(arg);
-        if (unlikely(!(mask & IN_ALL_EVENTS)))
-                return -EINVAL;
        fsn_mark = fsnotify_find_inode_mark(group, inode);
        if (!fsn_mark)
@@ -629,8 +627,6 @@ static int inotify_new_watch(struct fsnotify_group *group,
        /* don't allow invalid bits: we don't want flags set */
        mask = inotify_arg_to_mask(arg);
-        if (unlikely(!(mask & IN_ALL_EVENTS)))
-                return -EINVAL;
        tmp_i_mark = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
        if (unlikely(!tmp_i_mark))
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 31b9463fba1f..b8a9d87231b1 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -6751,8 +6751,7 @@ int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle,
                mlog_errno(ret);
 out:
-        if (pages)
+        kfree(pages);
-                kfree(pages);
        return ret;
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 657743254eb9..9796330d8f04 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -1194,6 +1194,7 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
                                goto out;
                        }
                }
+                wait_for_stable_page(wc->w_pages[i]);
                if (index == target_index)
                        wc->w_target_page = wc->w_pages[i];
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index f7c648d7d6bf..42252bf64b51 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1471,8 +1471,7 @@ static void o2hb_region_release(struct config_item *item)
        mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
-        if (reg->hr_tmp_block)
+        kfree(reg->hr_tmp_block);
-                kfree(reg->hr_tmp_block);
        if (reg->hr_slot_data) {
                for (i = 0; i < reg->hr_num_pages; i++) {
@@ -1486,8 +1485,7 @@ static void o2hb_region_release(struct config_item *item)
        if (reg->hr_bdev)
                blkdev_put(reg->hr_bdev, FMODE_READ|FMODE_WRITE);
-        if (reg->hr_slots)
+        kfree(reg->hr_slots);
-                kfree(reg->hr_slots);
        kfree(reg->hr_db_regnum);
        kfree(reg->hr_db_livenodes);
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 1bfe8802cc1e..0d2bf566e39a 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -870,7 +870,7 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
                /* we've had some trouble with handlers seemingly vanishing. */
                mlog_bug_on_msg(o2net_handler_tree_lookup(msg_type, key, &p,
                                                          &parent) == NULL,
-                                "couldn't find handler we *just* registerd "
+                                "couldn't find handler we *just* registered "
                                "for type %u key %08x\n", msg_type, key);
        }
        write_unlock(&o2net_handler_lock);
@@ -1165,10 +1165,8 @@ out:
        o2net_debug_del_nst(&nst); /* must be before dropping sc and node */
        if (sc)
                sc_put(sc);
-        if (vec)
+        kfree(vec);
-                kfree(vec);
+        kfree(msg);
-        if (msg)
-                kfree(msg);
        o2net_complete_nsw(nn, &nsw, 0, 0, 0);
        return ret;
 }
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 9e89d70df337..dbb17c07656a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -319,9 +319,7 @@ static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
        if (dlm->master_hash)
                dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES);
-        if (dlm->name)
+        kfree(dlm->name);
-                kfree(dlm->name);
        kfree(dlm);
 }
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 4f7795fb5fc0..88577eb5d712 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2545,6 +2545,7 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
         * everything is up to the caller :) */
        status = ocfs2_should_refresh_lock_res(lockres);
        if (status < 0) {
+                ocfs2_cluster_unlock(osb, lockres, level);
                mlog_errno(status);
                goto bail;
        }
@@ -2553,8 +2554,10 @@ int ocfs2_super_lock(struct ocfs2_super *osb,
                ocfs2_complete_lock_res_refresh(lockres, status);
-                if (status < 0)
+                if (status < 0) {
+                        ocfs2_cluster_unlock(osb, lockres, level);
                        mlog_errno(status);
+                }
                ocfs2_track_lock_refresh(lockres);
        }
 bail:
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f487aa343442..1c39efb71bab 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -282,8 +282,7 @@ search:
        spin_unlock(&oi->ip_lock);
 out:
-        if (new_emi)
+        kfree(new_emi);
-                kfree(new_emi);
 }
 static int ocfs2_last_eb_is_empty(struct inode *inode,
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 2dd36af79e26..8eccfabcd12e 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1234,11 +1234,8 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal,
                /* Though we wish to avoid it, we are in fact safe in
                 * skipping local alloc cleanup as fsck.ocfs2 is more
                 * than capable of reclaiming unused space. */
-                if (la_dinode)
+                kfree(la_dinode);
-                        kfree(la_dinode);
+                kfree(tl_dinode);
-                if (tl_dinode)
-                        kfree(tl_dinode);
                if (qrec)
                        ocfs2_free_quota_recovery(qrec);
@@ -1408,8 +1405,7 @@ bail:
        mutex_unlock(&osb->recovery_lock);
-        if (rm_quota)
+        kfree(rm_quota);
-                kfree(rm_quota);
        /* no one is callint kthread_stop() for us so the kthread() api
         * requires that we call do_exit().  And it isn't exported, but
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index a9f78c74d687..aebeacd807c3 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -476,8 +476,7 @@ out:
        if (local_alloc_inode)
                iput(local_alloc_inode);
-        if (alloc_copy)
+        kfree(alloc_copy);
-                kfree(alloc_copy);
 }
 /*
@@ -534,7 +533,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
                mlog_errno(status);
 bail:
-        if ((status < 0) && (*alloc_copy)) {
+        if (status < 0) {
                kfree(*alloc_copy);
                *alloc_copy = NULL;
        }
@@ -1290,8 +1289,7 @@ bail:
        if (main_bm_inode)
                iput(main_bm_inode);
-        if (alloc_copy)
+        kfree(alloc_copy);
-                kfree(alloc_copy);
        if (ac)
                ocfs2_free_alloc_context(ac);
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 94368017edb3..bf1f8930456f 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -376,7 +376,7 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
        dlm_register_eviction_cb(dlm, &priv->op_eviction_cb);
 out_free:
-        if (rc && conn->cc_private)
+        if (rc)
                kfree(conn->cc_private);
 out:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 0e91ec22a940..9b6910dec4ba 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -2525,8 +2525,7 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
                mlog_errno(status);
 finally:
-        if (local_alloc)
+        kfree(local_alloc);
-                kfree(local_alloc);
        if (status)
                mlog_errno(status);
@@ -2553,8 +2552,7 @@ static void ocfs2_delete_osb(struct ocfs2_super *osb)
         * we free it here.
         */
        kfree(osb->journal);
-        if (osb->local_alloc_copy)
+        kfree(osb->local_alloc_copy);
-                kfree(osb->local_alloc_copy);
        kfree(osb->uuid_str);
        ocfs2_put_dlm_debug(osb->osb_dlm_debug);
        memset(osb, 0, sizeof(struct ocfs2_super));
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 3d635f4bbb20..f053688d22a3 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -91,8 +91,7 @@ static struct inode **get_local_system_inode(struct ocfs2_super *osb,
                } else
                        osb->local_system_inodes = local_system_inodes;
                spin_unlock(&osb->osb_lock);
-                if (unlikely(free))
+                kfree(free);
-                        kfree(free);
        }
        index = (slot * NUM_LOCAL_SYSTEM_INODES) +
diff --git a/fs/proc/Makefile b/fs/proc/Makefile
index 981b05601931..712f24db9600 100644
--- a/fs/proc/Makefile
+++ b/fs/proc/Makefile
@@ -8,7 +8,8 @@ proc-y			:= nommu.o task_nommu.o
 proc-$(CONFIG_MMU)      := mmu.o task_mmu.o
 proc-y       += inode.o root.o base.o generic.o array.o \
-                proc_tty.o fd.o
+                fd.o
+proc-$(CONFIG_TTY)      += proc_tty.o
 proc-y  += cmdline.o
 proc-y  += consoles.o
 proc-y  += cpuinfo.o
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 6a91e6ffbcbd..f7ed9ee46eb9 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -449,7 +449,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        do {
                                min_flt += t->min_flt;
                                maj_flt += t->maj_flt;
-                                gtime += t->gtime;
+                                gtime += task_gtime(t);
                                t = next_thread(t);
                        } while (t != task);
@@ -472,7 +472,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                min_flt = task->min_flt;
                maj_flt = task->maj_flt;
                task_cputime_adjusted(task, &utime, &stime);
-                gtime = task->gtime;
+                gtime = task_gtime(task);
        }
        /* scale priority and nice values from timeslices to -20..20 */
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index fe72cd073dea..3131a03d7d37 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -177,20 +177,6 @@ const struct file_operations proc_net_operations = {
        .readdir        = proc_tgid_net_readdir,
 };
-struct proc_dir_entry *proc_net_fops_create(struct net *net,
-        const char *name, umode_t mode, const struct file_operations *fops)
-{
-        return proc_create(name, mode, net->proc_net, fops);
-}
-EXPORT_SYMBOL_GPL(proc_net_fops_create);
-void proc_net_remove(struct net *net, const char *name)
-{
-        remove_proc_entry(name, net->proc_net);
-}
-EXPORT_SYMBOL_GPL(proc_net_remove);
 static __net_init int proc_net_ns_init(struct net *net)
 {
        struct proc_dir_entry *netd, *net_statd;
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 67de74ca85f4..e4bcb2cf055a 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -418,9 +418,25 @@ static struct file_system_type pstore_fs_type = {
        .kill_sb        = pstore_kill_sb,
 };
+static struct kobject *pstore_kobj;
 static int __init init_pstore_fs(void)
 {
-        return register_filesystem(&pstore_fs_type);
+        int err = 0;
+        /* Create a convenient mount point for people to access pstore */
+        pstore_kobj = kobject_create_and_add("pstore", fs_kobj);
+        if (!pstore_kobj) {
+                err = -ENOMEM;
+                goto out;
+        }
+        err = register_filesystem(&pstore_fs_type);
+        if (err < 0)
+                kobject_put(pstore_kobj);
+out:
+        return err;
 }
 module_init(init_pstore_fs)
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 5ea2e77ff023..86d1038b5a12 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -96,6 +96,27 @@ static const char *get_reason_str(enum kmsg_dump_reason reason)
        }
 }
+bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
+{
+        /*
+         * In case of NMI path, pstore shouldn't be blocked
+         * regardless of reason.
+         */
+        if (in_nmi())
+                return true;
+        switch (reason) {
+        /* In panic case, other cpus are stopped by smp_send_stop(). */
+        case KMSG_DUMP_PANIC:
+        /* Emergency restart shouldn't be blocked by spin lock. */
+        case KMSG_DUMP_EMERG:
+                return true;
+        default:
+                return false;
+        }
+}
+EXPORT_SYMBOL_GPL(pstore_cannot_block_path);
 /*
 * callback from kmsg_dump. (s2,l2) has the most recently
 * written bytes, older bytes are in (s1,l1). Save as much
@@ -114,10 +135,12 @@ static void pstore_dump(struct kmsg_dumper *dumper,
        why = get_reason_str(reason);
-        if (in_nmi()) {
+        if (pstore_cannot_block_path(reason)) {
-                is_locked = spin_trylock(&psinfo->buf_lock);
+                is_locked = spin_trylock_irqsave(&psinfo->buf_lock, flags);
-                if (!is_locked)
+                if (!is_locked) {
-                        pr_err("pstore dump routine blocked in NMI, may corrupt error record\n");
+                        pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
+                                       , in_nmi() ? "NMI" : why);
+                }
        } else
                spin_lock_irqsave(&psinfo->buf_lock, flags);
        oopscount++;
@@ -143,9 +166,9 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                total += hsize + len;
                part++;
        }
-        if (in_nmi()) {
+        if (pstore_cannot_block_path(reason)) {
                if (is_locked)
-                        spin_unlock(&psinfo->buf_lock);
+                        spin_unlock_irqrestore(&psinfo->buf_lock, flags);
        } else
                spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 }
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index f883e7e74305..288f068740f6 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -167,12 +167,16 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 static size_t ramoops_write_kmsg_hdr(struct persistent_ram_zone *prz)
 {
        char *hdr;
-        struct timeval timestamp;
+        struct timespec timestamp;
        size_t len;
-        do_gettimeofday(&timestamp);
+        /* Report zeroed timestamp if called before timekeeping has resumed. */
+        if (__getnstimeofday(&timestamp)) {
+                timestamp.tv_sec = 0;
+                timestamp.tv_nsec = 0;
+        }
        hdr = kasprintf(GFP_ATOMIC, RAMOOPS_KERNMSG_HDR "%lu.%lu\n",
-                (long)timestamp.tv_sec, (long)timestamp.tv_usec);
+                (long)timestamp.tv_sec, (long)(timestamp.tv_nsec / 1000));
        WARN_ON_ONCE(!hdr);
        len = hdr ? strlen(hdr) : 0;
        persistent_ram_write(prz, hdr, len);
@@ -291,9 +295,8 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
        kfree(cxt->przs);
 }
-static int __devinit ramoops_init_przs(struct device *dev,
+static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
-                                       struct ramoops_context *cxt,
+                             phys_addr_t *paddr, size_t dump_mem_sz)
-                                       phys_addr_t *paddr, size_t dump_mem_sz)
 {
        int err = -ENOMEM;
        int i;
@@ -336,10 +339,9 @@ fail_prz:
        return err;
 }
-static int __devinit ramoops_init_prz(struct device *dev,
+static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
-                                      struct ramoops_context *cxt,
+                            struct persistent_ram_zone **prz,
-                                      struct persistent_ram_zone **prz,
+                            phys_addr_t *paddr, size_t sz, u32 sig)
-                                      phys_addr_t *paddr, size_t sz, u32 sig)
 {
        if (!sz)
                return 0;
@@ -367,7 +369,7 @@ static int __devinit ramoops_init_prz(struct device *dev,
        return 0;
 }
-static int __devinit ramoops_probe(struct platform_device *pdev)
+static int ramoops_probe(struct platform_device *pdev)
 {
        struct device *dev = &pdev->dev;
        struct ramoops_platform_data *pdata = pdev->dev.platform_data;
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index eecd2a8a84dd..0306303be372 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -390,8 +390,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
        return 0;
 }
-static int __devinit persistent_ram_post_init(struct persistent_ram_zone *prz,
+static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
-                                              u32 sig, int ecc_size)
+                                    int ecc_size)
 {
        int ret;
@@ -443,9 +443,8 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
        kfree(prz);
 }
-struct persistent_ram_zone * __devinit persistent_ram_new(phys_addr_t start,
+struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
-                                                          size_t size, u32 sig,
+                                               u32 sig, int ecc_size)
-                                                          int ecc_size)
 {
        struct persistent_ram_zone *prz;
        int ret = -ENOMEM;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index b6addf560483..57199a52a351 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -285,7 +285,7 @@ static struct buffer_head *qnx6_check_first_superblock(struct super_block *s,
                if (fs32_to_cpu(sbi, sb->sb_magic) == QNX6_SUPER_MAGIC) {
                        /* we got a big endian fs */
                        QNX6DEBUG((KERN_INFO "qnx6: fs got different"
-                                        " endianess.\n"));
+                                        " endianness.\n"));
                        return bh;
                } else
                        sbi->s_bytesex = BYTESEX_LE;
diff --git a/fs/select.c b/fs/select.c
index 2ef72d965036..8c1c96c27062 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/rcupdate.h>
 #include <linux/hrtimer.h>
+#include <linux/sched/rt.h>
 #include <asm/uaccess.h>
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 9d863fb501f9..f2bc3dfd0b88 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -296,7 +296,7 @@ EXPORT_SYMBOL(seq_read);
 *      seq_lseek -     ->llseek() method for sequential files.
 *      @file: the file in question
 *      @offset: new position
- *      @origin: 0 for absolute, 1 for relative position
+ *      @whence: 0 for absolute, 1 for relative position
 *
 *      Ready-made ->f_op->llseek()
 */
diff --git a/fs/splice.c b/fs/splice.c
index 8890604e3fcd..6909d89d0da5 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -696,8 +696,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *pipe,
                return -EINVAL;
        more = (sd->flags & SPLICE_F_MORE) ? MSG_MORE : 0;
-        if (sd->len < sd->total_len)
+        if (sd->len < sd->total_len && pipe->nrbufs > 1)
                more |= MSG_SENDPAGE_NOTLAST;
        return file->f_op->sendpage(file, buf->page, buf->offset,
                                    sd->len, &pos, more);
 }
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 2df555c66d57..aec3d5c98c94 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -205,6 +205,48 @@ void sysfs_unmerge_group(struct kobject *kobj,
 }
 EXPORT_SYMBOL_GPL(sysfs_unmerge_group);
+/**
+ * sysfs_add_link_to_group - add a symlink to an attribute group.
+ * @kobj:       The kobject containing the group.
+ * @group_name: The name of the group.
+ * @target:     The target kobject of the symlink to create.
+ * @link_name:  The name of the symlink to create.
+ */
+int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name,
+                            struct kobject *target, const char *link_name)
+{
+        struct sysfs_dirent *dir_sd;
+        int error = 0;
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
+        if (!dir_sd)
+                return -ENOENT;
+        error = sysfs_create_link_sd(dir_sd, target, link_name);
+        sysfs_put(dir_sd);
+        return error;
+}
+EXPORT_SYMBOL_GPL(sysfs_add_link_to_group);
+/**
+ * sysfs_remove_link_from_group - remove a symlink from an attribute group.
+ * @kobj:       The kobject containing the group.
+ * @group_name: The name of the group.
+ * @link_name:  The name of the symlink to remove.
+ */
+void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name,
+                                  const char *link_name)
+{
+        struct sysfs_dirent *dir_sd;
+        dir_sd = sysfs_get_dirent(kobj->sd, NULL, group_name);
+        if (dir_sd) {
+                sysfs_hash_and_remove(dir_sd, NULL, link_name);
+                sysfs_put(dir_sd);
+        }
+}
+EXPORT_SYMBOL_GPL(sysfs_remove_link_from_group);
 EXPORT_SYMBOL_GPL(sysfs_create_group);
 EXPORT_SYMBOL_GPL(sysfs_update_group);
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index db940a9be045..8d924b5ec733 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -10,7 +10,7 @@
 * Please see Documentation/filesystems/sysfs.txt for more information.
 */
-#define DEBUG 
+#define DEBUG
 #include <linux/fs.h>
 #include <linux/mount.h>
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 3c9eb5624f5e..8c940df97a52 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -21,26 +21,17 @@
 #include "sysfs.h"
-static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
+static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
-                                const char *name, int warn)
+                                   struct kobject *target,
+                                   const char *name, int warn)
 {
-        struct sysfs_dirent *parent_sd = NULL;
        struct sysfs_dirent *target_sd = NULL;
        struct sysfs_dirent *sd = NULL;
        struct sysfs_addrm_cxt acxt;
        enum kobj_ns_type ns_type;
        int error;
-        BUG_ON(!name);
+        BUG_ON(!name || !parent_sd);
-        if (!kobj)
-                parent_sd = &sysfs_root;
-        else
-                parent_sd = kobj->sd;
-        error = -EFAULT;
-        if (!parent_sd)
-                goto out_put;
        /* target->sd can go away beneath us but is protected with
         * sysfs_assoc_lock.  Fetch target_sd from it.
@@ -96,6 +87,34 @@ static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
 }
 /**
+ *      sysfs_create_link_sd - create symlink to a given object.
+ *      @sd:            directory we're creating the link in.
+ *      @target:        object we're pointing to.
+ *      @name:          name of the symlink.
+ */
+int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+                         const char *name)
+{
+        return sysfs_do_create_link_sd(sd, target, name, 1);
+}
+static int sysfs_do_create_link(struct kobject *kobj, struct kobject *target,
+                                const char *name, int warn)
+{
+        struct sysfs_dirent *parent_sd = NULL;
+        if (!kobj)
+                parent_sd = &sysfs_root;
+        else
+                parent_sd = kobj->sd;
+        if (!parent_sd)
+                return -EFAULT;
+        return sysfs_do_create_link_sd(parent_sd, target, name, warn);
+}
+/**
 *      sysfs_create_link - create symlink between two objects.
 *      @kobj:  object whose directory we're creating the link in.
 *      @target:        object we're pointing to.
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index d73c0932bbd6..d1e4043eb0c3 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -240,3 +240,5 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd);
 * symlink.c
 */
 extern const struct inode_operations sysfs_symlink_inode_operations;
+int sysfs_create_link_sd(struct sysfs_dirent *sd, struct kobject *target,
+                         const char *name);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5bc77817f382..4f6493c130e0 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1522,6 +1522,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma,
                        ubifs_release_dirty_inode_budget(c, ui);
        }
+        wait_for_stable_page(page);
        unlock_page(page);
        return 0;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index d44fb568abe1..e9be396a558d 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -307,7 +307,8 @@ static void udf_sb_free_partitions(struct super_block *sb)
 {
        struct udf_sb_info *sbi = UDF_SB(sb);
        int i;
+        if (sbi->s_partmaps == NULL)
+                return;
        for (i = 0; i < sbi->s_partitions; i++)
                udf_free_partition(&sbi->s_partmaps[i]);
        kfree(sbi->s_partmaps);
diff --git a/fs/ufs/Kconfig b/fs/ufs/Kconfig
index e4f10a40768a..0bf6e16f8d79 100644
--- a/fs/ufs/Kconfig
+++ b/fs/ufs/Kconfig
@@ -29,7 +29,7 @@ config UFS_FS
 config UFS_FS_WRITE
        bool "UFS file system write support (DANGEROUS)"
-        depends on UFS_FS && EXPERIMENTAL
+        depends on UFS_FS
        help
          Say Y here if you want to try writing to UFS partitions. This is
          experimental, so you should back up your UFS partitions beforehand.
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 5a7ffe54f5d5..cc33aaf219f1 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -70,8 +70,8 @@ config XFS_RT
          If unsure, say N.
 config XFS_DEBUG
-        bool "XFS Debugging support (EXPERIMENTAL)"
+        bool "XFS Debugging support"
-        depends on XFS_FS && EXPERIMENTAL
+        depends on XFS_FS
        help
          Say Y here to get an XFS build with many debugging features,
          including ASSERT checks, function wrappers around macros,
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 393055fe3aef..0ad23253e8b1 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -1925,8 +1925,6 @@ xfs_alloc_fix_freelist(
        targs.mp = mp;
        targs.agbp = agbp;
        targs.agno = args->agno;
-        targs.mod = targs.minleft = targs.wasdel = targs.userdata =
-                targs.minalignslop = 0;
        targs.alignment = targs.minlen = targs.prod = targs.isfl = 1;
        targs.type = XFS_ALLOCTYPE_THIS_AG;
        targs.pag = pag;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4111a40ebe1a..5f707e537171 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -86,11 +86,11 @@ xfs_destroy_ioend(
        }
        if (ioend->io_iocb) {
+                inode_dio_done(ioend->io_inode);
                if (ioend->io_isasync) {
                        aio_complete(ioend->io_iocb, ioend->io_error ?
                                        ioend->io_error : ioend->io_result, 0);
                }
-                inode_dio_done(ioend->io_inode);
        }
        mempool_free(ioend, xfs_ioend_pool);
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index aaf472532b3c..888683844d98 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -300,9 +300,12 @@ xfs_attr_set_int(
        if (rsvd)
                args.trans->t_flags |= XFS_TRANS_RESERVE;
-        if ((error = xfs_trans_reserve(args.trans, args.total,
+        error = xfs_trans_reserve(args.trans, args.total,
-                        XFS_ATTRSET_LOG_RES(mp, args.total), 0,
+                                  XFS_ATTRSETM_LOG_RES(mp) +
-                        XFS_TRANS_PERM_LOG_RES, XFS_ATTRSET_LOG_COUNT))) {
+                                  XFS_ATTRSETRT_LOG_RES(mp) * args.total,
+                                  0, XFS_TRANS_PERM_LOG_RES,
+                                  XFS_ATTRSET_LOG_COUNT);
+        if (error) {
                xfs_trans_cancel(args.trans, 0);
                return(error);
        }
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 0e92d12765d2..b44af9211bd9 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -147,7 +147,10 @@ xfs_bmap_local_to_extents(
        xfs_fsblock_t   *firstblock,    /* first block allocated in xaction */
        xfs_extlen_t    total,          /* total blocks needed by transaction */
        int             *logflagsp,     /* inode logging flags */
-        int             whichfork);     /* data or attr fork */
+        int             whichfork,      /* data or attr fork */
+        void            (*init_fn)(struct xfs_buf *bp,
+                                   struct xfs_inode *ip,
+                                   struct xfs_ifork *ifp));
 /*
 * Search the extents list for the inode, for the extent containing bno.
@@ -357,7 +360,42 @@ xfs_bmap_add_attrfork_extents(
 }
 /*
- * Called from xfs_bmap_add_attrfork to handle local format files.
+ * Block initialisation functions for local to extent format conversion.
+ * As these get more complex, they will be moved to the relevant files,
+ * but for now they are too simple to worry about.
+ */
+STATIC void
+xfs_bmap_local_to_extents_init_fn(
+        struct xfs_buf          *bp,
+        struct xfs_inode        *ip,
+        struct xfs_ifork        *ifp)
+{
+        bp->b_ops = &xfs_bmbt_buf_ops;
+        memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+}
+STATIC void
+xfs_symlink_local_to_remote(
+        struct xfs_buf          *bp,
+        struct xfs_inode        *ip,
+        struct xfs_ifork        *ifp)
+{
+        /* remote symlink blocks are not verifiable until CRCs come along */
+        bp->b_ops = NULL;
+        memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+}
+/*
+ * Called from xfs_bmap_add_attrfork to handle local format files. Each
+ * different data fork content type needs a different callout to do the
+ * conversion. Some are basic and only require special block initialisation
+ * callouts for the data formating, others (directories) are so specialised they
+ * handle everything themselves.
+ *
+ * XXX (dgc): investigate whether directory conversion can use the generic
+ * formatting callout. It should be possible - it's just a very complex
+ * formatter. it would also require passing the transaction through to the init
+ * function.
 */
 STATIC int                                      /* error */
 xfs_bmap_add_attrfork_local(
@@ -368,25 +406,29 @@ xfs_bmap_add_attrfork_local(
        int                     *flags)         /* inode logging flags */
 {
        xfs_da_args_t           dargs;          /* args for dir/attr code */
-        int                     error;          /* error return value */
-        xfs_mount_t             *mp;            /* mount structure pointer */
        if (ip->i_df.if_bytes <= XFS_IFORK_DSIZE(ip))
                return 0;
        if (S_ISDIR(ip->i_d.di_mode)) {
-                mp = ip->i_mount;
                memset(&dargs, 0, sizeof(dargs));
                dargs.dp = ip;
                dargs.firstblock = firstblock;
                dargs.flist = flist;
-                dargs.total = mp->m_dirblkfsbs;
+                dargs.total = ip->i_mount->m_dirblkfsbs;
                dargs.whichfork = XFS_DATA_FORK;
                dargs.trans = tp;
-                error = xfs_dir2_sf_to_block(&dargs);
+                return xfs_dir2_sf_to_block(&dargs);
-        } else
+        }
-                error = xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
-                        XFS_DATA_FORK);
+        if (S_ISLNK(ip->i_d.di_mode))
-        return error;
+                return xfs_bmap_local_to_extents(tp, ip, firstblock, 1,
+                                                 flags, XFS_DATA_FORK,
+                                                 xfs_symlink_local_to_remote);
+        return xfs_bmap_local_to_extents(tp, ip, firstblock, 1, flags,
+                                         XFS_DATA_FORK,
+                                         xfs_bmap_local_to_extents_init_fn);
 }
 /*
@@ -3099,8 +3141,6 @@ xfs_bmap_extents_to_btree(
                args.fsbno = *firstblock;
        }
        args.minlen = args.maxlen = args.prod = 1;
-        args.total = args.minleft = args.alignment = args.mod = args.isfl =
-                args.minalignslop = 0;
        args.wasdel = wasdel;
        *logflagsp = 0;
        if ((error = xfs_alloc_vextent(&args))) {
@@ -3221,7 +3261,10 @@ xfs_bmap_local_to_extents(
        xfs_fsblock_t   *firstblock,    /* first block allocated in xaction */
        xfs_extlen_t    total,          /* total blocks needed by transaction */
        int             *logflagsp,     /* inode logging flags */
-        int             whichfork)      /* data or attr fork */
+        int             whichfork,
+        void            (*init_fn)(struct xfs_buf *bp,
+                                   struct xfs_inode *ip,
+                                   struct xfs_ifork *ifp))
 {
        int             error;          /* error return value */
        int             flags;          /* logging flags returned */
@@ -3241,12 +3284,12 @@ xfs_bmap_local_to_extents(
                xfs_buf_t       *bp;    /* buffer for extent block */
                xfs_bmbt_rec_host_t *ep;/* extent record pointer */
+                ASSERT((ifp->if_flags &
+                        (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
                memset(&args, 0, sizeof(args));
                args.tp = tp;
                args.mp = ip->i_mount;
                args.firstblock = *firstblock;
-                ASSERT((ifp->if_flags &
-                        (XFS_IFINLINE|XFS_IFEXTENTS|XFS_IFEXTIREC)) == XFS_IFINLINE);
                /*
                 * Allocate a block.  We know we need only one, since the
                 * file currently fits in an inode.
@@ -3259,20 +3302,21 @@ xfs_bmap_local_to_extents(
                        args.type = XFS_ALLOCTYPE_NEAR_BNO;
                }
                args.total = total;
-                args.mod = args.minleft = args.alignment = args.wasdel =
-                        args.isfl = args.minalignslop = 0;
                args.minlen = args.maxlen = args.prod = 1;
-                if ((error = xfs_alloc_vextent(&args)))
+                error = xfs_alloc_vextent(&args);
+                if (error)
                        goto done;
-                /*
-                 * Can't fail, the space was reserved.
+                /* Can't fail, the space was reserved. */
-                 */
                ASSERT(args.fsbno != NULLFSBLOCK);
                ASSERT(args.len == 1);
                *firstblock = args.fsbno;
                bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
-                bp->b_ops = &xfs_bmbt_buf_ops;
-                memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
+                /* initialise the block and copy the data */
+                init_fn(bp, ip, ifp);
+                /* account for the change in fork size and log everything */
                xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
                xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
                xfs_idata_realloc(ip, -ifp->if_bytes, whichfork);
@@ -4680,9 +4724,6 @@ __xfs_bmapi_allocate(
                        return error;
        }
-        if (bma->flags & XFS_BMAPI_STACK_SWITCH)
-                bma->stack_switch = 1;
        error = xfs_bmap_alloc(bma);
        if (error)
                return error;
@@ -4922,8 +4963,32 @@ xfs_bmapi_write(
        XFS_STATS_INC(xs_blk_mapw);
        if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+                /*
+                 * XXX (dgc): This assumes we are only called for inodes that
+                 * contain content neutral data in local format. Anything that
+                 * contains caller-specific data in local format that needs
+                 * transformation to move to a block format needs to do the
+                 * conversion to extent format itself.
+                 *
+                 * Directory data forks and attribute forks handle this
+                 * themselves, but with the addition of metadata verifiers every
+                 * data fork in local format now contains caller specific data
+                 * and as such conversion through this function is likely to be
+                 * broken.
+                 *
+                 * The only likely user of this branch is for remote symlinks,
+                 * but we cannot overwrite the data fork contents of the symlink
+                 * (EEXIST occurs higher up the stack) and so it will never go
+                 * from local format to extent format here. Hence I don't think
+                 * this branch is ever executed intentionally and we should
+                 * consider removing it and asserting that xfs_bmapi_write()
+                 * cannot be called directly on local format forks. i.e. callers
+                 * are completely responsible for local to extent format
+                 * conversion, not xfs_bmapi_write().
+                 */
                error = xfs_bmap_local_to_extents(tp, ip, firstblock, total,
-                                                  &bma.logflags, whichfork);
+                                        &bma.logflags, whichfork,
+                                        xfs_bmap_local_to_extents_init_fn);
                if (error)
                        goto error0;
        }
@@ -4956,6 +5021,9 @@ xfs_bmapi_write(
        bma.flist = flist;
        bma.firstblock = firstblock;
+        if (flags & XFS_BMAPI_STACK_SWITCH)
+                bma.stack_switch = 1;
        while (bno < end && n < *nmap) {
                inhole = eof || bma.got.br_startoff > bno;
                wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 26673a0b20e7..4e8f0df82d02 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -175,7 +175,7 @@ xfs_buf_get_maps(
        bp->b_map_count = map_count;
        if (map_count == 1) {
-                bp->b_maps = &bp->b_map;
+                bp->b_maps = &bp->__b_map;
                return 0;
        }
@@ -193,7 +193,7 @@ static void
 xfs_buf_free_maps(
        struct xfs_buf  *bp)
 {
-        if (bp->b_maps != &bp->b_map) {
+        if (bp->b_maps != &bp->__b_map) {
                kmem_free(bp->b_maps);
                bp->b_maps = NULL;
        }
@@ -377,8 +377,8 @@ xfs_buf_allocate_memory(
        }
 use_alloc_page:
-        start = BBTOB(bp->b_map.bm_bn) >> PAGE_SHIFT;
+        start = BBTOB(bp->b_maps[0].bm_bn) >> PAGE_SHIFT;
-        end = (BBTOB(bp->b_map.bm_bn + bp->b_length) + PAGE_SIZE - 1)
+        end = (BBTOB(bp->b_maps[0].bm_bn + bp->b_length) + PAGE_SIZE - 1)
                                                                >> PAGE_SHIFT;
        page_count = end - start;
        error = _xfs_buf_get_pages(bp, page_count, flags);
@@ -487,6 +487,7 @@ _xfs_buf_find(
        struct rb_node          *parent;
        xfs_buf_t               *bp;
        xfs_daddr_t             blkno = map[0].bm_bn;
+        xfs_daddr_t             eofs;
        int                     numblks = 0;
        int                     i;
@@ -498,6 +499,23 @@ _xfs_buf_find(
        ASSERT(!(numbytes < (1 << btp->bt_sshift)));
        ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask));
+        /*
+         * Corrupted block numbers can get through to here, unfortunately, so we
+         * have to check that the buffer falls within the filesystem bounds.
+         */
+        eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
+        if (blkno >= eofs) {
+                /*
+                 * XXX (dgc): we should really be returning EFSCORRUPTED here,
+                 * but none of the higher level infrastructure supports
+                 * returning a specific error on buffer lookup failures.
+                 */
+                xfs_alert(btp->bt_mount,
+                          "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
+                          __func__, blkno, eofs);
+                return NULL;
+        }
        /* get tree root */
        pag = xfs_perag_get(btp->bt_mount,
                                xfs_daddr_to_agno(btp->bt_mount, blkno));
@@ -640,7 +658,7 @@ _xfs_buf_read(
        xfs_buf_flags_t         flags)
 {
        ASSERT(!(flags & XBF_WRITE));
-        ASSERT(bp->b_map.bm_bn != XFS_BUF_DADDR_NULL);
+        ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL);
        bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
        bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
@@ -933,8 +951,6 @@ xfs_buf_trylock(
        locked = down_trylock(&bp->b_sema) == 0;
        if (locked)
                XB_SET_OWNER(bp);
-        else if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
-                xfs_log_force(bp->b_target->bt_mount, 0);
        trace_xfs_buf_trylock(bp, _RET_IP_);
        return locked;
@@ -1487,6 +1503,8 @@ restart:
        while (!list_empty(&btp->bt_lru)) {
                bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
                if (atomic_read(&bp->b_hold) > 1) {
+                        trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+                        list_move_tail(&bp->b_lru, &btp->bt_lru);
                        spin_unlock(&btp->bt_lru_lock);
                        delay(100);
                        goto restart;
@@ -1709,7 +1727,7 @@ xfs_buf_cmp(
        struct xfs_buf  *bp = container_of(b, struct xfs_buf, b_list);
        xfs_daddr_t             diff;
-        diff = ap->b_map.bm_bn - bp->b_map.bm_bn;
+        diff = ap->b_maps[0].bm_bn - bp->b_maps[0].bm_bn;
        if (diff < 0)
                return -1;
        if (diff > 0)
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 23f5642480bb..433a12ed7b17 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -151,7 +151,7 @@ typedef struct xfs_buf {
        struct page             **b_pages;      /* array of page pointers */
        struct page             *b_page_array[XB_PAGES]; /* inline pages */
        struct xfs_buf_map      *b_maps;        /* compound buffer map */
-        struct xfs_buf_map      b_map;          /* inline compound buffer map */
+        struct xfs_buf_map      __b_map;        /* inline compound buffer map */
        int                     b_map_count;
        int                     b_io_length;    /* IO size in BBs */
        atomic_t                b_pin_count;    /* pin count */
@@ -330,8 +330,8 @@ void xfs_buf_stale(struct xfs_buf *bp);
 * In future, uncached buffers will pass the block number directly to the io
 * request function and hence these macros will go away at that point.
 */
-#define XFS_BUF_ADDR(bp)                ((bp)->b_map.bm_bn)
+#define XFS_BUF_ADDR(bp)                ((bp)->b_maps[0].bm_bn)
-#define XFS_BUF_SET_ADDR(bp, bno)       ((bp)->b_map.bm_bn = (xfs_daddr_t)(bno))
+#define XFS_BUF_SET_ADDR(bp, bno)       ((bp)->b_maps[0].bm_bn = (xfs_daddr_t)(bno))
 static inline void xfs_buf_set_ref(struct xfs_buf *bp, int lru_ref)
 {
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index becf4a97efc6..cf263476d6b4 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -37,109 +37,6 @@ static inline struct xfs_buf_log_item *BUF_ITEM(struct xfs_log_item *lip)
        return container_of(lip, struct xfs_buf_log_item, bli_item);
 }
-#ifdef XFS_TRANS_DEBUG
-/*
- * This function uses an alternate strategy for tracking the bytes
- * that the user requests to be logged.  This can then be used
- * in conjunction with the bli_orig array in the buf log item to
- * catch bugs in our callers' code.
- *
- * We also double check the bits set in xfs_buf_item_log using a
- * simple algorithm to check that every byte is accounted for.
- */
-STATIC void
-xfs_buf_item_log_debug(
-        xfs_buf_log_item_t      *bip,
-        uint                    first,
-        uint                    last)
-{
-        uint    x;
-        uint    byte;
-        uint    nbytes;
-        uint    chunk_num;
-        uint    word_num;
-        uint    bit_num;
-        uint    bit_set;
-        uint    *wordp;
-        ASSERT(bip->bli_logged != NULL);
-        byte = first;
-        nbytes = last - first + 1;
-        bfset(bip->bli_logged, first, nbytes);
-        for (x = 0; x < nbytes; x++) {
-                chunk_num = byte >> XFS_BLF_SHIFT;
-                word_num = chunk_num >> BIT_TO_WORD_SHIFT;
-                bit_num = chunk_num & (NBWORD - 1);
-                wordp = &(bip->bli_format.blf_data_map[word_num]);
-                bit_set = *wordp & (1 << bit_num);
-                ASSERT(bit_set);
-                byte++;
-        }
-}
-/*
- * This function is called when we flush something into a buffer without
- * logging it.  This happens for things like inodes which are logged
- * separately from the buffer.
- */
-void
-xfs_buf_item_flush_log_debug(
-        xfs_buf_t       *bp,
-        uint            first,
-        uint            last)
-{
-        xfs_buf_log_item_t      *bip = bp->b_fspriv;
-        uint                    nbytes;
-        if (bip == NULL || (bip->bli_item.li_type != XFS_LI_BUF))
-                return;
-        ASSERT(bip->bli_logged != NULL);
-        nbytes = last - first + 1;
-        bfset(bip->bli_logged, first, nbytes);
-}
-/*
- * This function is called to verify that our callers have logged
- * all the bytes that they changed.
- *
- * It does this by comparing the original copy of the buffer stored in
- * the buf log item's bli_orig array to the current copy of the buffer
- * and ensuring that all bytes which mismatch are set in the bli_logged
- * array of the buf log item.
- */
-STATIC void
-xfs_buf_item_log_check(
-        xfs_buf_log_item_t      *bip)
-{
-        char            *orig;
-        char            *buffer;
-        int             x;
-        xfs_buf_t       *bp;
-        ASSERT(bip->bli_orig != NULL);
-        ASSERT(bip->bli_logged != NULL);
-        bp = bip->bli_buf;
-        ASSERT(bp->b_length > 0);
-        ASSERT(bp->b_addr != NULL);
-        orig = bip->bli_orig;
-        buffer = bp->b_addr;
-        for (x = 0; x < BBTOB(bp->b_length); x++) {
-                if (orig[x] != buffer[x] && !btst(bip->bli_logged, x)) {
-                        xfs_emerg(bp->b_mount,
-                                "%s: bip %x buffer %x orig %x index %d",
-                                __func__, bip, bp, orig, x);
-                        ASSERT(0);
-                }
-        }
-}
-#else
-#define         xfs_buf_item_log_debug(x,y,z)
-#define         xfs_buf_item_log_check(x)
-#endif
 STATIC void     xfs_buf_do_callbacks(struct xfs_buf *bp);
 /*
@@ -237,7 +134,7 @@ xfs_buf_item_size(
                 * cancel flag in it.
                 */
                trace_xfs_buf_item_size_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+                ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
                return bip->bli_format_count;
        }
@@ -278,7 +175,7 @@ xfs_buf_item_format_segment(
        uint            buffer_offset;
        /* copy the flags across from the base format item */
-        blfp->blf_flags = bip->bli_format.blf_flags;
+        blfp->blf_flags = bip->__bli_format.blf_flags;
        /*
         * Base size is the actual size of the ondisk structure - it reflects
@@ -287,6 +184,17 @@ xfs_buf_item_format_segment(
         */
        base_size = offsetof(struct xfs_buf_log_format, blf_data_map) +
                        (blfp->blf_map_size * sizeof(blfp->blf_data_map[0]));
+        nvecs = 0;
+        first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
+        if (!(bip->bli_flags & XFS_BLI_STALE) && first_bit == -1) {
+                /*
+                 * If the map is not be dirty in the transaction, mark
+                 * the size as zero and do not advance the vector pointer.
+                 */
+                goto out;
+        }
        vecp->i_addr = blfp;
        vecp->i_len = base_size;
        vecp->i_type = XLOG_REG_TYPE_BFORMAT;
@@ -301,15 +209,13 @@ xfs_buf_item_format_segment(
                 */
                trace_xfs_buf_item_format_stale(bip);
                ASSERT(blfp->blf_flags & XFS_BLF_CANCEL);
-                blfp->blf_size = nvecs;
+                goto out;
-                return vecp;
        }
        /*
         * Fill in an iovec for each set of contiguous chunks.
         */
-        first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0);
-        ASSERT(first_bit != -1);
        last_bit = first_bit;
        nbits = 1;
        for (;;) {
@@ -371,7 +277,8 @@ xfs_buf_item_format_segment(
                        nbits++;
                }
        }
-        bip->bli_format.blf_size = nvecs;
+out:
+        blfp->blf_size = nvecs;
        return vecp;
 }
@@ -405,7 +312,7 @@ xfs_buf_item_format(
        if (bip->bli_flags & XFS_BLI_INODE_BUF) {
                if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
                      xfs_log_item_in_current_chkpt(lip)))
-                        bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+                        bip->__bli_format.blf_flags |= XFS_BLF_INODE_BUF;
                bip->bli_flags &= ~XFS_BLI_INODE_BUF;
        }
@@ -419,7 +326,6 @@ xfs_buf_item_format(
         * Check to make sure everything is consistent.
         */
        trace_xfs_buf_item_format(bip);
-        xfs_buf_item_log_check(bip);
 }
 /*
@@ -485,7 +391,7 @@ xfs_buf_item_unpin(
                ASSERT(bip->bli_flags & XFS_BLI_STALE);
                ASSERT(xfs_buf_islocked(bp));
                ASSERT(XFS_BUF_ISSTALE(bp));
-                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+                ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
                trace_xfs_buf_item_unpin_stale(bip);
@@ -563,8 +469,18 @@ xfs_buf_item_push(
        if (xfs_buf_ispinned(bp))
                return XFS_ITEM_PINNED;
-        if (!xfs_buf_trylock(bp))
+        if (!xfs_buf_trylock(bp)) {
+                /*
+                 * If we have just raced with a buffer being pinned and it has
+                 * been marked stale, we could end up stalling until someone else
+                 * issues a log force to unpin the stale buffer. Check for the
+                 * race condition here so xfsaild recognizes the buffer is pinned
+                 * and queues a log force to move it along.
+                 */
+                if (xfs_buf_ispinned(bp))
+                        return XFS_ITEM_PINNED;
                return XFS_ITEM_LOCKED;
+        }
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
@@ -601,7 +517,7 @@ xfs_buf_item_unlock(
 {
        struct xfs_buf_log_item *bip = BUF_ITEM(lip);
        struct xfs_buf          *bp = bip->bli_buf;
-        int                     aborted;
+        int                     aborted, clean, i;
        uint                    hold;
        /* Clear the buffer's association with this transaction. */
@@ -631,7 +547,7 @@ xfs_buf_item_unlock(
         */
        if (bip->bli_flags & XFS_BLI_STALE) {
                trace_xfs_buf_item_unlock_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+                ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
                if (!aborted) {
                        atomic_dec(&bip->bli_refcount);
                        return;
@@ -642,12 +558,27 @@ xfs_buf_item_unlock(
        /*
         * If the buf item isn't tracking any data, free it, otherwise drop the
-         * reference we hold to it.
+         * reference we hold to it. If we are aborting the transaction, this may
+         * be the only reference to the buf item, so we free it anyway
+         * regardless of whether it is dirty or not. A dirty abort implies a
+         * shutdown, anyway.
         */
-        if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
+        clean = 1;
-                             bip->bli_format.blf_map_size))
+        for (i = 0; i < bip->bli_format_count; i++) {
+                if (!xfs_bitmap_empty(bip->bli_formats[i].blf_data_map,
+                             bip->bli_formats[i].blf_map_size)) {
+                        clean = 0;
+                        break;
+                }
+        }
+        if (clean)
                xfs_buf_item_relse(bp);
-        else
+        else if (aborted) {
+                if (atomic_dec_and_test(&bip->bli_refcount)) {
+                        ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+                        xfs_buf_item_relse(bp);
+                }
+        } else
                atomic_dec(&bip->bli_refcount);
        if (!hold)
@@ -716,7 +647,7 @@ xfs_buf_item_get_format(
        bip->bli_format_count = count;
        if (count == 1) {
-                bip->bli_formats = &bip->bli_format;
+                bip->bli_formats = &bip->__bli_format;
                return 0;
        }
@@ -731,7 +662,7 @@ STATIC void
 xfs_buf_item_free_format(
        struct xfs_buf_log_item *bip)
 {
-        if (bip->bli_formats != &bip->bli_format) {
+        if (bip->bli_formats != &bip->__bli_format) {
                kmem_free(bip->bli_formats);
                bip->bli_formats = NULL;
        }
@@ -898,8 +829,6 @@ xfs_buf_item_log_segment(
                mask = (1 << end_bit) - 1;
                *wordp |= mask;
        }
-        xfs_buf_item_log_debug(bip, first, last);
 }
 /*
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index 6850f49f4af3..ee36c88ecfde 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -98,13 +98,9 @@ typedef struct xfs_buf_log_item {
        unsigned int            bli_flags;      /* misc flags */
        unsigned int            bli_recur;      /* lock recursion count */
        atomic_t                bli_refcount;   /* cnt of tp refs */
-#ifdef XFS_TRANS_DEBUG
-        char                    *bli_orig;      /* original buffer copy */
-        char                    *bli_logged;    /* bytes logged (bitmap) */
-#endif
        int                     bli_format_count;       /* count of headers */
        struct xfs_buf_log_format *bli_formats; /* array of in-log header ptrs */
-        struct xfs_buf_log_format bli_format;   /* embedded in-log header */
+        struct xfs_buf_log_format __bli_format; /* embedded in-log header */
 } xfs_buf_log_item_t;
 void    xfs_buf_item_init(struct xfs_buf *, struct xfs_mount *);
@@ -117,16 +113,6 @@ void	xfs_buf_attach_iodone(struct xfs_buf *,
 void    xfs_buf_iodone_callbacks(struct xfs_buf *);
 void    xfs_buf_iodone(struct xfs_buf *, struct xfs_log_item *);
-#ifdef XFS_TRANS_DEBUG
-void
-xfs_buf_item_flush_log_debug(
-        struct xfs_buf *bp,
-        uint    first,
-        uint    last);
-#else
-#define xfs_buf_item_flush_log_debug(bp, first, last)
-#endif
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BUF_ITEM_H__ */
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index d0e9c74d3d96..a8bd26b82ecb 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,10 +246,10 @@ xfs_swap_extents(
                goto out_unlock;
        }
-        error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
+        error = -filemap_write_and_wait(VFS_I(tip)->i_mapping);
        if (error)
                goto out_unlock;
-        truncate_pagecache_range(VFS_I(ip), 0, -1);
+        truncate_pagecache_range(VFS_I(tip), 0, -1);
        /* Verify O_DIRECT for ftmp */
        if (VN_CACHED(VFS_I(tip)) != 0) {
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index 7536faaa61e7..12afe07a91d7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -355,10 +355,12 @@ xfs_dir2_block_addname(
        /*
         * If need to compact the leaf entries, do it now.
         */
-        if (compact)
+        if (compact) {
                xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
                                      &lfloghigh, &lfloglow);
-        else if (btp->stale) {
+                /* recalculate blp post-compaction */
+                blp = xfs_dir2_block_leaf_p(btp);
+        } else if (btp->stale) {
                /*
                 * Set leaf logging boundaries to impossible state.
                 * For the no-stale case they're set explicitly.
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 9e1bf5294c91..8025eb23ad72 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -612,15 +612,9 @@ xfs_qm_dqread(
        if (flags & XFS_QMOPT_DQALLOC) {
                tp = xfs_trans_alloc(mp, XFS_TRANS_QM_DQALLOC);
                error = xfs_trans_reserve(tp, XFS_QM_DQALLOC_SPACE_RES(mp),
-                                XFS_WRITE_LOG_RES(mp) +
+                                          XFS_QM_DQALLOC_LOG_RES(mp), 0,
-                                /*
+                                          XFS_TRANS_PERM_LOG_RES,
-                                 * Round the chunklen up to the next multiple
+                                          XFS_WRITE_LOG_COUNT);
-                                 * of 128 (buf log item chunk size)).
-                                 */
-                                BBTOB(mp->m_quotainfo->qi_dqchunklen) - 1 + 128,
-                                0,
-                                XFS_TRANS_PERM_LOG_RES,
-                                XFS_WRITE_LOG_COUNT);
                if (error)
                        goto error1;
                cancelflags = XFS_TRANS_RELEASE_LOG_RES;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 94eaeedc5498..2866b8c78b7a 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -709,8 +709,8 @@ xfs_fs_log_dummy(
        int             error;
        tp = _xfs_trans_alloc(mp, XFS_TRANS_DUMMY1, KM_SLEEP);
-        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+        error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
-                                        XFS_DEFAULT_LOG_COUNT);
+                                  XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index a815412eab80..515bf71ce01c 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -279,8 +279,6 @@ xfs_ialloc_ag_alloc(
                  (args.agbno < be32_to_cpu(agi->agi_length)))) {
                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
                args.type = XFS_ALLOCTYPE_THIS_BNO;
-                args.mod = args.total = args.wasdel = args.isfl =
-                        args.userdata = args.minalignslop = 0;
                args.prod = 1;
                /*
@@ -333,8 +331,6 @@ xfs_ialloc_ag_alloc(
                 * Allocate a fixed-size extent of inodes.
                 */
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
-                args.mod = args.total = args.wasdel = args.isfl =
-                        args.userdata = args.minalignslop = 0;
                args.prod = 1;
                /*
                 * Allow space for the inode btree to split.
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 66282dcb821b..4f201656d2d9 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2379,9 +2379,6 @@ xfs_iflush_fork(
        char                    *cp;
        xfs_ifork_t             *ifp;
        xfs_mount_t             *mp;
-#ifdef XFS_TRANS_DEBUG
-        int                     first;
-#endif
        static const short      brootflag[2] =
                { XFS_ILOG_DBROOT, XFS_ILOG_ABROOT };
        static const short      dataflag[2] =
@@ -2724,9 +2721,6 @@ xfs_iflush_int(
        xfs_inode_log_item_t    *iip;
        xfs_dinode_t            *dip;
        xfs_mount_t             *mp;
-#ifdef XFS_TRANS_DEBUG
-        int                     first;
-#endif
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
        ASSERT(xfs_isiflocked(ip));
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 22baf6ea4fac..237e7f6f2ab3 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -419,6 +419,7 @@ static inline void xfs_iflock(struct xfs_inode *ip)
 static inline void xfs_ifunlock(struct xfs_inode *ip)
 {
        xfs_iflags_clear(ip, XFS_IFLOCK);
+        smp_mb();
        wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
 }
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d041d47d9d86..f034bd1652f0 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -269,17 +269,6 @@ xfs_inode_item_format(
                } else {
                        ASSERT(!(iip->ili_fields &
                                 XFS_ILOG_DBROOT));
-#ifdef XFS_TRANS_DEBUG
-                        if (iip->ili_root_size > 0) {
-                                ASSERT(iip->ili_root_size ==
-                                       ip->i_df.if_broot_bytes);
-                                ASSERT(memcmp(iip->ili_orig_root,
-                                            ip->i_df.if_broot,
-                                            iip->ili_root_size) == 0);
-                        } else {
-                                ASSERT(ip->i_df.if_broot_bytes == 0);
-                        }
-#endif
                        iip->ili_fields &= ~XFS_ILOG_DBROOT;
                }
                break;
@@ -678,11 +667,6 @@ void
 xfs_inode_item_destroy(
        xfs_inode_t     *ip)
 {
-#ifdef XFS_TRANS_DEBUG
-        if (ip->i_itemp->ili_root_size != 0) {
-                kmem_free(ip->i_itemp->ili_orig_root);
-        }
-#endif
        kmem_zone_free(xfs_ili_zone, ip->i_itemp);
 }
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 376d4d0b2635..779812fb3d80 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -148,10 +148,6 @@ typedef struct xfs_inode_log_item {
                                                      data exts */
        struct xfs_bmbt_rec     *ili_aextents_buf; /* array of logged
                                                      attr exts */
-#ifdef XFS_TRANS_DEBUG
-        int                     ili_root_size;
-        char                    *ili_orig_root;
-#endif
        xfs_inode_log_format_t  ili_format;        /* logged structure */
 } xfs_inode_log_item_t;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index add06b4e9a63..912d83d8860a 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -311,6 +311,62 @@ xfs_iomap_eof_want_preallocate(
 }
 /*
+ * Determine the initial size of the preallocation. We are beyond the current
+ * EOF here, but we need to take into account whether this is a sparse write or
+ * an extending write when determining the preallocation size.  Hence we need to
+ * look up the extent that ends at the current write offset and use the result
+ * to determine the preallocation size.
+ *
+ * If the extent is a hole, then preallocation is essentially disabled.
+ * Otherwise we take the size of the preceeding data extent as the basis for the
+ * preallocation size. If the size of the extent is greater than half the
+ * maximum extent length, then use the current offset as the basis. This ensures
+ * that for large files the preallocation size always extends to MAXEXTLEN
+ * rather than falling short due to things like stripe unit/width alignment of
+ * real extents.
+ */
+STATIC int
+xfs_iomap_eof_prealloc_initial_size(
+        struct xfs_mount        *mp,
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        xfs_bmbt_irec_t         *imap,
+        int                     nimaps)
+{
+        xfs_fileoff_t   start_fsb;
+        int             imaps = 1;
+        int             error;
+        ASSERT(nimaps >= imaps);
+        /* if we are using a specific prealloc size, return now */
+        if (mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)
+                return 0;
+        /*
+         * As we write multiple pages, the offset will always align to the
+         * start of a page and hence point to a hole at EOF. i.e. if the size is
+         * 4096 bytes, we only have one block at FSB 0, but XFS_B_TO_FSB(4096)
+         * will return FSB 1. Hence if there are blocks in the file, we want to
+         * point to the block prior to the EOF block and not the hole that maps
+         * directly at @offset.
+         */
+        start_fsb = XFS_B_TO_FSB(mp, offset);
+        if (start_fsb)
+                start_fsb--;
+        error = xfs_bmapi_read(ip, start_fsb, 1, imap, &imaps, XFS_BMAPI_ENTIRE);
+        if (error)
+                return 0;
+        ASSERT(imaps == 1);
+        if (imap[0].br_startblock == HOLESTARTBLOCK)
+                return 0;
+        if (imap[0].br_blockcount <= (MAXEXTLEN >> 1))
+                return imap[0].br_blockcount;
+        return XFS_B_TO_FSB(mp, offset);
+}
+/*
 * If we don't have a user specified preallocation size, dynamically increase
 * the preallocation size as the size of the file grows. Cap the maximum size
 * at a single extent or less if the filesystem is near full. The closer the
@@ -319,20 +375,19 @@ xfs_iomap_eof_want_preallocate(
 STATIC xfs_fsblock_t
 xfs_iomap_prealloc_size(
        struct xfs_mount        *mp,
-        struct xfs_inode        *ip)
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        struct xfs_bmbt_irec    *imap,
+        int                     nimaps)
 {
        xfs_fsblock_t           alloc_blocks = 0;
-        if (!(mp->m_flags & XFS_MOUNT_DFLT_IOSIZE)) {
+        alloc_blocks = xfs_iomap_eof_prealloc_initial_size(mp, ip, offset,
+                                                           imap, nimaps);
+        if (alloc_blocks > 0) {
                int shift = 0;
                int64_t freesp;
-                /*
-                 * rounddown_pow_of_two() returns an undefined result
-                 * if we pass in alloc_blocks = 0. Hence the "+ 1" to
-                 * ensure we always pass in a non-zero value.
-                 */
-                alloc_blocks = XFS_B_TO_FSB(mp, XFS_ISIZE(ip)) + 1;
                alloc_blocks = XFS_FILEOFF_MIN(MAXEXTLEN,
                                        rounddown_pow_of_two(alloc_blocks));
@@ -351,6 +406,15 @@ xfs_iomap_prealloc_size(
                }
                if (shift)
                        alloc_blocks >>= shift;
+                /*
+                 * If we are still trying to allocate more space than is
+                 * available, squash the prealloc hard. This can happen if we
+                 * have a large file on a small filesystem and the above
+                 * lowspace thresholds are smaller than MAXEXTLEN.
+                 */
+                while (alloc_blocks >= freesp)
+                        alloc_blocks >>= 4;
        }
        if (alloc_blocks < mp->m_writeio_blocks)
@@ -390,7 +454,6 @@ xfs_iomap_write_delay(
        extsz = xfs_get_extsz_hint(ip);
        offset_fsb = XFS_B_TO_FSBT(mp, offset);
        error = xfs_iomap_eof_want_preallocate(mp, ip, offset, count,
                                imap, XFS_WRITE_IMAPS, &prealloc);
        if (error)
@@ -398,7 +461,10 @@ xfs_iomap_write_delay(
 retry:
        if (prealloc) {
-                xfs_fsblock_t   alloc_blocks = xfs_iomap_prealloc_size(mp, ip);
+                xfs_fsblock_t   alloc_blocks;
+                alloc_blocks = xfs_iomap_prealloc_size(mp, ip, offset, imap,
+                                                       XFS_WRITE_IMAPS);
                aligned_offset = XFS_WRITEIO_ALIGN(mp, (offset + count - 1));
                ioalign = XFS_B_TO_FSBT(mp, aligned_offset);
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 46bd9d52ab51..eec226f78a40 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -120,7 +120,7 @@ xlog_verify_iclog(
        struct xlog             *log,
        struct xlog_in_core     *iclog,
        int                     count,
-        boolean_t               syncing);
+        bool                    syncing);
 STATIC void
 xlog_verify_tail_lsn(
        struct xlog             *log,
@@ -1737,7 +1737,7 @@ xlog_sync(
        ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
        ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
-        xlog_verify_iclog(log, iclog, count, B_TRUE);
+        xlog_verify_iclog(log, iclog, count, true);
        /* account for log which doesn't start at block #0 */
        XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart);
@@ -3611,7 +3611,7 @@ xlog_verify_iclog(
        struct xlog             *log,
        struct xlog_in_core     *iclog,
        int                     count,
-        boolean_t               syncing)
+        bool                    syncing)
 {
        xlog_op_header_t        *ophead;
        xlog_in_core_t          *icptr;
@@ -3659,7 +3659,7 @@ xlog_verify_iclog(
                /* clientid is only 1 byte */
                field_offset = (__psint_t)
                               ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr);
-                if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+                if (!syncing || (field_offset & 0x1ff)) {
                        clientid = ophead->oh_clientid;
                } else {
                        idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap);
@@ -3682,7 +3682,7 @@ xlog_verify_iclog(
                /* check length */
                field_offset = (__psint_t)
                               ((xfs_caddr_t)&(ophead->oh_len) - base_ptr);
-                if (syncing == B_FALSE || (field_offset & 0x1ff)) {
+                if (!syncing || (field_offset & 0x1ff)) {
                        op_len = be32_to_cpu(ophead->oh_len);
                } else {
                        idx = BTOBBT((__psint_t)&ophead->oh_len -
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index da508463ff10..3806088a8f77 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -658,7 +658,7 @@ xfs_sb_quiet_read_verify(
                return;
        }
        /* quietly fail */
-        xfs_buf_ioerror(bp, EFSCORRUPTED);
+        xfs_buf_ioerror(bp, EWRONGFS);
 }
 static void
@@ -1109,8 +1109,8 @@ xfs_mount_reset_sbqflags(
                return 0;
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+        error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp),
-                                      XFS_DEFAULT_LOG_COUNT);
+                                  0, 0, XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
                xfs_alert(mp, "%s: Superblock update failed!", __func__);
@@ -1583,8 +1583,8 @@ xfs_log_sbcount(xfs_mount_t *mp)
                return 0;
        tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_COUNT, KM_SLEEP);
-        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+        error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
-                                        XFS_DEFAULT_LOG_COUNT);
+                                  XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
@@ -1945,8 +1945,8 @@ xfs_mount_log_sb(
                         XFS_SB_VERSIONNUM));
        tp = xfs_trans_alloc(mp, XFS_TRANS_SB_UNIT);
-        error = xfs_trans_reserve(tp, 0, mp->m_sb.sb_sectsize + 128, 0, 0,
+        error = xfs_trans_reserve(tp, 0, XFS_SB_LOG_RES(mp), 0, 0,
-                                XFS_DEFAULT_LOG_COUNT);
+                                  XFS_DEFAULT_LOG_COUNT);
        if (error) {
                xfs_trans_cancel(tp, 0);
                return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index bab8314507e4..bc907061d392 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -34,12 +34,19 @@ typedef struct xfs_trans_reservations {
        uint    tr_addafork;    /* cvt inode to attributed trans */
        uint    tr_writeid;     /* write setuid/setgid file */
        uint    tr_attrinval;   /* attr fork buffer invalidation */
-        uint    tr_attrset;     /* set/create an attribute */
+        uint    tr_attrsetm;    /* set/create an attribute at mount time */
+        uint    tr_attrsetrt;   /* set/create an attribute at runtime */
        uint    tr_attrrm;      /* remove an attribute */
        uint    tr_clearagi;    /* clear bad agi unlinked ino bucket */
        uint    tr_growrtalloc; /* grow realtime allocations */
        uint    tr_growrtzero;  /* grow realtime zeroing */
        uint    tr_growrtfree;  /* grow realtime freeing */
+        uint    tr_qm_sbchange; /* change quota flags */
+        uint    tr_qm_setqlim;  /* adjust quota limits */
+        uint    tr_qm_dqalloc;  /* allocate quota on disk */
+        uint    tr_qm_quotaoff; /* turn quota off */
+        uint    tr_qm_equotaoff;/* end of turn quota off */
+        uint    tr_sb;          /* modify superblock */
 } xfs_trans_reservations_t;
 #ifndef __KERNEL__
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 60eff4763156..e5b5cf973781 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1584,10 +1584,9 @@ xfs_qm_write_sb_changes(
        int             error;
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SBCHANGE);
-        if ((error = xfs_trans_reserve(tp, 0,
+        error = xfs_trans_reserve(tp, 0, XFS_QM_SBCHANGE_LOG_RES(mp),
-                                      mp->m_sb.sb_sectsize + 128, 0,
+                                  0, 0, XFS_DEFAULT_LOG_COUNT);
-                                      0,
+        if (error) {
-                                      XFS_DEFAULT_LOG_COUNT))) {
                xfs_trans_cancel(tp, 0);
                return error;
        }
diff --git a/fs/xfs/xfs_qm_bhv.c b/fs/xfs/xfs_qm_bhv.c
index 6b39115bf145..2d02eac1c9a8 100644
--- a/fs/xfs/xfs_qm_bhv.c
+++ b/fs/xfs/xfs_qm_bhv.c
@@ -146,7 +146,7 @@ xfs_qm_newmount(
                         * inode goes inactive and wants to free blocks,
                         * or via xfs_log_mount_finish.
                         */
-                        *needquotamount = B_TRUE;
+                        *needquotamount = true;
                        *quotaflags = mp->m_qflags;
                        mp->m_qflags = 0;
                }
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 5f53e75409b8..cf9a34051e07 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -408,10 +408,10 @@ xfs_qm_scall_getqstat(
 {
        struct xfs_quotainfo    *q = mp->m_quotainfo;
        struct xfs_inode        *uip, *gip;
-        boolean_t               tempuqip, tempgqip;
+        bool                    tempuqip, tempgqip;
        uip = gip = NULL;
-        tempuqip = tempgqip = B_FALSE;
+        tempuqip = tempgqip = false;
        memset(out, 0, sizeof(fs_quota_stat_t));
        out->qs_version = FS_QSTAT_VERSION;
@@ -434,12 +434,12 @@ xfs_qm_scall_getqstat(
        if (!uip && mp->m_sb.sb_uquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_uquotino,
                                        0, 0, &uip) == 0)
-                        tempuqip = B_TRUE;
+                        tempuqip = true;
        }
        if (!gip && mp->m_sb.sb_gquotino != NULLFSINO) {
                if (xfs_iget(mp, NULL, mp->m_sb.sb_gquotino,
                                        0, 0, &gip) == 0)
-                        tempgqip = B_TRUE;
+                        tempgqip = true;
        }
        if (uip) {
                out->qs_uquota.qfs_nblks = uip->i_d.di_nblocks;
@@ -490,8 +490,9 @@ xfs_qm_scall_setqlim(
                return 0;
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM);
-        if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_disk_dquot_t) + 128,
+        error = xfs_trans_reserve(tp, 0, XFS_QM_SETQLIM_LOG_RES(mp),
-                                      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+                                  0, 0, XFS_DEFAULT_LOG_COUNT);
+        if (error) {
                xfs_trans_cancel(tp, 0);
                return (error);
        }
@@ -638,8 +639,9 @@ xfs_qm_log_quotaoff_end(
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF_END);
-        if ((error = xfs_trans_reserve(tp, 0, sizeof(xfs_qoff_logitem_t) * 2,
+        error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_END_LOG_RES(mp),
-                                      0, 0, XFS_DEFAULT_LOG_COUNT))) {
+                                  0, 0, XFS_DEFAULT_LOG_COUNT);
+        if (error) {
                xfs_trans_cancel(tp, 0);
                return (error);
        }
@@ -671,14 +673,10 @@ xfs_qm_log_quotaoff(
        uint                    oldsbqflag=0;
        tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF);
-        if ((error = xfs_trans_reserve(tp, 0,
+        error = xfs_trans_reserve(tp, 0, XFS_QM_QUOTAOFF_LOG_RES(mp),
-                                      sizeof(xfs_qoff_logitem_t) * 2 +
+                                  0, 0, XFS_DEFAULT_LOG_COUNT);
-                                      mp->m_sb.sb_sectsize + 128,
+        if (error)
-                                      0,
-                                      0,
-                                      XFS_DEFAULT_LOG_COUNT))) {
                goto error0;
-        }
        qoffi = xfs_trans_get_qoff_item(tp, NULL, flags & XFS_ALL_QUOTA_ACCT);
        xfs_trans_log_quotaoff_item(tp, qoffi);
@@ -784,11 +782,11 @@ xfs_qm_scall_getquota(
             (XFS_IS_OQUOTA_ENFORCED(mp) &&
                        (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) &&
            dst->d_id != 0) {
-                if (((int) dst->d_bcount > (int) dst->d_blk_softlimit) &&
+                if ((dst->d_bcount > dst->d_blk_softlimit) &&
                    (dst->d_blk_softlimit > 0)) {
                        ASSERT(dst->d_btimer != 0);
                }
-                if (((int) dst->d_icount > (int) dst->d_ino_softlimit) &&
+                if ((dst->d_icount > dst->d_ino_softlimit) &&
                    (dst->d_ino_softlimit > 0)) {
                        ASSERT(dst->d_itimer != 0);
                }
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ab8839b26272..c407121873b4 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -139,9 +139,9 @@ static const match_table_t tokens = {
 STATIC unsigned long
-suffix_strtoul(char *s, char **endp, unsigned int base)
+suffix_kstrtoint(char *s, unsigned int base, int *res)
 {
-        int     last, shift_left_factor = 0;
+        int     last, shift_left_factor = 0, _res;
        char    *value = s;
        last = strlen(value) - 1;
@@ -158,7 +158,10 @@ suffix_strtoul(char *s, char **endp, unsigned int base)
                value[last] = '\0';
        }
-        return simple_strtoul((const char *)s, endp, base) << shift_left_factor;
+        if (kstrtoint(s, base, &_res))
+                return -EINVAL;
+        *res = _res << shift_left_factor;
+        return 0;
 }
 /*
@@ -174,7 +177,7 @@ xfs_parseargs(
        char                    *options)
 {
        struct super_block      *sb = mp->m_super;
-        char                    *this_char, *value, *eov;
+        char                    *this_char, *value;
        int                     dsunit = 0;
        int                     dswidth = 0;
        int                     iosize = 0;
@@ -230,14 +233,16 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        mp->m_logbufs = simple_strtoul(value, &eov, 10);
+                        if (kstrtoint(value, 10, &mp->m_logbufs))
+                                return EINVAL;
                } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
-                        mp->m_logbsize = suffix_strtoul(value, &eov, 10);
+                        if (suffix_kstrtoint(value, 10, &mp->m_logbsize))
+                                return EINVAL;
                } else if (!strcmp(this_char, MNTOPT_LOGDEV)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
@@ -266,7 +271,8 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        iosize = simple_strtoul(value, &eov, 10);
+                        if (kstrtoint(value, 10, &iosize))
+                                return EINVAL;
                        iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) {
                        if (!value || !*value) {
@@ -274,7 +280,8 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        iosize = suffix_strtoul(value, &eov, 10);
+                        if (suffix_kstrtoint(value, 10, &iosize))
+                                return EINVAL;
                        iosizelog = ffs(iosize) - 1;
                } else if (!strcmp(this_char, MNTOPT_GRPID) ||
                           !strcmp(this_char, MNTOPT_BSDGROUPS)) {
@@ -296,14 +303,16 @@ xfs_parseargs(
                                        this_char);
                                return EINVAL;
                        }
-                        dsunit = simple_strtoul(value, &eov, 10);
+                        if (kstrtoint(value, 10, &dsunit))
+                                return EINVAL;
                } else if (!strcmp(this_char, MNTOPT_SWIDTH)) {
                        if (!value || !*value) {
                                xfs_warn(mp, "%s option requires an argument",
                                        this_char);
                                return EINVAL;
                        }
-                        dswidth = simple_strtoul(value, &eov, 10);
+                        if (kstrtoint(value, 10, &dswidth))
+                                return EINVAL;
                } else if (!strcmp(this_char, MNTOPT_32BITINODE)) {
                        mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
                } else if (!strcmp(this_char, MNTOPT_64BITINODE)) {
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 2e137d4a85ae..16a812977eab 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -341,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
 DEFINE_BUF_EVENT(xfs_buf_error_relse);
+DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
 DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
 DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index 06ed520a767f..2fd7c1ff1d21 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -37,14 +37,45 @@
 #include "xfs_extent_busy.h"
 #include "xfs_bmap.h"
 #include "xfs_quota.h"
+#include "xfs_qm.h"
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
 #include "xfs_inode_item.h"
+#include "xfs_log_priv.h"
+#include "xfs_buf_item.h"
 #include "xfs_trace.h"
 kmem_zone_t     *xfs_trans_zone;
 kmem_zone_t     *xfs_log_item_desc_zone;
+/*
+ * A buffer has a format structure overhead in the log in addition
+ * to the data, so we need to take this into account when reserving
+ * space in a transaction for a buffer.  Round the space required up
+ * to a multiple of 128 bytes so that we don't change the historical
+ * reservation that has been used for this overhead.
+ */
+STATIC uint
+xfs_buf_log_overhead(void)
+{
+        return round_up(sizeof(struct xlog_op_header) +
+                        sizeof(struct xfs_buf_log_format), 128);
+}
+/*
+ * Calculate out transaction log reservation per item in bytes.
+ *
+ * The nbufs argument is used to indicate the number of items that
+ * will be changed in a transaction.  size is used to tell how many
+ * bytes should be reserved per item.
+ */
+STATIC uint
+xfs_calc_buf_res(
+        uint            nbufs,
+        uint            size)
+{
+        return nbufs * (size + xfs_buf_log_overhead());
+}
 /*
 * Various log reservation values.
@@ -85,18 +116,15 @@ xfs_calc_write_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((mp->m_sb.sb_inodesize +
+                MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+                     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
-                     2 * mp->m_sb.sb_sectsize +
+                                      XFS_FSB_TO_B(mp, 1)) +
-                     mp->m_sb.sb_sectsize +
+                     xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                     128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+                                      XFS_FSB_TO_B(mp, 1))),
-                            XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+                    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    (2 * mp->m_sb.sb_sectsize +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                     2 * mp->m_sb.sb_sectsize +
+                                      XFS_FSB_TO_B(mp, 1))));
-                     mp->m_sb.sb_sectsize +
-                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
-                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
 /*
@@ -117,18 +145,17 @@ xfs_calc_itruncate_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((mp->m_sb.sb_inodesize +
+                MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+                     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1,
-                     128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+                                      XFS_FSB_TO_B(mp, 1))),
-                    (4 * mp->m_sb.sb_sectsize +
+                    (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                     4 * mp->m_sb.sb_sectsize +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
-                     mp->m_sb.sb_sectsize +
+                                      XFS_FSB_TO_B(mp, 1)) +
-                     XFS_ALLOCFREE_LOG_RES(mp, 4) +
+                    xfs_calc_buf_res(5, 0) +
-                     128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                     128 * 5 +
+                                     XFS_FSB_TO_B(mp, 1)) +
-                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                    xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
-                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                                     mp->m_in_maxlevels, 0)));
-                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 /*
@@ -148,14 +175,12 @@ xfs_calc_rename_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((4 * mp->m_sb.sb_inodesize +
+                MAX((xfs_calc_buf_res(4, mp->m_sb.sb_inodesize) +
-                     2 * XFS_DIROP_LOG_RES(mp) +
+                     xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp),
-                     128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+                                      XFS_FSB_TO_B(mp, 1))),
-                    (3 * mp->m_sb.sb_sectsize +
+                    (xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) +
-                     3 * mp->m_sb.sb_sectsize +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 3),
-                     mp->m_sb.sb_sectsize +
+                                      XFS_FSB_TO_B(mp, 1))));
-                     XFS_ALLOCFREE_LOG_RES(mp, 3) +
-                     128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
 }
 /*
@@ -175,15 +200,12 @@ xfs_calc_link_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((mp->m_sb.sb_inodesize +
+                MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                     mp->m_sb.sb_inodesize +
+                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                     XFS_DIROP_LOG_RES(mp) +
+                                      XFS_FSB_TO_B(mp, 1))),
-                     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+                    (xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                    (mp->m_sb.sb_sectsize +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                     mp->m_sb.sb_sectsize +
+                                      XFS_FSB_TO_B(mp, 1))));
-                     mp->m_sb.sb_sectsize +
-                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
-                     128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 /*
@@ -203,15 +225,12 @@ xfs_calc_remove_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((mp->m_sb.sb_inodesize +
+                MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                     mp->m_sb.sb_inodesize +
+                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                     XFS_DIROP_LOG_RES(mp) +
+                                      XFS_FSB_TO_B(mp, 1))),
-                     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+                    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                    (2 * mp->m_sb.sb_sectsize +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                     2 * mp->m_sb.sb_sectsize +
+                                      XFS_FSB_TO_B(mp, 1))));
-                     mp->m_sb.sb_sectsize +
-                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
-                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
 /*
@@ -233,18 +252,18 @@ xfs_calc_symlink_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((mp->m_sb.sb_inodesize +
+                MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                     mp->m_sb.sb_inodesize +
+                     xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-                     XFS_FSB_TO_B(mp, 1) +
+                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
-                     XFS_DIROP_LOG_RES(mp) +
+                                      XFS_FSB_TO_B(mp, 1)) +
-                     1024 +
+                     xfs_calc_buf_res(1, 1024)),
-                     128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+                    (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-                    (2 * mp->m_sb.sb_sectsize +
+                     xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
-                     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+                                      XFS_FSB_TO_B(mp, 1)) +
-                     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+                     xfs_calc_buf_res(mp->m_in_maxlevels,
-                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                                      XFS_FSB_TO_B(mp, 1)) +
-                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
+                                      XFS_FSB_TO_B(mp, 1))));
 }
 /*
@@ -267,18 +286,19 @@ xfs_calc_create_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((mp->m_sb.sb_inodesize +
+                MAX((xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                     mp->m_sb.sb_inodesize +
+                     xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                     (uint)XFS_FSB_TO_B(mp, 1) +
+                     xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp),
+                                      XFS_FSB_TO_B(mp, 1))),
+                    (xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
                     mp->m_sb.sb_sectsize +
-                     XFS_FSB_TO_B(mp, 1) +
+                     xfs_calc_buf_res(XFS_IALLOC_BLOCKS(mp),
-                     XFS_DIROP_LOG_RES(mp) +
+                                      XFS_FSB_TO_B(mp, 1)) +
-                     128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+                     xfs_calc_buf_res(mp->m_in_maxlevels,
-                    (3 * mp->m_sb.sb_sectsize +
+                                      XFS_FSB_TO_B(mp, 1)) +
-                     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+                                      XFS_FSB_TO_B(mp, 1))));
-                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
-                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
-                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
 /*
@@ -306,16 +326,16 @@ xfs_calc_ifree_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                mp->m_sb.sb_inodesize +
+                xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                mp->m_sb.sb_sectsize +
+                xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-                mp->m_sb.sb_sectsize +
+                xfs_calc_buf_res(1, XFS_FSB_TO_B(mp, 1)) +
-                XFS_FSB_TO_B(mp, 1) +
                MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
                    XFS_INODE_CLUSTER_SIZE(mp)) +
-                128 * 5 +
+                xfs_calc_buf_res(1, 0) +
-                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                xfs_calc_buf_res(2 + XFS_IALLOC_BLOCKS(mp) +
-                128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                                 mp->m_in_maxlevels, 0) +
-                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
+                                 XFS_FSB_TO_B(mp, 1));
 }
 /*
@@ -343,9 +363,9 @@ STATIC uint
 xfs_calc_growdata_reservation(
        struct xfs_mount        *mp)
 {
-        return mp->m_sb.sb_sectsize * 3 +
+        return xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) +
-                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+                                 XFS_FSB_TO_B(mp, 1));
 }
 /*
@@ -362,12 +382,12 @@ STATIC uint
 xfs_calc_growrtalloc_reservation(
        struct xfs_mount        *mp)
 {
-        return 2 * mp->m_sb.sb_sectsize +
+        return xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-                XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+                xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK),
-                mp->m_sb.sb_inodesize +
+                                 XFS_FSB_TO_B(mp, 1)) +
-                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+                                 XFS_FSB_TO_B(mp, 1));
 }
 /*
@@ -379,7 +399,7 @@ STATIC uint
 xfs_calc_growrtzero_reservation(
        struct xfs_mount        *mp)
 {
-        return mp->m_sb.sb_blocksize + 128;
+        return xfs_calc_buf_res(1, mp->m_sb.sb_blocksize);
 }
 /*
@@ -396,11 +416,10 @@ STATIC uint
 xfs_calc_growrtfree_reservation(
        struct xfs_mount        *mp)
 {
-        return mp->m_sb.sb_sectsize +
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-                2 * mp->m_sb.sb_inodesize +
+                xfs_calc_buf_res(2, mp->m_sb.sb_inodesize) +
-                mp->m_sb.sb_blocksize +
+                xfs_calc_buf_res(1, mp->m_sb.sb_blocksize) +
-                mp->m_rsumsize +
+                xfs_calc_buf_res(1, mp->m_rsumsize);
-                128 * 5;
 }
 /*
@@ -411,7 +430,7 @@ STATIC uint
 xfs_calc_swrite_reservation(
        struct xfs_mount        *mp)
 {
-        return mp->m_sb.sb_inodesize + 128;
+        return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
 }
 /*
@@ -421,7 +440,7 @@ xfs_calc_swrite_reservation(
 STATIC uint
 xfs_calc_writeid_reservation(xfs_mount_t *mp)
 {
-        return mp->m_sb.sb_inodesize + 128;
+        return xfs_calc_buf_res(1, mp->m_sb.sb_inodesize);
 }
 /*
@@ -437,13 +456,13 @@ xfs_calc_addafork_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                mp->m_sb.sb_inodesize +
+                xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                mp->m_sb.sb_sectsize * 2 +
+                xfs_calc_buf_res(2, mp->m_sb.sb_sectsize) +
-                mp->m_dirblksize +
+                xfs_calc_buf_res(1, mp->m_dirblksize) +
-                XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+                xfs_calc_buf_res(XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1,
-                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                                 XFS_FSB_TO_B(mp, 1)) +
-                128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+                xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 1),
-                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
+                                 XFS_FSB_TO_B(mp, 1));
 }
 /*
@@ -461,35 +480,51 @@ STATIC uint
 xfs_calc_attrinval_reservation(
        struct xfs_mount        *mp)
 {
-        return MAX((mp->m_sb.sb_inodesize +
+        return MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                    XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                    xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
-                    128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+                                     XFS_FSB_TO_B(mp, 1))),
-                   (4 * mp->m_sb.sb_sectsize +
+                   (xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) +
-                    4 * mp->m_sb.sb_sectsize +
+                    xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 4),
-                    mp->m_sb.sb_sectsize +
+                                     XFS_FSB_TO_B(mp, 1))));
-                    XFS_ALLOCFREE_LOG_RES(mp, 4) +
-                    128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
 }
 /*
- * Setting an attribute.
+ * Setting an attribute at mount time.
 *      the inode getting the attribute
 *      the superblock for allocations
 *      the agfs extents are allocated from
 *      the attribute btree * max depth
 *      the inode allocation btree
 * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
+ * the calculation is done partially at mount time and partially at runtime(see
+ * below).
 */
 STATIC uint
-xfs_calc_attrset_reservation(
+xfs_calc_attrsetm_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                mp->m_sb.sb_inodesize +
+                xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                mp->m_sb.sb_sectsize +
+                xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
-                XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+                xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH, XFS_FSB_TO_B(mp, 1));
-                128 * (2 + XFS_DA_NODE_MAXDEPTH);
+}
+/*
+ * Setting an attribute at runtime, transaction space unit per block.
+ *      the superblock for allocations: sector size
+ *      the inode bmap btree could join or split: max depth * block size
+ * Since the runtime attribute transaction space is dependent on the total
+ * blocks needed for the 1st bmap, here we calculate out the space unit for
+ * one block so that the caller could figure out the total space according
+ * to the attibute extent length in blocks by: ext * XFS_ATTRSETRT_LOG_RES(mp).
+ */
+STATIC uint
+xfs_calc_attrsetrt_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize) +
+                xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK),
+                                 XFS_FSB_TO_B(mp, 1));
 }
 /*
@@ -508,16 +543,15 @@ xfs_calc_attrrm_reservation(
        struct xfs_mount        *mp)
 {
        return XFS_DQUOT_LOGRES(mp) +
-                MAX((mp->m_sb.sb_inodesize +
+                MAX((xfs_calc_buf_res(1, mp->m_sb.sb_inodesize) +
-                     XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+                     xfs_calc_buf_res(XFS_DA_NODE_MAXDEPTH,
-                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                                      XFS_FSB_TO_B(mp, 1)) +
-                     128 * (1 + XFS_DA_NODE_MAXDEPTH +
+                     (uint)XFS_FSB_TO_B(mp,
-                            XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+                                        XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
-                    (2 * mp->m_sb.sb_sectsize +
+                     xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK), 0)),
-                     2 * mp->m_sb.sb_sectsize +
+                    (xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) +
-                     mp->m_sb.sb_sectsize +
+                     xfs_calc_buf_res(XFS_ALLOCFREE_LOG_COUNT(mp, 2),
-                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                                      XFS_FSB_TO_B(mp, 1))));
-                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
 /*
@@ -527,7 +561,78 @@ STATIC uint
 xfs_calc_clear_agi_bucket_reservation(
        struct xfs_mount        *mp)
 {
-        return mp->m_sb.sb_sectsize + 128;
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+/*
+ * Clearing the quotaflags in the superblock.
+ *      the super block for changing quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_sbchange_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+/*
+ * Adjusting quota limits.
+ *    the xfs_disk_dquot_t: sizeof(struct xfs_disk_dquot)
+ */
+STATIC uint
+xfs_calc_qm_setqlim_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, sizeof(struct xfs_disk_dquot));
+}
+/*
+ * Allocating quota on disk if needed.
+ *      the write transaction log space: XFS_WRITE_LOG_RES(mp)
+ *      the unit of quota allocation: one system block size
+ */
+STATIC uint
+xfs_calc_qm_dqalloc_reservation(
+        struct xfs_mount        *mp)
+{
+        return XFS_WRITE_LOG_RES(mp) +
+                xfs_calc_buf_res(1,
+                        XFS_FSB_TO_B(mp, XFS_DQUOT_CLUSTER_SIZE_FSB) - 1);
+}
+/*
+ * Turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ *    the superblock for the quota flags: sector size
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_reservation(
+        struct xfs_mount        *mp)
+{
+        return sizeof(struct xfs_qoff_logitem) * 2 +
+                xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
+}
+/*
+ * End of turning off quotas.
+ *    the xfs_qoff_logitem_t: sizeof(struct xfs_qoff_logitem) * 2
+ */
+STATIC uint
+xfs_calc_qm_quotaoff_end_reservation(
+        struct xfs_mount        *mp)
+{
+        return sizeof(struct xfs_qoff_logitem) * 2;
+}
+/*
+ * Syncing the incore super block changes to disk.
+ *     the super block to reflect the changes: sector size
+ */
+STATIC uint
+xfs_calc_sb_reservation(
+        struct xfs_mount        *mp)
+{
+        return xfs_calc_buf_res(1, mp->m_sb.sb_sectsize);
 }
 /*
@@ -555,12 +660,19 @@ xfs_trans_init(
        resp->tr_writeid = xfs_calc_writeid_reservation(mp);
        resp->tr_addafork = xfs_calc_addafork_reservation(mp);
        resp->tr_attrinval = xfs_calc_attrinval_reservation(mp);
-        resp->tr_attrset = xfs_calc_attrset_reservation(mp);
+        resp->tr_attrsetm = xfs_calc_attrsetm_reservation(mp);
+        resp->tr_attrsetrt = xfs_calc_attrsetrt_reservation(mp);
        resp->tr_attrrm = xfs_calc_attrrm_reservation(mp);
        resp->tr_clearagi = xfs_calc_clear_agi_bucket_reservation(mp);
        resp->tr_growrtalloc = xfs_calc_growrtalloc_reservation(mp);
        resp->tr_growrtzero = xfs_calc_growrtzero_reservation(mp);
        resp->tr_growrtfree = xfs_calc_growrtfree_reservation(mp);
+        resp->tr_qm_sbchange = xfs_calc_qm_sbchange_reservation(mp);
+        resp->tr_qm_setqlim = xfs_calc_qm_setqlim_reservation(mp);
+        resp->tr_qm_dqalloc = xfs_calc_qm_dqalloc_reservation(mp);
+        resp->tr_qm_quotaoff = xfs_calc_qm_quotaoff_reservation(mp);
+        resp->tr_qm_equotaoff = xfs_calc_qm_quotaoff_end_reservation(mp);
+        resp->tr_sb = xfs_calc_sb_reservation(mp);
 }
 /*
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c6c0601abd7a..cd29f6171021 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -252,17 +252,19 @@ struct xfs_log_item_desc {
 * as long as SWRITE logs the entire inode core
 */
 #define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
-#define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
+#define XFS_WRITEID_LOG_RES(mp)         ((mp)->m_reservations.tr_swrite)
 #define XFS_ADDAFORK_LOG_RES(mp)        ((mp)->m_reservations.tr_addafork)
 #define XFS_ATTRINVAL_LOG_RES(mp)       ((mp)->m_reservations.tr_attrinval)
-#define XFS_ATTRSET_LOG_RES(mp, ext)    \
+#define XFS_ATTRSETM_LOG_RES(mp)        ((mp)->m_reservations.tr_attrsetm)
-        ((mp)->m_reservations.tr_attrset + \
+#define XFS_ATTRSETRT_LOG_RES(mp)       ((mp)->m_reservations.tr_attrsetrt)
-         (ext * (mp)->m_sb.sb_sectsize) + \
+#define XFS_ATTRRM_LOG_RES(mp)          ((mp)->m_reservations.tr_attrrm)
-         (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
-         (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-#define XFS_ATTRRM_LOG_RES(mp)  ((mp)->m_reservations.tr_attrrm)
 #define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
+#define XFS_QM_SBCHANGE_LOG_RES(mp)     ((mp)->m_reservations.tr_qm_sbchange)
+#define XFS_QM_SETQLIM_LOG_RES(mp)      ((mp)->m_reservations.tr_qm_setqlim)
+#define XFS_QM_DQALLOC_LOG_RES(mp)      ((mp)->m_reservations.tr_qm_dqalloc)
+#define XFS_QM_QUOTAOFF_LOG_RES(mp)     ((mp)->m_reservations.tr_qm_quotaoff)
+#define XFS_QM_QUOTAOFF_END_LOG_RES(mp) ((mp)->m_reservations.tr_qm_equotaoff)
+#define XFS_SB_LOG_RES(mp)              ((mp)->m_reservations.tr_sb)
 /*
 * Various log count values.
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 6011ee661339..0eda7254305f 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -55,20 +55,6 @@ xfs_ail_check(
                ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) >= 0);
-#ifdef XFS_TRANS_DEBUG
-        /*
-         * Walk the list checking lsn ordering, and that every entry has the
-         * XFS_LI_IN_AIL flag set. This is really expensive, so only do it
-         * when specifically debugging the transaction subsystem.
-         */
-        prev_lip = list_entry(&ailp->xa_ail, xfs_log_item_t, li_ail);
-        list_for_each_entry(lip, &ailp->xa_ail, li_ail) {
-                if (&prev_lip->li_ail != &ailp->xa_ail)
-                        ASSERT(XFS_LSN_CMP(prev_lip->li_lsn, lip->li_lsn) <= 0);
-                ASSERT((lip->li_flags & XFS_LI_IN_AIL) != 0);
-                prev_lip = lip;
-        }
-#endif /* XFS_TRANS_DEBUG */
 }
 #else /* !DEBUG */
 #define xfs_ail_check(a,l)
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 4fc17d479d42..3edf5dbee001 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -93,7 +93,7 @@ _xfs_trans_bjoin(
        xfs_buf_item_init(bp, tp->t_mountp);
        bip = bp->b_fspriv;
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+        ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
        if (reset_recur)
                bip->bli_recur = 0;
@@ -432,7 +432,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
        bip = bp->b_fspriv;
        ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+        ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        trace_xfs_trans_brelse(bip);
@@ -519,7 +519,7 @@ xfs_trans_bhold(xfs_trans_t	*tp,
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+        ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_flags |= XFS_BLI_HOLD;
@@ -539,7 +539,7 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
+        ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT(bip->bli_flags & XFS_BLI_HOLD);
@@ -598,7 +598,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
                bip->bli_flags &= ~XFS_BLI_STALE;
                ASSERT(XFS_BUF_ISSTALE(bp));
                XFS_BUF_UNSTALE(bp);
-                bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
+                bip->__bli_format.blf_flags &= ~XFS_BLF_CANCEL;
        }
        tp->t_flags |= XFS_TRANS_DIRTY;
@@ -643,6 +643,7 @@ xfs_trans_binval(
        xfs_buf_t       *bp)
 {
        xfs_buf_log_item_t      *bip = bp->b_fspriv;
+        int                     i;
        ASSERT(bp->b_transp == tp);
        ASSERT(bip != NULL);
@@ -657,8 +658,8 @@ xfs_trans_binval(
                 */
                ASSERT(XFS_BUF_ISSTALE(bp));
                ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
-                ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
+                ASSERT(!(bip->__bli_format.blf_flags & XFS_BLF_INODE_BUF));
-                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
+                ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
                ASSERT(bip->bli_item.li_desc->lid_flags & XFS_LID_DIRTY);
                ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
                return;
@@ -668,10 +669,12 @@ xfs_trans_binval(
        bip->bli_flags |= XFS_BLI_STALE;
        bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
-        bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
+        bip->__bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
-        bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
+        bip->__bli_format.blf_flags |= XFS_BLF_CANCEL;
-        memset((char *)(bip->bli_format.blf_data_map), 0,
+        for (i = 0; i < bip->bli_format_count; i++) {
-              (bip->bli_format.blf_map_size * sizeof(uint)));
+                memset(bip->bli_formats[i].blf_data_map, 0,
+                       (bip->bli_formats[i].blf_map_size * sizeof(uint)));
+        }
        bip->bli_item.li_desc->lid_flags |= XFS_LID_DIRTY;
        tp->t_flags |= XFS_TRANS_DIRTY;
 }
@@ -775,5 +778,5 @@ xfs_trans_dquot_buf(
               type == XFS_BLF_GDQUOT_BUF);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
-        bip->bli_format.blf_flags |= type;
+        bip->__bli_format.blf_flags |= type;
 }
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index 0c7fa54f309e..642c2d6e1db1 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -516,7 +516,7 @@ xfs_trans_unreserve_and_mod_dquots(
        int                     i, j;
        xfs_dquot_t             *dqp;
        xfs_dqtrx_t             *qtrx, *qa;
-        boolean_t               locked;
+        bool                    locked;
        if (!tp->t_dqinfo || !(tp->t_flags & XFS_TRANS_DQ_DIRTY))
                return;
@@ -537,17 +537,17 @@ xfs_trans_unreserve_and_mod_dquots(
                         * about the number of blocks used field, or deltas.
                         * Also we don't bother to zero the fields.
                         */
-                        locked = B_FALSE;
+                        locked = false;
                        if (qtrx->qt_blk_res) {
                                xfs_dqlock(dqp);
-                                locked = B_TRUE;
+                                locked = true;
                                dqp->q_res_bcount -=
                                        (xfs_qcnt_t)qtrx->qt_blk_res;
                        }
                        if (qtrx->qt_ino_res) {
                                if (!locked) {
                                        xfs_dqlock(dqp);
-                                        locked = B_TRUE;
+                                        locked = true;
                                }
                                dqp->q_res_icount -=
                                        (xfs_qcnt_t)qtrx->qt_ino_res;
@@ -556,7 +556,7 @@ xfs_trans_unreserve_and_mod_dquots(
                        if (qtrx->qt_rtblk_res) {
                                if (!locked) {
                                        xfs_dqlock(dqp);
-                                        locked = B_TRUE;
+                                        locked = true;
                                }
                                dqp->q_res_rtbcount -=
                                        (xfs_qcnt_t)qtrx->qt_rtblk_res;
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index d2eee20d5f5b..ac6d567704db 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -33,14 +33,6 @@
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
-#ifdef XFS_TRANS_DEBUG
-STATIC void
-xfs_trans_inode_broot_debug(
-        xfs_inode_t     *ip);
-#else
-#define xfs_trans_inode_broot_debug(ip)
-#endif
 /*
 * Add a locked inode to the transaction.
 *
@@ -67,8 +59,6 @@ xfs_trans_ijoin(
         * Get a log_item_desc to point at the new item.
         */
        xfs_trans_add_item(tp, &iip->ili_item);
-        xfs_trans_inode_broot_debug(ip);
 }
 /*
@@ -135,34 +125,3 @@ xfs_trans_log_inode(
        flags |= ip->i_itemp->ili_last_fields;
        ip->i_itemp->ili_fields |= flags;
 }
-#ifdef XFS_TRANS_DEBUG
-/*
- * Keep track of the state of the inode btree root to make sure we
- * log it properly.
- */
-STATIC void
-xfs_trans_inode_broot_debug(
-        xfs_inode_t     *ip)
-{
-        xfs_inode_log_item_t    *iip;
-        ASSERT(ip->i_itemp != NULL);
-        iip = ip->i_itemp;
-        if (iip->ili_root_size != 0) {
-                ASSERT(iip->ili_orig_root != NULL);
-                kmem_free(iip->ili_orig_root);
-                iip->ili_root_size = 0;
-                iip->ili_orig_root = NULL;
-        }
-        if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-                ASSERT((ip->i_df.if_broot != NULL) &&
-                       (ip->i_df.if_broot_bytes > 0));
-                iip->ili_root_size = ip->i_df.if_broot_bytes;
-                iip->ili_orig_root =
-                        (char*)kmem_alloc(iip->ili_root_size, KM_SLEEP);
-                memcpy(iip->ili_orig_root, (char*)(ip->i_df.if_broot),
-                      iip->ili_root_size);
-        }
-}
-#endif
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index 7a41874f4c20..61ba1cfa974c 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -32,7 +32,6 @@ typedef unsigned int		__uint32_t;
 typedef signed long long int    __int64_t;
 typedef unsigned long long int  __uint64_t;
-typedef enum { B_FALSE,B_TRUE } boolean_t;
 typedef __uint32_t              prid_t;         /* project ID */
 typedef __uint32_t              inst_t;         /* an instruction */
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index d95f565a390e..77ad74834baa 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -725,7 +725,7 @@ xfs_create(
        int                     error;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
-        boolean_t               unlock_dp_on_error = B_FALSE;
+        bool                    unlock_dp_on_error = false;
        uint                    cancel_flags;
        int                     committed;
        prid_t                  prid;
@@ -794,7 +794,7 @@ xfs_create(
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-        unlock_dp_on_error = B_TRUE;
+        unlock_dp_on_error = true;
        xfs_bmap_init(&free_list, &first_block);
@@ -830,7 +830,7 @@ xfs_create(
         * error path.
         */
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-        unlock_dp_on_error = B_FALSE;
+        unlock_dp_on_error = false;
        error = xfs_dir_createname(tp, dp, name, ip->i_ino,
                                        &first_block, &free_list, resblks ?
@@ -1367,7 +1367,7 @@ xfs_symlink(
        int                     pathlen;
        xfs_bmap_free_t         free_list;
        xfs_fsblock_t           first_block;
-        boolean_t               unlock_dp_on_error = B_FALSE;
+        bool                    unlock_dp_on_error = false;
        uint                    cancel_flags;
        int                     committed;
        xfs_fileoff_t           first_fsb;
@@ -1438,7 +1438,7 @@ xfs_symlink(
        }
        xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
-        unlock_dp_on_error = B_TRUE;
+        unlock_dp_on_error = true;
        /*
         * Check whether the directory allows new symlinks or not.
@@ -1484,7 +1484,7 @@ xfs_symlink(
         * error path.
         */
        xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
-        unlock_dp_on_error = B_FALSE;
+        unlock_dp_on_error = false;
        /*
         * Also attach the dquot(s) to it, if applicable.