174 files changed, 5659 insertions, 4008 deletions
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 842d00048a65..01443ce43ee7 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -548,15 +548,6 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
                spin_lock(&sbi->fs_lock);
                ino->flags &= ~AUTOFS_INF_EXPIRING;
-                spin_lock(&dentry->d_lock);
-                if (!ret) {
-                        if ((IS_ROOT(dentry) ||
-                            (autofs_type_indirect(sbi->type) &&
-                             IS_ROOT(dentry->d_parent))) &&
-                            !(dentry->d_flags & DCACHE_NEED_AUTOMOUNT))
-                                __managed_dentry_set_automount(dentry);
-                }
-                spin_unlock(&dentry->d_lock);
                complete_all(&ino->expire_complete);
                spin_unlock(&sbi->fs_lock);
                dput(dentry);
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 91b11650722e..c93447604da8 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -124,13 +124,10 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
         * it.
         */
        spin_lock(&sbi->lookup_lock);
-        spin_lock(&dentry->d_lock);
+        if (!d_mountpoint(dentry) && simple_empty(dentry)) {
-        if (!d_mountpoint(dentry) && list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dentry->d_lock);
                spin_unlock(&sbi->lookup_lock);
                return -ENOENT;
        }
-        spin_unlock(&dentry->d_lock);
        spin_unlock(&sbi->lookup_lock);
 out:
@@ -355,7 +352,6 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
                status = autofs4_mount_wait(dentry);
                if (status)
                        return ERR_PTR(status);
-                spin_lock(&sbi->fs_lock);
                goto done;
        }
@@ -364,8 +360,11 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
         * having d_mountpoint() true, so there's no need to call back
         * to the daemon.
         */
-        if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode))
+        if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) {
+                spin_unlock(&sbi->fs_lock);
                goto done;
+        }
        if (!d_mountpoint(dentry)) {
                /*
                 * It's possible that user space hasn't removed directories
@@ -379,15 +378,13 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
                 * require user space behave.
                 */
                if (sbi->version > 4) {
-                        if (have_submounts(dentry))
+                        if (have_submounts(dentry)) {
+                                spin_unlock(&sbi->fs_lock);
                                goto done;
+                        }
                } else {
-                        spin_lock(&dentry->d_lock);
+                        if (!simple_empty(dentry))
-                        if (!list_empty(&dentry->d_subdirs)) {
-                                spin_unlock(&dentry->d_lock);
                                goto done;
-                        }
-                        spin_unlock(&dentry->d_lock);
                }
                ino->flags |= AUTOFS_INF_PENDING;
                spin_unlock(&sbi->fs_lock);
@@ -399,28 +396,8 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
                        return ERR_PTR(status);
                }
        }
-done:
-        if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
-                /*
-                 * Any needed mounting has been completed and the path
-                 * updated so clear DCACHE_NEED_AUTOMOUNT so we don't
-                 * call ->d_automount() on rootless multi-mounts since
-                 * it can lead to an incorrect ELOOP error return.
-                 *
-                 * Only clear DMANAGED_AUTOMOUNT for rootless multi-mounts and
-                 * symlinks as in all other cases the dentry will be covered by
-                 * an actual mount so ->d_automount() won't be called during
-                 * the follow.
-                 */
-                spin_lock(&dentry->d_lock);
-                if ((!d_mountpoint(dentry) &&
-                    !list_empty(&dentry->d_subdirs)) ||
-                    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
-                        __managed_dentry_clear_automount(dentry);
-                spin_unlock(&dentry->d_lock);
-        }
        spin_unlock(&sbi->fs_lock);
+done:
        /* Mount succeeded, check if we ended up with a new dentry */
        dentry = autofs4_mountpoint_changed(path);
        if (!dentry)
@@ -432,6 +409,8 @@ done:
 int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
+        struct autofs_info *ino = autofs4_dentry_ino(dentry);
+        int status;
        DPRINTK("dentry=%p %.*s",
                dentry, dentry->d_name.len, dentry->d_name.name);
@@ -456,7 +435,32 @@ int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
         * This dentry may be under construction so wait on mount
         * completion.
         */
-        return autofs4_mount_wait(dentry);
+        status = autofs4_mount_wait(dentry);
+        if (status)
+                return status;
+        spin_lock(&sbi->fs_lock);
+        /*
+         * If the dentry has been selected for expire while we slept
+         * on the lock then it might go away. We'll deal with that in
+         * ->d_automount() and wait on a new mount if the expire
+         * succeeds or return here if it doesn't (since there's no
+         * mount to follow with a rootless multi-mount).
+         */
+        if (!(ino->flags & AUTOFS_INF_EXPIRING)) {
+                /*
+                 * Any needed mounting has been completed and the path
+                 * updated so check if this is a rootless multi-mount so
+                 * we can avoid needless calls ->d_automount() and avoid
+                 * an incorrect ELOOP error return.
+                 */
+                if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
+                    (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)))
+                        status = -EISDIR;
+        }
+        spin_unlock(&sbi->fs_lock);
+        return status;
 }
 /* Lookups in the root directory */
@@ -599,9 +603,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
        spin_lock(&sbi->lookup_lock);
        __autofs4_add_expiring(dentry);
-        spin_lock(&dentry->d_lock);
+        d_drop(dentry);
-        __d_drop(dentry);
-        spin_unlock(&dentry->d_lock);
        spin_unlock(&sbi->lookup_lock);
        return 0;
@@ -672,15 +674,12 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry)
                return -EACCES;
        spin_lock(&sbi->lookup_lock);
-        spin_lock(&dentry->d_lock);
+        if (!simple_empty(dentry)) {
-        if (!list_empty(&dentry->d_subdirs)) {
-                spin_unlock(&dentry->d_lock);
                spin_unlock(&sbi->lookup_lock);
                return -ENOTEMPTY;
        }
        __autofs4_add_expiring(dentry);
-        __d_drop(dentry);
+        d_drop(dentry);
-        spin_unlock(&dentry->d_lock);
        spin_unlock(&sbi->lookup_lock);
        if (sbi->version < 5)
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index 0e7a6f81ae36..6043567b95c2 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -30,7 +30,7 @@
 #include <asm/cacheflush.h>
 #include <asm/a.out-core.h>
-static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_aout_binary(struct linux_binprm *);
 static int load_aout_library(struct file*);
 #ifdef CONFIG_COREDUMP
@@ -201,8 +201,9 @@ static unsigned long __user *create_aout_tables(char __user *p, struct linux_bin
 * libraries.  There is no binary dependent code anywhere else.
 */
-static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_aout_binary(struct linux_binprm * bprm)
 {
+        struct pt_regs *regs = current_pt_regs();
        struct exec ex;
        unsigned long error;
        unsigned long fd_offset;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index fbd9f60bd763..6d7d1647a68c 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -44,7 +44,7 @@
 #define user_siginfo_t siginfo_t
 #endif
-static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs);
+static int load_elf_binary(struct linux_binprm *bprm);
 static int load_elf_library(struct file *);
 static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *,
                                int, int, unsigned long);
@@ -558,7 +558,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 #endif
 }
-static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+static int load_elf_binary(struct linux_binprm *bprm)
 {
        struct file *interpreter = NULL; /* to shut gcc up */
        unsigned long load_addr = 0, load_bias = 0;
@@ -575,6 +575,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        unsigned long reloc_func_desc __maybe_unused = 0;
        int executable_stack = EXSTACK_DEFAULT;
        unsigned long def_flags = 0;
+        struct pt_regs *regs = current_pt_regs();
        struct {
                struct elfhdr elf_ex;
                struct elfhdr interp_elf_ex;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index a46049154107..dc84732e554f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -56,7 +56,7 @@ typedef char *elf_caddr_t;
 MODULE_LICENSE("GPL");
-static int load_elf_fdpic_binary(struct linux_binprm *, struct pt_regs *);
+static int load_elf_fdpic_binary(struct linux_binprm *);
 static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *, struct file *);
 static int elf_fdpic_map_file(struct elf_fdpic_params *, struct file *,
                              struct mm_struct *, const char *);
@@ -164,10 +164,10 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params,
 /*
 * load an fdpic binary into various bits of memory
 */
-static int load_elf_fdpic_binary(struct linux_binprm *bprm,
+static int load_elf_fdpic_binary(struct linux_binprm *bprm)
-                                 struct pt_regs *regs)
 {
        struct elf_fdpic_params exec_params, interp_params;
+        struct pt_regs *regs = current_pt_regs();
        struct elf_phdr *phdr;
        unsigned long stack_size, entryaddr;
 #ifdef ELF_FDPIC_PLAT_INIT
diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c
index 2790c7e1912e..4e6cce57d113 100644
--- a/fs/binfmt_em86.c
+++ b/fs/binfmt_em86.c
@@ -22,7 +22,7 @@
 #define EM86_INTERP     "/usr/bin/em86"
 #define EM86_I_NAME     "em86"
-static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
+static int load_em86(struct linux_binprm *bprm)
 {
        char *interp, *i_name, *i_arg;
        struct file * file;
@@ -90,7 +90,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs)
        if (retval < 0)
                return retval;
-        return search_binary_handler(bprm, regs);
+        return search_binary_handler(bprm);
 }
 static struct linux_binfmt em86_format = {
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index e280352b28f9..b56371981d16 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -88,7 +88,7 @@ struct lib_info {
 static int load_flat_shared_library(int id, struct lib_info *p);
 #endif
-static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs);
+static int load_flat_binary(struct linux_binprm *);
 static int flat_core_dump(struct coredump_params *cprm);
 static struct linux_binfmt flat_format = {
@@ -858,9 +858,10 @@ out:
 * libraries.  There is no binary dependent code anywhere else.
 */
-static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+static int load_flat_binary(struct linux_binprm * bprm)
 {
        struct lib_info libinfo;
+        struct pt_regs *regs = current_pt_regs();
        unsigned long p = bprm->p;
        unsigned long stack_len;
        unsigned long start_addr;
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 790b3cddca67..b0b70fbea06c 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -104,7 +104,7 @@ static Node *check_file(struct linux_binprm *bprm)
 /*
 * the loader itself
 */
-static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
+static int load_misc_binary(struct linux_binprm *bprm)
 {
        Node *fmt;
        struct file * interp_file = NULL;
@@ -199,7 +199,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs)
        bprm->recursion_depth++;
-        retval = search_binary_handler (bprm, regs);
+        retval = search_binary_handler(bprm);
        if (retval < 0)
                goto _error;
diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c
index d3b8c1f63155..8c954997e7f7 100644
--- a/fs/binfmt_script.c
+++ b/fs/binfmt_script.c
@@ -14,7 +14,7 @@
 #include <linux/err.h>
 #include <linux/fs.h>
-static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
+static int load_script(struct linux_binprm *bprm)
 {
        const char *i_arg, *i_name;
        char *cp;
@@ -95,7 +95,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs)
        retval = prepare_binprm(bprm);
        if (retval < 0)
                return retval;
-        return search_binary_handler(bprm,regs);
+        return search_binary_handler(bprm);
 }
 static struct linux_binfmt script_format = {
diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c
index 4517aaff61b4..4e00ed68d4a6 100644
--- a/fs/binfmt_som.c
+++ b/fs/binfmt_som.c
@@ -35,7 +35,7 @@
 #include <linux/elf.h>
-static int load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs);
+static int load_som_binary(struct linux_binprm * bprm);
 static int load_som_library(struct file *);
 /*
@@ -180,13 +180,14 @@ out:
 */
 static int
-load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs)
+load_som_binary(struct linux_binprm * bprm)
 {
        int retval;
        unsigned int size;
        unsigned long som_entry;
        struct som_hdr *som_ex;
        struct som_exec_auxhdr *hpuxhdr;
+        struct pt_regs *regs = current_pt_regs();
        /* Get the exec-header */
        som_ex = (struct som_hdr *) bprm->buf;
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1a1e5e3b1eaf..ab3a456f6650 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -70,19 +70,6 @@ static void bdev_inode_switch_bdi(struct inode *inode,
        spin_unlock(&dst->wb.list_lock);
 }
-sector_t blkdev_max_block(struct block_device *bdev)
-{
-        sector_t retval = ~((sector_t)0);
-        loff_t sz = i_size_read(bdev->bd_inode);
-        if (sz) {
-                unsigned int size = block_size(bdev);
-                unsigned int sizebits = blksize_bits(size);
-                retval = (sz >> sizebits);
-        }
-        return retval;
-}
 /* Kill _all_ buffers and pagecache , dirty or not.. */
 void kill_bdev(struct block_device *bdev)
 {
@@ -116,8 +103,6 @@ EXPORT_SYMBOL(invalidate_bdev);
 int set_blocksize(struct block_device *bdev, int size)
 {
-        struct address_space *mapping;
        /* Size must be a power of two, and between 512 and PAGE_SIZE */
        if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
                return -EINVAL;
@@ -126,19 +111,6 @@ int set_blocksize(struct block_device *bdev, int size)
        if (size < bdev_logical_block_size(bdev))
                return -EINVAL;
-        /* Prevent starting I/O or mapping the device */
-        percpu_down_write(&bdev->bd_block_size_semaphore);
-        /* Check that the block device is not memory mapped */
-        mapping = bdev->bd_inode->i_mapping;
-        mutex_lock(&mapping->i_mmap_mutex);
-        if (mapping_mapped(mapping)) {
-                mutex_unlock(&mapping->i_mmap_mutex);
-                percpu_up_write(&bdev->bd_block_size_semaphore);
-                return -EBUSY;
-        }
-        mutex_unlock(&mapping->i_mmap_mutex);
        /* Don't change the size if it is same as current */
        if (bdev->bd_block_size != size) {
                sync_blockdev(bdev);
@@ -146,9 +118,6 @@ int set_blocksize(struct block_device *bdev, int size)
                bdev->bd_inode->i_blkbits = blksize_bits(size);
                kill_bdev(bdev);
        }
-        percpu_up_write(&bdev->bd_block_size_semaphore);
        return 0;
 }
@@ -181,52 +150,12 @@ static int
 blkdev_get_block(struct inode *inode, sector_t iblock,
                struct buffer_head *bh, int create)
 {
-        if (iblock >= blkdev_max_block(I_BDEV(inode))) {
-                if (create)
-                        return -EIO;
-                /*
-                 * for reads, we're just trying to fill a partial page.
-                 * return a hole, they will have to call get_block again
-                 * before they can fill it, and they will get -EIO at that
-                 * time
-                 */
-                return 0;
-        }
        bh->b_bdev = I_BDEV(inode);
        bh->b_blocknr = iblock;
        set_buffer_mapped(bh);
        return 0;
 }
-static int
-blkdev_get_blocks(struct inode *inode, sector_t iblock,
-                struct buffer_head *bh, int create)
-{
-        sector_t end_block = blkdev_max_block(I_BDEV(inode));
-        unsigned long max_blocks = bh->b_size >> inode->i_blkbits;
-        if ((iblock + max_blocks) > end_block) {
-                max_blocks = end_block - iblock;
-                if ((long)max_blocks <= 0) {
-                        if (create)
-                                return -EIO;    /* write fully beyond EOF */
-                        /*
-                         * It is a read which is fully beyond EOF.  We return
-                         * a !buffer_mapped buffer
-                         */
-                        max_blocks = 0;
-                }
-        }
-        bh->b_bdev = I_BDEV(inode);
-        bh->b_blocknr = iblock;
-        bh->b_size = max_blocks << inode->i_blkbits;
-        if (max_blocks)
-                set_buffer_mapped(bh);
-        return 0;
-}
 static ssize_t
 blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                        loff_t offset, unsigned long nr_segs)
@@ -235,7 +164,7 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        struct inode *inode = file->f_mapping->host;
        return __blockdev_direct_IO(rw, iocb, inode, I_BDEV(inode), iov, offset,
-                                    nr_segs, blkdev_get_blocks, NULL, NULL, 0);
+                                    nr_segs, blkdev_get_block, NULL, NULL, 0);
 }
 int __sync_blockdev(struct block_device *bdev, int wait)
@@ -459,12 +388,6 @@ static struct inode *bdev_alloc_inode(struct super_block *sb)
        struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL);
        if (!ei)
                return NULL;
-        if (unlikely(percpu_init_rwsem(&ei->bdev.bd_block_size_semaphore))) {
-                kmem_cache_free(bdev_cachep, ei);
-                return NULL;
-        }
        return &ei->vfs_inode;
 }
@@ -473,8 +396,6 @@ static void bdev_i_callback(struct rcu_head *head)
        struct inode *inode = container_of(head, struct inode, i_rcu);
        struct bdev_inode *bdi = BDEV_I(inode);
-        percpu_free_rwsem(&bdi->bdev.bd_block_size_semaphore);
        kmem_cache_free(bdev_cachep, bdi);
 }
@@ -1593,22 +1514,6 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
        return blkdev_ioctl(bdev, mode, cmd, arg);
 }
-ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
-                        unsigned long nr_segs, loff_t pos)
-{
-        ssize_t ret;
-        struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
-        percpu_down_read(&bdev->bd_block_size_semaphore);
-        ret = generic_file_aio_read(iocb, iov, nr_segs, pos);
-        percpu_up_read(&bdev->bd_block_size_semaphore);
-        return ret;
-}
-EXPORT_SYMBOL_GPL(blkdev_aio_read);
 /*
 * Write data to the block device.  Only intended for the block device itself
 * and the raw driver which basically is a fake block device.
@@ -1620,16 +1525,12 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                         unsigned long nr_segs, loff_t pos)
 {
        struct file *file = iocb->ki_filp;
-        struct block_device *bdev = I_BDEV(file->f_mapping->host);
        struct blk_plug plug;
        ssize_t ret;
        BUG_ON(iocb->ki_pos != pos);
        blk_start_plug(&plug);
-        percpu_down_read(&bdev->bd_block_size_semaphore);
        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
        if (ret > 0 || ret == -EIOCBQUEUED) {
                ssize_t err;
@@ -1638,62 +1539,27 @@ ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
                if (err < 0 && ret > 0)
                        ret = err;
        }
-        percpu_up_read(&bdev->bd_block_size_semaphore);
        blk_finish_plug(&plug);
        return ret;
 }
 EXPORT_SYMBOL_GPL(blkdev_aio_write);
-static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
-{
+                         unsigned long nr_segs, loff_t pos)
-        int ret;
-        struct block_device *bdev = I_BDEV(file->f_mapping->host);
-        percpu_down_read(&bdev->bd_block_size_semaphore);
-        ret = generic_file_mmap(file, vma);
-        percpu_up_read(&bdev->bd_block_size_semaphore);
-        return ret;
-}
-static ssize_t blkdev_splice_read(struct file *file, loff_t *ppos,
-                                  struct pipe_inode_info *pipe, size_t len,
-                                  unsigned int flags)
-{
-        ssize_t ret;
-        struct block_device *bdev = I_BDEV(file->f_mapping->host);
-        percpu_down_read(&bdev->bd_block_size_semaphore);
-        ret = generic_file_splice_read(file, ppos, pipe, len, flags);
-        percpu_up_read(&bdev->bd_block_size_semaphore);
-        return ret;
-}
-static ssize_t blkdev_splice_write(struct pipe_inode_info *pipe,
-                                   struct file *file, loff_t *ppos, size_t len,
-                                   unsigned int flags)
 {
-        ssize_t ret;
+        struct file *file = iocb->ki_filp;
-        struct block_device *bdev = I_BDEV(file->f_mapping->host);
+        struct inode *bd_inode = file->f_mapping->host;
+        loff_t size = i_size_read(bd_inode);
-        percpu_down_read(&bdev->bd_block_size_semaphore);
-        ret = generic_file_splice_write(pipe, file, ppos, len, flags);
-        percpu_up_read(&bdev->bd_block_size_semaphore);
+        if (pos >= size)
+                return 0;
-        return ret;
+        size -= pos;
+        if (size < INT_MAX)
+                nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
+        return generic_file_aio_read(iocb, iov, nr_segs, pos);
 }
 /*
 * Try to release a page associated with block device when the system
 * is under memory pressure.
@@ -1724,16 +1590,16 @@ const struct file_operations def_blk_fops = {
        .llseek         = block_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
-        .aio_read       = blkdev_aio_read,
+        .aio_read       = blkdev_aio_read,
        .aio_write      = blkdev_aio_write,
-        .mmap           = blkdev_mmap,
+        .mmap           = generic_file_mmap,
        .fsync          = blkdev_fsync,
        .unlocked_ioctl = block_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = compat_blkdev_ioctl,
 #endif
-        .splice_read    = blkdev_splice_read,
+        .splice_read    = generic_file_splice_read,
-        .splice_write   = blkdev_splice_write,
+        .splice_write   = generic_file_splice_write,
 };
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c72ead869507..596617ecd329 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -413,7 +413,7 @@ struct btrfs_root_backup {
        __le64 bytes_used;
        __le64 num_devices;
        /* future */
-        __le64 unsed_64[4];
+        __le64 unused_64[4];
        u8 tree_root_level;
        u8 chunk_root_level;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 7cda51995c1e..22a0439e5a86 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3416,8 +3416,8 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
        num_dirty = root->fs_info->dirty_metadata_bytes;
        if (num_dirty > thresh) {
-                balance_dirty_pages_ratelimited_nr(
+                balance_dirty_pages_ratelimited(
-                                   root->fs_info->btree_inode->i_mapping, 1);
+                                   root->fs_info->btree_inode->i_mapping);
        }
        return;
 }
@@ -3437,8 +3437,8 @@ void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
        num_dirty = root->fs_info->dirty_metadata_bytes;
        if (num_dirty > thresh) {
-                balance_dirty_pages_ratelimited_nr(
+                balance_dirty_pages_ratelimited(
-                                   root->fs_info->btree_inode->i_mapping, 1);
+                                   root->fs_info->btree_inode->i_mapping);
        }
        return;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 3d3e2c17d8d1..06b2635073f3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3888,7 +3888,7 @@ static int flush_space(struct btrfs_root *root,
 * @root - the root we're allocating for
 * @block_rsv - the block_rsv we're allocating for
 * @orig_bytes - the number of bytes we want
- * @flush - wether or not we can flush to make our reservation
+ * @flush - whether or not we can flush to make our reservation
 *
 * This will reserve orgi_bytes number of bytes from the space info associated
 * with the block_rsv.  If there is not enough space it will make an attempt to
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index b8cbc8d5c7f7..ce9f79216723 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -234,12 +234,11 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
 }
 /**
- * unpint_extent_cache - unpin an extent from the cache
+ * unpin_extent_cache - unpin an extent from the cache
 * @tree:       tree to unpin the extent in
 * @start:      logical offset in the file
 * @len:        length of the extent
 * @gen:        generation that this extent has been modified in
- * @prealloc:   if this is set we need to clear the prealloc flag
 *
 * Called after an extent has been written to disk properly.  Set the generation
 * to the generation that actually added the file item to the inode so we know
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 9ab1bed88116..a8ee75cb96ee 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1346,8 +1346,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                cond_resched();
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+                balance_dirty_pages_ratelimited(inode->i_mapping);
-                                                   dirty_pages);
                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
                        btrfs_btree_balance_dirty(root, 1);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8fcf9a59c28d..5b3429ab8ec1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1225,7 +1225,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                }
                defrag_count += ret;
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
+                balance_dirty_pages_ratelimited(inode->i_mapping);
                mutex_unlock(&inode->i_mutex);
                if (newer_than) {
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index dd27a0b46a37..853fc7beedfa 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -76,7 +76,7 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_IOERR 6 /* We had an io error when writing this out */
-#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates wether this ordered extent
+#define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
                                       * has done its due diligence in updating
                                       * the isize. */
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 0f5ebb72a5ea..e3c6ee3cc2ba 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4294,7 +4294,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                        rcu_read_lock();
                        name = rcu_dereference(dev->name);
-                        pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
+                        pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu "
                                 "(%s id %llu), size=%u\n", rw,
                                 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
                                 name->str, dev->devid, bio->bi_size);
diff --git a/fs/buffer.c b/fs/buffer.c
index b5f044283edb..c017a2dfb909 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -46,8 +46,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
 #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)
-inline void
+void init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
-init_buffer(struct buffer_head *bh, bh_end_io_t *handler, void *private)
 {
        bh->b_end_io = handler;
        bh->b_private = private;
@@ -555,7 +554,7 @@ void emergency_thaw_all(void)
 */
 int sync_mapping_buffers(struct address_space *mapping)
 {
-        struct address_space *buffer_mapping = mapping->assoc_mapping;
+        struct address_space *buffer_mapping = mapping->private_data;
        if (buffer_mapping == NULL || list_empty(&mapping->private_list))
                return 0;
@@ -588,10 +587,10 @@ void mark_buffer_dirty_inode(struct buffer_head *bh, struct inode *inode)
        struct address_space *buffer_mapping = bh->b_page->mapping;
        mark_buffer_dirty(bh);
-        if (!mapping->assoc_mapping) {
+        if (!mapping->private_data) {
-                mapping->assoc_mapping = buffer_mapping;
+                mapping->private_data = buffer_mapping;
        } else {
-                BUG_ON(mapping->assoc_mapping != buffer_mapping);
+                BUG_ON(mapping->private_data != buffer_mapping);
        }
        if (!bh->b_assoc_map) {
                spin_lock(&buffer_mapping->private_lock);
@@ -788,7 +787,7 @@ void invalidate_inode_buffers(struct inode *inode)
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->private_list;
-                struct address_space *buffer_mapping = mapping->assoc_mapping;
+                struct address_space *buffer_mapping = mapping->private_data;
                spin_lock(&buffer_mapping->private_lock);
                while (!list_empty(list))
@@ -811,7 +810,7 @@ int remove_inode_buffers(struct inode *inode)
        if (inode_has_buffers(inode)) {
                struct address_space *mapping = &inode->i_data;
                struct list_head *list = &mapping->private_list;
-                struct address_space *buffer_mapping = mapping->assoc_mapping;
+                struct address_space *buffer_mapping = mapping->private_data;
                spin_lock(&buffer_mapping->private_lock);
                while (!list_empty(list)) {
@@ -850,13 +849,10 @@ try_again:
                if (!bh)
                        goto no_grow;
-                bh->b_bdev = NULL;
                bh->b_this_page = head;
                bh->b_blocknr = -1;
                head = bh;
-                bh->b_state = 0;
-                atomic_set(&bh->b_count, 0);
                bh->b_size = size;
                /* Link the buffer to its page */
@@ -911,6 +907,18 @@ link_dev_buffers(struct page *page, struct buffer_head *head)
        attach_page_buffers(page, head);
 }
+static sector_t blkdev_max_block(struct block_device *bdev, unsigned int size)
+{
+        sector_t retval = ~((sector_t)0);
+        loff_t sz = i_size_read(bdev->bd_inode);
+        if (sz) {
+                unsigned int sizebits = blksize_bits(size);
+                retval = (sz >> sizebits);
+        }
+        return retval;
+}
 /*
 * Initialise the state of a blockdev page's buffers.
 */ 
@@ -921,7 +929,7 @@ init_page_buffers(struct page *page, struct block_device *bdev,
        struct buffer_head *head = page_buffers(page);
        struct buffer_head *bh = head;
        int uptodate = PageUptodate(page);
-        sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode));
+        sector_t end_block = blkdev_max_block(I_BDEV(bdev->bd_inode), size);
        do {
                if (!buffer_mapped(bh)) {
@@ -1553,6 +1561,28 @@ void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
 EXPORT_SYMBOL(unmap_underlying_metadata);
 /*
+ * Size is a power-of-two in the range 512..PAGE_SIZE,
+ * and the case we care about most is PAGE_SIZE.
+ *
+ * So this *could* possibly be written with those
+ * constraints in mind (relevant mostly if some
+ * architecture has a slow bit-scan instruction)
+ */
+static inline int block_size_bits(unsigned int blocksize)
+{
+        return ilog2(blocksize);
+}
+static struct buffer_head *create_page_buffers(struct page *page, struct inode *inode, unsigned int b_state)
+{
+        BUG_ON(!PageLocked(page));
+        if (!page_has_buffers(page))
+                create_empty_buffers(page, 1 << ACCESS_ONCE(inode->i_blkbits), b_state);
+        return page_buffers(page);
+}
+/*
 * NOTE! All mapped/uptodate combinations are valid:
 *
 *      Mapped  Uptodate        Meaning
@@ -1589,19 +1619,13 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
        sector_t block;
        sector_t last_block;
        struct buffer_head *bh, *head;
-        const unsigned blocksize = 1 << inode->i_blkbits;
+        unsigned int blocksize, bbits;
        int nr_underway = 0;
        int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
                        WRITE_SYNC : WRITE);
-        BUG_ON(!PageLocked(page));
+        head = create_page_buffers(page, inode,
-        last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
-        if (!page_has_buffers(page)) {
-                create_empty_buffers(page, blocksize,
                                        (1 << BH_Dirty)|(1 << BH_Uptodate));
-        }
        /*
         * Be very careful.  We have no exclusion from __set_page_dirty_buffers
@@ -1613,9 +1637,12 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
         * handle that here by just cleaning them.
         */
-        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        head = page_buffers(page);
        bh = head;
+        blocksize = bh->b_size;
+        bbits = block_size_bits(blocksize);
+        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
+        last_block = (i_size_read(inode) - 1) >> bbits;
        /*
         * Get all the dirty buffers mapped to disk addresses and
@@ -1806,12 +1833,10 @@ int __block_write_begin(struct page *page, loff_t pos, unsigned len,
        BUG_ON(to > PAGE_CACHE_SIZE);
        BUG_ON(from > to);
-        blocksize = 1 << inode->i_blkbits;
+        head = create_page_buffers(page, inode, 0);
-        if (!page_has_buffers(page))
+        blocksize = head->b_size;
-                create_empty_buffers(page, blocksize, 0);
+        bbits = block_size_bits(blocksize);
-        head = page_buffers(page);
-        bbits = inode->i_blkbits;
        block = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
        for(bh = head, block_start = 0; bh != head || !block_start;
@@ -1881,11 +1906,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
        unsigned blocksize;
        struct buffer_head *bh, *head;
-        blocksize = 1 << inode->i_blkbits;
+        bh = head = page_buffers(page);
+        blocksize = bh->b_size;
-        for(bh = head = page_buffers(page), block_start = 0;
+        block_start = 0;
-            bh != head || !block_start;
+        do {
-            block_start=block_end, bh = bh->b_this_page) {
                block_end = block_start + blocksize;
                if (block_end <= from || block_start >= to) {
                        if (!buffer_uptodate(bh))
@@ -1895,7 +1920,10 @@ static int __block_commit_write(struct inode *inode, struct page *page,
                        mark_buffer_dirty(bh);
                }
                clear_buffer_new(bh);
-        }
+                block_start = block_end;
+                bh = bh->b_this_page;
+        } while (bh != head);
        /*
         * If this is a partial write which happened to make all buffers
@@ -2020,7 +2048,6 @@ EXPORT_SYMBOL(generic_write_end);
 int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
                                        unsigned long from)
 {
-        struct inode *inode = page->mapping->host;
        unsigned block_start, block_end, blocksize;
        unsigned to;
        struct buffer_head *bh, *head;
@@ -2029,13 +2056,13 @@ int block_is_partially_uptodate(struct page *page, read_descriptor_t *desc,
        if (!page_has_buffers(page))
                return 0;
-        blocksize = 1 << inode->i_blkbits;
+        head = page_buffers(page);
+        blocksize = head->b_size;
        to = min_t(unsigned, PAGE_CACHE_SIZE - from, desc->count);
        to = from + to;
        if (from < blocksize && to > PAGE_CACHE_SIZE - blocksize)
                return 0;
-        head = page_buffers(page);
        bh = head;
        block_start = 0;
        do {
@@ -2068,18 +2095,16 @@ int block_read_full_page(struct page *page, get_block_t *get_block)
        struct inode *inode = page->mapping->host;
        sector_t iblock, lblock;
        struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE];
-        unsigned int blocksize;
+        unsigned int blocksize, bbits;
        int nr, i;
        int fully_mapped = 1;
-        BUG_ON(!PageLocked(page));
+        head = create_page_buffers(page, inode, 0);
-        blocksize = 1 << inode->i_blkbits;
+        blocksize = head->b_size;
-        if (!page_has_buffers(page))
+        bbits = block_size_bits(blocksize);
-                create_empty_buffers(page, blocksize, 0);
-        head = page_buffers(page);
-        iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        iblock = (sector_t)page->index << (PAGE_CACHE_SHIFT - bbits);
-        lblock = (i_size_read(inode)+blocksize-1) >> inode->i_blkbits;
+        lblock = (i_size_read(inode)+blocksize-1) >> bbits;
        bh = head;
        nr = 0;
        i = 0;
@@ -2864,6 +2889,55 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
        bio_put(bio);
 }
+/*
+ * This allows us to do IO even on the odd last sectors
+ * of a device, even if the bh block size is some multiple
+ * of the physical sector size.
+ *
+ * We'll just truncate the bio to the size of the device,
+ * and clear the end of the buffer head manually.
+ *
+ * Truly out-of-range accesses will turn into actual IO
+ * errors, this only handles the "we need to be able to
+ * do IO at the final sector" case.
+ */
+static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+{
+        sector_t maxsector;
+        unsigned bytes;
+        maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
+        if (!maxsector)
+                return;
+        /*
+         * If the *whole* IO is past the end of the device,
+         * let it through, and the IO layer will turn it into
+         * an EIO.
+         */
+        if (unlikely(bio->bi_sector >= maxsector))
+                return;
+        maxsector -= bio->bi_sector;
+        bytes = bio->bi_size;
+        if (likely((bytes >> 9) <= maxsector))
+                return;
+        /* Uhhuh. We've got a bh that straddles the device size! */
+        bytes = maxsector << 9;
+        /* Truncate the bio.. */
+        bio->bi_size = bytes;
+        bio->bi_io_vec[0].bv_len = bytes;
+        /* ..and clear the end of the buffer for reads */
+        if ((rw & RW_MASK) == READ) {
+                void *kaddr = kmap_atomic(bh->b_page);
+                memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
+                kunmap_atomic(kaddr);
+        }
+}
 int submit_bh(int rw, struct buffer_head * bh)
 {
        struct bio *bio;
@@ -2900,6 +2974,9 @@ int submit_bh(int rw, struct buffer_head * bh)
        bio->bi_end_io = end_bio_bh_io_sync;
        bio->bi_private = bh;
+        /* Take care of bh's that straddle the end of the device */
+        guard_bh_eod(rw, bio, bh);
        bio_get(bio);
        submit_bio(rw, bio);
diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 2075ddfffa73..21ff76c22a17 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -122,9 +122,17 @@ config CIFS_ACL
            Allows fetching CIFS/NTFS ACL from the server.  The DACL blob
            is handed over to the application/caller.
+config CIFS_DEBUG
+        bool "Enable CIFS debugging routines"
+        default y
+        depends on CIFS
+        help
+           Enabling this option adds helpful debugging messages to
+           the cifs code which increases the size of the cifs module.
+           If unsure, say Y.
 config CIFS_DEBUG2
        bool "Enable additional CIFS debugging routines"
-        depends on CIFS
+        depends on CIFS_DEBUG
        help
           Enabling this option adds a few more debugging routines
           to the cifs code which slightly increases the size of
diff --git a/fs/cifs/README b/fs/cifs/README
index 22ab7b5b8da7..2d5622f60e11 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -480,7 +480,7 @@ A partial list of the supported mount options follows:
                Unicode on the wire.
 nomapchars     Do not translate any of these seven characters (default).
 nocase         Request case insensitive path name matching (case
-                sensitive is the default if the server suports it).
+                sensitive is the default if the server supports it).
                (mount option "ignorecase" is identical to "nocase")
 posixpaths     If CIFS Unix extensions are supported, attempt to
                negotiate posix path name support which allows certain
diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h
index c0c68bb492d7..86e92ef2abc1 100644
--- a/fs/cifs/cifs_debug.h
+++ b/fs/cifs/cifs_debug.h
@@ -18,7 +18,6 @@
 *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
 *
 */
-#define CIFS_DEBUG              /* BB temporary */
 #ifndef _H_CIFS_DEBUG
 #define _H_CIFS_DEBUG
@@ -37,49 +36,43 @@ void dump_smb(void *, int);
 #define CIFS_RC         0x02
 #define CIFS_TIMER      0x04
+extern int cifsFYI;
+extern int cifsERROR;
 /*
 *      debug ON
 *      --------
 */
-#ifdef CIFS_DEBUG
+#ifdef CONFIG_CIFS_DEBUG
 /* information message: e.g., configuration, major event */
-extern int cifsFYI;
+#define cifsfyi(fmt, ...)                                               \
-#define cifsfyi(fmt, arg...)                                            \
 do {                                                                    \
        if (cifsFYI & CIFS_INFO)                                        \
-                printk(KERN_DEBUG "%s: " fmt "\n", __FILE__, ##arg);    \
+                printk(KERN_DEBUG "%s: " fmt "\n",                      \
+                       __FILE__, ##__VA_ARGS__);                        \
 } while (0)
-#define cFYI(set, fmt, arg...)                  \
+#define cFYI(set, fmt, ...)                                             \
-do {                                            \
+do {                                                                    \
-        if (set)                                \
+        if (set)                                                        \
-                cifsfyi(fmt, ##arg);            \
+                cifsfyi(fmt, ##__VA_ARGS__);                            \
 } while (0)
-#define cifswarn(fmt, arg...)                   \
+#define cifswarn(fmt, ...)                                              \
-        printk(KERN_WARNING fmt "\n", ##arg)
+        printk(KERN_WARNING fmt "\n", ##__VA_ARGS__)
-/* debug event message: */
+/* error event message: e.g., i/o error */
-extern int cifsERROR;
+#define cifserror(fmt, ...)                                             \
-#define cEVENT(fmt, arg...)                                             \
 do {                                                                    \
        if (cifsERROR)                                                  \
-                printk(KERN_EVENT "%s: " fmt "\n", __FILE__, ##arg);    \
+                printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__);  \
-} while (0)
-/* error event message: e.g., i/o error */
-#define cifserror(fmt, arg...)                                  \
-do {                                                            \
-        if (cifsERROR)                                          \
-                printk(KERN_ERR "CIFS VFS: " fmt "\n", ##arg);  \
 } while (0)
-#define cERROR(set, fmt, arg...)                \
+#define cERROR(set, fmt, ...)                                           \
-do {                                            \
+do {                                                                    \
-        if (set)                                \
+        if (set)                                                        \
-                cifserror(fmt, ##arg);          \
+                cifserror(fmt, ##__VA_ARGS__);                          \
 } while (0)
 /*
@@ -87,10 +80,27 @@ do {						\
 *      ---------
 */
 #else           /* _CIFS_DEBUG */
-#define cERROR(set, fmt, arg...)
+#define cifsfyi(fmt, ...)                                               \
-#define cEVENT(fmt, arg...)
+do {                                                                    \
-#define cFYI(set, fmt, arg...)
+        if (0)                                                          \
-#define cifserror(fmt, arg...)
+                printk(KERN_DEBUG "%s: " fmt "\n",                      \
+                       __FILE__, ##__VA_ARGS__);                        \
+} while (0)
+#define cFYI(set, fmt, ...)                                             \
+do {                                                                    \
+        if (0 && set)                                                   \
+                cifsfyi(fmt, ##__VA_ARGS__);                            \
+} while (0)
+#define cifserror(fmt, ...)                                             \
+do {                                                                    \
+        if (0)                                                          \
+                printk(KERN_ERR "CIFS VFS: " fmt "\n", ##__VA_ARGS__);  \
+} while (0)
+#define cERROR(set, fmt, ...)                                           \
+do {                                                                    \
+        if (0 && set)                                                   \
+                cifserror(fmt, ##__VA_ARGS__);                          \
+} while (0)
 #endif          /* _CIFS_DEBUG */
 #endif                          /* _H_CIFS_DEBUG */
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 0fb15bbbe43c..75c1ee699143 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -42,135 +42,27 @@ static const struct cifs_sid sid_authusers = {
 /* group users */
 static const struct cifs_sid sid_user = {1, 2 , {0, 0, 0, 0, 0, 5}, {} };
-const struct cred *root_cred;
+static const struct cred *root_cred;
-static void
-shrink_idmap_tree(struct rb_root *root, int nr_to_scan, int *nr_rem,
-                        int *nr_del)
-{
-        struct rb_node *node;
-        struct rb_node *tmp;
-        struct cifs_sid_id *psidid;
-        node = rb_first(root);
-        while (node) {
-                tmp = node;
-                node = rb_next(tmp);
-                psidid = rb_entry(tmp, struct cifs_sid_id, rbnode);
-                if (nr_to_scan == 0 || *nr_del == nr_to_scan)
-                        ++(*nr_rem);
-                else {
-                        if (time_after(jiffies, psidid->time + SID_MAP_EXPIRE)
-                                                && psidid->refcount == 0) {
-                                rb_erase(tmp, root);
-                                ++(*nr_del);
-                        } else
-                                ++(*nr_rem);
-                }
-        }
-}
-/*
- * Run idmap cache shrinker.
- */
-static int
-cifs_idmap_shrinker(struct shrinker *shrink, struct shrink_control *sc)
-{
-        int nr_to_scan = sc->nr_to_scan;
-        int nr_del = 0;
-        int nr_rem = 0;
-        struct rb_root *root;
-        root = &uidtree;
-        spin_lock(&siduidlock);
-        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
-        spin_unlock(&siduidlock);
-        root = &gidtree;
-        spin_lock(&sidgidlock);
-        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
-        spin_unlock(&sidgidlock);
-        root = &siduidtree;
-        spin_lock(&uidsidlock);
-        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
-        spin_unlock(&uidsidlock);
-        root = &sidgidtree;
-        spin_lock(&gidsidlock);
-        shrink_idmap_tree(root, nr_to_scan, &nr_rem, &nr_del);
-        spin_unlock(&gidsidlock);
-        return nr_rem;
-}
-static void
-sid_rb_insert(struct rb_root *root, unsigned long cid,
-                struct cifs_sid_id **psidid, char *typestr)
-{
-        char *strptr;
-        struct rb_node *node = root->rb_node;
-        struct rb_node *parent = NULL;
-        struct rb_node **linkto = &(root->rb_node);
-        struct cifs_sid_id *lsidid;
-        while (node) {
-                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
-                parent = node;
-                if (cid > lsidid->id) {
-                        linkto = &(node->rb_left);
-                        node = node->rb_left;
-                }
-                if (cid < lsidid->id) {
-                        linkto = &(node->rb_right);
-                        node = node->rb_right;
-                }
-        }
-        (*psidid)->id = cid;
-        (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
-        (*psidid)->refcount = 0;
-        sprintf((*psidid)->sidstr, "%s", typestr);
-        strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
-        sprintf(strptr, "%ld", cid);
-        clear_bit(SID_ID_PENDING, &(*psidid)->state);
-        clear_bit(SID_ID_MAPPED, &(*psidid)->state);
-        rb_link_node(&(*psidid)->rbnode, parent, linkto);
-        rb_insert_color(&(*psidid)->rbnode, root);
-}
-static struct cifs_sid_id *
-sid_rb_search(struct rb_root *root, unsigned long cid)
-{
-        struct rb_node *node = root->rb_node;
-        struct cifs_sid_id *lsidid;
-        while (node) {
-                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
-                if (cid > lsidid->id)
-                        node = node->rb_left;
-                else if (cid < lsidid->id)
-                        node = node->rb_right;
-                else /* node found */
-                        return lsidid;
-        }
-        return NULL;
-}
-static struct shrinker cifs_shrinker = {
-        .shrink = cifs_idmap_shrinker,
-        .seeks = DEFAULT_SEEKS,
-};
 static int
 cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 {
        char *payload;
+        /*
+         * If the payload is less than or equal to the size of a pointer, then
+         * an allocation here is wasteful. Just copy the data directly to the
+         * payload.value union member instead.
+         *
+         * With this however, you must check the datalen before trying to
+         * dereference payload.data!
+         */
+        if (prep->datalen <= sizeof(key->payload)) {
+                key->payload.value = 0;
+                memcpy(&key->payload.value, prep->data, prep->datalen);
+                key->datalen = prep->datalen;
+                return 0;
+        }
        payload = kmalloc(prep->datalen, GFP_KERNEL);
        if (!payload)
                return -ENOMEM;
@@ -184,10 +76,11 @@ cifs_idmap_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
 static inline void
 cifs_idmap_key_destroy(struct key *key)
 {
-        kfree(key->payload.data);
+        if (key->datalen > sizeof(key->payload))
+                kfree(key->payload.data);
 }
-struct key_type cifs_idmap_key_type = {
+static struct key_type cifs_idmap_key_type = {
        .name        = "cifs.idmap",
        .instantiate = cifs_idmap_key_instantiate,
        .destroy     = cifs_idmap_key_destroy,
@@ -195,221 +88,174 @@ struct key_type cifs_idmap_key_type = {
        .match       = user_match,
 };
-static void
+static char *
-sid_to_str(struct cifs_sid *sidptr, char *sidstr)
+sid_to_key_str(struct cifs_sid *sidptr, unsigned int type)
 {
-        int i;
+        int i, len;
-        unsigned long saval;
+        unsigned int saval;
-        char *strptr;
+        char *sidstr, *strptr;
+        unsigned long long id_auth_val;
+        /* 3 bytes for prefix */
+        sidstr = kmalloc(3 + SID_STRING_BASE_SIZE +
+                         (SID_STRING_SUBAUTH_SIZE * sidptr->num_subauth),
+                         GFP_KERNEL);
+        if (!sidstr)
+                return sidstr;
        strptr = sidstr;
+        len = sprintf(strptr, "%cs:S-%hhu", type == SIDOWNER ? 'o' : 'g',
+                        sidptr->revision);
+        strptr += len;
+        /* The authority field is a single 48-bit number */
+        id_auth_val = (unsigned long long)sidptr->authority[5];
+        id_auth_val |= (unsigned long long)sidptr->authority[4] << 8;
+        id_auth_val |= (unsigned long long)sidptr->authority[3] << 16;
+        id_auth_val |= (unsigned long long)sidptr->authority[2] << 24;
+        id_auth_val |= (unsigned long long)sidptr->authority[1] << 32;
+        id_auth_val |= (unsigned long long)sidptr->authority[0] << 48;
-        sprintf(strptr, "%s", "S");
+        /*
-        strptr = sidstr + strlen(sidstr);
+         * MS-DTYP states that if the authority is >= 2^32, then it should be
+         * expressed as a hex value.
-        sprintf(strptr, "-%d", sidptr->revision);
+         */
-        strptr = sidstr + strlen(sidstr);
+        if (id_auth_val <= UINT_MAX)
+                len = sprintf(strptr, "-%llu", id_auth_val);
+        else
+                len = sprintf(strptr, "-0x%llx", id_auth_val);
-        for (i = 0; i < 6; ++i) {
+        strptr += len;
-                if (sidptr->authority[i]) {
-                        sprintf(strptr, "-%d", sidptr->authority[i]);
-                        strptr = sidstr + strlen(sidstr);
-                }
-        }
        for (i = 0; i < sidptr->num_subauth; ++i) {
                saval = le32_to_cpu(sidptr->sub_auth[i]);
-                sprintf(strptr, "-%ld", saval);
+                len = sprintf(strptr, "-%u", saval);
-                strptr = sidstr + strlen(sidstr);
+                strptr += len;
        }
-}
-static void
+        return sidstr;
-cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
-{
-        memcpy(dst, src, sizeof(*dst));
-        dst->num_subauth = min_t(u8, src->num_subauth, NUM_SUBAUTHS);
 }
-static void
+/*
-id_rb_insert(struct rb_root *root, struct cifs_sid *sidptr,
+ * if the two SIDs (roughly equivalent to a UUID for a user or group) are
-                struct cifs_sid_id **psidid, char *typestr)
+ * the same returns zero, if they do not match returns non-zero.
+ */
+static int
+compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
 {
-        int rc;
+        int i;
-        char *strptr;
+        int num_subauth, num_sat, num_saw;
-        struct rb_node *node = root->rb_node;
-        struct rb_node *parent = NULL;
-        struct rb_node **linkto = &(root->rb_node);
-        struct cifs_sid_id *lsidid;
-        while (node) {
-                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
-                parent = node;
-                rc = compare_sids(sidptr, &((lsidid)->sid));
-                if (rc > 0) {
-                        linkto = &(node->rb_left);
-                        node = node->rb_left;
-                } else if (rc < 0) {
-                        linkto = &(node->rb_right);
-                        node = node->rb_right;
-                }
-        }
-        cifs_copy_sid(&(*psidid)->sid, sidptr);
-        (*psidid)->time = jiffies - (SID_MAP_RETRY + 1);
-        (*psidid)->refcount = 0;
-        sprintf((*psidid)->sidstr, "%s", typestr);
+        if ((!ctsid) || (!cwsid))
-        strptr = (*psidid)->sidstr + strlen((*psidid)->sidstr);
+                return 1;
-        sid_to_str(&(*psidid)->sid, strptr);
-        clear_bit(SID_ID_PENDING, &(*psidid)->state);
+        /* compare the revision */
-        clear_bit(SID_ID_MAPPED, &(*psidid)->state);
+        if (ctsid->revision != cwsid->revision) {
+                if (ctsid->revision > cwsid->revision)
+                        return 1;
+                else
+                        return -1;
+        }
-        rb_link_node(&(*psidid)->rbnode, parent, linkto);
+        /* compare all of the six auth values */
-        rb_insert_color(&(*psidid)->rbnode, root);
+        for (i = 0; i < NUM_AUTHS; ++i) {
-}
+                if (ctsid->authority[i] != cwsid->authority[i]) {
+                        if (ctsid->authority[i] > cwsid->authority[i])
+                                return 1;
+                        else
+                                return -1;
+                }
+        }
-static struct cifs_sid_id *
+        /* compare all of the subauth values if any */
-id_rb_search(struct rb_root *root, struct cifs_sid *sidptr)
+        num_sat = ctsid->num_subauth;
-{
+        num_saw = cwsid->num_subauth;
-        int rc;
+        num_subauth = num_sat < num_saw ? num_sat : num_saw;
-        struct rb_node *node = root->rb_node;
+        if (num_subauth) {
-        struct cifs_sid_id *lsidid;
+                for (i = 0; i < num_subauth; ++i) {
+                        if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
-        while (node) {
+                                if (le32_to_cpu(ctsid->sub_auth[i]) >
-                lsidid = rb_entry(node, struct cifs_sid_id, rbnode);
+                                        le32_to_cpu(cwsid->sub_auth[i]))
-                rc = compare_sids(sidptr, &((lsidid)->sid));
+                                        return 1;
-                if (rc > 0) {
+                                else
-                        node = node->rb_left;
+                                        return -1;
-                } else if (rc < 0) {
+                        }
-                        node = node->rb_right;
+                }
-                } else /* node found */
-                        return lsidid;
        }
-        return NULL;
+        return 0; /* sids compare/match */
 }
-static int
+static void
-sidid_pending_wait(void *unused)
+cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src)
 {
-        schedule();
+        int i;
-        return signal_pending(current) ? -ERESTARTSYS : 0;
+        dst->revision = src->revision;
+        dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES);
+        for (i = 0; i < NUM_AUTHS; ++i)
+                dst->authority[i] = src->authority[i];
+        for (i = 0; i < dst->num_subauth; ++i)
+                dst->sub_auth[i] = src->sub_auth[i];
 }
 static int
-id_to_sid(unsigned long cid, uint sidtype, struct cifs_sid *ssid)
+id_to_sid(unsigned int cid, uint sidtype, struct cifs_sid *ssid)
 {
-        int rc = 0;
+        int rc;
        struct key *sidkey;
+        struct cifs_sid *ksid;
+        unsigned int ksid_size;
+        char desc[3 + 10 + 1]; /* 3 byte prefix + 10 bytes for value + NULL */
        const struct cred *saved_cred;
-        struct cifs_sid *lsid;
-        struct cifs_sid_id *psidid, *npsidid;
-        struct rb_root *cidtree;
-        spinlock_t *cidlock;
-        if (sidtype == SIDOWNER) {
-                cidlock = &siduidlock;
-                cidtree = &uidtree;
-        } else if (sidtype == SIDGROUP) {
-                cidlock = &sidgidlock;
-                cidtree = &gidtree;
-        } else
-                return -EINVAL;
-        spin_lock(cidlock);
-        psidid = sid_rb_search(cidtree, cid);
-        if (!psidid) { /* node does not exist, allocate one & attempt adding */
+        rc = snprintf(desc, sizeof(desc), "%ci:%u",
-                spin_unlock(cidlock);
+                        sidtype == SIDOWNER ? 'o' : 'g', cid);
-                npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
+        if (rc >= sizeof(desc))
-                if (!npsidid)
+                return -EINVAL;
-                        return -ENOMEM;
-                npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
-                if (!npsidid->sidstr) {
-                        kfree(npsidid);
-                        return -ENOMEM;
-                }
-                spin_lock(cidlock);
+        rc = 0;
-                psidid = sid_rb_search(cidtree, cid);
+        saved_cred = override_creds(root_cred);
-                if (psidid) { /* node happened to get inserted meanwhile */
+        sidkey = request_key(&cifs_idmap_key_type, desc, "");
-                        ++psidid->refcount;
+        if (IS_ERR(sidkey)) {
-                        spin_unlock(cidlock);
+                rc = -EINVAL;
-                        kfree(npsidid->sidstr);
+                cFYI(1, "%s: Can't map %cid %u to a SID", __func__,
-                        kfree(npsidid);
+                        sidtype == SIDOWNER ? 'u' : 'g', cid);
-                } else {
+                goto out_revert_creds;
-                        psidid = npsidid;
+        } else if (sidkey->datalen < CIFS_SID_BASE_SIZE) {
-                        sid_rb_insert(cidtree, cid, &psidid,
+                rc = -EIO;
-                                        sidtype == SIDOWNER ? "oi:" : "gi:");
+                cFYI(1, "%s: Downcall contained malformed key "
-                        ++psidid->refcount;
+                        "(datalen=%hu)", __func__, sidkey->datalen);
-                        spin_unlock(cidlock);
+                goto invalidate_key;
-                }
-        } else {
-                ++psidid->refcount;
-                spin_unlock(cidlock);
        }
        /*
-         * If we are here, it is safe to access psidid and its fields
+         * A sid is usually too large to be embedded in payload.value, but if
-         * since a reference was taken earlier while holding the spinlock.
+         * there are no subauthorities and the host has 8-byte pointers, then
-         * A reference on the node is put without holding the spinlock
+         * it could be.
-         * and it is OK to do so in this case, shrinker will not erase
-         * this node until all references are put and we do not access
-         * any fields of the node after a reference is put .
         */
-        if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+        ksid = sidkey->datalen <= sizeof(sidkey->payload) ?
-                cifs_copy_sid(ssid, &psidid->sid);
+                (struct cifs_sid *)&sidkey->payload.value :
-                psidid->time = jiffies; /* update ts for accessing */
+                (struct cifs_sid *)sidkey->payload.data;
-                goto id_sid_out;
+        ksid_size = CIFS_SID_BASE_SIZE + (ksid->num_subauth * sizeof(__le32));
+        if (ksid_size > sidkey->datalen) {
+                rc = -EIO;
+                cFYI(1, "%s: Downcall contained malformed key (datalen=%hu, "
+                        "ksid_size=%u)", __func__, sidkey->datalen, ksid_size);
+                goto invalidate_key;
        }
-        if (time_after(psidid->time + SID_MAP_RETRY, jiffies)) {
+        cifs_copy_sid(ssid, ksid);
-                rc = -EINVAL;
+out_key_put:
-                goto id_sid_out;
+        key_put(sidkey);
-        }
+out_revert_creds:
+        revert_creds(saved_cred);
-        if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
-                saved_cred = override_creds(root_cred);
-                sidkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
-                if (IS_ERR(sidkey)) {
-                        rc = -EINVAL;
-                        cFYI(1, "%s: Can't map and id to a SID", __func__);
-                } else if (sidkey->datalen < sizeof(struct cifs_sid)) {
-                        rc = -EIO;
-                        cFYI(1, "%s: Downcall contained malformed key "
-                                "(datalen=%hu)", __func__, sidkey->datalen);
-                } else {
-                        lsid = (struct cifs_sid *)sidkey->payload.data;
-                        cifs_copy_sid(&psidid->sid, lsid);
-                        cifs_copy_sid(ssid, &psidid->sid);
-                        set_bit(SID_ID_MAPPED, &psidid->state);
-                        key_put(sidkey);
-                        kfree(psidid->sidstr);
-                }
-                psidid->time = jiffies; /* update ts for accessing */
-                revert_creds(saved_cred);
-                clear_bit(SID_ID_PENDING, &psidid->state);
-                wake_up_bit(&psidid->state, SID_ID_PENDING);
-        } else {
-                rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
-                                sidid_pending_wait, TASK_INTERRUPTIBLE);
-                if (rc) {
-                        cFYI(1, "%s: sidid_pending_wait interrupted %d",
-                                        __func__, rc);
-                        --psidid->refcount;
-                        return rc;
-                }
-                if (test_bit(SID_ID_MAPPED, &psidid->state))
-                        cifs_copy_sid(ssid, &psidid->sid);
-                else
-                        rc = -EINVAL;
-        }
-id_sid_out:
-        --psidid->refcount;
        return rc;
+invalidate_key:
+        key_invalidate(sidkey);
+        goto out_key_put;
 }
 static int
@@ -417,111 +263,67 @@ sid_to_id(struct cifs_sb_info *cifs_sb, struct cifs_sid *psid,
                struct cifs_fattr *fattr, uint sidtype)
 {
        int rc;
-        unsigned long cid;
+        struct key *sidkey;
-        struct key *idkey;
+        char *sidstr;
        const struct cred *saved_cred;
-        struct cifs_sid_id *psidid, *npsidid;
+        uid_t fuid = cifs_sb->mnt_uid;
-        struct rb_root *cidtree;
+        gid_t fgid = cifs_sb->mnt_gid;
-        spinlock_t *cidlock;
-        if (sidtype == SIDOWNER) {
-                cid = cifs_sb->mnt_uid; /* default uid, in case upcall fails */
-                cidlock = &siduidlock;
-                cidtree = &uidtree;
-        } else if (sidtype == SIDGROUP) {
-                cid = cifs_sb->mnt_gid; /* default gid, in case upcall fails */
-                cidlock = &sidgidlock;
-                cidtree = &gidtree;
-        } else
-                return -ENOENT;
-        spin_lock(cidlock);
-        psidid = id_rb_search(cidtree, psid);
-        if (!psidid) { /* node does not exist, allocate one & attempt adding */
-                spin_unlock(cidlock);
-                npsidid = kzalloc(sizeof(struct cifs_sid_id), GFP_KERNEL);
-                if (!npsidid)
-                        return -ENOMEM;
-                npsidid->sidstr = kmalloc(SIDLEN, GFP_KERNEL);
-                if (!npsidid->sidstr) {
-                        kfree(npsidid);
-                        return -ENOMEM;
-                }
-                spin_lock(cidlock);
-                psidid = id_rb_search(cidtree, psid);
-                if (psidid) { /* node happened to get inserted meanwhile */
-                        ++psidid->refcount;
-                        spin_unlock(cidlock);
-                        kfree(npsidid->sidstr);
-                        kfree(npsidid);
-                } else {
-                        psidid = npsidid;
-                        id_rb_insert(cidtree, psid, &psidid,
-                                        sidtype == SIDOWNER ? "os:" : "gs:");
-                        ++psidid->refcount;
-                        spin_unlock(cidlock);
-                }
-        } else {
-                ++psidid->refcount;
-                spin_unlock(cidlock);
-        }
        /*
-         * If we are here, it is safe to access psidid and its fields
+         * If we have too many subauthorities, then something is really wrong.
-         * since a reference was taken earlier while holding the spinlock.
+         * Just return an error.
-         * A reference on the node is put without holding the spinlock
-         * and it is OK to do so in this case, shrinker will not erase
-         * this node until all references are put and we do not access
-         * any fields of the node after a reference is put .
         */
-        if (test_bit(SID_ID_MAPPED, &psidid->state)) {
+        if (unlikely(psid->num_subauth > SID_MAX_SUB_AUTHORITIES)) {
-                cid = psidid->id;
+                cFYI(1, "%s: %u subauthorities is too many!", __func__,
-                psidid->time = jiffies; /* update ts for accessing */
+                        psid->num_subauth);
-                goto sid_to_id_out;
+                return -EIO;
        }
-        if (time_after(psidid->time + SID_MAP_RETRY, jiffies))
+        sidstr = sid_to_key_str(psid, sidtype);
-                goto sid_to_id_out;
+        if (!sidstr)
+                return -ENOMEM;
-        if (!test_and_set_bit(SID_ID_PENDING, &psidid->state)) {
-                saved_cred = override_creds(root_cred);
+        saved_cred = override_creds(root_cred);
-                idkey = request_key(&cifs_idmap_key_type, psidid->sidstr, "");
+        sidkey = request_key(&cifs_idmap_key_type, sidstr, "");
-                if (IS_ERR(idkey))
+        if (IS_ERR(sidkey)) {
-                        cFYI(1, "%s: Can't map SID to an id", __func__);
+                rc = -EINVAL;
-                else {
+                cFYI(1, "%s: Can't map SID %s to a %cid", __func__, sidstr,
-                        cid = *(unsigned long *)idkey->payload.value;
+                        sidtype == SIDOWNER ? 'u' : 'g');
-                        psidid->id = cid;
+                goto out_revert_creds;
-                        set_bit(SID_ID_MAPPED, &psidid->state);
+        }
-                        key_put(idkey);
-                        kfree(psidid->sidstr);
+        /*
-                }
+         * FIXME: Here we assume that uid_t and gid_t are same size. It's
-                revert_creds(saved_cred);
+         * probably a safe assumption but might be better to check based on
-                psidid->time = jiffies; /* update ts for accessing */
+         * sidtype.
-                clear_bit(SID_ID_PENDING, &psidid->state);
+         */
-                wake_up_bit(&psidid->state, SID_ID_PENDING);
+        if (sidkey->datalen != sizeof(uid_t)) {
-        } else {
+                rc = -EIO;
-                rc = wait_on_bit(&psidid->state, SID_ID_PENDING,
+                cFYI(1, "%s: Downcall contained malformed key "
-                                sidid_pending_wait, TASK_INTERRUPTIBLE);
+                        "(datalen=%hu)", __func__, sidkey->datalen);
-                if (rc) {
+                key_invalidate(sidkey);
-                        cFYI(1, "%s: sidid_pending_wait interrupted %d",
+                goto out_key_put;
-                                        __func__, rc);
-                        --psidid->refcount; /* decremented without spinlock */
-                        return rc;
-                }
-                if (test_bit(SID_ID_MAPPED, &psidid->state))
-                        cid = psidid->id;
        }
-sid_to_id_out:
-        --psidid->refcount; /* decremented without spinlock */
        if (sidtype == SIDOWNER)
-                fattr->cf_uid = cid;
+                memcpy(&fuid, &sidkey->payload.value, sizeof(uid_t));
        else
-                fattr->cf_gid = cid;
+                memcpy(&fgid, &sidkey->payload.value, sizeof(gid_t));
+out_key_put:
+        key_put(sidkey);
+out_revert_creds:
+        revert_creds(saved_cred);
+        kfree(sidstr);
+        /*
+         * Note that we return 0 here unconditionally. If the mapping
+         * fails then we just fall back to using the mnt_uid/mnt_gid.
+         */
+        if (sidtype == SIDOWNER)
+                fattr->cf_uid = fuid;
+        else
+                fattr->cf_gid = fgid;
        return 0;
 }
@@ -568,17 +370,6 @@ init_cifs_idmap(void)
        cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING;
        root_cred = cred;
-        spin_lock_init(&siduidlock);
-        uidtree = RB_ROOT;
-        spin_lock_init(&sidgidlock);
-        gidtree = RB_ROOT;
-        spin_lock_init(&uidsidlock);
-        siduidtree = RB_ROOT;
-        spin_lock_init(&gidsidlock);
-        sidgidtree = RB_ROOT;
-        register_shrinker(&cifs_shrinker);
        cFYI(1, "cifs idmap keyring: %d", key_serial(keyring));
        return 0;
@@ -595,89 +386,9 @@ exit_cifs_idmap(void)
        key_revoke(root_cred->thread_keyring);
        unregister_key_type(&cifs_idmap_key_type);
        put_cred(root_cred);
-        unregister_shrinker(&cifs_shrinker);
        cFYI(1, "Unregistered %s key type", cifs_idmap_key_type.name);
 }
-void
-cifs_destroy_idmaptrees(void)
-{
-        struct rb_root *root;
-        struct rb_node *node;
-        root = &uidtree;
-        spin_lock(&siduidlock);
-        while ((node = rb_first(root)))
-                rb_erase(node, root);
-        spin_unlock(&siduidlock);
-        root = &gidtree;
-        spin_lock(&sidgidlock);
-        while ((node = rb_first(root)))
-                rb_erase(node, root);
-        spin_unlock(&sidgidlock);
-        root = &siduidtree;
-        spin_lock(&uidsidlock);
-        while ((node = rb_first(root)))
-                rb_erase(node, root);
-        spin_unlock(&uidsidlock);
-        root = &sidgidtree;
-        spin_lock(&gidsidlock);
-        while ((node = rb_first(root)))
-                rb_erase(node, root);
-        spin_unlock(&gidsidlock);
-}
-/* if the two SIDs (roughly equivalent to a UUID for a user or group) are
-   the same returns 1, if they do not match returns 0 */
-int compare_sids(const struct cifs_sid *ctsid, const struct cifs_sid *cwsid)
-{
-        int i;
-        int num_subauth, num_sat, num_saw;
-        if ((!ctsid) || (!cwsid))
-                return 1;
-        /* compare the revision */
-        if (ctsid->revision != cwsid->revision) {
-                if (ctsid->revision > cwsid->revision)
-                        return 1;
-                else
-                        return -1;
-        }
-        /* compare all of the six auth values */
-        for (i = 0; i < 6; ++i) {
-                if (ctsid->authority[i] != cwsid->authority[i]) {
-                        if (ctsid->authority[i] > cwsid->authority[i])
-                                return 1;
-                        else
-                                return -1;
-                }
-        }
-        /* compare all of the subauth values if any */
-        num_sat = ctsid->num_subauth;
-        num_saw = cwsid->num_subauth;
-        num_subauth = num_sat < num_saw ? num_sat : num_saw;
-        if (num_subauth) {
-                for (i = 0; i < num_subauth; ++i) {
-                        if (ctsid->sub_auth[i] != cwsid->sub_auth[i]) {
-                                if (le32_to_cpu(ctsid->sub_auth[i]) >
-                                        le32_to_cpu(cwsid->sub_auth[i]))
-                                        return 1;
-                                else
-                                        return -1;
-                        }
-                }
-        }
-        return 0; /* sids compare/match */
-}
 /* copy ntsd, owner sid, and group sid from a security descriptor to another */
 static void copy_sec_desc(const struct cifs_ntsd *pntsd,
                                struct cifs_ntsd *pnntsd, __u32 sidsoffset)
@@ -811,7 +522,7 @@ static __u16 fill_ace_for_sid(struct cifs_ace *pntace,
        pntace->sid.revision = psid->revision;
        pntace->sid.num_subauth = psid->num_subauth;
-        for (i = 0; i < 6; i++)
+        for (i = 0; i < NUM_AUTHS; i++)
                pntace->sid.authority[i] = psid->authority[i];
        for (i = 0; i < psid->num_subauth; i++)
                pntace->sid.sub_auth[i] = psid->sub_auth[i];
@@ -987,8 +698,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
                return -EINVAL;
        }
-        if (psid->num_subauth) {
 #ifdef CONFIG_CIFS_DEBUG2
+        if (psid->num_subauth) {
                int i;
                cFYI(1, "SID revision %d num_auth %d",
                        psid->revision, psid->num_subauth);
@@ -1002,8 +713,8 @@ static int parse_sid(struct cifs_sid *psid, char *end_of_acl)
                        num auths and therefore go off the end */
                cFYI(1, "RID 0x%x",
                        le32_to_cpu(psid->sub_auth[psid->num_subauth-1]));
-#endif
        }
+#endif
        return 0;
 }
@@ -1307,42 +1018,39 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 nmode,
        /* Get the security descriptor */
        pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
-        /* Add three ACEs for owner, group, everyone getting rid of
-           other ACEs as chmod disables ACEs and set the security descriptor */
        if (IS_ERR(pntsd)) {
                rc = PTR_ERR(pntsd);
                cERROR(1, "%s: error %d getting sec desc", __func__, rc);
-        } else {
+                goto out;
-                /* allocate memory for the smb header,
+        }
-                   set security descriptor request security descriptor
-                   parameters, and secuirty descriptor itself */
-                secdesclen = secdesclen < DEFSECDESCLEN ?
-                                        DEFSECDESCLEN : secdesclen;
-                pnntsd = kmalloc(secdesclen, GFP_KERNEL);
-                if (!pnntsd) {
-                        cERROR(1, "Unable to allocate security descriptor");
-                        kfree(pntsd);
-                        return -ENOMEM;
-                }
-                rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+        /*
-                                        &aclflag);
+         * Add three ACEs for owner, group, everyone getting rid of other ACEs
+         * as chmod disables ACEs and set the security descriptor. Allocate
+         * memory for the smb header, set security descriptor request security
+         * descriptor parameters, and secuirty descriptor itself
+         */
+        secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN);
+        pnntsd = kmalloc(secdesclen, GFP_KERNEL);
+        if (!pnntsd) {
+                cERROR(1, "Unable to allocate security descriptor");
+                kfree(pntsd);
+                return -ENOMEM;
+        }
-                cFYI(DBG2, "build_sec_desc rc: %d", rc);
+        rc = build_sec_desc(pntsd, pnntsd, secdesclen, nmode, uid, gid,
+                                &aclflag);
-                if (!rc) {
+        cFYI(DBG2, "build_sec_desc rc: %d", rc);
-                        /* Set the security descriptor */
-                        rc = set_cifs_acl(pnntsd, secdesclen, inode,
-                                                path, aclflag);
-                        cFYI(DBG2, "set_cifs_acl rc: %d", rc);
-                }
-                kfree(pnntsd);
+        if (!rc) {
-                kfree(pntsd);
+                /* Set the security descriptor */
+                rc = set_cifs_acl(pnntsd, secdesclen, inode, path, aclflag);
+                cFYI(DBG2, "set_cifs_acl rc: %d", rc);
        }
+        kfree(pnntsd);
+        kfree(pntsd);
+out:
        return rc;
 }
diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h
index 5c902c7ce524..4f3884835267 100644
--- a/fs/cifs/cifsacl.h
+++ b/fs/cifs/cifsacl.h
@@ -23,11 +23,8 @@
 #define _CIFSACL_H
-#define NUM_AUTHS 6 /* number of authority fields */
+#define NUM_AUTHS (6)   /* number of authority fields */
-#define NUM_SUBAUTHS 5 /* number of sub authority fields */
+#define SID_MAX_SUB_AUTHORITIES (15) /* max number of sub authority fields */
-#define NUM_WK_SIDS 7 /* number of well known sids */
-#define SIDNAMELENGTH 20 /* long enough for the ones we care about */
-#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */
 #define READ_BIT        0x4
 #define WRITE_BIT       0x2
@@ -41,12 +38,32 @@
 #define SIDOWNER 1
 #define SIDGROUP 2
-#define SIDLEN 150 /* S- 1 revision- 6 authorities- max 5 sub authorities */
-#define SID_ID_MAPPED 0
+/*
-#define SID_ID_PENDING 1
+ * Security Descriptor length containing DACL with 3 ACEs (one each for
-#define SID_MAP_EXPIRE (3600 * HZ) /* map entry expires after one hour */
+ * owner, group and world).
-#define SID_MAP_RETRY (300 * HZ)   /* wait 5 minutes for next attempt to map */
+ */
+#define DEFAULT_SEC_DESC_LEN (sizeof(struct cifs_ntsd) + \
+                              sizeof(struct cifs_acl) + \
+                              (sizeof(struct cifs_ace) * 3))
+/*
+ * Maximum size of a string representation of a SID:
+ *
+ * The fields are unsigned values in decimal. So:
+ *
+ * u8:  max 3 bytes in decimal
+ * u32: max 10 bytes in decimal
+ *
+ * "S-" + 3 bytes for version field + 15 for authority field + NULL terminator
+ *
+ * For authority field, max is when all 6 values are non-zero and it must be
+ * represented in hex. So "-0x" + 12 hex digits.
+ *
+ * Add 11 bytes for each subauthority field (10 bytes each + 1 for '-')
+ */
+#define SID_STRING_BASE_SIZE (2 + 3 + 15 + 1)
+#define SID_STRING_SUBAUTH_SIZE (11) /* size of a single subauth string */
 struct cifs_ntsd {
        __le16 revision; /* revision level */
@@ -60,10 +77,13 @@ struct cifs_ntsd {
 struct cifs_sid {
        __u8 revision; /* revision level */
        __u8 num_subauth;
-        __u8 authority[6];
+        __u8 authority[NUM_AUTHS];
-        __le32 sub_auth[5]; /* sub_auth[num_subauth] */
+        __le32 sub_auth[SID_MAX_SUB_AUTHORITIES]; /* sub_auth[num_subauth] */
 } __attribute__((packed));
+/* size of a struct cifs_sid, sans sub_auth array */
+#define CIFS_SID_BASE_SIZE (1 + 1 + NUM_AUTHS)
 struct cifs_acl {
        __le16 revision; /* revision level */
        __le16 size;
@@ -78,26 +98,4 @@ struct cifs_ace {
        struct cifs_sid sid; /* ie UUID of user or group who gets these perms */
 } __attribute__((packed));
-struct cifs_wksid {
-        struct cifs_sid cifssid;
-        char sidname[SIDNAMELENGTH];
-} __attribute__((packed));
-struct cifs_sid_id {
-        unsigned int refcount; /* increment with spinlock, decrement without */
-        unsigned long id;
-        unsigned long time;
-        unsigned long state;
-        char *sidstr;
-        struct rb_node rbnode;
-        struct cifs_sid sid;
-};
-#ifdef __KERNEL__
-extern struct key_type cifs_idmap_key_type;
-extern const struct cred *root_cred;
-#endif /* KERNEL */
-extern int compare_sids(const struct cifs_sid *, const struct cifs_sid *);
 #endif /* _CIFSACL_H */
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index e7931cc55d0c..210f0af83fc4 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -64,24 +64,23 @@ unsigned int global_secflags = CIFSSEC_DEF;
 unsigned int sign_CIFS_PDUs = 1;
 static const struct super_operations cifs_super_ops;
 unsigned int CIFSMaxBufSize = CIFS_MAX_MSGSIZE;
-module_param(CIFSMaxBufSize, int, 0);
+module_param(CIFSMaxBufSize, uint, 0);
 MODULE_PARM_DESC(CIFSMaxBufSize, "Network buffer size (not including header). "
                                 "Default: 16384 Range: 8192 to 130048");
 unsigned int cifs_min_rcv = CIFS_MIN_RCV_POOL;
-module_param(cifs_min_rcv, int, 0);
+module_param(cifs_min_rcv, uint, 0);
 MODULE_PARM_DESC(cifs_min_rcv, "Network buffers in pool. Default: 4 Range: "
                                "1 to 64");
 unsigned int cifs_min_small = 30;
-module_param(cifs_min_small, int, 0);
+module_param(cifs_min_small, uint, 0);
 MODULE_PARM_DESC(cifs_min_small, "Small network buffers in pool. Default: 30 "
                                 "Range: 2 to 256");
 unsigned int cifs_max_pending = CIFS_MAX_REQ;
-module_param(cifs_max_pending, int, 0444);
+module_param(cifs_max_pending, uint, 0444);
 MODULE_PARM_DESC(cifs_max_pending, "Simultaneous requests to server. "
                                   "Default: 32767 Range: 2 to 32767.");
 module_param(enable_oplocks, bool, 0644);
-MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks (bool). Default:"
+MODULE_PARM_DESC(enable_oplocks, "Enable or disable oplocks. Default: y/Y/1");
-                                 "y/Y/1");
 extern mempool_t *cifs_sm_req_poolp;
 extern mempool_t *cifs_req_poolp;
@@ -230,6 +229,7 @@ cifs_alloc_inode(struct super_block *sb)
        cifs_set_oplock_level(cifs_inode, 0);
        cifs_inode->delete_pending = false;
        cifs_inode->invalid_mapping = false;
+        cifs_inode->leave_pages_clean = false;
        cifs_inode->vfs_inode.i_blkbits = 14;  /* 2**14 = CIFS_MAX_MSGSIZE */
        cifs_inode->server_eof = 0;
        cifs_inode->uniqueid = 0;
@@ -540,8 +540,8 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
        char *s, *p;
        char sep;
-        full_path = build_path_to_root(vol, cifs_sb,
+        full_path = cifs_build_path_to_root(vol, cifs_sb,
-                                       cifs_sb_master_tcon(cifs_sb));
+                                            cifs_sb_master_tcon(cifs_sb));
        if (full_path == NULL)
                return ERR_PTR(-ENOMEM);
@@ -1205,7 +1205,6 @@ exit_cifs(void)
        unregister_filesystem(&cifs_fs_type);
        cifs_dfs_release_automount_timer();
 #ifdef CONFIG_CIFS_ACL
-        cifs_destroy_idmaptrees();
        exit_cifs_idmap();
 #endif
 #ifdef CONFIG_CIFS_UPCALL
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index f5af2527fc69..aea1eec64911 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -178,6 +178,7 @@ struct smb_rqst {
 enum smb_version {
        Smb_1 = 1,
+        Smb_20,
        Smb_21,
        Smb_30,
 };
@@ -280,9 +281,6 @@ struct smb_version_operations {
        /* set attributes */
        int (*set_file_info)(struct inode *, const char *, FILE_BASIC_INFO *,
                             const unsigned int);
-        /* build a full path to the root of the mount */
-        char * (*build_path_to_root)(struct smb_vol *, struct cifs_sb_info *,
-                                     struct cifs_tcon *);
        /* check if we can send an echo or nor */
        bool (*can_echo)(struct TCP_Server_Info *);
        /* send echo request */
@@ -369,6 +367,8 @@ struct smb_version_operations {
        void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
        /* generate new lease key */
        void (*new_lease_key)(struct cifs_fid *fid);
+        int (*calc_signature)(struct smb_rqst *rqst,
+                                   struct TCP_Server_Info *server);
 };
 struct smb_version_values {
@@ -396,7 +396,6 @@ struct smb_vol {
        char *password;
        char *domainname;
        char *UNC;
-        char *UNCip;
        char *iocharset;  /* local code page for mapping to and from Unicode */
        char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */
        char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */
@@ -444,11 +443,11 @@ struct smb_vol {
        unsigned int rsize;
        unsigned int wsize;
        bool sockopt_tcp_nodelay:1;
-        unsigned short int port;
        unsigned long actimeo; /* attribute cache timeout (jiffies) */
        struct smb_version_operations *ops;
        struct smb_version_values *vals;
        char *prepath;
+        struct sockaddr_storage dstaddr; /* destination address */
        struct sockaddr_storage srcaddr; /* allow binding to a local IP */
        struct nls_table *local_nls;
 };
@@ -1031,6 +1030,7 @@ struct cifsInodeInfo {
        bool clientCanCacheAll;         /* read and writebehind oplock */
        bool delete_pending;            /* DELETE_ON_CLOSE is set */
        bool invalid_mapping;           /* pagecache is invalid */
+        bool leave_pages_clean; /* protected by i_mutex, not set pages dirty */
        unsigned long time;             /* jiffies of last update of inode */
        u64  server_eof;                /* current file size on server -- protected by i_lock */
        u64  uniqueid;                  /* server inode number */
@@ -1067,30 +1067,16 @@ static inline char CIFS_DIR_SEP(const struct cifs_sb_info *cifs_sb)
 static inline void
 convert_delimiter(char *path, char delim)
 {
-        int i;
+        char old_delim, *pos;
-        char old_delim;
-        if (path == NULL)
-                return;
        if (delim == '/')
                old_delim = '\\';
        else
                old_delim = '/';
-        for (i = 0; path[i] != '\0'; i++) {
+        pos = path;
-                if (path[i] == old_delim)
+        while ((pos = strchr(pos, old_delim)))
-                        path[i] = delim;
+                *pos = delim;
-        }
-}
-static inline char *
-build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
-                   struct cifs_tcon *tcon)
-{
-        if (!vol->ops->build_path_to_root)
-                return NULL;
-        return vol->ops->build_path_to_root(vol, cifs_sb, tcon);
 }
 #ifdef CONFIG_CIFS_STATS
@@ -1362,7 +1348,7 @@ require use of the stronger protocol */
 #define   CIFSSEC_MUST_SEAL     0x40040 /* not supported yet */
 #define   CIFSSEC_MUST_NTLMSSP  0x80080 /* raw ntlmssp with ntlmv2 */
-#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_NTLMSSP)
+#define   CIFSSEC_DEF (CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLMSSP)
 #define   CIFSSEC_MAX (CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2)
 #define   CIFSSEC_AUTH_MASK (CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 | CIFSSEC_MAY_LANMAN | CIFSSEC_MAY_PLNTXT | CIFSSEC_MAY_KRB5 | CIFSSEC_MAY_NTLMSSP)
 /*
@@ -1506,6 +1492,6 @@ extern struct smb_version_values smb20_values;
 extern struct smb_version_operations smb21_operations;
 extern struct smb_version_values smb21_values;
 #define SMB30_VERSION_STRING    "3.0"
-/*extern struct smb_version_operations smb30_operations; */ /* not needed yet */
+extern struct smb_version_operations smb30_operations;
 extern struct smb_version_values smb30_values;
 #endif  /* _CIFS_GLOB_H */
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 5144e9fbeb8c..1988c1baa224 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -58,8 +58,10 @@ do {								\
 } while (0)
 extern int init_cifs_idmap(void);
 extern void exit_cifs_idmap(void);
-extern void cifs_destroy_idmaptrees(void);
 extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct smb_vol *vol,
+                                     struct cifs_sb_info *cifs_sb,
+                                     struct cifs_tcon *tcon);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
                const char *fullpath, const struct dfs_info3_param *ref,
@@ -107,9 +109,7 @@ extern unsigned int smbCalcSize(void *buf);
 extern int decode_negTokenInit(unsigned char *security_blob, int length,
                        struct TCP_Server_Info *server);
 extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len);
-extern int cifs_set_port(struct sockaddr *addr, const unsigned short int port);
+extern void cifs_set_port(struct sockaddr *addr, const unsigned short int port);
-extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
-                                const unsigned short int port);
 extern int map_smb_to_linux_error(char *buf, bool logErr);
 extern void header_assemble(struct smb_hdr *, char /* command */ ,
                            const struct cifs_tcon *, int /* length of
@@ -185,7 +185,7 @@ extern void cifs_mark_open_files_invalid(struct cifs_tcon *tcon);
 extern bool cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset,
                                    __u64 length, __u8 type,
                                    struct cifsLockInfo **conf_lock,
-                                    bool rw_check);
+                                    int rw_check);
 extern void cifs_add_pending_open(struct cifs_fid *fid,
                                  struct tcon_link *tlink,
                                  struct cifs_pending_open *open);
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 5c670b998ffb..7635b5db26a7 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -186,6 +186,7 @@ static const match_table_t cifs_mount_option_tokens = {
        { Opt_user, "user=%s" },
        { Opt_user, "username=%s" },
        { Opt_blank_pass, "pass=" },
+        { Opt_blank_pass, "password=" },
        { Opt_pass, "pass=%s" },
        { Opt_pass, "password=%s" },
        { Opt_blank_ip, "ip=" },
@@ -274,6 +275,7 @@ static const match_table_t cifs_cacheflavor_tokens = {
 static const match_table_t cifs_smb_version_tokens = {
        { Smb_1, SMB1_VERSION_STRING },
+        { Smb_20, SMB20_VERSION_STRING},
        { Smb_21, SMB21_VERSION_STRING },
        { Smb_30, SMB30_VERSION_STRING },
 };
@@ -1074,12 +1076,16 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
                vol->vals = &smb1_values;
                break;
 #ifdef CONFIG_CIFS_SMB2
+        case Smb_20:
+                vol->ops = &smb21_operations; /* currently identical with 2.1 */
+                vol->vals = &smb20_values;
+                break;
        case Smb_21:
                vol->ops = &smb21_operations;
                vol->vals = &smb21_values;
                break;
        case Smb_30:
-                vol->ops = &smb21_operations; /* currently identical with 2.1 */
+                vol->ops = &smb30_operations;
                vol->vals = &smb30_values;
                break;
 #endif
@@ -1090,6 +1096,52 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
        return 0;
 }
+/*
+ * Parse a devname into substrings and populate the vol->UNC and vol->prepath
+ * fields with the result. Returns 0 on success and an error otherwise.
+ */
+static int
+cifs_parse_devname(const char *devname, struct smb_vol *vol)
+{
+        char *pos;
+        const char *delims = "/\\";
+        size_t len;
+        /* make sure we have a valid UNC double delimiter prefix */
+        len = strspn(devname, delims);
+        if (len != 2)
+                return -EINVAL;
+        /* find delimiter between host and sharename */
+        pos = strpbrk(devname + 2, delims);
+        if (!pos)
+                return -EINVAL;
+        /* skip past delimiter */
+        ++pos;
+        /* now go until next delimiter or end of string */
+        len = strcspn(pos, delims);
+        /* move "pos" up to delimiter or NULL */
+        pos += len;
+        vol->UNC = kstrndup(devname, pos - devname, GFP_KERNEL);
+        if (!vol->UNC)
+                return -ENOMEM;
+        convert_delimiter(vol->UNC, '\\');
+        /* If pos is NULL, or is a bogus trailing delimiter then no prepath */
+        if (!*pos++ || !*pos)
+                return 0;
+        vol->prepath = kstrdup(pos, GFP_KERNEL);
+        if (!vol->prepath)
+                return -ENOMEM;
+        return 0;
+}
 static int
 cifs_parse_mount_options(const char *mountdata, const char *devname,
                         struct smb_vol *vol)
@@ -1108,11 +1160,17 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
        char *string = NULL;
        char *tmp_end, *value;
        char delim;
+        bool got_ip = false;
+        unsigned short port = 0;
+        struct sockaddr *dstaddr = (struct sockaddr *)&vol->dstaddr;
        separator[0] = ',';
        separator[1] = 0;
        delim = separator[0];
+        /* ensure we always start with zeroed-out smb_vol */
+        memset(vol, 0, sizeof(*vol));
        /*
         * does not have to be perfect mapping since field is
         * informational, only used for servers that do not support
@@ -1169,6 +1227,16 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
        vol->backupuid_specified = false; /* no backup intent for a user */
        vol->backupgid_specified = false; /* no backup intent for a group */
+        /*
+         * For now, we ignore -EINVAL errors under the assumption that the
+         * unc= and prefixpath= options will be usable.
+         */
+        if (cifs_parse_devname(devname, vol) == -ENOMEM) {
+                printk(KERN_ERR "CIFS: Unable to allocate memory to parse "
+                                "device string.\n");
+                goto out_nomem;
+        }
        while ((data = strsep(&options, separator)) != NULL) {
                substring_t args[MAX_OPT_ARGS];
                unsigned long option;
@@ -1416,12 +1484,12 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        vol->dir_mode = option;
                        break;
                case Opt_port:
-                        if (get_option_ul(args, &option)) {
+                        if (get_option_ul(args, &option) ||
-                                cERROR(1, "%s: Invalid port value",
+                            option > USHRT_MAX) {
-                                        __func__);
+                                cERROR(1, "%s: Invalid port value", __func__);
                                goto cifs_parse_mount_err;
                        }
-                        vol->port = option;
+                        port = (unsigned short)option;
                        break;
                case Opt_rsize:
                        if (get_option_ul(args, &option)) {
@@ -1537,53 +1605,48 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        vol->password[j] = '\0';
                        break;
                case Opt_blank_ip:
-                        vol->UNCip = NULL;
+                        /* FIXME: should this be an error instead? */
+                        got_ip = false;
                        break;
                case Opt_ip:
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        if (strnlen(string, INET6_ADDRSTRLEN) >
+                        if (!cifs_convert_address(dstaddr, string,
-                                                INET6_ADDRSTRLEN) {
+                                        strlen(string))) {
-                                printk(KERN_WARNING "CIFS: ip address "
+                                printk(KERN_ERR "CIFS: bad ip= option (%s).\n",
-                                                    "too long\n");
+                                        string);
-                                goto cifs_parse_mount_err;
-                        }
-                        vol->UNCip = kstrdup(string, GFP_KERNEL);
-                        if (!vol->UNCip) {
-                                printk(KERN_WARNING "CIFS: no memory "
-                                                    "for UNC IP\n");
                                goto cifs_parse_mount_err;
                        }
+                        got_ip = true;
                        break;
                case Opt_unc:
-                        string = match_strdup(args);
+                        string = vol->UNC;
-                        if (string == NULL)
+                        vol->UNC = match_strdup(args);
+                        if (vol->UNC == NULL) {
+                                kfree(string);
                                goto out_nomem;
-                        temp_len = strnlen(string, 300);
-                        if (temp_len  == 300) {
-                                printk(KERN_WARNING "CIFS: UNC name too long\n");
-                                goto cifs_parse_mount_err;
                        }
-                        vol->UNC = kmalloc(temp_len+1, GFP_KERNEL);
+                        convert_delimiter(vol->UNC, '\\');
-                        if (vol->UNC == NULL) {
+                        if (vol->UNC[0] != '\\' || vol->UNC[1] != '\\') {
-                                printk(KERN_WARNING "CIFS: no memory for UNC\n");
+                                kfree(string);
-                                goto cifs_parse_mount_err;
+                                printk(KERN_ERR "CIFS: UNC Path does not "
-                        }
+                                                "begin with // or \\\\\n");
-                        strcpy(vol->UNC, string);
-                        if (strncmp(string, "//", 2) == 0) {
-                                vol->UNC[0] = '\\';
-                                vol->UNC[1] = '\\';
-                        } else if (strncmp(string, "\\\\", 2) != 0) {
-                                printk(KERN_WARNING "CIFS: UNC Path does not "
-                                                    "begin with // or \\\\\n");
                                goto cifs_parse_mount_err;
                        }
+                        /* Compare old unc= option to new one */
+                        if (!string || strcmp(string, vol->UNC))
+                                printk(KERN_WARNING "CIFS: the value of the "
+                                        "unc= mount option does not match the "
+                                        "device string. Using the unc= option "
+                                        "for now. In 3.10, that option will "
+                                        "be ignored and the contents of the "
+                                        "device string will be used "
+                                        "instead. (%s != %s)\n", string,
+                                        vol->UNC);
                        break;
                case Opt_domain:
                        string = match_strdup(args);
@@ -1618,31 +1681,26 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        }
                        break;
                case Opt_prefixpath:
-                        string = match_strdup(args);
+                        /* skip over any leading delimiter */
-                        if (string == NULL)
+                        if (*args[0].from == '/' || *args[0].from == '\\')
-                                goto out_nomem;
+                                args[0].from++;
-                        temp_len = strnlen(string, 1024);
-                        if (string[0] != '/')
-                                temp_len++; /* missing leading slash */
-                        if (temp_len > 1024) {
-                                printk(KERN_WARNING "CIFS: prefix too long\n");
-                                goto cifs_parse_mount_err;
-                        }
-                        vol->prepath = kmalloc(temp_len+1, GFP_KERNEL);
+                        string = vol->prepath;
+                        vol->prepath = match_strdup(args);
                        if (vol->prepath == NULL) {
-                                printk(KERN_WARNING "CIFS: no memory "
+                                kfree(string);
-                                                    "for path prefix\n");
+                                goto out_nomem;
-                                goto cifs_parse_mount_err;
                        }
+                        /* Compare old prefixpath= option to new one */
-                        if (string[0] != '/') {
+                        if (!string || strcmp(string, vol->prepath))
-                                vol->prepath[0] = '/';
+                                printk(KERN_WARNING "CIFS: the value of the "
-                                strcpy(vol->prepath+1, string);
+                                        "prefixpath= mount option does not "
-                        } else
+                                        "match the device string. Using the "
-                                strcpy(vol->prepath, string);
+                                        "prefixpath= option for now. In 3.10, "
+                                        "that option will be ignored and the "
+                                        "contents of the device string will be "
+                                        "used instead.(%s != %s)\n", string,
+                                        vol->prepath);
                        break;
                case Opt_iocharset:
                        string = match_strdup(args);
@@ -1799,9 +1857,30 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                goto cifs_parse_mount_err;
        }
 #endif
+        if (!vol->UNC) {
+                cERROR(1, "CIFS mount error: No usable UNC path provided in "
+                          "device string or in unc= option!");
+                goto cifs_parse_mount_err;
+        }
-        if (vol->UNCip == NULL)
+        /* make sure UNC has a share name */
-                vol->UNCip = &vol->UNC[2];
+        if (!strchr(vol->UNC + 3, '\\')) {
+                cERROR(1, "Malformed UNC. Unable to find share name.");
+                goto cifs_parse_mount_err;
+        }
+        if (!got_ip) {
+                /* No ip= option specified? Try to get it from UNC */
+                if (!cifs_convert_address(dstaddr, &vol->UNC[2],
+                                                strlen(&vol->UNC[2]))) {
+                        printk(KERN_ERR "Unable to determine destination "
+                                        "address.\n");
+                        goto cifs_parse_mount_err;
+                }
+        }
+        /* set the port that we got earlier */
+        cifs_set_port(dstaddr, port);
        if (uid_specified)
                vol->override_uid = override_uid;
@@ -1972,9 +2051,10 @@ match_security(struct TCP_Server_Info *server, struct smb_vol *vol)
        return true;
 }
-static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
+static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
-                         struct smb_vol *vol)
 {
+        struct sockaddr *addr = (struct sockaddr *)&vol->dstaddr;
        if ((server->vals != vol->vals) || (server->ops != vol->ops))
                return 0;
@@ -1995,13 +2075,13 @@ static int match_server(struct TCP_Server_Info *server, struct sockaddr *addr,
 }
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr *addr, struct smb_vol *vol)
+cifs_find_tcp_session(struct smb_vol *vol)
 {
        struct TCP_Server_Info *server;
        spin_lock(&cifs_tcp_ses_lock);
        list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) {
-                if (!match_server(server, addr, vol))
+                if (!match_server(server, vol))
                        continue;
                ++server->srv_count;
@@ -2051,40 +2131,12 @@ static struct TCP_Server_Info *
 cifs_get_tcp_session(struct smb_vol *volume_info)
 {
        struct TCP_Server_Info *tcp_ses = NULL;
-        struct sockaddr_storage addr;
-        struct sockaddr_in *sin_server = (struct sockaddr_in *) &addr;
-        struct sockaddr_in6 *sin_server6 = (struct sockaddr_in6 *) &addr;
        int rc;
-        memset(&addr, 0, sizeof(struct sockaddr_storage));
+        cFYI(1, "UNC: %s", volume_info->UNC);
-        cFYI(1, "UNC: %s ip: %s", volume_info->UNC, volume_info->UNCip);
-        if (volume_info->UNCip && volume_info->UNC) {
-                rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
-                                        volume_info->UNCip,
-                                        strlen(volume_info->UNCip),
-                                        volume_info->port);
-                if (!rc) {
-                        /* we failed translating address */
-                        rc = -EINVAL;
-                        goto out_err;
-                }
-        } else if (volume_info->UNCip) {
-                /* BB using ip addr as tcp_ses name to connect to the
-                   DFS root below */
-                cERROR(1, "Connecting to DFS root not implemented yet");
-                rc = -EINVAL;
-                goto out_err;
-        } else /* which tcp_sess DFS root would we conect to */ {
-                cERROR(1, "CIFS mount error: No UNC path (e.g. -o "
-                        "unc=//192.168.1.100/public) specified");
-                rc = -EINVAL;
-                goto out_err;
-        }
        /* see if we already have a matching tcp_ses */
-        tcp_ses = cifs_find_tcp_session((struct sockaddr *)&addr, volume_info);
+        tcp_ses = cifs_find_tcp_session(volume_info);
        if (tcp_ses)
                return tcp_ses;
@@ -2129,27 +2181,18 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
        INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
        INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
+        memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
+               sizeof(tcp_ses->srcaddr));
+        memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
+                sizeof(tcp_ses->dstaddr));
        /*
         * at this point we are the only ones with the pointer
         * to the struct since the kernel thread not created yet
         * no need to spinlock this init of tcpStatus or srv_count
         */
        tcp_ses->tcpStatus = CifsNew;
-        memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
-               sizeof(tcp_ses->srcaddr));
        ++tcp_ses->srv_count;
-        if (addr.ss_family == AF_INET6) {
-                cFYI(1, "attempting ipv6 connect");
-                /* BB should we allow ipv6 on port 139? */
-                /* other OS never observed in Wild doing 139 with v6 */
-                memcpy(&tcp_ses->dstaddr, sin_server6,
-                       sizeof(struct sockaddr_in6));
-        } else
-                memcpy(&tcp_ses->dstaddr, sin_server,
-                       sizeof(struct sockaddr_in));
        rc = ip_connect(tcp_ses);
        if (rc < 0) {
                cERROR(1, "Error connecting to socket. Aborting operation");
@@ -2397,8 +2440,6 @@ cifs_set_cifscreds(struct smb_vol *vol __attribute__((unused)),
 }
 #endif /* CONFIG_KEYS */
-static bool warned_on_ntlm;  /* globals init to false automatically */
 static struct cifs_ses *
 cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 {
@@ -2475,14 +2516,6 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
        ses->cred_uid = volume_info->cred_uid;
        ses->linux_uid = volume_info->linux_uid;
-        /* ntlmv2 is much stronger than ntlm security, and has been broadly
-        supported for many years, time to update default security mechanism */
-        if ((volume_info->secFlg == 0) && warned_on_ntlm == false) {
-                warned_on_ntlm = true;
-                cERROR(1, "default security mechanism requested.  The default "
-                        "security mechanism will be upgraded from ntlm to "
-                        "ntlmv2 in kernel release 3.3");
-        }
        ses->overrideSecFlg = volume_info->secFlg;
        mutex_lock(&ses->session_mutex);
@@ -2598,13 +2631,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
                }
        }
-        if (strchr(volume_info->UNC + 3, '\\') == NULL
-            && strchr(volume_info->UNC + 3, '/') == NULL) {
-                cERROR(1, "Missing share name");
-                rc = -ENODEV;
-                goto out_fail;
-        }
        /*
         * BB Do we need to wrap session_mutex around this TCon call and Unix
         * SetFS as we do on SessSetup and reconnect?
@@ -2718,11 +2744,8 @@ cifs_match_super(struct super_block *sb, void *data)
        struct cifs_ses *ses;
        struct cifs_tcon *tcon;
        struct tcon_link *tlink;
-        struct sockaddr_storage addr;
        int rc = 0;
-        memset(&addr, 0, sizeof(struct sockaddr_storage));
        spin_lock(&cifs_tcp_ses_lock);
        cifs_sb = CIFS_SB(sb);
        tlink = cifs_get_tlink(cifs_sb_master_tlink(cifs_sb));
@@ -2736,17 +2759,7 @@ cifs_match_super(struct super_block *sb, void *data)
        volume_info = mnt_data->vol;
-        if (!volume_info->UNCip || !volume_info->UNC)
+        if (!match_server(tcp_srv, volume_info) ||
-                goto out;
-        rc = cifs_fill_sockaddr((struct sockaddr *)&addr,
-                                volume_info->UNCip,
-                                strlen(volume_info->UNCip),
-                                volume_info->port);
-        if (!rc)
-                goto out;
-        if (!match_server(tcp_srv, (struct sockaddr *)&addr, volume_info) ||
            !match_session(ses, volume_info) ||
            !match_tcon(tcon, volume_info->UNC)) {
                rc = 0;
@@ -3261,8 +3274,6 @@ cleanup_volume_info_contents(struct smb_vol *volume_info)
 {
        kfree(volume_info->username);
        kzfree(volume_info->password);
-        if (volume_info->UNCip != volume_info->UNC + 2)
-                kfree(volume_info->UNCip);
        kfree(volume_info->UNC);
        kfree(volume_info->domainname);
        kfree(volume_info->iocharset);
@@ -3280,14 +3291,16 @@ cifs_cleanup_volume_info(struct smb_vol *volume_info)
 #ifdef CONFIG_CIFS_DFS_UPCALL
-/* build_path_to_root returns full path to root when
+/*
- * we do not have an exiting connection (tcon) */
+ * cifs_build_path_to_root returns full path to root when we do not have an
+ * exiting connection (tcon)
+ */
 static char *
 build_unc_path_to_root(const struct smb_vol *vol,
                const struct cifs_sb_info *cifs_sb)
 {
        char *full_path, *pos;
-        unsigned int pplen = vol->prepath ? strlen(vol->prepath) : 0;
+        unsigned int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
        unsigned int unc_len = strnlen(vol->UNC, MAX_TREE_SIZE + 1);
        full_path = kmalloc(unc_len + pplen + 1, GFP_KERNEL);
@@ -3298,6 +3311,7 @@ build_unc_path_to_root(const struct smb_vol *vol,
        pos = full_path + unc_len;
        if (pplen) {
+                *pos++ = CIFS_DIR_SEP(cifs_sb);
                strncpy(pos, vol->prepath, pplen);
                pos += pplen;
        }
@@ -3353,7 +3367,6 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses,
                        mdata = NULL;
                } else {
                        cleanup_volume_info_contents(volume_info);
-                        memset(volume_info, '\0', sizeof(*volume_info));
                        rc = cifs_setup_volume_info(volume_info, mdata,
                                                        fake_devname);
                }
@@ -3375,7 +3388,6 @@ cifs_setup_volume_info(struct smb_vol *volume_info, char *mount_data,
        if (cifs_parse_mount_options(mount_data, devname, volume_info))
                return -EINVAL;
        if (volume_info->nullauth) {
                cFYI(1, "Anonymous login");
                kfree(volume_info->username);
@@ -3412,7 +3424,7 @@ cifs_get_volume_info(char *mount_data, const char *devname)
        int rc;
        struct smb_vol *volume_info;
-        volume_info = kzalloc(sizeof(struct smb_vol), GFP_KERNEL);
+        volume_info = kmalloc(sizeof(struct smb_vol), GFP_KERNEL);
        if (!volume_info)
                return ERR_PTR(-ENOMEM);
@@ -3537,8 +3549,10 @@ remote_path_check:
                        rc = -ENOSYS;
                        goto mount_fail_check;
                }
-                /* build_path_to_root works only when we have a valid tcon */
+                /*
-                full_path = build_path_to_root(volume_info, cifs_sb, tcon);
+                 * cifs_build_path_to_root works only when we have a valid tcon
+                 */
+                full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
                if (full_path == NULL) {
                        rc = -ENOMEM;
                        goto mount_fail_check;
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index d3671f2acb29..8719bbe0dcc3 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -44,6 +44,38 @@ renew_parental_timestamps(struct dentry *direntry)
        } while (!IS_ROOT(direntry));
 }
+char *
+cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
+                        struct cifs_tcon *tcon)
+{
+        int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
+        int dfsplen;
+        char *full_path = NULL;
+        /* if no prefix path, simply set path to the root of share to "" */
+        if (pplen == 0) {
+                full_path = kzalloc(1, GFP_KERNEL);
+                return full_path;
+        }
+        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
+        else
+                dfsplen = 0;
+        full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
+        if (full_path == NULL)
+                return full_path;
+        if (dfsplen)
+                strncpy(full_path, tcon->treeName, dfsplen);
+        full_path[dfsplen] = CIFS_DIR_SEP(cifs_sb);
+        strncpy(full_path + dfsplen + 1, vol->prepath, pplen);
+        convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
+        full_path[dfsplen + pplen] = 0; /* add trailing null */
+        return full_path;
+}
 /* Note: caller must free return buffer */
 char *
 build_path_from_dentry(struct dentry *direntry)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index edb25b4bbb95..0a6677ba212b 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -505,16 +505,36 @@ out:
        return rc;
 }
+static int cifs_push_posix_locks(struct cifsFileInfo *cfile);
 /*
 * Try to reacquire byte range locks that were released when session
- * to server was lost
+ * to server was lost.
 */
-static int cifs_relock_file(struct cifsFileInfo *cifsFile)
+static int
+cifs_relock_file(struct cifsFileInfo *cfile)
 {
+        struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        int rc = 0;
-        /* BB list all locks open on this file and relock */
+        /* we are going to update can_cache_brlcks here - need a write access */
+        down_write(&cinode->lock_sem);
+        if (cinode->can_cache_brlcks) {
+                /* can cache locks - no need to push them */
+                up_write(&cinode->lock_sem);
+                return rc;
+        }
+        if (cap_unix(tcon->ses) &&
+            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+                rc = cifs_push_posix_locks(cfile);
+        else
+                rc = tcon->ses->server->ops->push_mand_locks(cfile);
+        up_write(&cinode->lock_sem);
        return rc;
 }
@@ -739,10 +759,15 @@ cifs_del_lock_waiters(struct cifsLockInfo *lock)
        }
 }
+#define CIFS_LOCK_OP    0
+#define CIFS_READ_OP    1
+#define CIFS_WRITE_OP   2
+/* @rw_check : 0 - no op, 1 - read, 2 - write */
 static bool
 cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
                            __u64 length, __u8 type, struct cifsFileInfo *cfile,
-                            struct cifsLockInfo **conf_lock, bool rw_check)
+                            struct cifsLockInfo **conf_lock, int rw_check)
 {
        struct cifsLockInfo *li;
        struct cifsFileInfo *cur_cfile = fdlocks->cfile;
@@ -752,9 +777,13 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
                if (offset + length <= li->offset ||
                    offset >= li->offset + li->length)
                        continue;
-                if (rw_check && server->ops->compare_fids(cfile, cur_cfile) &&
+                if (rw_check != CIFS_LOCK_OP && current->tgid == li->pid &&
-                    current->tgid == li->pid)
+                    server->ops->compare_fids(cfile, cur_cfile)) {
-                        continue;
+                        /* shared lock prevents write op through the same fid */
+                        if (!(li->type & server->vals->shared_lock_type) ||
+                            rw_check != CIFS_WRITE_OP)
+                                continue;
+                }
                if ((type & server->vals->shared_lock_type) &&
                    ((server->ops->compare_fids(cfile, cur_cfile) &&
                     current->tgid == li->pid) || type == li->type))
@@ -769,7 +798,7 @@ cifs_find_fid_lock_conflict(struct cifs_fid_locks *fdlocks, __u64 offset,
 bool
 cifs_find_lock_conflict(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
                        __u8 type, struct cifsLockInfo **conf_lock,
-                        bool rw_check)
+                        int rw_check)
 {
        bool rc = false;
        struct cifs_fid_locks *cur;
@@ -805,7 +834,7 @@ cifs_lock_test(struct cifsFileInfo *cfile, __u64 offset, __u64 length,
        down_read(&cinode->lock_sem);
        exist = cifs_find_lock_conflict(cfile, offset, length, type,
-                                        &conf_lock, false);
+                                        &conf_lock, CIFS_LOCK_OP);
        if (exist) {
                flock->fl_start = conf_lock->offset;
                flock->fl_end = conf_lock->offset + conf_lock->length - 1;
@@ -852,7 +881,7 @@ try_again:
        down_write(&cinode->lock_sem);
        exist = cifs_find_lock_conflict(cfile, lock->offset, lock->length,
-                                        lock->type, &conf_lock, false);
+                                        lock->type, &conf_lock, CIFS_LOCK_OP);
        if (!exist && cinode->can_cache_brlcks) {
                list_add_tail(&lock->llist, &cfile->llist->locks);
                up_write(&cinode->lock_sem);
@@ -948,7 +977,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
        int rc = 0, stored_rc;
        struct cifsLockInfo *li, *tmp;
        struct cifs_tcon *tcon;
-        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
        unsigned int num, max_num, max_buf;
        LOCKING_ANDX_RANGE *buf, *cur;
        int types[] = {LOCKING_ANDX_LARGE_FILES,
@@ -958,21 +986,12 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
        xid = get_xid();
        tcon = tlink_tcon(cfile->tlink);
-        /* we are going to update can_cache_brlcks here - need a write access */
-        down_write(&cinode->lock_sem);
-        if (!cinode->can_cache_brlcks) {
-                up_write(&cinode->lock_sem);
-                free_xid(xid);
-                return rc;
-        }
        /*
         * Accessing maxBuf is racy with cifs_reconnect - need to store value
         * and check it for zero before using.
         */
        max_buf = tcon->ses->server->maxBuf;
        if (!max_buf) {
-                up_write(&cinode->lock_sem);
                free_xid(xid);
                return -EINVAL;
        }
@@ -981,7 +1000,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
                                                sizeof(LOCKING_ANDX_RANGE);
        buf = kzalloc(max_num * sizeof(LOCKING_ANDX_RANGE), GFP_KERNEL);
        if (!buf) {
-                up_write(&cinode->lock_sem);
                free_xid(xid);
                return -ENOMEM;
        }
@@ -1018,9 +1036,6 @@ cifs_push_mandatory_locks(struct cifsFileInfo *cfile)
                }
        }
-        cinode->can_cache_brlcks = false;
-        up_write(&cinode->lock_sem);
        kfree(buf);
        free_xid(xid);
        return rc;
@@ -1043,7 +1058,6 @@ struct lock_to_push {
 static int
 cifs_push_posix_locks(struct cifsFileInfo *cfile)
 {
-        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
        struct file_lock *flock, **before;
        unsigned int count = 0, i = 0;
@@ -1054,14 +1068,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        xid = get_xid();
-        /* we are going to update can_cache_brlcks here - need a write access */
-        down_write(&cinode->lock_sem);
-        if (!cinode->can_cache_brlcks) {
-                up_write(&cinode->lock_sem);
-                free_xid(xid);
-                return rc;
-        }
        lock_flocks();
        cifs_for_each_lock(cfile->dentry->d_inode, before) {
                if ((*before)->fl_flags & FL_POSIX)
@@ -1127,9 +1133,6 @@ cifs_push_posix_locks(struct cifsFileInfo *cfile)
        }
 out:
-        cinode->can_cache_brlcks = false;
-        up_write(&cinode->lock_sem);
        free_xid(xid);
        return rc;
 err_out:
@@ -1144,14 +1147,27 @@ static int
 cifs_push_locks(struct cifsFileInfo *cfile)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(cfile->dentry->d_sb);
+        struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+        int rc = 0;
+        /* we are going to update can_cache_brlcks here - need a write access */
+        down_write(&cinode->lock_sem);
+        if (!cinode->can_cache_brlcks) {
+                up_write(&cinode->lock_sem);
+                return rc;
+        }
        if (cap_unix(tcon->ses) &&
            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
-                return cifs_push_posix_locks(cfile);
+                rc = cifs_push_posix_locks(cfile);
+        else
+                rc = tcon->ses->server->ops->push_mand_locks(cfile);
-        return tcon->ses->server->ops->push_mand_locks(cfile);
+        cinode->can_cache_brlcks = false;
+        up_write(&cinode->lock_sem);
+        return rc;
 }
 static void
@@ -1436,16 +1452,18 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
                        return -ENOMEM;
                rc = cifs_lock_add_if(cfile, lock, wait_flag);
-                if (rc < 0)
+                if (rc < 0) {
                        kfree(lock);
-                if (rc <= 0)
+                        return rc;
+                }
+                if (!rc)
                        goto out;
                rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
                                            type, 1, 0, wait_flag);
                if (rc) {
                        kfree(lock);
-                        goto out;
+                        return rc;
                }
                cifs_lock_add(cfile, lock);
@@ -1794,7 +1812,6 @@ static int cifs_writepages(struct address_space *mapping,
        struct TCP_Server_Info *server;
        struct page *page;
        int rc = 0;
-        loff_t isize = i_size_read(mapping->host);
        /*
         * If wsize is smaller than the page cache size, default to writing
@@ -1899,7 +1916,7 @@ retry:
                         */
                        set_page_writeback(page);
-                        if (page_offset(page) >= isize) {
+                        if (page_offset(page) >= i_size_read(mapping->host)) {
                                done = true;
                                unlock_page(page);
                                end_page_writeback(page);
@@ -1932,7 +1949,8 @@ retry:
                wdata->offset = page_offset(wdata->pages[0]);
                wdata->pagesz = PAGE_CACHE_SIZE;
                wdata->tailsz =
-                        min(isize - page_offset(wdata->pages[nr_pages - 1]),
+                        min(i_size_read(mapping->host) -
+                            page_offset(wdata->pages[nr_pages - 1]),
                            (loff_t)PAGE_CACHE_SIZE);
                wdata->bytes = ((nr_pages - 1) * PAGE_CACHE_SIZE) +
                                        wdata->tailsz;
@@ -2085,7 +2103,15 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        } else {
                rc = copied;
                pos += copied;
-                set_page_dirty(page);
+                /*
+                 * When we use strict cache mode and cifs_strict_writev was run
+                 * with level II oplock (indicated by leave_pages_clean field of
+                 * CIFS_I(inode)), we can leave pages clean - cifs_strict_writev
+                 * sent the data to the server itself.
+                 */
+                if (!CIFS_I(inode)->leave_pages_clean ||
+                    !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO))
+                        set_page_dirty(page);
        }
        if (rc > 0) {
@@ -2436,8 +2462,8 @@ ssize_t cifs_user_writev(struct kiocb *iocb, const struct iovec *iov,
 }
 static ssize_t
-cifs_writev(struct kiocb *iocb, const struct iovec *iov,
+cifs_pagecache_writev(struct kiocb *iocb, const struct iovec *iov,
-            unsigned long nr_segs, loff_t pos)
+                      unsigned long nr_segs, loff_t pos, bool cache_ex)
 {
        struct file *file = iocb->ki_filp;
        struct cifsFileInfo *cfile = (struct cifsFileInfo *)file->private_data;
@@ -2457,10 +2483,14 @@ cifs_writev(struct kiocb *iocb, const struct iovec *iov,
        down_read(&cinode->lock_sem);
        if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
                                     server->vals->exclusive_lock_type, NULL,
-                                     true)) {
+                                     CIFS_WRITE_OP)) {
                mutex_lock(&inode->i_mutex);
+                if (!cache_ex)
+                        cinode->leave_pages_clean = true;
                rc = __generic_file_aio_write(iocb, iov, nr_segs,
-                                               &iocb->ki_pos);
+                                              &iocb->ki_pos);
+                if (!cache_ex)
+                        cinode->leave_pages_clean = false;
                mutex_unlock(&inode->i_mutex);
        }
@@ -2487,42 +2517,62 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
        struct cifsFileInfo *cfile = (struct cifsFileInfo *)
                                                iocb->ki_filp->private_data;
        struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
+        ssize_t written, written2;
-#ifdef CONFIG_CIFS_SMB2
        /*
-         * If we have an oplock for read and want to write a data to the file
+         * We need to store clientCanCacheAll here to prevent race
-         * we need to store it in the page cache and then push it to the server
+         * conditions - this value can be changed during an execution
-         * to be sure the next read will get a valid data.
+         * of generic_file_aio_write. For CIFS it can be changed from
+         * true to false only, but for SMB2 it can be changed both from
+         * true to false and vice versa. So, we can end up with a data
+         * stored in the cache, not marked dirty and not sent to the
+         * server if this value changes its state from false to true
+         * after cifs_write_end.
         */
-        if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead) {
+        bool cache_ex = cinode->clientCanCacheAll;
-                ssize_t written;
+        bool cache_read = cinode->clientCanCacheRead;
-                int rc;
+        int rc;
+        loff_t saved_pos;
-                written = generic_file_aio_write(iocb, iov, nr_segs, pos);
-                rc = filemap_fdatawrite(inode->i_mapping);
-                if (rc)
-                        return (ssize_t)rc;
-                return written;
+        if (cache_ex) {
+                if (cap_unix(tcon->ses) &&
+                    ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
+                    (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
+                                                tcon->fsUnixInfo.Capability)))
+                        return generic_file_aio_write(iocb, iov, nr_segs, pos);
+                return cifs_pagecache_writev(iocb, iov, nr_segs, pos, cache_ex);
        }
-#endif
        /*
-         * For non-oplocked files in strict cache mode we need to write the data
+         * For files without exclusive oplock in strict cache mode we need to
-         * to the server exactly from the pos to pos+len-1 rather than flush all
+         * write the data to the server exactly from the pos to pos+len-1 rather
-         * affected pages because it may cause a error with mandatory locks on
+         * than flush all affected pages because it may cause a error with
-         * these pages but not on the region from pos to ppos+len-1.
+         * mandatory locks on these pages but not on the region from pos to
+         * ppos+len-1.
         */
+        written = cifs_user_writev(iocb, iov, nr_segs, pos);
+        if (!cache_read || written <= 0)
+                return written;
-        if (!cinode->clientCanCacheAll)
+        saved_pos = iocb->ki_pos;
-                return cifs_user_writev(iocb, iov, nr_segs, pos);
+        iocb->ki_pos = pos;
+        /* we have a read oplock - need to store a data in the page cache */
        if (cap_unix(tcon->ses) &&
-            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability)) &&
+            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0) &&
-            ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
+            (CIFS_UNIX_FCNTL_CAP & le64_to_cpu(
-                return generic_file_aio_write(iocb, iov, nr_segs, pos);
+                                        tcon->fsUnixInfo.Capability)))
+                written2 = generic_file_aio_write(iocb, iov, nr_segs, pos);
-        return cifs_writev(iocb, iov, nr_segs, pos);
+        else
+                written2 = cifs_pagecache_writev(iocb, iov, nr_segs, pos,
+                                                 cache_ex);
+        /* errors occured during writing - invalidate the page cache */
+        if (written2 < 0) {
+                rc = cifs_invalidate_mapping(inode);
+                if (rc)
+                        written = (ssize_t)rc;
+                else
+                        iocb->ki_pos = saved_pos;
+        }
+        return written;
 }
 static struct cifs_readdata *
@@ -2892,7 +2942,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
        down_read(&cinode->lock_sem);
        if (!cifs_find_lock_conflict(cfile, pos, iov_length(iov, nr_segs),
                                     tcon->ses->server->vals->shared_lock_type,
-                                     NULL, true))
+                                     NULL, CIFS_READ_OP))
                rc = generic_file_aio_read(iocb, iov, nr_segs, pos);
        up_read(&cinode->lock_sem);
        return rc;
@@ -3536,7 +3586,7 @@ void cifs_oplock_break(struct work_struct *work)
                if (cinode->clientCanCacheRead == 0) {
                        rc = filemap_fdatawait(inode->i_mapping);
                        mapping_set_error(inode->i_mapping, rc);
-                        invalidate_remote_inode(inode);
+                        cifs_invalidate_mapping(inode);
                }
                cFYI(1, "Oplock flush inode %p rc %d", inode, rc);
        }
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index afdff79651f1..ed6208ff85a7 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1791,11 +1791,12 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
        stat->ino = CIFS_I(inode)->uniqueid;
        /*
-         * If on a multiuser mount without unix extensions, and the admin hasn't
+         * If on a multiuser mount without unix extensions or cifsacl being
-         * overridden them, set the ownership to the fsuid/fsgid of the current
+         * enabled, and the admin hasn't overridden them, set the ownership
-         * process.
+         * to the fsuid/fsgid of the current process.
         */
        if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) &&
+            !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) &&
            !tcon->unix_ext) {
                if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID))
                        stat->uid = current_fsuid();
diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index d5ce9e26696c..a82bc51fdc82 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -204,7 +204,7 @@ cifs_convert_address(struct sockaddr *dst, const char *src, int len)
        return rc;
 }
-int
+void
 cifs_set_port(struct sockaddr *addr, const unsigned short int port)
 {
        switch (addr->sa_family) {
@@ -214,19 +214,7 @@ cifs_set_port(struct sockaddr *addr, const unsigned short int port)
        case AF_INET6:
                ((struct sockaddr_in6 *)addr)->sin6_port = htons(port);
                break;
-        default:
-                return 0;
        }
-        return 1;
-}
-int
-cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len,
-                   const unsigned short int port)
-{
-        if (!cifs_convert_address(dst, src, len))
-                return 0;
-        return cifs_set_port(dst, port);
 }
 /*****************************************************************************
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index f9b5d3d6cf33..6002fdc920ae 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -66,18 +66,20 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 #endif /* DEBUG2 */
 /*
+ * Attempt to preload the dcache with the results from the FIND_FIRST/NEXT
+ *
 * Find the dentry that matches "name". If there isn't one, create one. If it's
 * a negative dentry or the uniqueid changed, then drop it and recreate it.
 */
-static struct dentry *
+static void
-cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
+cifs_prime_dcache(struct dentry *parent, struct qstr *name,
                    struct cifs_fattr *fattr)
 {
        struct dentry *dentry, *alias;
        struct inode *inode;
        struct super_block *sb = parent->d_inode->i_sb;
-        cFYI(1, "For %s", name->name);
+        cFYI(1, "%s: for %s", __func__, name->name);
        if (parent->d_op && parent->d_op->d_hash)
                parent->d_op->d_hash(parent, parent->d_inode, name);
@@ -86,35 +88,33 @@ cifs_readdir_lookup(struct dentry *parent, struct qstr *name,
        dentry = d_lookup(parent, name);
        if (dentry) {
+                int err;
                inode = dentry->d_inode;
                /* update inode in place if i_ino didn't change */
                if (inode && CIFS_I(inode)->uniqueid == fattr->cf_uniqueid) {
                        cifs_fattr_to_inode(inode, fattr);
-                        return dentry;
+                        goto out;
                }
-                d_drop(dentry);
+                err = d_invalidate(dentry);
                dput(dentry);
+                if (err)
+                        return;
        }
        dentry = d_alloc(parent, name);
-        if (dentry == NULL)
+        if (!dentry)
-                return NULL;
+                return;
        inode = cifs_iget(sb, fattr);
-        if (!inode) {
+        if (!inode)
-                dput(dentry);
+                goto out;
-                return NULL;
-        }
        alias = d_materialise_unique(dentry, inode);
-        if (alias != NULL) {
+        if (alias && !IS_ERR(alias))
-                dput(dentry);
+                dput(alias);
-                if (IS_ERR(alias))
+out:
-                        return NULL;
+        dput(dentry);
-                dentry = alias;
-        }
-        return dentry;
 }
 static void
@@ -134,6 +134,16 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
        if (fattr->cf_cifsattrs & ATTR_READONLY)
                fattr->cf_mode &= ~S_IWUGO;
+        /*
+         * We of course don't get ACL info in FIND_FIRST/NEXT results, so
+         * mark it for revalidation so that "ls -l" will look right. It might
+         * be super-slow, but if we don't do this then the ownership of files
+         * may look wrong since the inodes may not have timed out by the time
+         * "ls" does a stat() call on them.
+         */
+        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL)
+                fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL &&
            fattr->cf_cifsattrs & ATTR_SYSTEM) {
                if (fattr->cf_eof == 0)  {
@@ -649,7 +659,6 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
        struct cifs_dirent de = { NULL, };
        struct cifs_fattr fattr;
-        struct dentry *dentry;
        struct qstr name;
        int rc = 0;
        ino_t ino;
@@ -720,13 +729,11 @@ static int cifs_filldir(char *find_entry, struct file *file, filldir_t filldir,
                 */
                fattr.cf_flags |= CIFS_FATTR_NEED_REVAL;
-        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
+        cifs_prime_dcache(file->f_dentry, &name, &fattr);
-        dentry = cifs_readdir_lookup(file->f_dentry, &name, &fattr);
+        ino = cifs_uniqueid_to_ino_t(fattr.cf_uniqueid);
        rc = filldir(dirent, name.name, name.len, file->f_pos, ino,
                     fattr.cf_dtype);
-        dput(dentry);
        return rc;
 }
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 56cc4be87807..a5d234c8d5d9 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -575,37 +575,6 @@ cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
        return CIFSSMBQFileInfo(xid, tcon, fid->netfid, data);
 }
-static char *
-cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
-                        struct cifs_tcon *tcon)
-{
-        int pplen = vol->prepath ? strlen(vol->prepath) : 0;
-        int dfsplen;
-        char *full_path = NULL;
-        /* if no prefix path, simply set path to the root of share to "" */
-        if (pplen == 0) {
-                full_path = kzalloc(1, GFP_KERNEL);
-                return full_path;
-        }
-        if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
-                dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
-        else
-                dfsplen = 0;
-        full_path = kmalloc(dfsplen + pplen + 1, GFP_KERNEL);
-        if (full_path == NULL)
-                return full_path;
-        if (dfsplen)
-                strncpy(full_path, tcon->treeName, dfsplen);
-        strncpy(full_path + dfsplen, vol->prepath, pplen);
-        convert_delimiter(full_path, CIFS_DIR_SEP(cifs_sb));
-        full_path[dfsplen + pplen] = 0; /* add trailing null */
-        return full_path;
-}
 static void
 cifs_clear_stats(struct cifs_tcon *tcon)
 {
@@ -766,7 +735,6 @@ smb_set_file_info(struct inode *inode, const char *full_path,
        struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
        struct tcon_link *tlink = NULL;
        struct cifs_tcon *tcon;
-        FILE_BASIC_INFO info_buf;
        /* if the file is already open for write, just use that fileid */
        open_file = find_writable_file(cinode, true);
@@ -817,7 +785,7 @@ smb_set_file_info(struct inode *inode, const char *full_path,
        netpid = current->tgid;
 set_via_filehandle:
-        rc = CIFSSMBSetFileInfo(xid, tcon, &info_buf, netfid, netpid);
+        rc = CIFSSMBSetFileInfo(xid, tcon, buf, netfid, netpid);
        if (!rc)
                cinode->cifsAttrs = le32_to_cpu(buf->Attributes);
@@ -944,7 +912,6 @@ struct smb_version_operations smb1_operations = {
        .set_path_size = CIFSSMBSetEOF,
        .set_file_size = CIFSSMBSetFileSize,
        .set_file_info = smb_set_file_info,
-        .build_path_to_root = cifs_build_path_to_root,
        .echo = CIFSSMBEcho,
        .mkdir = CIFSSMBMkDir,
        .mkdir_setinfo = cifs_mkdir_setinfo,
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index a93eec30a50d..71e6aed4b382 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -260,13 +260,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
        struct cifs_fid_locks *fdlocks;
        xid = get_xid();
-        /* we are going to update can_cache_brlcks here - need a write access */
-        down_write(&cinode->lock_sem);
-        if (!cinode->can_cache_brlcks) {
-                up_write(&cinode->lock_sem);
-                free_xid(xid);
-                return rc;
-        }
        /*
         * Accessing maxBuf is racy with cifs_reconnect - need to store value
@@ -274,7 +267,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
         */
        max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
        if (!max_buf) {
-                up_write(&cinode->lock_sem);
                free_xid(xid);
                return -EINVAL;
        }
@@ -282,7 +274,6 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
        max_num = max_buf / sizeof(struct smb2_lock_element);
        buf = kzalloc(max_num * sizeof(struct smb2_lock_element), GFP_KERNEL);
        if (!buf) {
-                up_write(&cinode->lock_sem);
                free_xid(xid);
                return -ENOMEM;
        }
@@ -293,10 +284,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
                        rc = stored_rc;
        }
-        cinode->can_cache_brlcks = false;
        kfree(buf);
-        up_write(&cinode->lock_sem);
        free_xid(xid);
        return rc;
 }
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 4d9dbe0b7385..d79de7bc4435 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -262,23 +262,6 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
        return rc;
 }
-static char *
-smb2_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
-                        struct cifs_tcon *tcon)
-{
-        int pplen = vol->prepath ? strlen(vol->prepath) : 0;
-        char *full_path = NULL;
-        /* if no prefix path, simply set path to the root of share to "" */
-        if (pplen == 0) {
-                full_path = kzalloc(2, GFP_KERNEL);
-                return full_path;
-        }
-        cERROR(1, "prefixpath is not supported for SMB2 now");
-        return NULL;
-}
 static bool
 smb2_can_echo(struct TCP_Server_Info *server)
 {
@@ -613,7 +596,6 @@ struct smb_version_operations smb21_operations = {
        .set_path_size = smb2_set_path_size,
        .set_file_size = smb2_set_file_size,
        .set_file_info = smb2_set_file_info,
-        .build_path_to_root = smb2_build_path_to_root,
        .mkdir = smb2_mkdir,
        .mkdir_setinfo = smb2_mkdir_setinfo,
        .rmdir = smb2_rmdir,
@@ -641,6 +623,91 @@ struct smb_version_operations smb21_operations = {
        .get_lease_key = smb2_get_lease_key,
        .set_lease_key = smb2_set_lease_key,
        .new_lease_key = smb2_new_lease_key,
+        .calc_signature = smb2_calc_signature,
+};
+struct smb_version_operations smb30_operations = {
+        .compare_fids = smb2_compare_fids,
+        .setup_request = smb2_setup_request,
+        .setup_async_request = smb2_setup_async_request,
+        .check_receive = smb2_check_receive,
+        .add_credits = smb2_add_credits,
+        .set_credits = smb2_set_credits,
+        .get_credits_field = smb2_get_credits_field,
+        .get_credits = smb2_get_credits,
+        .get_next_mid = smb2_get_next_mid,
+        .read_data_offset = smb2_read_data_offset,
+        .read_data_length = smb2_read_data_length,
+        .map_error = map_smb2_to_linux_error,
+        .find_mid = smb2_find_mid,
+        .check_message = smb2_check_message,
+        .dump_detail = smb2_dump_detail,
+        .clear_stats = smb2_clear_stats,
+        .print_stats = smb2_print_stats,
+        .is_oplock_break = smb2_is_valid_oplock_break,
+        .need_neg = smb2_need_neg,
+        .negotiate = smb2_negotiate,
+        .negotiate_wsize = smb2_negotiate_wsize,
+        .negotiate_rsize = smb2_negotiate_rsize,
+        .sess_setup = SMB2_sess_setup,
+        .logoff = SMB2_logoff,
+        .tree_connect = SMB2_tcon,
+        .tree_disconnect = SMB2_tdis,
+        .is_path_accessible = smb2_is_path_accessible,
+        .can_echo = smb2_can_echo,
+        .echo = SMB2_echo,
+        .query_path_info = smb2_query_path_info,
+        .get_srv_inum = smb2_get_srv_inum,
+        .query_file_info = smb2_query_file_info,
+        .set_path_size = smb2_set_path_size,
+        .set_file_size = smb2_set_file_size,
+        .set_file_info = smb2_set_file_info,
+        .mkdir = smb2_mkdir,
+        .mkdir_setinfo = smb2_mkdir_setinfo,
+        .rmdir = smb2_rmdir,
+        .unlink = smb2_unlink,
+        .rename = smb2_rename_path,
+        .create_hardlink = smb2_create_hardlink,
+        .open = smb2_open_file,
+        .set_fid = smb2_set_fid,
+        .close = smb2_close_file,
+        .flush = smb2_flush_file,
+        .async_readv = smb2_async_readv,
+        .async_writev = smb2_async_writev,
+        .sync_read = smb2_sync_read,
+        .sync_write = smb2_sync_write,
+        .query_dir_first = smb2_query_dir_first,
+        .query_dir_next = smb2_query_dir_next,
+        .close_dir = smb2_close_dir,
+        .calc_smb_size = smb2_calc_size,
+        .is_status_pending = smb2_is_status_pending,
+        .oplock_response = smb2_oplock_response,
+        .queryfs = smb2_queryfs,
+        .mand_lock = smb2_mand_lock,
+        .mand_unlock_range = smb2_unlock_range,
+        .push_mand_locks = smb2_push_mandatory_locks,
+        .get_lease_key = smb2_get_lease_key,
+        .set_lease_key = smb2_set_lease_key,
+        .new_lease_key = smb2_new_lease_key,
+        .calc_signature = smb3_calc_signature,
+};
+struct smb_version_values smb20_values = {
+        .version_string = SMB20_VERSION_STRING,
+        .protocol_id = SMB20_PROT_ID,
+        .req_capabilities = 0, /* MBZ */
+        .large_lock_type = 0,
+        .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
+        .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
+        .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK,
+        .header_size = sizeof(struct smb2_hdr),
+        .max_header_size = MAX_SMB2_HDR_SIZE,
+        .read_rsp_size = sizeof(struct smb2_read_rsp) - 1,
+        .lock_cmd = SMB2_LOCK,
+        .cap_unix = 0,
+        .cap_nt_find = SMB2_NT_FIND,
+        .cap_large_files = SMB2_LARGE_FILES,
 };
 struct smb_version_values smb21_values = {
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index cf33622cdac8..41d9d0725f0f 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -425,7 +425,7 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses)
        }
        cFYI(1, "sec_flags 0x%x", sec_flags);
-        if (sec_flags & CIFSSEC_MUST_SIGN) {
+        if ((sec_flags & CIFSSEC_MUST_SIGN) == CIFSSEC_MUST_SIGN) {
                cFYI(1, "Signing required");
                if (!(server->sec_mode & (SMB2_NEGOTIATE_SIGNING_REQUIRED |
                      SMB2_NEGOTIATE_SIGNING_ENABLED))) {
@@ -612,7 +612,8 @@ ssetup_ntlmssp_authenticate:
        /* BB add code to build os and lm fields */
-        rc = SendReceive2(xid, ses, iov, 2, &resp_buftype, CIFS_LOG_ERROR);
+        rc = SendReceive2(xid, ses, iov, 2, &resp_buftype,
+                          CIFS_LOG_ERROR | CIFS_NEG_OP);
        kfree(security_blob);
        rsp = (struct smb2_sess_setup_rsp *)iov[0].iov_base;
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 7d25f8b14f93..2aa3535e38ce 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -47,6 +47,10 @@ extern struct mid_q_entry *smb2_setup_request(struct cifs_ses *ses,
                              struct smb_rqst *rqst);
 extern struct mid_q_entry *smb2_setup_async_request(
                        struct TCP_Server_Info *server, struct smb_rqst *rqst);
+extern int smb2_calc_signature(struct smb_rqst *rqst,
+                                struct TCP_Server_Info *server);
+extern int smb3_calc_signature(struct smb_rqst *rqst,
+                                struct TCP_Server_Info *server);
 extern void smb2_echo_request(struct work_struct *work);
 extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
 extern __u8 smb2_map_lease_to_oplock(__le32 lease_state);
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 2a5fdf26f79f..8dd73e61d762 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -39,7 +39,7 @@
 #include "smb2status.h"
 #include "smb2glob.h"
-static int
+int
 smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
        int i, rc;
@@ -116,6 +116,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
        return rc;
 }
+int
+smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
+{
+        cFYI(1, "smb3 signatures not supported yet");
+        return -EOPNOTSUPP;
+}
 /* must be called with server->srv_mutex held */
 static int
 smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
@@ -132,7 +139,7 @@ smb2_sign_rqst(struct smb_rqst *rqst, struct TCP_Server_Info *server)
                return rc;
        }
-        rc = smb2_calc_signature(rqst, server);
+        rc = server->ops->calc_signature(rqst, server);
        return rc;
 }
@@ -168,7 +175,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
        memset(smb2_pdu->Signature, 0, SMB2_SIGNATURE_SIZE);
        mutex_lock(&server->srv_mutex);
-        rc = smb2_calc_signature(rqst, server);
+        rc = server->ops->calc_signature(rqst, server);
        mutex_unlock(&server->srv_mutex);
        if (rc)
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 4c6285fff598..e2f57a007029 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -844,6 +844,9 @@ COMPATIBLE_IOCTL(TIOCGDEV)
 COMPATIBLE_IOCTL(TIOCCBRK)
 COMPATIBLE_IOCTL(TIOCGSID)
 COMPATIBLE_IOCTL(TIOCGICOUNT)
+COMPATIBLE_IOCTL(TIOCGPKT)
+COMPATIBLE_IOCTL(TIOCGPTLCK)
+COMPATIBLE_IOCTL(TIOCGEXCL)
 /* Little t */
 COMPATIBLE_IOCTL(TIOCGETD)
 COMPATIBLE_IOCTL(TIOCSETD)
diff --git a/fs/coredump.c b/fs/coredump.c
index ce47379bfa61..177493272a61 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -458,7 +458,7 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
        return err;
 }
-void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
+void do_coredump(siginfo_t *siginfo)
 {
        struct core_state core_state;
        struct core_name cn;
@@ -474,7 +474,7 @@ void do_coredump(siginfo_t *siginfo, struct pt_regs *regs)
        static atomic_t core_dump_count = ATOMIC_INIT(0);
        struct coredump_params cprm = {
                .siginfo = siginfo,
-                .regs = regs,
+                .regs = signal_pt_regs(),
                .limit = rlimit(RLIMIT_CORE),
                /*
                 * We must use the same mm->flags while dumping core to avoid
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b607d92cdf24..153bb1e42e63 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -59,7 +59,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, umode_t mode, dev
                case S_IFDIR:
                        inode->i_op = &simple_dir_inode_operations;
                        inode->i_fop = &simple_dir_operations;
-                        inode->i_private = NULL;
                        /* directory inodes start off with i_nlink == 2
                         * (for "." entry) */
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 14afbabe6546..472e6befc54d 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -545,37 +545,38 @@ void devpts_kill_index(struct inode *ptmx_inode, int idx)
        mutex_unlock(&allocated_ptys_lock);
 }
-int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
+/**
+ * devpts_pty_new -- create a new inode in /dev/pts/
+ * @ptmx_inode: inode of the master
+ * @device: major+minor of the node to be created
+ * @index: used as a name of the node
+ * @priv: what's given back by devpts_get_priv
+ *
+ * The created inode is returned. Remove it from /dev/pts/ by devpts_pty_kill.
+ */
+struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
+                void *priv)
 {
-        /* tty layer puts index from devpts_new_index() in here */
-        int number = tty->index;
-        struct tty_driver *driver = tty->driver;
-        dev_t device = MKDEV(driver->major, driver->minor_start+number);
        struct dentry *dentry;
        struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-        struct inode *inode = new_inode(sb);
+        struct inode *inode;
        struct dentry *root = sb->s_root;
        struct pts_fs_info *fsi = DEVPTS_SB(sb);
        struct pts_mount_opts *opts = &fsi->mount_opts;
-        int ret = 0;
        char s[12];
-        /* We're supposed to be given the slave end of a pty */
+        inode = new_inode(sb);
-        BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY);
-        BUG_ON(driver->subtype != PTY_TYPE_SLAVE);
        if (!inode)
-                return -ENOMEM;
+                return ERR_PTR(-ENOMEM);
-        inode->i_ino = number + 3;
+        inode->i_ino = index + 3;
        inode->i_uid = opts->setuid ? opts->uid : current_fsuid();
        inode->i_gid = opts->setgid ? opts->gid : current_fsgid();
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        init_special_inode(inode, S_IFCHR|opts->mode, device);
-        inode->i_private = tty;
+        inode->i_private = priv;
-        tty->driver_data = inode;
-        sprintf(s, "%d", number);
+        sprintf(s, "%d", index);
        mutex_lock(&root->d_inode->i_mutex);
@@ -585,18 +586,24 @@ int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty)
                fsnotify_create(root->d_inode, dentry);
        } else {
                iput(inode);
-                ret = -ENOMEM;
+                inode = ERR_PTR(-ENOMEM);
        }
        mutex_unlock(&root->d_inode->i_mutex);
-        return ret;
+        return inode;
 }
-struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
+/**
+ * devpts_get_priv -- get private data for a slave
+ * @pts_inode: inode of the slave
+ *
+ * Returns whatever was passed as priv in devpts_pty_new for a given inode.
+ */
+void *devpts_get_priv(struct inode *pts_inode)
 {
        struct dentry *dentry;
-        struct tty_struct *tty;
+        void *priv = NULL;
        BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
@@ -605,18 +612,22 @@ struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number)
        if (!dentry)
                return NULL;
-        tty = NULL;
        if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
-                tty = (struct tty_struct *)pts_inode->i_private;
+                priv = pts_inode->i_private;
        dput(dentry);
-        return tty;
+        return priv;
 }
-void devpts_pty_kill(struct tty_struct *tty)
+/**
+ * devpts_pty_kill -- remove inode form /dev/pts/
+ * @inode: inode of the slave to be removed
+ *
+ * This is an inverse operation of devpts_pty_new.
+ */
+void devpts_pty_kill(struct inode *inode)
 {
-        struct inode *inode = tty->driver_data;
        struct super_block *sb = pts_sb_from_inode(inode);
        struct dentry *root = sb->s_root;
        struct dentry *dentry;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f86c720dba0e..cf5b44b10c67 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -540,6 +540,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
        sector_t fs_endblk;     /* Into file, in filesystem-sized blocks */
        unsigned long fs_count; /* Number of filesystem-sized blocks */
        int create;
+        unsigned int i_blkbits = sdio->blkbits + sdio->blkfactor;
        /*
         * If there was a memory error and we've overwritten all the
@@ -554,7 +555,7 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
                fs_count = fs_endblk - fs_startblk + 1;
                map_bh->b_state = 0;
-                map_bh->b_size = fs_count << dio->inode->i_blkbits;
+                map_bh->b_size = fs_count << i_blkbits;
                /*
                 * For writes inside i_size on a DIO_SKIP_HOLES filesystem we
@@ -1053,7 +1054,8 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        int seg;
        size_t size;
        unsigned long addr;
-        unsigned blkbits = inode->i_blkbits;
+        unsigned i_blkbits = ACCESS_ONCE(inode->i_blkbits);
+        unsigned blkbits = i_blkbits;
        unsigned blocksize_mask = (1 << blkbits) - 1;
        ssize_t retval = -EINVAL;
        loff_t end = offset;
@@ -1149,7 +1151,7 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        dio->inode = inode;
        dio->rw = rw;
        sdio.blkbits = blkbits;
-        sdio.blkfactor = inode->i_blkbits - blkbits;
+        sdio.blkfactor = i_blkbits - blkbits;
        sdio.block_in_file = offset >> blkbits;
        sdio.get_block = get_block;
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index 1897eb1b4b6a..e4242c3f8486 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -1,6 +1,6 @@
 menuconfig DLM
        tristate "Distributed Lock Manager (DLM)"
-        depends on EXPERIMENTAL && INET
+        depends on INET
        depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
        select IP_SCTP
        help
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 871c1abf6029..77c0f70f8fe8 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -337,6 +337,7 @@ enum rsb_flags {
        RSB_NEW_MASTER2,
        RSB_RECOVER_CONVERT,
        RSB_RECOVER_GRANT,
+        RSB_RECOVER_LVB_INVAL,
 };
 static inline void rsb_set_flag(struct dlm_rsb *r, enum rsb_flags flag)
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index b56950758188..a579f30f237d 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -5393,6 +5393,13 @@ static void purge_dead_list(struct dlm_ls *ls, struct dlm_rsb *r,
                if ((lkb->lkb_nodeid == nodeid_gone) ||
                    dlm_is_removed(ls, lkb->lkb_nodeid)) {
+                        /* tell recover_lvb to invalidate the lvb
+                           because a node holding EX/PW failed */
+                        if ((lkb->lkb_exflags & DLM_LKF_VALBLK) &&
+                            (lkb->lkb_grmode >= DLM_LOCK_PW)) {
+                                rsb_set_flag(r, RSB_RECOVER_LVB_INVAL);
+                        }
                        del_lkb(r, lkb);
                        /* this put should free the lkb */
@@ -6025,15 +6032,18 @@ static int orphan_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
        return error;
 }
-/* The force flag allows the unlock to go ahead even if the lkb isn't granted.
+/* The FORCEUNLOCK flag allows the unlock to go ahead even if the lkb isn't
-   Regardless of what rsb queue the lock is on, it's removed and freed. */
+   granted.  Regardless of what rsb queue the lock is on, it's removed and
+   freed.  The IVVALBLK flag causes the lvb on the resource to be invalidated
+   if our lock is PW/EX (it's ignored if our granted mode is smaller.) */
 static int unlock_proc_lock(struct dlm_ls *ls, struct dlm_lkb *lkb)
 {
        struct dlm_args args;
        int error;
-        set_unlock_args(DLM_LKF_FORCEUNLOCK, lkb->lkb_ua, &args);
+        set_unlock_args(DLM_LKF_FORCEUNLOCK | DLM_LKF_IVVALBLK,
+                        lkb->lkb_ua, &args);
        error = unlock_lock(ls, lkb, &args);
        if (error == -DLM_EUNLOCK)
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 331ea4f94efd..dd87a31bcc21 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -1385,7 +1385,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
        struct connection *con;
        struct writequeue_entry *e;
        int offset = 0;
-        int users = 0;
        con = nodeid2con(nodeid, allocation);
        if (!con)
@@ -1399,7 +1398,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
        } else {
                offset = e->end;
                e->end += len;
-                users = e->users++;
+                e->users++;
        }
        spin_unlock(&con->writequeue_lock);
@@ -1414,7 +1413,7 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc)
                spin_lock(&con->writequeue_lock);
                offset = e->end;
                e->end += len;
-                users = e->users++;
+                e->users++;
                list_add_tail(&e->list, &con->writequeue);
                spin_unlock(&con->writequeue_lock);
                goto got_one;
diff --git a/fs/dlm/recover.c b/fs/dlm/recover.c
index 4a7a76e42fc3..aedea28a86a1 100644
--- a/fs/dlm/recover.c
+++ b/fs/dlm/recover.c
@@ -717,8 +717,14 @@ void dlm_recovered_lock(struct dlm_rsb *r)
 * the VALNOTVALID flag if necessary, and determining the correct lvb contents
 * based on the lvb's of the locks held on the rsb.
 *
- * RSB_VALNOTVALID is set if there are only NL/CR locks on the rsb.  If it
+ * RSB_VALNOTVALID is set in two cases:
- * was already set prior to recovery, it's not cleared, regardless of locks.
+ *
+ * 1. we are master, but not new, and we purged an EX/PW lock held by a
+ * failed node (in dlm_recover_purge which set RSB_RECOVER_LVB_INVAL)
+ *
+ * 2. we are a new master, and there are only NL/CR locks left.
+ * (We could probably improve this by only invaliding in this way when
+ * the previous master left uncleanly.  VMS docs mention that.)
 *
 * The LVB contents are only considered for changing when this is a new master
 * of the rsb (NEW_MASTER2).  Then, the rsb's lvb is taken from any lkb with
@@ -734,6 +740,19 @@ static void recover_lvb(struct dlm_rsb *r)
        int big_lock_exists = 0;
        int lvblen = r->res_ls->ls_lvblen;
+        if (!rsb_flag(r, RSB_NEW_MASTER2) &&
+            rsb_flag(r, RSB_RECOVER_LVB_INVAL)) {
+                /* case 1 above */
+                rsb_set_flag(r, RSB_VALNOTVALID);
+                return;
+        }
+        if (!rsb_flag(r, RSB_NEW_MASTER2))
+                return;
+        /* we are the new master, so figure out if VALNOTVALID should
+           be set, and set the rsb lvb from the best lkb available. */
        list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) {
                if (!(lkb->lkb_exflags & DLM_LKF_VALBLK))
                        continue;
@@ -772,13 +791,10 @@ static void recover_lvb(struct dlm_rsb *r)
        if (!lock_lvb_exists)
                goto out;
+        /* lvb is invalidated if only NL/CR locks remain */
        if (!big_lock_exists)
                rsb_set_flag(r, RSB_VALNOTVALID);
-        /* don't mess with the lvb unless we're the new master */
-        if (!rsb_flag(r, RSB_NEW_MASTER2))
-                goto out;
        if (!r->res_lvbptr) {
                r->res_lvbptr = dlm_allocate_lvb(r->res_ls);
                if (!r->res_lvbptr)
@@ -852,12 +868,19 @@ void dlm_recover_rsbs(struct dlm_ls *ls)
                if (is_master(r)) {
                        if (rsb_flag(r, RSB_RECOVER_CONVERT))
                                recover_conversion(r);
+                        /* recover lvb before granting locks so the updated
+                           lvb/VALNOTVALID is presented in the completion */
+                        recover_lvb(r);
                        if (rsb_flag(r, RSB_NEW_MASTER2))
                                recover_grant(r);
-                        recover_lvb(r);
                        count++;
+                } else {
+                        rsb_clear_flag(r, RSB_VALNOTVALID);
                }
                rsb_clear_flag(r, RSB_RECOVER_CONVERT);
+                rsb_clear_flag(r, RSB_RECOVER_LVB_INVAL);
                rsb_clear_flag(r, RSB_NEW_MASTER2);
                unlock_rsb(r);
        }
diff --git a/fs/exec.c b/fs/exec.c
index 0039055b1fc6..721a29929511 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1349,7 +1349,7 @@ EXPORT_SYMBOL(remove_arg_zero);
 /*
 * cycle the list of binary formats handler, until one recognizes the image
 */
-int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
+int search_binary_handler(struct linux_binprm *bprm)
 {
        unsigned int depth = bprm->recursion_depth;
        int try,retval;
@@ -1374,13 +1374,13 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
        for (try=0; try<2; try++) {
                read_lock(&binfmt_lock);
                list_for_each_entry(fmt, &formats, lh) {
-                        int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
+                        int (*fn)(struct linux_binprm *) = fmt->load_binary;
                        if (!fn)
                                continue;
                        if (!try_module_get(fmt->module))
                                continue;
                        read_unlock(&binfmt_lock);
-                        retval = fn(bprm, regs);
+                        retval = fn(bprm);
                        /*
                         * Restore the depth counter to its starting value
                         * in this call, so we don't have to rely on every
@@ -1439,8 +1439,7 @@ EXPORT_SYMBOL(search_binary_handler);
 */
 static int do_execve_common(const char *filename,
                                struct user_arg_ptr argv,
-                                struct user_arg_ptr envp,
+                                struct user_arg_ptr envp)
-                                struct pt_regs *regs)
 {
        struct linux_binprm *bprm;
        struct file *file;
@@ -1524,7 +1523,7 @@ static int do_execve_common(const char *filename,
        if (retval < 0)
                goto out;
-        retval = search_binary_handler(bprm,regs);
+        retval = search_binary_handler(bprm);
        if (retval < 0)
                goto out;
@@ -1566,19 +1565,17 @@ out_ret:
 int do_execve(const char *filename,
        const char __user *const __user *__argv,
-        const char __user *const __user *__envp,
+        const char __user *const __user *__envp)
-        struct pt_regs *regs)
 {
        struct user_arg_ptr argv = { .ptr.native = __argv };
        struct user_arg_ptr envp = { .ptr.native = __envp };
-        return do_execve_common(filename, argv, envp, regs);
+        return do_execve_common(filename, argv, envp);
 }
 #ifdef CONFIG_COMPAT
-int compat_do_execve(const char *filename,
+static int compat_do_execve(const char *filename,
        const compat_uptr_t __user *__argv,
-        const compat_uptr_t __user *__envp,
+        const compat_uptr_t __user *__envp)
-        struct pt_regs *regs)
 {
        struct user_arg_ptr argv = {
                .is_compat = true,
@@ -1588,7 +1585,7 @@ int compat_do_execve(const char *filename,
                .is_compat = true,
                .ptr.compat = __envp,
        };
-        return do_execve_common(filename, argv, envp, regs);
+        return do_execve_common(filename, argv, envp);
 }
 #endif
@@ -1669,7 +1666,7 @@ SYSCALL_DEFINE3(execve,
        struct filename *path = getname(filename);
        int error = PTR_ERR(path);
        if (!IS_ERR(path)) {
-                error = do_execve(path->name, argv, envp, current_pt_regs());
+                error = do_execve(path->name, argv, envp);
                putname(path);
        }
        return error;
@@ -1682,8 +1679,7 @@ asmlinkage long compat_sys_execve(const char __user * filename,
        struct filename *path = getname(filename);
        int error = PTR_ERR(path);
        if (!IS_ERR(path)) {
-                error = compat_do_execve(path->name, argv, envp,
+                error = compat_do_execve(path->name, argv, envp);
-                                                        current_pt_regs());
                putname(path);
        }
        return error;
@@ -1696,12 +1692,9 @@ int kernel_execve(const char *filename,
                  const char *const argv[],
                  const char *const envp[])
 {
-        struct pt_regs *p = current_pt_regs();
+        int ret = do_execve(filename,
-        int ret;
-        ret = do_execve(filename,
                        (const char __user *const __user *)argv,
-                        (const char __user *const __user *)envp, p);
+                        (const char __user *const __user *)envp);
        if (ret < 0)
                return ret;
@@ -1709,6 +1702,6 @@ int kernel_execve(const char *filename,
         * We were successful.  We won't be returning to our caller, but
         * instead to user space by manipulating the kernel stack.
         */
-        ret_from_kernel_execve(p);
+        ret_from_kernel_execve(current_pt_regs());
 }
 #endif
diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 7320a66e958f..22548f56197b 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -2101,8 +2101,9 @@ int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
        end = start + (range->len >> sb->s_blocksize_bits) - 1;
        minlen = range->minlen >> sb->s_blocksize_bits;
-        if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)) ||
+        if (minlen > EXT3_BLOCKS_PER_GROUP(sb) ||
-            unlikely(start >= max_blks))
+            start >= max_blks ||
+            range->len < sb->s_blocksize)
                return -EINVAL;
        if (end >= max_blks)
                end = max_blks - 1;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 3c20de1d59d0..df163da388c9 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -2455,7 +2455,7 @@ TAS_BUFFER_FNS(Uninit, uninit)
 BUFFER_FNS(Da_Mapped, da_mapped)
 /*
- * Add new method to test wether block and inode bitmaps are properly
+ * Add new method to test whether block and inode bitmaps are properly
 * initialized. With uninit_bg reading the block from disk is not enough
 * to mark the bitmap uptodate. We need to also zero-out the bitmap
 */
diff --git a/fs/fhandle.c b/fs/fhandle.c
index f775bfdd6e4a..cccdc874bb55 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -22,7 +22,7 @@ static long do_sys_name_to_handle(struct path *path,
        struct file_handle *handle = NULL;
        /*
-         * We need t make sure wether the file system
+         * We need to make sure whether the file system
         * support decoding of the file handle
         */
        if (!path->dentry->d_sb->s_export_op ||
@@ -40,7 +40,7 @@ static long do_sys_name_to_handle(struct path *path,
        if (!handle)
                return -ENOMEM;
-        /* convert handle size to  multiple of sizeof(u32) */
+        /* convert handle size to multiple of sizeof(u32) */
        handle_dwords = f_handle.handle_bytes >> 2;
        /* we ask for a non connected handle */
diff --git a/fs/file.c b/fs/file.c
index 708d997a7748..15cb8618e95d 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -519,12 +519,6 @@ struct files_struct init_files = {
        .file_lock      = __SPIN_LOCK_UNLOCKED(init_task.file_lock),
 };
-void daemonize_descriptors(void)
-{
-        atomic_inc(&init_files.count);
-        reset_files_struct(&init_files);
-}
 /*
 * allocate a file descriptor, mark it busy.
 */
@@ -685,7 +679,6 @@ void do_close_on_exec(struct files_struct *files)
        struct fdtable *fdt;
        /* exec unshares first */
-        BUG_ON(atomic_read(&files->count) != 1);
        spin_lock(&files->file_lock);
        for (i = 0; ; i++) {
                unsigned long set;
@@ -995,16 +988,18 @@ int iterate_fd(struct files_struct *files, unsigned n,
                const void *p)
 {
        struct fdtable *fdt;
-        struct file *file;
        int res = 0;
        if (!files)
                return 0;
        spin_lock(&files->file_lock);
-        fdt = files_fdtable(files);
+        for (fdt = files_fdtable(files); n < fdt->max_fds; n++) {
-        while (!res && n < fdt->max_fds) {
+                struct file *file;
-                file = rcu_dereference_check_fdtable(files, fdt->fd[n++]);
+                file = rcu_dereference_check_fdtable(files, fdt->fd[n]);
-                if (file)
+                if (!file)
-                        res = f(p, file, n);
+                        continue;
+                res = f(p, file, n);
+                if (res)
+                        break;
        }
        spin_unlock(&files->file_lock);
        return res;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 51ea267d444c..310972b72a66 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -228,6 +228,8 @@ static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 static void inode_sync_complete(struct inode *inode)
 {
        inode->i_state &= ~I_SYNC;
+        /* If inode is clean an unused, put it into LRU now... */
+        inode_add_lru(inode);
        /* Waiters must see I_SYNC cleared before being woken up */
        smp_mb();
        wake_up_bit(&inode->i_state, __I_SYNC);
@@ -1032,7 +1034,7 @@ int bdi_writeback_thread(void *data)
        while (!kthread_freezable_should_stop(NULL)) {
                /*
                 * Remove own delayed wake-up timer, since we are already awake
-                 * and we'll take care of the preriodic write-back.
+                 * and we'll take care of the periodic write-back.
                 */
                del_timer(&wb->wakeup_timer);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 5df4775fea03..fe6ca583bbc0 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -164,27 +164,3 @@ struct fs_struct init_fs = {
        .seq            = SEQCNT_ZERO,
        .umask          = 0022,
 };
-void daemonize_fs_struct(void)
-{
-        struct fs_struct *fs = current->fs;
-        if (fs) {
-                int kill;
-                task_lock(current);
-                spin_lock(&init_fs.lock);
-                init_fs.users++;
-                spin_unlock(&init_fs.lock);
-                spin_lock(&fs->lock);
-                current->fs = &init_fs;
-                kill = !--fs->users;
-                spin_unlock(&fs->lock);
-                task_unlock(current);
-                if (kill)
-                        free_fs_struct(fs);
-        }
-}
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 01c4975da4bc..30de4f2a2ea9 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -643,7 +643,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                        goto out_unlock;
                requested = data_blocks + ind_blocks;
-                error = gfs2_inplace_reserve(ip, requested);
+                error = gfs2_inplace_reserve(ip, requested, 0);
                if (error)
                        goto out_qunlock;
        }
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 1fd3ae237bdd..a68e91bcef3d 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -991,6 +991,41 @@ unlock:
        return err;
 }
+/**
+ * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
+ * @inode: The inode being truncated
+ * @oldsize: The original (larger) size
+ * @newsize: The new smaller size
+ *
+ * With jdata files, we have to journal a revoke for each block which is
+ * truncated. As a result, we need to split this into separate transactions
+ * if the number of pages being truncated gets too large.
+ */
+#define GFS2_JTRUNC_REVOKES 8192
+static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
+{
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
+        u64 chunk;
+        int error;
+        while (oldsize != newsize) {
+                chunk = oldsize - newsize;
+                if (chunk > max_chunk)
+                        chunk = max_chunk;
+                truncate_pagecache(inode, oldsize, oldsize - chunk);
+                oldsize -= chunk;
+                gfs2_trans_end(sdp);
+                error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
+                if (error)
+                        return error;
+        }
+        return 0;
+}
 static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1000,8 +1035,10 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
        int journaled = gfs2_is_jdata(ip);
        int error;
-        error = gfs2_trans_begin(sdp,
+        if (journaled)
-                                 RES_DINODE + (journaled ? RES_JDATA : 0), 0);
+                error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
+        else
+                error = gfs2_trans_begin(sdp, RES_DINODE, 0);
        if (error)
                return error;
@@ -1026,7 +1063,16 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_dinode_out(ip, dibh->b_data);
-        truncate_pagecache(inode, oldsize, newsize);
+        if (journaled)
+                error = gfs2_journaled_truncate(inode, oldsize, newsize);
+        else
+                truncate_pagecache(inode, oldsize, newsize);
+        if (error) {
+                brelse(dibh);
+                return error;
+        }
 out_brelse:
        brelse(dibh);
 out:
@@ -1178,7 +1224,7 @@ static int do_grow(struct inode *inode, u64 size)
                if (error)
                        return error;
-                error = gfs2_inplace_reserve(ip, 1);
+                error = gfs2_inplace_reserve(ip, 1, 0);
                if (error)
                        goto do_grow_qunlock;
                unstuff = 1;
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 259b088cfc4c..9a35670fdc38 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -1676,16 +1676,11 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name,
                                be16_add_cpu(&leaf->lf_entries, 1);
                        }
                        brelse(bh);
-                        error = gfs2_meta_inode_buffer(ip, &bh);
-                        if (error)
-                                break;
-                        gfs2_trans_add_bh(ip->i_gl, bh, 1);
                        ip->i_entries++;
                        ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME;
                        if (S_ISDIR(nip->i_inode.i_mode))
                                inc_nlink(&ip->i_inode);
-                        gfs2_dinode_out(ip, bh->b_data);
+                        mark_inode_dirty(inode);
-                        brelse(bh);
                        error = 0;
                        break;
                }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e056b4ce4877..dfe2d8cb9b2c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -432,7 +432,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        if (ret)
                goto out_unlock;
        gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks);
-        ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks);
+        ret = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
        if (ret)
                goto out_quota_unlock;
@@ -825,7 +825,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset,
 retry:
                gfs2_write_calc_reserv(ip, bytes, &data_blocks, &ind_blocks);
-                error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks);
+                error = gfs2_inplace_reserve(ip, data_blocks + ind_blocks, 0);
                if (error) {
                        if (error == -ENOSPC && bytes > sdp->sd_sb.sb_bsize) {
                                bytes >>= 1;
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index e6c2fd53cab2..992c5c0cb504 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -55,8 +55,6 @@ struct gfs2_glock_iter {
 typedef void (*glock_examiner) (struct gfs2_glock * gl);
-static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
-#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { __dump_glock(NULL, gl); BUG(); } } while(0)
 static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target);
 static struct dentry *gfs2_root;
@@ -107,10 +105,12 @@ static void gfs2_glock_dealloc(struct rcu_head *rcu)
 {
        struct gfs2_glock *gl = container_of(rcu, struct gfs2_glock, gl_rcu);
-        if (gl->gl_ops->go_flags & GLOF_ASPACE)
+        if (gl->gl_ops->go_flags & GLOF_ASPACE) {
                kmem_cache_free(gfs2_glock_aspace_cachep, gl);
-        else
+        } else {
+                kfree(gl->gl_lksb.sb_lvbptr);
                kmem_cache_free(gfs2_glock_cachep, gl);
+        }
 }
 void gfs2_glock_free(struct gfs2_glock *gl)
@@ -537,8 +537,8 @@ __acquires(&gl->gl_spin)
            (lck_flags & (LM_FLAG_TRY|LM_FLAG_TRY_1CB)))
                clear_bit(GLF_BLOCKING, &gl->gl_flags);
        spin_unlock(&gl->gl_spin);
-        if (glops->go_xmote_th)
+        if (glops->go_sync)
-                glops->go_xmote_th(gl);
+                glops->go_sync(gl);
        if (test_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags))
                glops->go_inval(gl, target == LM_ST_DEFERRED ? 0 : DIO_METADATA);
        clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags);
@@ -547,7 +547,10 @@ __acquires(&gl->gl_spin)
        if (sdp->sd_lockstruct.ls_ops->lm_lock) {
                /* lock_dlm */
                ret = sdp->sd_lockstruct.ls_ops->lm_lock(gl, target, lck_flags);
-                GLOCK_BUG_ON(gl, ret);
+                if (ret) {
+                        printk(KERN_ERR "GFS2: lm_lock ret %d\n", ret);
+                        GLOCK_BUG_ON(gl, 1);
+                }
        } else { /* lock_nolock */
                finish_xmote(gl, target);
                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
@@ -736,6 +739,16 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        if (!gl)
                return -ENOMEM;
+        memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
+        if (glops->go_flags & GLOF_LVB) {
+                gl->gl_lksb.sb_lvbptr = kzalloc(GFS2_MIN_LVB_SIZE, GFP_KERNEL);
+                if (!gl->gl_lksb.sb_lvbptr) {
+                        kmem_cache_free(cachep, gl);
+                        return -ENOMEM;
+                }
+        }
        atomic_inc(&sdp->sd_glock_disposal);
        gl->gl_sbd = sdp;
        gl->gl_flags = 0;
@@ -753,9 +766,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        preempt_enable();
        gl->gl_stats.stats[GFS2_LKS_DCOUNT] = 0;
        gl->gl_stats.stats[GFS2_LKS_QCOUNT] = 0;
-        memset(&gl->gl_lksb, 0, sizeof(struct dlm_lksb));
-        memset(gl->gl_lvb, 0, 32 * sizeof(char));
-        gl->gl_lksb.sb_lvbptr = gl->gl_lvb;
        gl->gl_tchange = jiffies;
        gl->gl_object = NULL;
        gl->gl_hold_time = GL_GLOCK_DFT_HOLD;
@@ -768,7 +778,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
                mapping->host = s->s_bdev->bd_inode;
                mapping->flags = 0;
                mapping_set_gfp_mask(mapping, GFP_NOFS);
-                mapping->assoc_mapping = NULL;
+                mapping->private_data = NULL;
                mapping->backing_dev_info = s->s_bdi;
                mapping->writeback_index = 0;
        }
@@ -777,6 +787,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        tmp = search_bucket(hash, sdp, &name);
        if (tmp) {
                spin_unlock_bucket(hash);
+                kfree(gl->gl_lksb.sb_lvbptr);
                kmem_cache_free(cachep, gl);
                atomic_dec(&sdp->sd_glock_disposal);
                gl = tmp;
@@ -1013,7 +1024,7 @@ trap_recursive:
        printk(KERN_ERR "pid: %d\n", pid_nr(gh->gh_owner_pid));
        printk(KERN_ERR "lock type: %d req lock state : %d\n",
               gh->gh_gl->gl_name.ln_type, gh->gh_state);
-        __dump_glock(NULL, gl);
+        gfs2_dump_glock(NULL, gl);
        BUG();
 }
@@ -1508,7 +1519,7 @@ static int dump_glock(struct seq_file *seq, struct gfs2_glock *gl)
 {
        int ret;
        spin_lock(&gl->gl_spin);
-        ret = __dump_glock(seq, gl);
+        ret = gfs2_dump_glock(seq, gl);
        spin_unlock(&gl->gl_spin);
        return ret;
 }
@@ -1528,6 +1539,7 @@ static void dump_glock_func(struct gfs2_glock *gl)
 void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 {
+        set_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags);
        glock_hash_walk(clear_glock, sdp);
        flush_workqueue(glock_workqueue);
        wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
@@ -1655,7 +1667,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
 }
 /**
- * __dump_glock - print information about a glock
+ * gfs2_dump_glock - print information about a glock
 * @seq: The seq_file struct
 * @gl: the glock
 *
@@ -1672,7 +1684,7 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl)
 * Returns: 0 on success, -ENOBUFS when we run out of space
 */
-static int __dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
+int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl)
 {
        const struct gfs2_glock_operations *glops = gl->gl_ops;
        unsigned long long dtime;
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index 307ac31df781..fd580b7861d5 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -178,33 +178,33 @@ static inline struct address_space *gfs2_glock2aspace(struct gfs2_glock *gl)
        return NULL;
 }
-int gfs2_glock_get(struct gfs2_sbd *sdp,
+extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
-                   u64 number, const struct gfs2_glock_operations *glops,
+                          const struct gfs2_glock_operations *glops,
-                   int create, struct gfs2_glock **glp);
+                          int create, struct gfs2_glock **glp);
-void gfs2_glock_hold(struct gfs2_glock *gl);
+extern void gfs2_glock_hold(struct gfs2_glock *gl);
-void gfs2_glock_put_nolock(struct gfs2_glock *gl);
+extern void gfs2_glock_put_nolock(struct gfs2_glock *gl);
-void gfs2_glock_put(struct gfs2_glock *gl);
+extern void gfs2_glock_put(struct gfs2_glock *gl);
-void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
-                      struct gfs2_holder *gh);
+                             unsigned flags, struct gfs2_holder *gh);
-void gfs2_holder_reinit(unsigned int state, unsigned flags,
+extern void gfs2_holder_reinit(unsigned int state, unsigned flags,
-                        struct gfs2_holder *gh);
+                               struct gfs2_holder *gh);
-void gfs2_holder_uninit(struct gfs2_holder *gh);
+extern void gfs2_holder_uninit(struct gfs2_holder *gh);
-int gfs2_glock_nq(struct gfs2_holder *gh);
+extern int gfs2_glock_nq(struct gfs2_holder *gh);
-int gfs2_glock_poll(struct gfs2_holder *gh);
+extern int gfs2_glock_poll(struct gfs2_holder *gh);
-int gfs2_glock_wait(struct gfs2_holder *gh);
+extern int gfs2_glock_wait(struct gfs2_holder *gh);
-void gfs2_glock_dq(struct gfs2_holder *gh);
+extern void gfs2_glock_dq(struct gfs2_holder *gh);
-void gfs2_glock_dq_wait(struct gfs2_holder *gh);
+extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
+extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
-void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
+extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
-int gfs2_glock_nq_num(struct gfs2_sbd *sdp,
+                             const struct gfs2_glock_operations *glops,
-                      u64 number, const struct gfs2_glock_operations *glops,
+                             unsigned int state, int flags,
-                      unsigned int state, int flags, struct gfs2_holder *gh);
+                             struct gfs2_holder *gh);
+extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
-void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
+extern void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
-void gfs2_glock_dq_uninit_m(unsigned int num_gh, struct gfs2_holder *ghs);
+extern int gfs2_dump_glock(struct seq_file *seq, const struct gfs2_glock *gl);
+#define GLOCK_BUG_ON(gl,x) do { if (unlikely(x)) { gfs2_dump_glock(NULL, gl); BUG(); } } while(0)
-__printf(2, 3)
+extern __printf(2, 3)
 void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
 /**
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 32cc4fde975c..78d4184ffc7d 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -74,7 +74,7 @@ static void __gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
                gfs2_trans_add_revoke(sdp, bd);
        }
-        BUG_ON(!fsync && atomic_read(&gl->gl_ail_count));
+        GLOCK_BUG_ON(gl, !fsync && atomic_read(&gl->gl_ail_count));
        spin_unlock(&sdp->sd_ail_lock);
        gfs2_log_unlock(sdp);
 }
@@ -96,7 +96,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
        tr.tr_ip = (unsigned long)__builtin_return_address(0);
        sb_start_intwrite(sdp->sd_vfs);
        gfs2_log_reserve(sdp, tr.tr_reserved);
-        BUG_ON(current->journal_info);
+        WARN_ON_ONCE(current->journal_info);
        current->journal_info = &tr;
        __gfs2_ail_flush(gl, 0);
@@ -139,7 +139,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
        if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
                return;
-        BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
+        GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
        gfs2_log_flush(gl->gl_sbd, gl);
        filemap_fdatawrite(metamapping);
@@ -168,7 +168,7 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags)
 {
        struct address_space *mapping = gfs2_glock2aspace(gl);
-        BUG_ON(!(flags & DIO_METADATA));
+        WARN_ON_ONCE(!(flags & DIO_METADATA));
        gfs2_assert_withdraw(gl->gl_sbd, !atomic_read(&gl->gl_ail_count));
        truncate_inode_pages(mapping, 0);
@@ -197,7 +197,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
        if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags))
                return;
-        BUG_ON(gl->gl_state != LM_ST_EXCLUSIVE);
+        GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
        gfs2_log_flush(gl->gl_sbd, gl);
        filemap_fdatawrite(metamapping);
@@ -536,7 +536,7 @@ const struct gfs2_glock_operations gfs2_meta_glops = {
 };
 const struct gfs2_glock_operations gfs2_inode_glops = {
-        .go_xmote_th = inode_go_sync,
+        .go_sync = inode_go_sync,
        .go_inval = inode_go_inval,
        .go_demote_ok = inode_go_demote_ok,
        .go_lock = inode_go_lock,
@@ -546,17 +546,17 @@ const struct gfs2_glock_operations gfs2_inode_glops = {
 };
 const struct gfs2_glock_operations gfs2_rgrp_glops = {
-        .go_xmote_th = rgrp_go_sync,
+        .go_sync = rgrp_go_sync,
        .go_inval = rgrp_go_inval,
        .go_lock = gfs2_rgrp_go_lock,
        .go_unlock = gfs2_rgrp_go_unlock,
        .go_dump = gfs2_rgrp_dump,
        .go_type = LM_TYPE_RGRP,
-        .go_flags = GLOF_ASPACE,
+        .go_flags = GLOF_ASPACE | GLOF_LVB,
 };
 const struct gfs2_glock_operations gfs2_trans_glops = {
-        .go_xmote_th = trans_go_sync,
+        .go_sync = trans_go_sync,
        .go_xmote_bh = trans_go_xmote_bh,
        .go_demote_ok = trans_go_demote_ok,
        .go_type = LM_TYPE_NONDISK,
@@ -577,6 +577,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = {
 const struct gfs2_glock_operations gfs2_quota_glops = {
        .go_type = LM_TYPE_QUOTA,
+        .go_flags = GLOF_LVB,
 };
 const struct gfs2_glock_operations gfs2_journal_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 3d469d37345e..c373a24fedd9 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -205,7 +205,7 @@ struct lm_lockname {
 struct gfs2_glock_operations {
-        void (*go_xmote_th) (struct gfs2_glock *gl);
+        void (*go_sync) (struct gfs2_glock *gl);
        int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh);
        void (*go_inval) (struct gfs2_glock *gl, int flags);
        int (*go_demote_ok) (const struct gfs2_glock *gl);
@@ -216,6 +216,7 @@ struct gfs2_glock_operations {
        const int go_type;
        const unsigned long go_flags;
 #define GLOF_ASPACE 1
+#define GLOF_LVB    2
 };
 enum {
@@ -321,7 +322,6 @@ struct gfs2_glock {
        ktime_t gl_dstamp;
        struct gfs2_lkstats gl_stats;
        struct dlm_lksb gl_lksb;
-        char gl_lvb[32];
        unsigned long gl_tchange;
        void *gl_object;
@@ -539,6 +539,7 @@ enum {
        SDF_DEMOTE              = 5,
        SDF_NOJOURNALID         = 6,
        SDF_RORECOVERY          = 7, /* read only recovery */
+        SDF_SKIP_DLM_UNLOCK     = 8,
 };
 #define GFS2_FSNAME_LEN         256
@@ -621,6 +622,7 @@ struct gfs2_sbd {
        u32 sd_hash_bsize_shift;
        u32 sd_hash_ptrs;       /* Number of pointers in a hash block */
        u32 sd_qc_per_block;
+        u32 sd_blocks_per_bitmap;
        u32 sd_max_dirres;      /* Max blocks needed to add a directory entry */
        u32 sd_max_height;      /* Max height of a file's metadata tree */
        u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 381893ceefa4..2b6f5698ef18 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -364,34 +364,34 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name,
        return 0;
 }
-static void munge_mode_uid_gid(struct gfs2_inode *dip, umode_t *mode,
+static void munge_mode_uid_gid(const struct gfs2_inode *dip,
-                               unsigned int *uid, unsigned int *gid)
+                               struct inode *inode)
 {
        if (GFS2_SB(&dip->i_inode)->sd_args.ar_suiddir &&
            (dip->i_inode.i_mode & S_ISUID) && dip->i_inode.i_uid) {
-                if (S_ISDIR(*mode))
+                if (S_ISDIR(inode->i_mode))
-                        *mode |= S_ISUID;
+                        inode->i_mode |= S_ISUID;
                else if (dip->i_inode.i_uid != current_fsuid())
-                        *mode &= ~07111;
+                        inode->i_mode &= ~07111;
-                *uid = dip->i_inode.i_uid;
+                inode->i_uid = dip->i_inode.i_uid;
        } else
-                *uid = current_fsuid();
+                inode->i_uid = current_fsuid();
        if (dip->i_inode.i_mode & S_ISGID) {
-                if (S_ISDIR(*mode))
+                if (S_ISDIR(inode->i_mode))
-                        *mode |= S_ISGID;
+                        inode->i_mode |= S_ISGID;
-                *gid = dip->i_inode.i_gid;
+                inode->i_gid = dip->i_inode.i_gid;
        } else
-                *gid = current_fsgid();
+                inode->i_gid = current_fsgid();
 }
-static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
+static int alloc_dinode(struct gfs2_inode *ip, u32 flags)
 {
-        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
+        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        int error;
        int dblocks = 1;
-        error = gfs2_inplace_reserve(dip, RES_DINODE);
+        error = gfs2_inplace_reserve(ip, RES_DINODE, flags);
        if (error)
                goto out;
@@ -399,12 +399,15 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
        if (error)
                goto out_ipreserv;
-        error = gfs2_alloc_blocks(dip, no_addr, &dblocks, 1, generation);
+        error = gfs2_alloc_blocks(ip, &ip->i_no_addr, &dblocks, 1, &ip->i_generation);
+        ip->i_no_formal_ino = ip->i_generation;
+        ip->i_inode.i_ino = ip->i_no_addr;
+        ip->i_goal = ip->i_no_addr;
        gfs2_trans_end(sdp);
 out_ipreserv:
-        gfs2_inplace_release(dip);
+        gfs2_inplace_release(ip);
 out:
        return error;
 }
@@ -429,52 +432,42 @@ static void gfs2_init_dir(struct buffer_head *dibh,
 /**
 * init_dinode - Fill in a new dinode structure
 * @dip: The directory this inode is being created in
- * @gl: The glock covering the new inode
+ * @ip: The inode
- * @inum: The inode number
- * @mode: The file permissions
- * @uid: The uid of the new inode
- * @gid: The gid of the new inode
- * @generation: The generation number of the new inode
- * @dev: The device number (if a device node)
 * @symname: The symlink destination (if a symlink)
- * @size: The inode size (ignored for directories)
 * @bhp: The buffer head (returned to caller)
 *
 */
-static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
-                        const struct gfs2_inum_host *inum, umode_t mode,
+                        const char *symname, struct buffer_head **bhp)
-                        unsigned int uid, unsigned int gid,
-                        const u64 *generation, dev_t dev, const char *symname,
-                        unsigned size, struct buffer_head **bhp)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_dinode *di;
        struct buffer_head *dibh;
        struct timespec tv = CURRENT_TIME;
-        dibh = gfs2_meta_new(gl, inum->no_addr);
+        dibh = gfs2_meta_new(ip->i_gl, ip->i_no_addr);
-        gfs2_trans_add_bh(gl, dibh, 1);
+        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_metatype_set(dibh, GFS2_METATYPE_DI, GFS2_FORMAT_DI);
        gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
        di = (struct gfs2_dinode *)dibh->b_data;
-        di->di_num.no_formal_ino = cpu_to_be64(inum->no_formal_ino);
+        di->di_num.no_formal_ino = cpu_to_be64(ip->i_no_formal_ino);
-        di->di_num.no_addr = cpu_to_be64(inum->no_addr);
+        di->di_num.no_addr = cpu_to_be64(ip->i_no_addr);
-        di->di_mode = cpu_to_be32(mode);
+        di->di_mode = cpu_to_be32(ip->i_inode.i_mode);
-        di->di_uid = cpu_to_be32(uid);
+        di->di_uid = cpu_to_be32(ip->i_inode.i_uid);
-        di->di_gid = cpu_to_be32(gid);
+        di->di_gid = cpu_to_be32(ip->i_inode.i_gid);
        di->di_nlink = 0;
-        di->di_size = cpu_to_be64(size);
+        di->di_size = cpu_to_be64(ip->i_inode.i_size);
        di->di_blocks = cpu_to_be64(1);
        di->di_atime = di->di_mtime = di->di_ctime = cpu_to_be64(tv.tv_sec);
-        di->di_major = cpu_to_be32(MAJOR(dev));
+        di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev));
-        di->di_minor = cpu_to_be32(MINOR(dev));
+        di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev));
-        di->di_goal_meta = di->di_goal_data = cpu_to_be64(inum->no_addr);
+        di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_no_addr);
-        di->di_generation = cpu_to_be64(*generation);
+        di->di_generation = cpu_to_be64(ip->i_generation);
        di->di_flags = 0;
        di->__pad1 = 0;
-        di->di_payload_format = cpu_to_be32(S_ISDIR(mode) ? GFS2_FORMAT_DE : 0);
+        di->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) ? GFS2_FORMAT_DE : 0);
        di->di_height = 0;
        di->__pad2 = 0;
        di->__pad3 = 0;
@@ -487,7 +480,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        di->di_ctime_nsec = cpu_to_be32(tv.tv_nsec);
        memset(&di->di_reserved, 0, sizeof(di->di_reserved));
-        switch(mode & S_IFMT) { 
+        switch(ip->i_inode.i_mode & S_IFMT) {
        case S_IFREG:
                if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) ||
                    gfs2_tune_get(sdp, gt_new_files_jdata))
@@ -502,7 +495,7 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
                gfs2_init_dir(dibh, dip);
                break;
        case S_IFLNK:
-                memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, size);
+                memcpy(dibh->b_data + sizeof(struct gfs2_dinode), symname, ip->i_inode.i_size);
                break;
        }
@@ -511,25 +504,22 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        *bhp = dibh;
 }
-static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
+static int make_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip,
-                       umode_t mode, const struct gfs2_inum_host *inum,
+                       const char *symname, struct buffer_head **bhp)
-                       const u64 *generation, dev_t dev, const char *symname,
-                       unsigned int size, struct buffer_head **bhp)
 {
+        struct inode *inode = &ip->i_inode;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        unsigned int uid, gid;
        int error;
-        munge_mode_uid_gid(dip, &mode, &uid, &gid);
        error = gfs2_rindex_update(sdp);
        if (error)
                return error;
-        error = gfs2_quota_lock(dip, uid, gid);
+        error = gfs2_quota_lock(dip, inode->i_uid, inode->i_gid);
        if (error)
                return error;
-        error = gfs2_quota_check(dip, uid, gid);
+        error = gfs2_quota_check(dip, inode->i_uid, inode->i_gid);
        if (error)
                goto out_quota;
@@ -537,8 +527,8 @@ static int make_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl,
        if (error)
                goto out_quota;
-        init_dinode(dip, gl, inum, mode, uid, gid, generation, dev, symname, size, bhp);
+        init_dinode(dip, ip, symname, bhp);
-        gfs2_quota_change(dip, +1, uid, gid);
+        gfs2_quota_change(dip, +1, inode->i_uid, inode->i_gid);
        gfs2_trans_end(sdp);
 out_quota:
@@ -570,7 +560,7 @@ static int link_dinode(struct gfs2_inode *dip, const struct qstr *name,
                if (error)
                        goto fail_quota_locks;
-                error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres);
+                error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
                if (error)
                        goto fail_quota_locks;
@@ -657,19 +647,14 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        struct inode *inode = NULL;
        struct gfs2_inode *dip = GFS2_I(dir), *ip;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
-        struct gfs2_inum_host inum = { .no_addr = 0, .no_formal_ino = 0 };
+        struct gfs2_glock *io_gl;
        int error;
-        u64 generation;
        struct buffer_head *bh = NULL;
+        u32 aflags = 0;
        if (!name->len || name->len > GFS2_FNAMESIZE)
                return -ENAMETOOLONG;
-        /* We need a reservation to allocate the new dinode block. The
-           directory ip temporarily points to the reservation, but this is
-           being done to get a set of contiguous blocks for the new dinode.
-           Since this is a create, we don't have a sizehint yet, so it will
-           have to use the minimum reservation size. */
        error = gfs2_rs_alloc(dip);
        if (error)
                return error;
@@ -688,45 +673,72 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        if (error)
                goto fail_gunlock;
-        error = alloc_dinode(dip, &inum.no_addr, &generation);
+        inode = new_inode(sdp->sd_vfs);
+        if (!inode) {
+                gfs2_glock_dq_uninit(ghs);
+                return -ENOMEM;
+        }
+        ip = GFS2_I(inode);
+        error = gfs2_rs_alloc(ip);
        if (error)
-                goto fail_gunlock;
+                goto fail_free_inode;
-        inum.no_formal_ino = generation;
+        set_bit(GIF_INVALID, &ip->i_flags);
+        inode->i_mode = mode;
+        inode->i_rdev = dev;
+        inode->i_size = size;
+        munge_mode_uid_gid(dip, inode);
+        ip->i_goal = dip->i_goal;
-        error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
+        if ((GFS2_I(sdp->sd_root_dir->d_inode) == dip) ||
-                                  LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
+            (dip->i_diskflags & GFS2_DIF_TOPDIR))
+                aflags |= GFS2_AF_ORLOV;
+        error = alloc_dinode(ip, aflags);
        if (error)
-                goto fail_gunlock;
+                goto fail_free_inode;
-        error = make_dinode(dip, ghs[1].gh_gl, mode, &inum, &generation, dev, symname, size, &bh);
+        error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
        if (error)
-                goto fail_gunlock2;
+                goto fail_free_inode;
-        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
+        ip->i_gl->gl_object = ip;
-                                  inum.no_formal_ino, 0);
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
-        if (IS_ERR(inode))
+        if (error)
+                goto fail_free_inode;
+        error = make_dinode(dip, ip, symname, &bh);
+        if (error)
                goto fail_gunlock2;
-        ip = GFS2_I(inode);
+        error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl);
-        error = gfs2_inode_refresh(ip);
        if (error)
                goto fail_gunlock2;
-        error = gfs2_rs_alloc(ip);
+        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, GL_EXACT, &ip->i_iopen_gh);
        if (error)
                goto fail_gunlock2;
+        ip->i_iopen_gh.gh_gl->gl_object = ip;
+        gfs2_glock_put(io_gl);
+        gfs2_set_iop(inode);
+        insert_inode_hash(inode);
+        error = gfs2_inode_refresh(ip);
+        if (error)
+                goto fail_gunlock3;
        error = gfs2_acl_create(dip, inode);
        if (error)
-                goto fail_gunlock2;
+                goto fail_gunlock3;
        error = gfs2_security_init(dip, ip, name);
        if (error)
-                goto fail_gunlock2;
+                goto fail_gunlock3;
        error = link_dinode(dip, name, ip);
        if (error)
-                goto fail_gunlock2;
+                goto fail_gunlock3;
        if (bh)
                brelse(bh);
@@ -739,8 +751,20 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
        d_instantiate(dentry, inode);
        return 0;
+fail_gunlock3:
+        gfs2_glock_dq_uninit(ghs + 1);
+        if (ip->i_gl)
+                gfs2_glock_put(ip->i_gl);
+        goto fail_gunlock;
 fail_gunlock2:
        gfs2_glock_dq_uninit(ghs + 1);
+fail_free_inode:
+        if (ip->i_gl)
+                gfs2_glock_put(ip->i_gl);
+        gfs2_rs_delete(ip);
+        free_inode_nonrcu(inode);
+        inode = NULL;
 fail_gunlock:
        gfs2_glock_dq_uninit(ghs);
        if (inode && !IS_ERR(inode)) {
@@ -748,7 +772,6 @@ fail_gunlock:
                iput(inode);
        }
 fail:
-        gfs2_rs_delete(dip);
        if (bh)
                brelse(bh);
        return error;
@@ -884,7 +907,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
                if (error)
                        goto out_gunlock;
-                error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres);
+                error = gfs2_inplace_reserve(dip, sdp->sd_max_dirres, 0);
                if (error)
                        goto out_gunlock_q;
@@ -977,7 +1000,6 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 * gfs2_unlink_inode - Removes an inode from its parent dir and unlinks it
 * @dip: The parent directory
 * @name: The name of the entry in the parent directory
- * @bh: The inode buffer for the inode to be removed
 * @inode: The inode to be removed
 *
 * Called with all the locks and in a transaction. This will only be
@@ -987,8 +1009,7 @@ static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name,
 */
 static int gfs2_unlink_inode(struct gfs2_inode *dip,
-                             const struct dentry *dentry,
+                             const struct dentry *dentry)
-                             struct buffer_head *bh)
 {
        struct inode *inode = dentry->d_inode;
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1028,7 +1049,6 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
        struct gfs2_sbd *sdp = GFS2_SB(dir);
        struct inode *inode = dentry->d_inode;
        struct gfs2_inode *ip = GFS2_I(inode);
-        struct buffer_head *bh;
        struct gfs2_holder ghs[3];
        struct gfs2_rgrpd *rgd;
        int error;
@@ -1077,14 +1097,9 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
        error = gfs2_trans_begin(sdp, 2*RES_DINODE + 3*RES_LEAF + RES_RG_BIT, 0);
        if (error)
-                goto out_gunlock;
-        error = gfs2_meta_inode_buffer(ip, &bh);
-        if (error)
                goto out_end_trans;
-        error = gfs2_unlink_inode(dip, dentry, bh);
+        error = gfs2_unlink_inode(dip, dentry);
-        brelse(bh);
 out_end_trans:
        gfs2_trans_end(sdp);
@@ -1365,7 +1380,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
                if (error)
                        goto out_gunlock;
-                error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres);
+                error = gfs2_inplace_reserve(ndip, sdp->sd_max_dirres, 0);
                if (error)
                        goto out_gunlock_q;
@@ -1384,14 +1399,8 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
        /* Remove the target file, if it exists */
-        if (nip) {
+        if (nip)
-                struct buffer_head *bh;
+                error = gfs2_unlink_inode(ndip, ndentry);
-                error = gfs2_meta_inode_buffer(nip, &bh);
-                if (error)
-                        goto out_end_trans;
-                error = gfs2_unlink_inode(ndip, ndentry, bh);
-                brelse(bh);
-        }
        if (dir_rename) {
                error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR);
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 0fb6539b0c8c..8dad6b093716 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -120,8 +120,8 @@ static void gdlm_ast(void *arg)
        gfs2_update_reply_times(gl);
        BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED);
-        if (gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID)
+        if ((gl->gl_lksb.sb_flags & DLM_SBF_VALNOTVALID) && gl->gl_lksb.sb_lvbptr)
-                memset(gl->gl_lvb, 0, GDLM_LVB_SIZE);
+                memset(gl->gl_lksb.sb_lvbptr, 0, GDLM_LVB_SIZE);
        switch (gl->gl_lksb.sb_status) {
        case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */
@@ -203,8 +203,10 @@ static int make_mode(const unsigned int lmstate)
 static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
                      const int req)
 {
-        u32 lkf = DLM_LKF_VALBLK;
+        u32 lkf = 0;
-        u32 lkid = gl->gl_lksb.sb_lkid;
+        if (gl->gl_lksb.sb_lvbptr)
+                lkf |= DLM_LKF_VALBLK;
        if (gfs_flags & LM_FLAG_TRY)
                lkf |= DLM_LKF_NOQUEUE;
@@ -228,7 +230,7 @@ static u32 make_flags(struct gfs2_glock *gl, const unsigned int gfs_flags,
                        BUG();
        }
-        if (lkid != 0) {
+        if (gl->gl_lksb.sb_lkid != 0) {
                lkf |= DLM_LKF_CONVERT;
                if (test_bit(GLF_BLOCKING, &gl->gl_flags))
                        lkf |= DLM_LKF_QUECVT;
@@ -289,6 +291,14 @@ static void gdlm_put_lock(struct gfs2_glock *gl)
        gfs2_glstats_inc(gl, GFS2_LKS_DCOUNT);
        gfs2_sbstats_inc(gl, GFS2_LKS_DCOUNT);
        gfs2_update_request_times(gl);
+        /* don't want to skip dlm_unlock writing the lvb when lock is ex */
+        if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) &&
+            gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) {
+                gfs2_glock_free(gl);
+                return;
+        }
        error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK,
                           NULL, gl);
        if (error) {
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index e443966c8106..0e3554edb8f2 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -278,6 +278,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
        sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize -
                                sizeof(struct gfs2_meta_header)) /
                                sizeof(struct gfs2_quota_change);
+        sdp->sd_blocks_per_bitmap = (sdp->sd_sb.sb_bsize -
+                                     sizeof(struct gfs2_meta_header))
+                * GFS2_NBBY; /* not the rgrp bitmap, subsequent bitmaps only */
        /* Compute maximum reservation required to add a entry to a directory */
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c5af8e18f27a..ae55e248c3b7 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -816,7 +816,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
        blocks = num_qd * data_blocks + RES_DINODE + num_qd + 3;
        reserved = 1 + (nalloc * (data_blocks + ind_blocks));
-        error = gfs2_inplace_reserve(ip, reserved);
+        error = gfs2_inplace_reserve(ip, reserved, 0);
        if (error)
                goto out_alloc;
@@ -869,7 +869,7 @@ static int update_qd(struct gfs2_sbd *sdp, struct gfs2_quota_data *qd)
        if (error < 0)
                return error;
-        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
        qlvb->qb_magic = cpu_to_be32(GFS2_MAGIC);
        qlvb->__pad = 0;
        qlvb->qb_limit = q.qu_limit;
@@ -893,7 +893,7 @@ restart:
        if (error)
                return error;
-        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        qd->qd_qb = *(struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
        if (force_refresh || qd->qd_qb.qb_magic != cpu_to_be32(GFS2_MAGIC)) {
                gfs2_glock_dq_uninit(q_gh);
@@ -1506,7 +1506,7 @@ static int gfs2_get_dqblk(struct super_block *sb, struct kqid qid,
        if (error)
                goto out;
-        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb;
+        qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lksb.sb_lvbptr;
        fdq->d_version = FS_DQUOT_VERSION;
        fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA;
        fdq->d_id = from_kqid(&init_user_ns, qid);
@@ -1605,7 +1605,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
                gfs2_write_calc_reserv(ip, sizeof(struct gfs2_quota),
                                       &data_blocks, &ind_blocks);
                blocks = 1 + data_blocks + ind_blocks;
-                error = gfs2_inplace_reserve(ip, blocks);
+                error = gfs2_inplace_reserve(ip, blocks, 0);
                if (error)
                        goto out_i;
                blocks += gfs2_rg_blocks(ip, blocks);
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 38fe18f2f055..37ee061d899e 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -16,6 +16,7 @@
 #include <linux/prefetch.h>
 #include <linux/blkdev.h>
 #include <linux/rbtree.h>
+#include <linux/random.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -251,22 +252,25 @@ static u32 gfs2_bitfit(const u8 *buf, const unsigned int len,
 static int gfs2_rbm_from_block(struct gfs2_rbm *rbm, u64 block)
 {
        u64 rblock = block - rbm->rgd->rd_data0;
-        u32 goal = (u32)rblock;
+        u32 x;
-        int x;
        if (WARN_ON_ONCE(rblock > UINT_MAX))
                return -EINVAL;
        if (block >= rbm->rgd->rd_data0 + rbm->rgd->rd_data)
                return -E2BIG;
-        for (x = 0; x < rbm->rgd->rd_length; x++) {
+        rbm->bi = rbm->rgd->rd_bits;
-                rbm->bi = rbm->rgd->rd_bits + x;
+        rbm->offset = (u32)(rblock);
-                if (goal < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY) {
+        /* Check if the block is within the first block */
-                        rbm->offset = goal - (rbm->bi->bi_start * GFS2_NBBY);
+        if (rbm->offset < (rbm->bi->bi_start + rbm->bi->bi_len) * GFS2_NBBY)
-                        break;
+                return 0;
-                }
-        }
+        /* Adjust for the size diff between gfs2_meta_header and gfs2_rgrp */
+        rbm->offset += (sizeof(struct gfs2_rgrp) -
+                        sizeof(struct gfs2_meta_header)) * GFS2_NBBY;
+        x = rbm->offset / rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
+        rbm->offset -= x * rbm->rgd->rd_sbd->sd_blocks_per_bitmap;
+        rbm->bi += x;
        return 0;
 }
@@ -875,7 +879,7 @@ static int read_rindex_entry(struct gfs2_inode *ip)
                goto fail;
        rgd->rd_gl->gl_object = rgd;
-        rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lvb;
+        rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
        rgd->rd_flags &= ~GFS2_RDF_UPTODATE;
        if (rgd->rd_data > sdp->sd_max_rg_data)
                sdp->sd_max_rg_data = rgd->rd_data;
@@ -1678,13 +1682,105 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
        return;
 }
+/**
+ * gfs2_rgrp_congested - Use stats to figure out whether an rgrp is congested
+ * @rgd: The rgrp in question
+ * @loops: An indication of how picky we can be (0=very, 1=less so)
+ *
+ * This function uses the recently added glock statistics in order to
+ * figure out whether a parciular resource group is suffering from
+ * contention from multiple nodes. This is done purely on the basis
+ * of timings, since this is the only data we have to work with and
+ * our aim here is to reject a resource group which is highly contended
+ * but (very important) not to do this too often in order to ensure that
+ * we do not land up introducing fragmentation by changing resource
+ * groups when not actually required.
+ *
+ * The calculation is fairly simple, we want to know whether the SRTTB
+ * (i.e. smoothed round trip time for blocking operations) to acquire
+ * the lock for this rgrp's glock is significantly greater than the
+ * time taken for resource groups on average. We introduce a margin in
+ * the form of the variable @var which is computed as the sum of the two
+ * respective variences, and multiplied by a factor depending on @loops
+ * and whether we have a lot of data to base the decision on. This is
+ * then tested against the square difference of the means in order to
+ * decide whether the result is statistically significant or not.
+ *
+ * Returns: A boolean verdict on the congestion status
+ */
+static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops)
+{
+        const struct gfs2_glock *gl = rgd->rd_gl;
+        const struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_lkstats *st;
+        s64 r_dcount, l_dcount;
+        s64 r_srttb, l_srttb;
+        s64 srttb_diff;
+        s64 sqr_diff;
+        s64 var;
+        preempt_disable();
+        st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP];
+        r_srttb = st->stats[GFS2_LKS_SRTTB];
+        r_dcount = st->stats[GFS2_LKS_DCOUNT];
+        var = st->stats[GFS2_LKS_SRTTVARB] +
+              gl->gl_stats.stats[GFS2_LKS_SRTTVARB];
+        preempt_enable();
+        l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB];
+        l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT];
+        if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0))
+                return false;
+        srttb_diff = r_srttb - l_srttb;
+        sqr_diff = srttb_diff * srttb_diff;
+        var *= 2;
+        if (l_dcount < 8 || r_dcount < 8)
+                var *= 2;
+        if (loops == 1)
+                var *= 2;
+        return ((srttb_diff < 0) && (sqr_diff > var));
+}
+/**
+ * gfs2_rgrp_used_recently
+ * @rs: The block reservation with the rgrp to test
+ * @msecs: The time limit in milliseconds
+ *
+ * Returns: True if the rgrp glock has been used within the time limit
+ */
+static bool gfs2_rgrp_used_recently(const struct gfs2_blkreserv *rs,
+                                    u64 msecs)
+{
+        u64 tdiff;
+        tdiff = ktime_to_ns(ktime_sub(ktime_get_real(),
+                            rs->rs_rbm.rgd->rd_gl->gl_dstamp));
+        return tdiff > (msecs * 1000 * 1000);
+}
+static u32 gfs2_orlov_skip(const struct gfs2_inode *ip)
+{
+        const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+        u32 skip;
+        get_random_bytes(&skip, sizeof(skip));
+        return skip % sdp->sd_rgrps;
+}
 static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *begin)
 {
        struct gfs2_rgrpd *rgd = *pos;
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
        rgd = gfs2_rgrpd_get_next(rgd);
        if (rgd == NULL)
-                rgd = gfs2_rgrpd_get_next(NULL);
+                rgd = gfs2_rgrpd_get_first(sdp);
        *pos = rgd;
        if (rgd != begin) /* If we didn't wrap */
                return true;
@@ -1699,14 +1795,15 @@ static bool gfs2_select_rgrp(struct gfs2_rgrpd **pos, const struct gfs2_rgrpd *b
 * Returns: errno
 */
-int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
+int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 aflags)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_rgrpd *begin = NULL;
        struct gfs2_blkreserv *rs = ip->i_res;
-        int error = 0, rg_locked, flags = LM_FLAG_TRY;
+        int error = 0, rg_locked, flags = 0;
        u64 last_unlinked = NO_BLOCK;
        int loops = 0;
+        u32 skip = 0;
        if (sdp->sd_args.ar_rgrplvb)
                flags |= GL_SKIP;
@@ -1720,6 +1817,8 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
        } else {
                rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1);
        }
+        if (S_ISDIR(ip->i_inode.i_mode) && (aflags & GFS2_AF_ORLOV))
+                skip = gfs2_orlov_skip(ip);
        if (rs->rs_rbm.rgd == NULL)
                return -EBADSLT;
@@ -1728,13 +1827,20 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested)
                if (!gfs2_glock_is_locked_by_me(rs->rs_rbm.rgd->rd_gl)) {
                        rg_locked = 0;
+                        if (skip && skip--)
+                                goto next_rgrp;
+                        if (!gfs2_rs_active(rs) && (loops < 2) &&
+                             gfs2_rgrp_used_recently(rs, 1000) &&
+                             gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
+                                goto next_rgrp;
                        error = gfs2_glock_nq_init(rs->rs_rbm.rgd->rd_gl,
                                                   LM_ST_EXCLUSIVE, flags,
                                                   &rs->rs_rgd_gh);
-                        if (error == GLR_TRYFAILED)
-                                goto next_rgrp;
                        if (unlikely(error))
                                return error;
+                        if (!gfs2_rs_active(rs) && (loops < 2) &&
+                            gfs2_rgrp_congested(rs->rs_rbm.rgd, loops))
+                                goto skip_rgrp;
                        if (sdp->sd_args.ar_rgrplvb) {
                                error = update_rgrp_lvb(rs->rs_rbm.rgd);
                                if (unlikely(error)) {
@@ -1781,12 +1887,13 @@ next_rgrp:
                /* Find the next rgrp, and continue looking */
                if (gfs2_select_rgrp(&rs->rs_rbm.rgd, begin))
                        continue;
+                if (skip)
+                        continue;
                /* If we've scanned all the rgrps, but found no free blocks
                 * then this checks for some less likely conditions before
                 * trying again.
                 */
-                flags &= ~LM_FLAG_TRY;
                loops++;
                /* Check that fs hasn't grown if writing to rindex */
                if (ip == GFS2_I(sdp->sd_rindex) && !sdp->sd_rindex_uptodate) {
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 24077958dcf6..842185853f6b 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -39,7 +39,8 @@ extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh);
 extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip);
-extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested);
+#define GFS2_AF_ORLOV 1
+extern int gfs2_inplace_reserve(struct gfs2_inode *ip, u32 requested, u32 flags);
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
 extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h
index bbdc78af60ca..2ee13e841e9f 100644
--- a/fs/gfs2/trace_gfs2.h
+++ b/fs/gfs2/trace_gfs2.h
@@ -486,7 +486,7 @@ TRACE_EVENT(gfs2_block_alloc,
        ),
        TP_fast_assign(
-                __entry->dev            = ip->i_gl->gl_sbd->sd_vfs->s_dev;
+                __entry->dev            = rgd->rd_gl->gl_sbd->sd_vfs->s_dev;
                __entry->start          = block;
                __entry->inum           = ip->i_no_addr;
                __entry->len            = len;
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index db330e5518cd..76c144b3c9bb 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -734,7 +734,7 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (error)
                return error;
-        error = gfs2_inplace_reserve(ip, blks);
+        error = gfs2_inplace_reserve(ip, blks, 0);
        if (error)
                goto out_gunlock_q;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index c5bc355d8243..78bde32ea951 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1,7 +1,7 @@
 /*
 * hugetlbpage-backed filesystem.  Based on ramfs.
 *
- * William Irwin, 2002
+ * Nadia Yvette Chambers, 2002
 *
 * Copyright (C) 2002 Linus Torvalds.
 */
@@ -151,8 +151,8 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 {
        struct mm_struct *mm = current->mm;
        struct vm_area_struct *vma;
-        unsigned long start_addr;
        struct hstate *h = hstate_file(file);
+        struct vm_unmapped_area_info info;
        if (len & ~huge_page_mask(h))
                return -EINVAL;
@@ -173,39 +173,13 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
                        return addr;
        }
-        if (len > mm->cached_hole_size)
+        info.flags = 0;
-                start_addr = mm->free_area_cache;
+        info.length = len;
-        else {
+        info.low_limit = TASK_UNMAPPED_BASE;
-                start_addr = TASK_UNMAPPED_BASE;
+        info.high_limit = TASK_SIZE;
-                mm->cached_hole_size = 0;
+        info.align_mask = PAGE_MASK & ~huge_page_mask(h);
-        }
+        info.align_offset = 0;
+        return vm_unmapped_area(&info);
-full_search:
-        addr = ALIGN(start_addr, huge_page_size(h));
-        for (vma = find_vma(mm, addr); ; vma = vma->vm_next) {
-                /* At this point:  (!vma || addr < vma->vm_end). */
-                if (TASK_SIZE - len < addr) {
-                        /*
-                         * Start a new search - just in case we missed
-                         * some holes.
-                         */
-                        if (start_addr != TASK_UNMAPPED_BASE) {
-                                start_addr = TASK_UNMAPPED_BASE;
-                                mm->cached_hole_size = 0;
-                                goto full_search;
-                        }
-                        return -ENOMEM;
-                }
-                if (!vma || addr + len <= vma->vm_start) {
-                        mm->free_area_cache = addr + len;
-                        return addr;
-                }
-                if (addr + mm->cached_hole_size < vma->vm_start)
-                        mm->cached_hole_size = vma->vm_start - addr;
-                addr = ALIGN(vma->vm_end, huge_page_size(h));
-        }
 }
 #endif
@@ -608,11 +582,11 @@ static int hugetlbfs_migrate_page(struct address_space *mapping,
        int rc;
        rc = migrate_huge_page_move_mapping(mapping, newpage, page);
-        if (rc)
+        if (rc != MIGRATEPAGE_SUCCESS)
                return rc;
        migrate_page_copy(newpage, page);
-        return 0;
+        return MIGRATEPAGE_SUCCESS;
 }
 static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf)
@@ -923,7 +897,7 @@ static struct file_system_type hugetlbfs_fs_type = {
        .kill_sb        = kill_litter_super,
 };
-static struct vfsmount *hugetlbfs_vfsmount;
+static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
 static int can_do_hugetlb_shm(void)
 {
@@ -932,9 +906,22 @@ static int can_do_hugetlb_shm(void)
        return capable(CAP_IPC_LOCK) || in_group_p(shm_group);
 }
+static int get_hstate_idx(int page_size_log)
+{
+        struct hstate *h;
+        if (!page_size_log)
+                return default_hstate_idx;
+        h = size_to_hstate(1 << page_size_log);
+        if (!h)
+                return -1;
+        return h - hstates;
+}
 struct file *hugetlb_file_setup(const char *name, unsigned long addr,
                                size_t size, vm_flags_t acctflag,
-                                struct user_struct **user, int creat_flags)
+                                struct user_struct **user,
+                                int creat_flags, int page_size_log)
 {
        int error = -ENOMEM;
        struct file *file;
@@ -944,9 +931,14 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
        struct qstr quick_string;
        struct hstate *hstate;
        unsigned long num_pages;
+        int hstate_idx;
+        hstate_idx = get_hstate_idx(page_size_log);
+        if (hstate_idx < 0)
+                return ERR_PTR(-ENODEV);
        *user = NULL;
-        if (!hugetlbfs_vfsmount)
+        if (!hugetlbfs_vfsmount[hstate_idx])
                return ERR_PTR(-ENOENT);
        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
@@ -963,7 +955,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
                }
        }
-        root = hugetlbfs_vfsmount->mnt_root;
+        root = hugetlbfs_vfsmount[hstate_idx]->mnt_root;
        quick_string.name = name;
        quick_string.len = strlen(quick_string.name);
        quick_string.hash = 0;
@@ -971,7 +963,7 @@ struct file *hugetlb_file_setup(const char *name, unsigned long addr,
        if (!path.dentry)
                goto out_shm_unlock;
-        path.mnt = mntget(hugetlbfs_vfsmount);
+        path.mnt = mntget(hugetlbfs_vfsmount[hstate_idx]);
        error = -ENOSPC;
        inode = hugetlbfs_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0);
        if (!inode)
@@ -1011,8 +1003,9 @@ out_shm_unlock:
 static int __init init_hugetlbfs_fs(void)
 {
+        struct hstate *h;
        int error;
-        struct vfsmount *vfsmount;
+        int i;
        error = bdi_init(&hugetlbfs_backing_dev_info);
        if (error)
@@ -1029,14 +1022,26 @@ static int __init init_hugetlbfs_fs(void)
        if (error)
                goto out;
-        vfsmount = kern_mount(&hugetlbfs_fs_type);
+        i = 0;
+        for_each_hstate(h) {
+                char buf[50];
+                unsigned ps_kb = 1U << (h->order + PAGE_SHIFT - 10);
-        if (!IS_ERR(vfsmount)) {
+                snprintf(buf, sizeof(buf), "pagesize=%uK", ps_kb);
-                hugetlbfs_vfsmount = vfsmount;
+                hugetlbfs_vfsmount[i] = kern_mount_data(&hugetlbfs_fs_type,
-                return 0;
+                                                        buf);
-        }
-        error = PTR_ERR(vfsmount);
+                if (IS_ERR(hugetlbfs_vfsmount[i])) {
+                        pr_err("hugetlb: Cannot mount internal hugetlbfs for "
+                                "page size %uK", ps_kb);
+                        error = PTR_ERR(hugetlbfs_vfsmount[i]);
+                        hugetlbfs_vfsmount[i] = NULL;
+                }
+                i++;
+        }
+        /* Non default hstates are optional */
+        if (!IS_ERR_OR_NULL(hugetlbfs_vfsmount[default_hstate_idx]))
+                return 0;
 out:
        kmem_cache_destroy(hugetlbfs_inode_cachep);
@@ -1047,13 +1052,19 @@ static int __init init_hugetlbfs_fs(void)
 static void __exit exit_hugetlbfs_fs(void)
 {
+        struct hstate *h;
+        int i;
        /*
         * Make sure all delayed rcu free inodes are flushed before we
         * destroy cache.
         */
        rcu_barrier();
        kmem_cache_destroy(hugetlbfs_inode_cachep);
-        kern_unmount(hugetlbfs_vfsmount);
+        i = 0;
+        for_each_hstate(h)
+                kern_unmount(hugetlbfs_vfsmount[i++]);
        unregister_filesystem(&hugetlbfs_fs_type);
        bdi_destroy(&hugetlbfs_backing_dev_info);
 }
diff --git a/fs/inode.c b/fs/inode.c
index b03c71957246..14084b72b259 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -165,7 +165,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
        mapping->host = inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE);
-        mapping->assoc_mapping = NULL;
+        mapping->private_data = NULL;
        mapping->backing_dev_info = &default_backing_dev_info;
        mapping->writeback_index = 0;
@@ -408,6 +408,19 @@ static void inode_lru_list_add(struct inode *inode)
        spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
+/*
+ * Add inode to LRU if needed (inode is unused and clean).
+ *
+ * Needs inode->i_lock held.
+ */
+void inode_add_lru(struct inode *inode)
+{
+        if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+            !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
+                inode_lru_list_add(inode);
+}
 static void inode_lru_list_del(struct inode *inode)
 {
        spin_lock(&inode->i_sb->s_inode_lru_lock);
@@ -1390,8 +1403,7 @@ static void iput_final(struct inode *inode)
        if (!drop && (sb->s_flags & MS_ACTIVE)) {
                inode->i_state |= I_REFERENCED;
-                if (!(inode->i_state & (I_DIRTY|I_SYNC)))
+                inode_add_lru(inode);
-                        inode_lru_list_add(inode);
                spin_unlock(&inode->i_lock);
                return;
        }
diff --git a/fs/internal.h b/fs/internal.h
index 916b7cbf3e3e..2f6af7f645eb 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -110,6 +110,7 @@ extern int open_check_o_direct(struct file *f);
 * inode.c
 */
 extern spinlock_t inode_sb_list_lock;
+extern void inode_add_lru(struct inode *inode);
 /*
 * fs-writeback.c
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 78b7f84241d4..071d6905f0dd 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -1259,7 +1259,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh)
                goto not_jbd;
        }
-        /* keep track of wether or not this transaction modified us */
+        /* keep track of whether or not this transaction modified us */
        was_modified = jh->b_modified;
        /*
@@ -1961,7 +1961,9 @@ retry:
                        spin_unlock(&journal->j_list_lock);
                        jbd_unlock_bh_state(bh);
                        spin_unlock(&journal->j_state_lock);
+                        unlock_buffer(bh);
                        log_wait_commit(journal, tid);
+                        lock_buffer(bh);
                        goto retry;
                }
                /*
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index a74ba4659549..d8da40e99d84 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1261,7 +1261,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh)
                goto not_jbd;
        }
-        /* keep track of wether or not this transaction modified us */
+        /* keep track of whether or not this transaction modified us */
        was_modified = jh->b_modified;
        /*
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 60ef3fb707ff..1506673c087e 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -138,33 +138,39 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
        struct page *pg;
        struct inode *inode = mapping->host;
        struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode);
+        struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
+        struct jffs2_raw_inode ri;
+        uint32_t alloc_len = 0;
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        uint32_t pageofs = index << PAGE_CACHE_SHIFT;
        int ret = 0;
+        jffs2_dbg(1, "%s()\n", __func__);
+        if (pageofs > inode->i_size) {
+                ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
+                                          ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
+                if (ret)
+                        return ret;
+        }
+        mutex_lock(&f->sem);
        pg = grab_cache_page_write_begin(mapping, index, flags);
-        if (!pg)
+        if (!pg) {
+                if (alloc_len)
+                        jffs2_complete_reservation(c);
+                mutex_unlock(&f->sem);
                return -ENOMEM;
+        }
        *pagep = pg;
-        jffs2_dbg(1, "%s()\n", __func__);
+        if (alloc_len) {
-        if (pageofs > inode->i_size) {
                /* Make new hole frag from old EOF to new page */
-                struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
-                struct jffs2_raw_inode ri;
                struct jffs2_full_dnode *fn;
-                uint32_t alloc_len;
                jffs2_dbg(1, "Writing new hole frag 0x%x-0x%x between current EOF and new page\n",
                          (unsigned int)inode->i_size, pageofs);
-                ret = jffs2_reserve_space(c, sizeof(ri), &alloc_len,
-                                          ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE);
-                if (ret)
-                        goto out_page;
-                mutex_lock(&f->sem);
                memset(&ri, 0, sizeof(ri));
                ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK);
@@ -191,7 +197,6 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
                if (IS_ERR(fn)) {
                        ret = PTR_ERR(fn);
                        jffs2_complete_reservation(c);
-                        mutex_unlock(&f->sem);
                        goto out_page;
                }
                ret = jffs2_add_full_dnode_to_inode(c, f, fn);
@@ -206,12 +211,10 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
                        jffs2_mark_node_obsolete(c, fn->raw);
                        jffs2_free_full_dnode(fn);
                        jffs2_complete_reservation(c);
-                        mutex_unlock(&f->sem);
                        goto out_page;
                }
                jffs2_complete_reservation(c);
                inode->i_size = pageofs;
-                mutex_unlock(&f->sem);
        }
        /*
@@ -220,18 +223,18 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
         * case of a short-copy.
         */
        if (!PageUptodate(pg)) {
-                mutex_lock(&f->sem);
                ret = jffs2_do_readpage_nolock(inode, pg);
-                mutex_unlock(&f->sem);
                if (ret)
                        goto out_page;
        }
+        mutex_unlock(&f->sem);
        jffs2_dbg(1, "end write_begin(). pg->flags %lx\n", pg->flags);
        return ret;
 out_page:
        unlock_page(pg);
        page_cache_release(pg);
+        mutex_unlock(&f->sem);
        return ret;
 }
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index adb90116d36b..af49e2d6941a 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -33,7 +33,7 @@
 * are being written out - and waiting for GC to make progress, naturally.
 *
 * So we cannot just call iget() or some variant of it, but first have to check
- * wether the inode in question might be in I_FREEING state.  Therefore we
+ * whether the inode in question might be in I_FREEING state.  Therefore we
 * maintain our own per-sb list of "almost deleted" inodes and check against
 * that list first.  Normally this should be at most 1-2 entries long.
 *
diff --git a/fs/namei.c b/fs/namei.c
index 937f9d50c84b..5f4cdf3ad913 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2131,6 +2131,11 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
        if (!len)
                return ERR_PTR(-EACCES);
+        if (unlikely(name[0] == '.')) {
+                if (len < 2 || (len == 2 && name[1] == '.'))
+                        return ERR_PTR(-EACCES);
+        }
        while (len--) {
                c = *(const unsigned char *)name++;
                if (c == '/' || c == '\0')
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index be20a7e171a0..63d14a99483d 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -89,7 +89,7 @@ static int ncp_file_mmap_fault(struct vm_area_struct *area,
        /*
         * If I understand ncp_read_kernel() properly, the above always
         * fetches from the network, here the analogue of disk.
-         * -- wli
+         * -- nyc
         */
        count_vm_event(PGMAJFAULT);
        mem_cgroup_count_vm_event(area->vm_mm, PGMAJFAULT);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ce8cb926526b..b9e66b7e0c14 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -450,7 +450,8 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
                        nfs_refresh_inode(dentry->d_inode, entry->fattr);
                        goto out;
                } else {
-                        d_drop(dentry);
+                        if (d_invalidate(dentry) != 0)
+                                goto out;
                        dput(dentry);
                }
        }
@@ -1100,6 +1101,8 @@ out_set_verifier:
 out_zap_parent:
        nfs_zap_caches(dir);
 out_bad:
+        nfs_free_fattr(fattr);
+        nfs_free_fhandle(fhandle);
        nfs_mark_for_revalidate(dir);
        if (inode && S_ISDIR(inode->i_mode)) {
                /* Purge readdir caches. */
@@ -1112,8 +1115,6 @@ out_zap_parent:
                shrink_dcache_parent(dentry);
        }
        d_drop(dentry);
-        nfs_free_fattr(fattr);
-        nfs_free_fhandle(fhandle);
        dput(parent);
        dfprintk(LOOKUPCACHE, "NFS: %s(%s/%s) is invalid\n",
                        __func__, dentry->d_parent->d_name.name,
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 3e7b2a0dc0c8..07f76db04ec7 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -431,7 +431,7 @@ void nilfs_mapping_init(struct address_space *mapping, struct inode *inode,
        mapping->host = inode;
        mapping->flags = 0;
        mapping_set_gfp_mask(mapping, GFP_NOFS);
-        mapping->assoc_mapping = NULL;
+        mapping->private_data = NULL;
        mapping->backing_dev_info = bdi;
        mapping->a_ops = &empty_aops;
 }
diff --git a/fs/notify/fanotify/Kconfig b/fs/notify/fanotify/Kconfig
index 7dceff005a67..e5f911bd80d2 100644
--- a/fs/notify/fanotify/Kconfig
+++ b/fs/notify/fanotify/Kconfig
@@ -4,7 +4,7 @@ config FANOTIFY
        select ANON_INODES
        default n
        ---help---
-           Say Y here to enable fanotify suport.  fanotify is a file access
+           Say Y here to enable fanotify support.  fanotify is a file access
           notification system which differs from inotify in that it sends
           an open file descriptor to the userspace listener along with
           the event.
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 721d692fa8d4..6fcaeb8c902e 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -258,7 +258,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        if (ret)
                goto out_close_fd;
-        fd_install(fd, f);
+        if (fd != FAN_NOFD)
+                fd_install(fd, f);
        return fanotify_event_metadata.event_len;
 out_close_fd:
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index c887b1378f7e..48cb994e4922 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -18,7 +18,7 @@
 /*
 * Basic idea behind the notification queue: An fsnotify group (like inotify)
- * sends the userspace notification about events asyncronously some time after
+ * sends the userspace notification about events asynchronously some time after
 * the event happened.  When inotify gets an event it will need to add that
 * event to the group notify queue.  Since a single event might need to be on
 * multiple group's notification queues we can't add the event directly to each
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 5a4ee77cec51..dda089804942 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2513,18 +2513,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
                ret = sd.num_spliced;
        if (ret > 0) {
-                unsigned long nr_pages;
                int err;
-                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
                err = generic_write_sync(out, *ppos, ret);
                if (err)
                        ret = err;
                else
                        *ppos += ret;
-                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+                balance_dirty_pages_ratelimited(mapping);
        }
        return ret;
diff --git a/fs/proc/array.c b/fs/proc/array.c
index c1c207c36cae..d3696708fc1a 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -438,7 +438,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        min_flt += sig->min_flt;
                        maj_flt += sig->maj_flt;
-                        thread_group_times(task, &utime, &stime);
+                        thread_group_cputime_adjusted(task, &utime, &stime);
                        gtime += sig->gtime;
                }
@@ -454,7 +454,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
        if (!whole) {
                min_flt = task->min_flt;
                maj_flt = task->maj_flt;
-                task_times(task, &utime, &stime);
+                task_cputime_adjusted(task, &utime, &stime);
                gtime = task->gtime;
        }
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 144a96732dd7..aa63d25157b8 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -873,12 +873,119 @@ static const struct file_operations proc_environ_operations = {
        .release        = mem_release,
 };
+static ssize_t oom_adj_read(struct file *file, char __user *buf, size_t count,
+                            loff_t *ppos)
+{
+        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
+        char buffer[PROC_NUMBUF];
+        int oom_adj = OOM_ADJUST_MIN;
+        size_t len;
+        unsigned long flags;
+        if (!task)
+                return -ESRCH;
+        if (lock_task_sighand(task, &flags)) {
+                if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MAX)
+                        oom_adj = OOM_ADJUST_MAX;
+                else
+                        oom_adj = (task->signal->oom_score_adj * -OOM_DISABLE) /
+                                  OOM_SCORE_ADJ_MAX;
+                unlock_task_sighand(task, &flags);
+        }
+        put_task_struct(task);
+        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_adj);
+        return simple_read_from_buffer(buf, count, ppos, buffer, len);
+}
+static ssize_t oom_adj_write(struct file *file, const char __user *buf,
+                             size_t count, loff_t *ppos)
+{
+        struct task_struct *task;
+        char buffer[PROC_NUMBUF];
+        int oom_adj;
+        unsigned long flags;
+        int err;
+        memset(buffer, 0, sizeof(buffer));
+        if (count > sizeof(buffer) - 1)
+                count = sizeof(buffer) - 1;
+        if (copy_from_user(buffer, buf, count)) {
+                err = -EFAULT;
+                goto out;
+        }
+        err = kstrtoint(strstrip(buffer), 0, &oom_adj);
+        if (err)
+                goto out;
+        if ((oom_adj < OOM_ADJUST_MIN || oom_adj > OOM_ADJUST_MAX) &&
+             oom_adj != OOM_DISABLE) {
+                err = -EINVAL;
+                goto out;
+        }
+        task = get_proc_task(file->f_path.dentry->d_inode);
+        if (!task) {
+                err = -ESRCH;
+                goto out;
+        }
+        task_lock(task);
+        if (!task->mm) {
+                err = -EINVAL;
+                goto err_task_lock;
+        }
+        if (!lock_task_sighand(task, &flags)) {
+                err = -ESRCH;
+                goto err_task_lock;
+        }
+        /*
+         * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum
+         * value is always attainable.
+         */
+        if (oom_adj == OOM_ADJUST_MAX)
+                oom_adj = OOM_SCORE_ADJ_MAX;
+        else
+                oom_adj = (oom_adj * OOM_SCORE_ADJ_MAX) / -OOM_DISABLE;
+        if (oom_adj < task->signal->oom_score_adj &&
+            !capable(CAP_SYS_RESOURCE)) {
+                err = -EACCES;
+                goto err_sighand;
+        }
+        /*
+         * /proc/pid/oom_adj is provided for legacy purposes, ask users to use
+         * /proc/pid/oom_score_adj instead.
+         */
+        printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, please use /proc/%d/oom_score_adj instead.\n",
+                  current->comm, task_pid_nr(current), task_pid_nr(task),
+                  task_pid_nr(task));
+        task->signal->oom_score_adj = oom_adj;
+        trace_oom_score_adj_update(task);
+err_sighand:
+        unlock_task_sighand(task, &flags);
+err_task_lock:
+        task_unlock(task);
+        put_task_struct(task);
+out:
+        return err < 0 ? err : count;
+}
+static const struct file_operations proc_oom_adj_operations = {
+        .read           = oom_adj_read,
+        .write          = oom_adj_write,
+        .llseek         = generic_file_llseek,
+};
 static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                                        size_t count, loff_t *ppos)
 {
        struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode);
        char buffer[PROC_NUMBUF];
-        int oom_score_adj = OOM_SCORE_ADJ_MIN;
+        short oom_score_adj = OOM_SCORE_ADJ_MIN;
        unsigned long flags;
        size_t len;
@@ -889,7 +996,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
                unlock_task_sighand(task, &flags);
        }
        put_task_struct(task);
-        len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj);
+        len = snprintf(buffer, sizeof(buffer), "%hd\n", oom_score_adj);
        return simple_read_from_buffer(buf, count, ppos, buffer, len);
 }
@@ -936,15 +1043,15 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
                goto err_task_lock;
        }
-        if (oom_score_adj < task->signal->oom_score_adj_min &&
+        if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
                        !capable(CAP_SYS_RESOURCE)) {
                err = -EACCES;
                goto err_sighand;
        }
-        task->signal->oom_score_adj = oom_score_adj;
+        task->signal->oom_score_adj = (short)oom_score_adj;
        if (has_capability_noaudit(current, CAP_SYS_RESOURCE))
-                task->signal->oom_score_adj_min = oom_score_adj;
+                task->signal->oom_score_adj_min = (short)oom_score_adj;
        trace_oom_score_adj_update(task);
 err_sighand:
@@ -1770,8 +1877,9 @@ static struct dentry *proc_map_files_lookup(struct inode *dir,
        if (!vma)
                goto out_no_vma;
-        result = proc_map_files_instantiate(dir, dentry, task,
+        if (vma->vm_file)
-                        (void *)(unsigned long)vma->vm_file->f_mode);
+                result = proc_map_files_instantiate(dir, dentry, task,
+                                (void *)(unsigned long)vma->vm_file->f_mode);
 out_no_vma:
        up_read(&mm->mmap_sem);
@@ -2598,6 +2706,7 @@ static const struct pid_entry tgid_base_stuff[] = {
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
        INF("oom_score",  S_IRUGO, proc_oom_score),
+        REG("oom_adj",    S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",   S_IWUSR|S_IRUGO, proc_loginuid_operations),
@@ -2964,6 +3073,7 @@ static const struct pid_entry tid_base_stuff[] = {
        REG("cgroup",  S_IRUGO, proc_cgroup_operations),
 #endif
        INF("oom_score", S_IRUGO, proc_oom_score),
+        REG("oom_adj",   S_IRUGO|S_IWUSR, proc_oom_adj_operations),
        REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations),
 #ifdef CONFIG_AUDITSYSCALL
        REG("loginuid",  S_IWUSR|S_IRUGO, proc_loginuid_operations),
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 86c67eee439f..e96d4f18ca3a 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -249,7 +249,7 @@ static int kcore_update_ram(void)
        /* Not inialized....update now */
        /* find out "max pfn" */
        end_pfn = 0;
-        for_each_node_state(nid, N_HIGH_MEMORY) {
+        for_each_node_state(nid, N_MEMORY) {
                unsigned long node_end;
                node_end  = NODE_DATA(nid)->node_start_pfn +
                        NODE_DATA(nid)->node_spanned_pages;
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index a781bdf06694..701580ddfcc3 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -378,12 +378,13 @@ static int test_perm(int mode, int op)
        return -EACCES;
 }
-static int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
+static int sysctl_perm(struct ctl_table_header *head, struct ctl_table *table, int op)
 {
+        struct ctl_table_root *root = head->root;
        int mode;
        if (root->permissions)
-                mode = root->permissions(root, current->nsproxy, table);
+                mode = root->permissions(head, table);
        else
                mode = table->mode;
@@ -491,7 +492,7 @@ static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf,
         * and won't be until we finish.
         */
        error = -EPERM;
-        if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ))
+        if (sysctl_perm(head, table, write ? MAY_WRITE : MAY_READ))
                goto out;
        /* if that can happen at all, it should be -EINVAL, not -EISDIR */
@@ -717,7 +718,7 @@ static int proc_sys_permission(struct inode *inode, int mask)
        if (!table) /* global root - r-xr-xr-x */
                error = mask & MAY_WRITE ? -EACCES : 0;
        else /* Use the permissions on the sysctl table entry */
-                error = sysctl_perm(head->root, table, mask & ~MAY_NOT_BLOCK);
+                error = sysctl_perm(head, table, mask & ~MAY_NOT_BLOCK);
        sysctl_head_finish(head);
        return error;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 90c63f9392a5..48775628abbf 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -643,7 +643,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
        spinlock_t *ptl;
        struct page *page;
-        split_huge_page_pmd(walk->mm, pmd);
+        split_huge_page_pmd(vma, addr, pmd);
        if (pmd_trans_unstable(pmd))
                return 0;
@@ -1126,7 +1126,7 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
                return NULL;
        nid = page_to_nid(page);
-        if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+        if (!node_isset(nid, node_states[N_MEMORY]))
                return NULL;
        return page;
@@ -1279,7 +1279,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
        if (md->writeback)
                seq_printf(m, " writeback=%lu", md->writeback);
-        for_each_node_state(n, N_HIGH_MEMORY)
+        for_each_node_state(n, N_MEMORY)
                if (md->node[n])
                        seq_printf(m, " N%d=%lu", n, md->node[n]);
 out:
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 4ab572e6d277..ed1d8c7212da 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -49,6 +49,7 @@ struct pstore_private {
        struct pstore_info *psi;
        enum pstore_type_id type;
        u64     id;
+        int     count;
        ssize_t size;
        char    data[];
 };
@@ -175,7 +176,8 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
        struct pstore_private *p = dentry->d_inode->i_private;
        if (p->psi->erase)
-                p->psi->erase(p->type, p->id, p->psi);
+                p->psi->erase(p->type, p->id, p->count,
+                              dentry->d_inode->i_ctime, p->psi);
        return simple_unlink(dir, dentry);
 }
@@ -270,7 +272,7 @@ int pstore_is_mounted(void)
 * Load it up with "size" bytes of data from "buf".
 * Set the mtime & ctime to the date that this record was originally stored.
 */
-int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
+int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
                  char *data, size_t size, struct timespec time,
                  struct pstore_info *psi)
 {
@@ -306,6 +308,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id,
                goto fail_alloc;
        private->type = type;
        private->id = id;
+        private->count = count;
        private->psi = psi;
        switch (type) {
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index 4847f588b7d5..937d820f273c 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -50,7 +50,7 @@ extern struct pstore_info *psinfo;
 extern void     pstore_set_kmsg_bytes(int);
 extern void     pstore_get_records(int);
 extern int      pstore_mkfile(enum pstore_type_id, char *psname, u64 id,
-                              char *data, size_t size,
+                              int count, char *data, size_t size,
                              struct timespec time, struct pstore_info *psi);
 extern int      pstore_is_mounted(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 947fbe06c3b1..5ea2e77ff023 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -136,7 +136,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
                        break;
                ret = psinfo->write(PSTORE_TYPE_DMESG, reason, &id, part,
-                                    hsize + len, psinfo);
+                                    oopscount, hsize + len, psinfo);
                if (ret == 0 && reason == KMSG_DUMP_OOPS && pstore_is_mounted())
                        pstore_new_entry = 1;
@@ -173,7 +173,7 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
                        spin_lock_irqsave(&psinfo->buf_lock, flags);
                }
                memcpy(psinfo->buf, s, c);
-                psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, c, psinfo);
+                psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, c, psinfo);
                spin_unlock_irqrestore(&psinfo->buf_lock, flags);
                s += c;
                c = e - s;
@@ -197,7 +197,7 @@ static void pstore_register_console(void) {}
 static int pstore_write_compat(enum pstore_type_id type,
                               enum kmsg_dump_reason reason,
-                               u64 *id, unsigned int part,
+                               u64 *id, unsigned int part, int count,
                               size_t size, struct pstore_info *psi)
 {
        return psi->write_buf(type, reason, id, part, psinfo->buf, size, psi);
@@ -267,6 +267,7 @@ void pstore_get_records(int quiet)
        char                    *buf = NULL;
        ssize_t                 size;
        u64                     id;
+        int                     count;
        enum pstore_type_id     type;
        struct timespec         time;
        int                     failed = 0, rc;
@@ -278,9 +279,9 @@ void pstore_get_records(int quiet)
        if (psi->open && psi->open(psi))
                goto out;
-        while ((size = psi->read(&id, &type, &time, &buf, psi)) > 0) {
+        while ((size = psi->read(&id, &type, &count, &time, &buf, psi)) > 0) {
-                rc = pstore_mkfile(type, psi->name, id, buf, (size_t)size,
+                rc = pstore_mkfile(type, psi->name, id, count, buf,
-                                  time, psi);
+                                  (size_t)size, time, psi);
                kfree(buf);
                buf = NULL;
                if (rc && (rc != -EEXIST || !quiet))
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index dba70e53b72c..f883e7e74305 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -132,9 +132,8 @@ ramoops_get_next_prz(struct persistent_ram_zone *przs[], uint *c, uint max,
 }
 static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
-                                   struct timespec *time,
+                                   int *count, struct timespec *time,
-                                   char **buf,
+                                   char **buf, struct pstore_info *psi)
-                                   struct pstore_info *psi)
 {
        ssize_t size;
        struct ramoops_context *cxt = psi->data;
@@ -241,8 +240,8 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
        return 0;
 }
-static int ramoops_pstore_erase(enum pstore_type_id type, u64 id,
+static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
-                                struct pstore_info *psi)
+                                struct timespec time, struct pstore_info *psi)
 {
        struct ramoops_context *cxt = psi->data;
        struct persistent_ram_zone *prz;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index f27f01a98aa2..d83736fbc26c 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1782,8 +1782,9 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
        BUG_ON(!th->t_trans_id);
-        dquot_initialize(inode);
+        reiserfs_write_unlock(inode->i_sb);
        err = dquot_alloc_inode(inode);
+        reiserfs_write_lock(inode->i_sb);
        if (err)
                goto out_end_trans;
        if (!dir->i_nlink) {
@@ -1979,8 +1980,10 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th,
      out_end_trans:
        journal_end(th, th->t_super, th->t_blocks_allocated);
+        reiserfs_write_unlock(inode->i_sb);
        /* Drop can be outside and it needs more credits so it's better to have it outside */
        dquot_drop(inode);
+        reiserfs_write_lock(inode->i_sb);
        inode->i_flags |= S_NOQUOTA;
        make_bad_inode(inode);
@@ -3103,10 +3106,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
        /* must be turned off for recursive notify_change calls */
        ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID);
-        depth = reiserfs_write_lock_once(inode->i_sb);
        if (is_quota_modification(inode, attr))
                dquot_initialize(inode);
+        depth = reiserfs_write_lock_once(inode->i_sb);
        if (attr->ia_valid & ATTR_SIZE) {
                /* version 2 items will be caught by the s_maxbytes check
                 ** done for us in vmtruncate
@@ -3170,7 +3172,9 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr)
                error = journal_begin(&th, inode->i_sb, jbegin_count);
                if (error)
                        goto out;
+                reiserfs_write_unlock_once(inode->i_sb, depth);
                error = dquot_transfer(inode, attr);
+                depth = reiserfs_write_lock_once(inode->i_sb);
                if (error) {
                        journal_end(&th, inode->i_sb, jbegin_count);
                        goto out;
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index f8afa4b162b8..2f40a4c70a4d 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -1968,7 +1968,9 @@ int reiserfs_paste_into_item(struct reiserfs_transaction_handle *th, struct tree
                       key2type(&(key->on_disk_key)));
 #endif
+        reiserfs_write_unlock(inode->i_sb);
        retval = dquot_alloc_space_nodirty(inode, pasted_size);
+        reiserfs_write_lock(inode->i_sb);
        if (retval) {
                pathrelse(search_path);
                return retval;
@@ -2061,9 +2063,11 @@ int reiserfs_insert_item(struct reiserfs_transaction_handle *th,
                               "reiserquota insert_item(): allocating %u id=%u type=%c",
                               quota_bytes, inode->i_uid, head2type(ih));
 #endif
+                reiserfs_write_unlock(inode->i_sb);
                /* We can't dirty inode here. It would be immediately written but
                 * appropriate stat item isn't inserted yet... */
                retval = dquot_alloc_space_nodirty(inode, quota_bytes);
+                reiserfs_write_lock(inode->i_sb);
                if (retval) {
                        pathrelse(path);
                        return retval;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 1078ae179993..418bdc3a57da 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -298,7 +298,9 @@ static int finish_unfinished(struct super_block *s)
                        retval = remove_save_link_only(s, &save_link_key, 0);
                        continue;
                }
+                reiserfs_write_unlock(s);
                dquot_initialize(inode);
+                reiserfs_write_lock(s);
                if (truncate && S_ISDIR(inode->i_mode)) {
                        /* We got a truncate request for a dir which is impossible.
@@ -1335,7 +1337,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                                kfree(qf_names[i]);
 #endif
                err = -EINVAL;
-                goto out_err;
+                goto out_unlock;
        }
 #ifdef CONFIG_QUOTA
        handle_quota_files(s, qf_names, &qfmt);
@@ -1379,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        if (blocks) {
                err = reiserfs_resize(s, blocks);
                if (err != 0)
-                        goto out_err;
+                        goto out_unlock;
        }
        if (*mount_flags & MS_RDONLY) {
@@ -1389,9 +1391,15 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                        /* it is read-only already */
                        goto out_ok;
+                /*
+                 * Drop write lock. Quota will retake it when needed and lock
+                 * ordering requires calling dquot_suspend() without it.
+                 */
+                reiserfs_write_unlock(s);
                err = dquot_suspend(s, -1);
                if (err < 0)
                        goto out_err;
+                reiserfs_write_lock(s);
                /* try to remount file system with read-only permissions */
                if (sb_umount_state(rs) == REISERFS_VALID_FS
@@ -1401,7 +1409,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                err = journal_begin(&th, s, 10);
                if (err)
-                        goto out_err;
+                        goto out_unlock;
                /* Mounting a rw partition read-only. */
                reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1416,7 +1424,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                if (reiserfs_is_journal_aborted(journal)) {
                        err = journal->j_errno;
-                        goto out_err;
+                        goto out_unlock;
                }
                handle_data_mode(s, mount_options);
@@ -1425,7 +1433,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                s->s_flags &= ~MS_RDONLY;       /* now it is safe to call journal_begin */
                err = journal_begin(&th, s, 10);
                if (err)
-                        goto out_err;
+                        goto out_unlock;
                /* Mount a partition which is read-only, read-write */
                reiserfs_prepare_for_journal(s, SB_BUFFER_WITH_SB(s), 1);
@@ -1442,10 +1450,16 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        SB_JOURNAL(s)->j_must_wait = 1;
        err = journal_end(&th, s, 10);
        if (err)
-                goto out_err;
+                goto out_unlock;
        if (!(*mount_flags & MS_RDONLY)) {
+                /*
+                 * Drop write lock. Quota will retake it when needed and lock
+                 * ordering requires calling dquot_resume() without it.
+                 */
+                reiserfs_write_unlock(s);
                dquot_resume(s, -1);
+                reiserfs_write_lock(s);
                finish_unfinished(s);
                reiserfs_xattr_init(s, *mount_flags);
        }
@@ -1455,9 +1469,10 @@ out_ok:
        reiserfs_write_unlock(s);
        return 0;
+out_unlock:
+        reiserfs_write_unlock(s);
 out_err:
        kfree(new_opts);
-        reiserfs_write_unlock(s);
        return err;
 }
@@ -2095,13 +2110,15 @@ static int reiserfs_write_dquot(struct dquot *dquot)
                          REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (ret)
                goto out;
+        reiserfs_write_unlock(dquot->dq_sb);
        ret = dquot_commit(dquot);
+        reiserfs_write_lock(dquot->dq_sb);
        err =
            journal_end(&th, dquot->dq_sb,
                        REISERFS_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (!ret && err)
                ret = err;
-      out:
+out:
        reiserfs_write_unlock(dquot->dq_sb);
        return ret;
 }
@@ -2117,13 +2134,15 @@ static int reiserfs_acquire_dquot(struct dquot *dquot)
                          REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (ret)
                goto out;
+        reiserfs_write_unlock(dquot->dq_sb);
        ret = dquot_acquire(dquot);
+        reiserfs_write_lock(dquot->dq_sb);
        err =
            journal_end(&th, dquot->dq_sb,
                        REISERFS_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (!ret && err)
                ret = err;
-      out:
+out:
        reiserfs_write_unlock(dquot->dq_sb);
        return ret;
 }
@@ -2137,19 +2156,21 @@ static int reiserfs_release_dquot(struct dquot *dquot)
        ret =
            journal_begin(&th, dquot->dq_sb,
                          REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+        reiserfs_write_unlock(dquot->dq_sb);
        if (ret) {
                /* Release dquot anyway to avoid endless cycle in dqput() */
                dquot_release(dquot);
                goto out;
        }
        ret = dquot_release(dquot);
+        reiserfs_write_lock(dquot->dq_sb);
        err =
            journal_end(&th, dquot->dq_sb,
                        REISERFS_QUOTA_DEL_BLOCKS(dquot->dq_sb));
        if (!ret && err)
                ret = err;
-      out:
        reiserfs_write_unlock(dquot->dq_sb);
+out:
        return ret;
 }
@@ -2174,11 +2195,13 @@ static int reiserfs_write_info(struct super_block *sb, int type)
        ret = journal_begin(&th, sb, 2);
        if (ret)
                goto out;
+        reiserfs_write_unlock(sb);
        ret = dquot_commit_info(sb, type);
+        reiserfs_write_lock(sb);
        err = journal_end(&th, sb, 2);
        if (!ret && err)
                ret = err;
-      out:
+out:
        reiserfs_write_unlock(sb);
        return ret;
 }
@@ -2203,8 +2226,11 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
        struct reiserfs_transaction_handle th;
        int opt = type == USRQUOTA ? REISERFS_USRQUOTA : REISERFS_GRPQUOTA;
-        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt)))
+        reiserfs_write_lock(sb);
-                return -EINVAL;
+        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << opt))) {
+                err = -EINVAL;
+                goto out;
+        }
        /* Quotafile not on the same filesystem? */
        if (path->dentry->d_sb != sb) {
@@ -2246,8 +2272,10 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                if (err)
                        goto out;
        }
-        err = dquot_quota_on(sb, type, format_id, path);
+        reiserfs_write_unlock(sb);
+        return dquot_quota_on(sb, type, format_id, path);
 out:
+        reiserfs_write_unlock(sb);
        return err;
 }
@@ -2320,7 +2348,9 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
                tocopy = sb->s_blocksize - offset < towrite ?
                    sb->s_blocksize - offset : towrite;
                tmp_bh.b_state = 0;
+                reiserfs_write_lock(sb);
                err = reiserfs_get_block(inode, blk, &tmp_bh, GET_BLOCK_CREATE);
+                reiserfs_write_unlock(sb);
                if (err)
                        goto out;
                if (offset || tocopy != sb->s_blocksize)
@@ -2336,10 +2366,12 @@ static ssize_t reiserfs_quota_write(struct super_block *sb, int type,
                flush_dcache_page(bh->b_page);
                set_buffer_uptodate(bh);
                unlock_buffer(bh);
+                reiserfs_write_lock(sb);
                reiserfs_prepare_for_journal(sb, bh, 1);
                journal_mark_dirty(current->journal_info, sb, bh);
                if (!journal_quota)
                        reiserfs_add_ordered_list(inode, bh);
+                reiserfs_write_unlock(sb);
                brelse(bh);
                offset = 0;
                towrite -= tocopy;
diff --git a/fs/splice.c b/fs/splice.c
index 13e5b4776e7a..8890604e3fcd 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1024,17 +1024,14 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                ret = sd.num_spliced;
        if (ret > 0) {
-                unsigned long nr_pages;
                int err;
-                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
                err = generic_write_sync(out, *ppos, ret);
                if (err)
                        ret = err;
                else
                        *ppos += ret;
-                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
+                balance_dirty_pages_ratelimited(mapping);
        }
        sb_end_write(inode->i_sb);
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 00012e31829d..602f56db0442 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -485,8 +485,8 @@ const struct file_operations sysfs_file_operations = {
        .poll           = sysfs_poll,
 };
-int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
+static int sysfs_attr_ns(struct kobject *kobj, const struct attribute *attr,
-                  const void **pns)
+                         const void **pns)
 {
        struct sysfs_dirent *dir_sd = kobj->sd;
        const struct sysfs_ops *ops;
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 28ec13af28d9..2dcf3d473fec 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -681,8 +681,16 @@ int ubifs_find_free_leb_for_idx(struct ubifs_info *c)
        if (!lprops) {
                lprops = ubifs_fast_find_freeable(c);
                if (!lprops) {
-                        ubifs_assert(c->freeable_cnt == 0);
+                        /*
-                        if (c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+                         * The first condition means the following: go scan the
+                         * LPT if there are uncategorized lprops, which means
+                         * there may be freeable LEBs there (UBIFS does not
+                         * store the information about freeable LEBs in the
+                         * master node).
+                         */
+                        if (c->in_a_category_cnt != c->main_lebs ||
+                            c->lst.empty_lebs - c->lst.taken_empty_lebs > 0) {
+                                ubifs_assert(c->freeable_cnt == 0);
                                lprops = scan_for_leb_for_idx(c);
                                if (IS_ERR(lprops)) {
                                        err = PTR_ERR(lprops);
diff --git a/fs/ubifs/lprops.c b/fs/ubifs/lprops.c
index e5a2a35a46dc..46190a7c42a6 100644
--- a/fs/ubifs/lprops.c
+++ b/fs/ubifs/lprops.c
@@ -300,8 +300,11 @@ void ubifs_add_to_cat(struct ubifs_info *c, struct ubifs_lprops *lprops,
        default:
                ubifs_assert(0);
        }
        lprops->flags &= ~LPROPS_CAT_MASK;
        lprops->flags |= cat;
+        c->in_a_category_cnt += 1;
+        ubifs_assert(c->in_a_category_cnt <= c->main_lebs);
 }
 /**
@@ -334,6 +337,9 @@ static void ubifs_remove_from_cat(struct ubifs_info *c,
        default:
                ubifs_assert(0);
        }
+        c->in_a_category_cnt -= 1;
+        ubifs_assert(c->in_a_category_cnt >= 0);
 }
 /**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 5486346d0a3f..d133c276fe05 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1183,6 +1183,8 @@ struct ubifs_debug_info;
 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
 * @freeable_cnt: number of freeable LEBs in @freeable_list
+ * @in_a_category_cnt: count of lprops which are in a certain category, which
+ *                     basically meants that they were loaded from the flash
 *
 * @ltab_lnum: LEB number of LPT's own lprops table
 * @ltab_offs: offset of LPT's own lprops table
@@ -1412,6 +1414,7 @@ struct ubifs_info {
        struct list_head freeable_list;
        struct list_head frdi_idx_list;
        int freeable_cnt;
+        int in_a_category_cnt;
        int ltab_lnum;
        int ltab_offs;
diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig
index 6100ec0fa1d4..5a7ffe54f5d5 100644
--- a/fs/xfs/Kconfig
+++ b/fs/xfs/Kconfig
@@ -2,6 +2,7 @@ config XFS_FS
        tristate "XFS filesystem support"
        depends on BLOCK
        select EXPORTFS
+        select LIBCRC32C
        help
          XFS is a high performance journaling filesystem which originated
          on the SGI IRIX platform.  It is completely multi-threaded, can
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index d2bf974b1a2f..d02201df855b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -37,9 +37,8 @@ xfs-y				+= xfs_aops.o \
                                   xfs_file.o \
                                   xfs_filestream.o \
                                   xfs_fsops.o \
-                                   xfs_fs_subr.o \
                                   xfs_globals.o \
-                                   xfs_iget.o \
+                                   xfs_icache.o \
                                   xfs_ioctl.o \
                                   xfs_iomap.o \
                                   xfs_iops.o \
@@ -47,7 +46,6 @@ xfs-y				+= xfs_aops.o \
                                   xfs_message.o \
                                   xfs_mru_cache.o \
                                   xfs_super.o \
-                                   xfs_sync.o \
                                   xfs_xattr.o \
                                   xfs_rename.o \
                                   xfs_utils.o \
diff --git a/fs/xfs/uuid.h b/fs/xfs/uuid.h
index 4732d71262cc..104db0f3bed6 100644
--- a/fs/xfs/uuid.h
+++ b/fs/xfs/uuid.h
@@ -26,4 +26,10 @@ extern int uuid_is_nil(uuid_t *uuid);
 extern int uuid_equal(uuid_t *uuid1, uuid_t *uuid2);
 extern void uuid_getnodeuniq(uuid_t *uuid, int fsid [2]);
+static inline void
+uuid_copy(uuid_t *dst, uuid_t *src)
+{
+        memcpy(dst, src, sizeof(uuid_t));
+}
 #endif  /* __XFS_SUPPORT_UUID_H__ */
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 44d65c1533c0..f2aeedb6a579 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -108,6 +108,8 @@ typedef struct xfs_agf {
 extern int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
                        xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
 /*
 * Size of the unlinked inode hash table in the agi.
 */
@@ -161,6 +163,8 @@ typedef struct xfs_agi {
 extern int xfs_read_agi(struct xfs_mount *mp, struct xfs_trans *tp,
                                xfs_agnumber_t agno, struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
 /*
 * The third a.g. block contains the a.g. freelist, an array
 * of block pointers to blocks owned by the allocation btree code.
@@ -233,6 +237,7 @@ typedef struct xfs_perag {
 #define XFS_ICI_NO_TAG          (-1)    /* special flag for an untagged lookup
                                           in xfs_inode_ag_iterator */
 #define XFS_ICI_RECLAIM_TAG     0       /* inode is to be reclaimed */
+#define XFS_ICI_EOFBLOCKS_TAG   1       /* inode has blocks beyond EOF */
 #define XFS_AG_MAXLEVELS(mp)            ((mp)->m_ag_maxlevels)
 #define XFS_MIN_FREELIST_RAW(bl,cl,mp)  \
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 335206a9c698..393055fe3aef 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -430,6 +430,60 @@ xfs_alloc_fixup_trees(
        return 0;
 }
+static void
+xfs_agfl_verify(
+        struct xfs_buf  *bp)
+{
+#ifdef WHEN_CRCS_COME_ALONG
+        /*
+         * we cannot actually do any verification of the AGFL because mkfs does
+         * not initialise the AGFL to zero or NULL. Hence the only valid part of
+         * the AGFL is what the AGF says is active. We can't get to the AGF, so
+         * we can't verify just those entries are valid.
+         *
+         * This problem goes away when the CRC format change comes along as that
+         * requires the AGFL to be initialised by mkfs. At that point, we can
+         * verify the blocks in the agfl -active or not- lie within the bounds
+         * of the AG. Until then, just leave this check ifdef'd out.
+         */
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
+        int             agfl_ok = 1;
+        int             i;
+        for (i = 0; i < XFS_AGFL_SIZE(mp); i++) {
+                if (be32_to_cpu(agfl->agfl_bno[i]) == NULLAGBLOCK ||
+                    be32_to_cpu(agfl->agfl_bno[i]) >= mp->m_sb.sb_agblocks)
+                        agfl_ok = 0;
+        }
+        if (!agfl_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agfl);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+#endif
+}
+static void
+xfs_agfl_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agfl_verify(bp);
+}
+static void
+xfs_agfl_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agfl_verify(bp);
+}
+const struct xfs_buf_ops xfs_agfl_buf_ops = {
+        .verify_read = xfs_agfl_read_verify,
+        .verify_write = xfs_agfl_write_verify,
+};
 /*
 * Read in the allocation group free block array.
 */
@@ -447,7 +501,7 @@ xfs_alloc_read_agfl(
        error = xfs_trans_read_buf(
                        mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                        XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
        if (error)
                return error;
        ASSERT(!xfs_buf_geterror(bp));
@@ -2091,6 +2145,63 @@ xfs_alloc_put_freelist(
        return 0;
 }
+static void
+xfs_agf_verify(
+        struct xfs_buf  *bp)
+ {
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_agf  *agf;
+        int             agf_ok;
+        agf = XFS_BUF_TO_AGF(bp);
+        agf_ok = agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
+                XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
+                be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
+                be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
+                be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
+                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp);
+        /*
+         * during growfs operations, the perag is not fully initialised,
+         * so we can't use it for any useful checking. growfs ensures we can't
+         * use it by using uncached buffers that don't have the perag attached
+         * so we can detect and avoid this problem.
+         */
+        if (bp->b_pag)
+                agf_ok = agf_ok && be32_to_cpu(agf->agf_seqno) ==
+                                                bp->b_pag->pag_agno;
+        if (xfs_sb_version_haslazysbcount(&mp->m_sb))
+                agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
+                                                be32_to_cpu(agf->agf_length);
+        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
+                        XFS_RANDOM_ALLOC_READ_AGF))) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agf);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_agf_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agf_verify(bp);
+}
+static void
+xfs_agf_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agf_verify(bp);
+}
+const struct xfs_buf_ops xfs_agf_buf_ops = {
+        .verify_read = xfs_agf_read_verify,
+        .verify_write = xfs_agf_write_verify,
+};
 /*
 * Read in the allocation group header (free/alloc section).
 */
@@ -2102,44 +2213,19 @@ xfs_read_agf(
        int                     flags,  /* XFS_BUF_ */
        struct xfs_buf          **bpp)  /* buffer for the ag freelist header */
 {
-        struct xfs_agf  *agf;           /* ag freelist header */
-        int             agf_ok;         /* set if agf is consistent */
        int             error;
        ASSERT(agno != NULLAGNUMBER);
        error = xfs_trans_read_buf(
                        mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1), flags, bpp);
+                        XFS_FSS_TO_BB(mp, 1), flags, bpp, &xfs_agf_buf_ops);
        if (error)
                return error;
        if (!*bpp)
                return 0;
        ASSERT(!(*bpp)->b_error);
-        agf = XFS_BUF_TO_AGF(*bpp);
-        /*
-         * Validate the magic number of the agf block.
-         */
-        agf_ok =
-                agf->agf_magicnum == cpu_to_be32(XFS_AGF_MAGIC) &&
-                XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
-                be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
-                be32_to_cpu(agf->agf_flfirst) < XFS_AGFL_SIZE(mp) &&
-                be32_to_cpu(agf->agf_fllast) < XFS_AGFL_SIZE(mp) &&
-                be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp) &&
-                be32_to_cpu(agf->agf_seqno) == agno;
-        if (xfs_sb_version_haslazysbcount(&mp->m_sb))
-                agf_ok = agf_ok && be32_to_cpu(agf->agf_btreeblks) <=
-                                                be32_to_cpu(agf->agf_length);
-        if (unlikely(XFS_TEST_ERROR(!agf_ok, mp, XFS_ERRTAG_ALLOC_READ_AGF,
-                        XFS_RANDOM_ALLOC_READ_AGF))) {
-                XFS_CORRUPTION_ERROR("xfs_alloc_read_agf",
-                                     XFS_ERRLEVEL_LOW, mp, agf);
-                xfs_trans_brelse(tp, *bpp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        xfs_buf_set_ref(*bpp, XFS_AGF_REF);
        return 0;
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index feacb061bab7..99d0a6101558 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -231,4 +231,7 @@ xfs_alloc_get_rec(
        xfs_extlen_t            *len,   /* output: length of extent */
        int                     *stat); /* output: success/failure */
+extern const struct xfs_buf_ops xfs_agf_buf_ops;
+extern const struct xfs_buf_ops xfs_agfl_buf_ops;
 #endif  /* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index f7876c6d6165..b1ddef6b2689 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -272,6 +272,82 @@ xfs_allocbt_key_diff(
        return (__int64_t)be32_to_cpu(kp->ar_startblock) - rec->ar_startblock;
 }
+static void
+xfs_allocbt_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        struct xfs_perag        *pag = bp->b_pag;
+        unsigned int            level;
+        int                     sblock_ok; /* block passes checks */
+        /*
+         * magic number and level verification
+         *
+         * During growfs operations, we can't verify the exact level as the
+         * perag is not fully initialised and hence not attached to the buffer.
+         * In this case, check against the maximum tree depth.
+         */
+        level = be16_to_cpu(block->bb_level);
+        switch (block->bb_magic) {
+        case cpu_to_be32(XFS_ABTB_MAGIC):
+                if (pag)
+                        sblock_ok = level < pag->pagf_levels[XFS_BTNUM_BNOi];
+                else
+                        sblock_ok = level < mp->m_ag_maxlevels;
+                break;
+        case cpu_to_be32(XFS_ABTC_MAGIC):
+                if (pag)
+                        sblock_ok = level < pag->pagf_levels[XFS_BTNUM_CNTi];
+                else
+                        sblock_ok = level < mp->m_ag_maxlevels;
+                break;
+        default:
+                sblock_ok = 0;
+                break;
+        }
+        /* numrecs verification */
+        sblock_ok = sblock_ok &&
+                be16_to_cpu(block->bb_numrecs) <= mp->m_alloc_mxr[level != 0];
+        /* sibling pointer verification */
+        sblock_ok = sblock_ok &&
+                (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+                 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+                block->bb_u.s.bb_leftsib &&
+                (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+                 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+                block->bb_u.s.bb_rightsib;
+        if (!sblock_ok) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_allocbt_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_allocbt_verify(bp);
+}
+static void
+xfs_allocbt_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_allocbt_verify(bp);
+}
+const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+        .verify_read = xfs_allocbt_read_verify,
+        .verify_write = xfs_allocbt_write_verify,
+};
 #ifdef DEBUG
 STATIC int
 xfs_allocbt_keys_inorder(
@@ -327,6 +403,7 @@ static const struct xfs_btree_ops xfs_allocbt_ops = {
        .init_rec_from_cur      = xfs_allocbt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_allocbt_init_ptr_from_cur,
        .key_diff               = xfs_allocbt_key_diff,
+        .buf_ops                = &xfs_allocbt_buf_ops,
 #ifdef DEBUG
        .keys_inorder           = xfs_allocbt_keys_inorder,
        .recs_inorder           = xfs_allocbt_recs_inorder,
diff --git a/fs/xfs/xfs_alloc_btree.h b/fs/xfs/xfs_alloc_btree.h
index 359fb86ed876..7e89a2b429dd 100644
--- a/fs/xfs/xfs_alloc_btree.h
+++ b/fs/xfs/xfs_alloc_btree.h
@@ -93,4 +93,6 @@ extern struct xfs_btree_cur *xfs_allocbt_init_cursor(struct xfs_mount *,
                xfs_agnumber_t, xfs_btnum_t);
 extern int xfs_allocbt_maxrecs(struct xfs_mount *, int, int);
+extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
 #endif  /* __XFS_ALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index e562dd43f41f..4111a40ebe1a 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -124,7 +124,7 @@ xfs_setfilesize_trans_alloc(
        ioend->io_append_trans = tp;
        /*
-         * We will pass freeze protection with a transaction.  So tell lockdep
+         * We may pass freeze protection with a transaction.  So tell lockdep
         * we released it.
         */
        rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
@@ -149,11 +149,13 @@ xfs_setfilesize(
        xfs_fsize_t             isize;
        /*
-         * The transaction was allocated in the I/O submission thread,
+         * The transaction may have been allocated in the I/O submission thread,
-         * thus we need to mark ourselves as beeing in a transaction
+         * thus we need to mark ourselves as beeing in a transaction manually.
-         * manually.
+         * Similarly for freeze protection.
         */
        current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
+                           0, 1, _THIS_IP_);
        xfs_ilock(ip, XFS_ILOCK_EXCL);
        isize = xfs_new_eof(ip, ioend->io_offset + ioend->io_size);
@@ -187,7 +189,8 @@ xfs_finish_ioend(
                if (ioend->io_type == XFS_IO_UNWRITTEN)
                        queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
-                else if (ioend->io_append_trans)
+                else if (ioend->io_append_trans ||
+                         (ioend->io_isdirect && xfs_ioend_is_append(ioend)))
                        queue_work(mp->m_data_workqueue, &ioend->io_work);
                else
                        xfs_destroy_ioend(ioend);
@@ -205,15 +208,6 @@ xfs_end_io(
        struct xfs_inode *ip = XFS_I(ioend->io_inode);
        int             error = 0;
-        if (ioend->io_append_trans) {
-                /*
-                 * We've got freeze protection passed with the transaction.
-                 * Tell lockdep about it.
-                 */
-                rwsem_acquire_read(
-                        &ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                        0, 1, _THIS_IP_);
-        }
        if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
                ioend->io_error = -EIO;
                goto done;
@@ -226,35 +220,31 @@ xfs_end_io(
         * range to normal written extens after the data I/O has finished.
         */
        if (ioend->io_type == XFS_IO_UNWRITTEN) {
+                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+                                                  ioend->io_size);
+        } else if (ioend->io_isdirect && xfs_ioend_is_append(ioend)) {
                /*
-                 * For buffered I/O we never preallocate a transaction when
+                 * For direct I/O we do not know if we need to allocate blocks
-                 * doing the unwritten extent conversion, but for direct I/O
+                 * or not so we can't preallocate an append transaction as that
-                 * we do not know if we are converting an unwritten extent
+                 * results in nested reservations and log space deadlocks. Hence
-                 * or not at the point where we preallocate the transaction.
+                 * allocate the transaction here. While this is sub-optimal and
+                 * can block IO completion for some time, we're stuck with doing
+                 * it this way until we can pass the ioend to the direct IO
+                 * allocation callbacks and avoid nesting that way.
                 */
-                if (ioend->io_append_trans) {
+                error = xfs_setfilesize_trans_alloc(ioend);
-                        ASSERT(ioend->io_isdirect);
+                if (error)
-                        current_set_flags_nested(
-                                &ioend->io_append_trans->t_pflags, PF_FSTRANS);
-                        xfs_trans_cancel(ioend->io_append_trans, 0);
-                }
-                error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
-                                                 ioend->io_size);
-                if (error) {
-                        ioend->io_error = -error;
                        goto done;
-                }
+                error = xfs_setfilesize(ioend);
        } else if (ioend->io_append_trans) {
                error = xfs_setfilesize(ioend);
-                if (error)
-                        ioend->io_error = -error;
        } else {
                ASSERT(!xfs_ioend_is_append(ioend));
        }
 done:
+        if (error)
+                ioend->io_error = -error;
        xfs_destroy_ioend(ioend);
 }
@@ -481,11 +471,17 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 *
 * The fix is two passes across the ioend list - one to start writeback on the
 * buffer_heads, and then submit them for I/O on the second pass.
+ *
+ * If @fail is non-zero, it means that we have a situation where some part of
+ * the submission process has failed after we have marked paged for writeback
+ * and unlocked them. In this situation, we need to fail the ioend chain rather
+ * than submit it to IO. This typically only happens on a filesystem shutdown.
 */
 STATIC void
 xfs_submit_ioend(
        struct writeback_control *wbc,
-        xfs_ioend_t             *ioend)
+        xfs_ioend_t             *ioend,
+        int                     fail)
 {
        xfs_ioend_t             *head = ioend;
        xfs_ioend_t             *next;
@@ -506,6 +502,18 @@ xfs_submit_ioend(
                next = ioend->io_list;
                bio = NULL;
+                /*
+                 * If we are failing the IO now, just mark the ioend with an
+                 * error and finish it. This will run IO completion immediately
+                 * as there is only one reference to the ioend at this point in
+                 * time.
+                 */
+                if (fail) {
+                        ioend->io_error = -fail;
+                        xfs_finish_ioend(ioend);
+                        continue;
+                }
                for (bh = ioend->io_buffer_head; bh; bh = bh->b_private) {
                        if (!bio) {
@@ -1060,7 +1068,18 @@ xfs_vm_writepage(
        xfs_start_page_writeback(page, 1, count);
-        if (ioend && imap_valid) {
+        /* if there is no IO to be submitted for this page, we are done */
+        if (!ioend)
+                return 0;
+        ASSERT(iohead);
+        /*
+         * Any errors from this point onwards need tobe reported through the IO
+         * completion path as we have marked the initial page as under writeback
+         * and unlocked it.
+         */
+        if (imap_valid) {
                xfs_off_t               end_index;
                end_index = imap.br_startoff + imap.br_blockcount;
@@ -1079,20 +1098,15 @@ xfs_vm_writepage(
                                  wbc, end_index);
        }
-        if (iohead) {
-                /*
-                 * Reserve log space if we might write beyond the on-disk
-                 * inode size.
-                 */
-                if (ioend->io_type != XFS_IO_UNWRITTEN &&
-                    xfs_ioend_is_append(ioend)) {
-                        err = xfs_setfilesize_trans_alloc(ioend);
-                        if (err)
-                                goto error;
-                }
-                xfs_submit_ioend(wbc, iohead);
+        /*
-        }
+         * Reserve log space if we might write beyond the on-disk inode size.
+         */
+        err = 0;
+        if (ioend->io_type != XFS_IO_UNWRITTEN && xfs_ioend_is_append(ioend))
+                err = xfs_setfilesize_trans_alloc(ioend);
+        xfs_submit_ioend(wbc, iohead, err);
        return 0;
@@ -1408,25 +1422,21 @@ xfs_vm_direct_IO(
                size_t size = iov_length(iov, nr_segs);
                /*
-                 * We need to preallocate a transaction for a size update
+                 * We cannot preallocate a size update transaction here as we
-                 * here.  In the case that this write both updates the size
+                 * don't know whether allocation is necessary or not. Hence we
-                 * and converts at least on unwritten extent we will cancel
+                 * can only tell IO completion that one is necessary if we are
-                 * the still clean transaction after the I/O has finished.
+                 * not doing unwritten extent conversion.
                 */
                iocb->private = ioend = xfs_alloc_ioend(inode, XFS_IO_DIRECT);
-                if (offset + size > XFS_I(inode)->i_d.di_size) {
+                if (offset + size > XFS_I(inode)->i_d.di_size)
-                        ret = xfs_setfilesize_trans_alloc(ioend);
-                        if (ret)
-                                goto out_destroy_ioend;
                        ioend->io_isdirect = 1;
-                }
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
                                            xfs_get_blocks_direct,
                                            xfs_end_io_direct_write, NULL, 0);
                if (ret != -EIOCBQUEUED && iocb->private)
-                        goto out_trans_cancel;
+                        goto out_destroy_ioend;
        } else {
                ret = __blockdev_direct_IO(rw, iocb, inode, bdev, iov,
                                            offset, nr_segs,
@@ -1436,15 +1446,6 @@ xfs_vm_direct_IO(
        return ret;
-out_trans_cancel:
-        if (ioend->io_append_trans) {
-                current_set_flags_nested(&ioend->io_append_trans->t_pflags,
-                                         PF_FSTRANS);
-                rwsem_acquire_read(
-                        &inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1],
-                        0, 1, _THIS_IP_);
-                xfs_trans_cancel(ioend->io_append_trans, 0);
-        }
 out_destroy_ioend:
        xfs_destroy_ioend(ioend);
        return ret;
@@ -1617,7 +1618,7 @@ xfs_vm_bmap(
        trace_xfs_vm_bmap(XFS_I(inode));
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        xfs_flush_pages(ip, (xfs_off_t)0, -1, 0, FI_REMAPF);
+        filemap_write_and_wait(mapping);
        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
        return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 0ca1f0be62d2..aaf472532b3c 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -903,11 +903,9 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
         */
        dp = args->dp;
        args->blkno = 0;
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-                                             XFS_ATTR_FORK);
        if (error)
-                return(error);
+                return error;
-        ASSERT(bp != NULL);
        /*
         * Look up the given attribute in the leaf block.  Figure out if
@@ -1031,12 +1029,12 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
                 * Read in the block containing the "old" attr, then
                 * remove the "old" attr from that block (neat, huh!)
                 */
-                error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1,
+                error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno,
-                                                     &bp, XFS_ATTR_FORK);
+                                           -1, &bp);
                if (error)
-                        return(error);
+                        return error;
-                ASSERT(bp != NULL);
-                (void)xfs_attr_leaf_remove(bp, args);
+                xfs_attr_leaf_remove(bp, args);
                /*
                 * If the result is small enough, shrink it all into the inode.
@@ -1100,20 +1098,17 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
         */
        dp = args->dp;
        args->blkno = 0;
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-                                             XFS_ATTR_FORK);
+        if (error)
-        if (error) {
+                return error;
-                return(error);
-        }
-        ASSERT(bp != NULL);
        error = xfs_attr_leaf_lookup_int(bp, args);
        if (error == ENOATTR) {
                xfs_trans_brelse(args->trans, bp);
                return(error);
        }
-        (void)xfs_attr_leaf_remove(bp, args);
+        xfs_attr_leaf_remove(bp, args);
        /*
         * If the result is small enough, shrink it all into the inode.
@@ -1155,12 +1150,12 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
        struct xfs_buf *bp;
        int error;
+        trace_xfs_attr_leaf_get(args);
        args->blkno = 0;
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-                                             XFS_ATTR_FORK);
        if (error)
-                return(error);
+                return error;
-        ASSERT(bp != NULL);
        error = xfs_attr_leaf_lookup_int(bp, args);
        if (error != EEXIST)  {
@@ -1181,22 +1176,15 @@ xfs_attr_leaf_get(xfs_da_args_t *args)
 STATIC int
 xfs_attr_leaf_list(xfs_attr_list_context_t *context)
 {
-        xfs_attr_leafblock_t *leaf;
        int error;
        struct xfs_buf *bp;
+        trace_xfs_attr_leaf_list(context);
        context->cursor->blkno = 0;
-        error = xfs_da_read_buf(NULL, context->dp, 0, -1, &bp, XFS_ATTR_FORK);
+        error = xfs_attr_leaf_read(NULL, context->dp, 0, -1, &bp);
        if (error)
                return XFS_ERROR(error);
-        ASSERT(bp != NULL);
-        leaf = bp->b_addr;
-        if (unlikely(leaf->hdr.info.magic != cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
-                XFS_CORRUPTION_ERROR("xfs_attr_leaf_list", XFS_ERRLEVEL_LOW,
-                                     context->dp->i_mount, leaf);
-                xfs_trans_brelse(NULL, bp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        error = xfs_attr_leaf_list_int(bp, context);
        xfs_trans_brelse(NULL, bp);
@@ -1600,12 +1588,9 @@ xfs_attr_node_removename(xfs_da_args_t *args)
                ASSERT(state->path.blk[0].bp);
                state->path.blk[0].bp = NULL;
-                error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp,
+                error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp);
-                                                     XFS_ATTR_FORK);
                if (error)
                        goto out;
-                ASSERT((((xfs_attr_leafblock_t *)bp->b_addr)->hdr.info.magic) ==
-                       cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
                if ((forkoff = xfs_attr_shortform_allfit(bp, dp))) {
                        xfs_bmap_init(args->flist, args->firstblock);
@@ -1653,6 +1638,8 @@ xfs_attr_fillstate(xfs_da_state_t *state)
        xfs_da_state_blk_t *blk;
        int level;
+        trace_xfs_attr_fillstate(state->args);
        /*
         * Roll down the "path" in the state structure, storing the on-disk
         * block number for those buffers in the "path".
@@ -1699,6 +1686,8 @@ xfs_attr_refillstate(xfs_da_state_t *state)
        xfs_da_state_blk_t *blk;
        int level, error;
+        trace_xfs_attr_refillstate(state->args);
        /*
         * Roll down the "path" in the state structure, storing the on-disk
         * block number for those buffers in the "path".
@@ -1707,7 +1696,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
                if (blk->disk_blkno) {
-                        error = xfs_da_read_buf(state->args->trans,
+                        error = xfs_da_node_read(state->args->trans,
                                                state->args->dp,
                                                blk->blkno, blk->disk_blkno,
                                                &blk->bp, XFS_ATTR_FORK);
@@ -1726,7 +1715,7 @@ xfs_attr_refillstate(xfs_da_state_t *state)
        ASSERT((path->active >= 0) && (path->active < XFS_DA_NODE_MAXDEPTH));
        for (blk = path->blk, level = 0; level < path->active; blk++, level++) {
                if (blk->disk_blkno) {
-                        error = xfs_da_read_buf(state->args->trans,
+                        error = xfs_da_node_read(state->args->trans,
                                                state->args->dp,
                                                blk->blkno, blk->disk_blkno,
                                                &blk->bp, XFS_ATTR_FORK);
@@ -1755,6 +1744,8 @@ xfs_attr_node_get(xfs_da_args_t *args)
        int error, retval;
        int i;
+        trace_xfs_attr_node_get(args);
        state = xfs_da_state_alloc();
        state->args = args;
        state->mp = args->dp->i_mount;
@@ -1804,6 +1795,8 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
        int error, i;
        struct xfs_buf *bp;
+        trace_xfs_attr_node_list(context);
        cursor = context->cursor;
        cursor->initted = 1;
@@ -1814,7 +1807,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
         */
        bp = NULL;
        if (cursor->blkno > 0) {
-                error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+                error = xfs_da_node_read(NULL, context->dp, cursor->blkno, -1,
                                              &bp, XFS_ATTR_FORK);
                if ((error != 0) && (error != EFSCORRUPTED))
                        return(error);
@@ -1856,17 +1849,11 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
        if (bp == NULL) {
                cursor->blkno = 0;
                for (;;) {
-                        error = xfs_da_read_buf(NULL, context->dp,
+                        error = xfs_da_node_read(NULL, context->dp,
                                                      cursor->blkno, -1, &bp,
                                                      XFS_ATTR_FORK);
                        if (error)
                                return(error);
-                        if (unlikely(bp == NULL)) {
-                                XFS_ERROR_REPORT("xfs_attr_node_list(2)",
-                                                 XFS_ERRLEVEL_LOW,
-                                                 context->dp->i_mount);
-                                return(XFS_ERROR(EFSCORRUPTED));
-                        }
                        node = bp->b_addr;
                        if (node->hdr.info.magic ==
                            cpu_to_be16(XFS_ATTR_LEAF_MAGIC))
@@ -1907,14 +1894,6 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
         */
        for (;;) {
                leaf = bp->b_addr;
-                if (unlikely(leaf->hdr.info.magic !=
-                             cpu_to_be16(XFS_ATTR_LEAF_MAGIC))) {
-                        XFS_CORRUPTION_ERROR("xfs_attr_node_list(4)",
-                                             XFS_ERRLEVEL_LOW,
-                                             context->dp->i_mount, leaf);
-                        xfs_trans_brelse(NULL, bp);
-                        return(XFS_ERROR(EFSCORRUPTED));
-                }
                error = xfs_attr_leaf_list_int(bp, context);
                if (error) {
                        xfs_trans_brelse(NULL, bp);
@@ -1924,16 +1903,10 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
                        break;
                cursor->blkno = be32_to_cpu(leaf->hdr.info.forw);
                xfs_trans_brelse(NULL, bp);
-                error = xfs_da_read_buf(NULL, context->dp, cursor->blkno, -1,
+                error = xfs_attr_leaf_read(NULL, context->dp, cursor->blkno, -1,
-                                              &bp, XFS_ATTR_FORK);
+                                           &bp);
                if (error)
-                        return(error);
+                        return error;
-                if (unlikely((bp == NULL))) {
-                        XFS_ERROR_REPORT("xfs_attr_node_list(5)",
-                                         XFS_ERRLEVEL_LOW,
-                                         context->dp->i_mount);
-                        return(XFS_ERROR(EFSCORRUPTED));
-                }
        }
        xfs_trans_brelse(NULL, bp);
        return(0);
@@ -1959,6 +1932,8 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
        int nmap, error, tmp, valuelen, blkcnt, i;
        xfs_dablk_t lblkno;
+        trace_xfs_attr_rmtval_get(args);
        ASSERT(!(args->flags & ATTR_KERNOVAL));
        mp = args->dp->i_mount;
@@ -1980,7 +1955,7 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                        dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
                        blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
                        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
-                                                   dblkno, blkcnt, 0, &bp);
+                                                   dblkno, blkcnt, 0, &bp, NULL);
                        if (error)
                                return(error);
@@ -2014,6 +1989,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
        xfs_dablk_t lblkno;
        int blkcnt, valuelen, nmap, error, tmp, committed;
+        trace_xfs_attr_rmtval_set(args);
        dp = args->dp;
        mp = dp->i_mount;
        src = args->value;
@@ -2143,6 +2120,8 @@ xfs_attr_rmtval_remove(xfs_da_args_t *args)
        xfs_dablk_t lblkno;
        int valuelen, blkcnt, nmap, error, done, committed;
+        trace_xfs_attr_rmtval_remove(args);
        mp = args->dp->i_mount;
        /*
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index d330111ca738..ee24993c7d12 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -57,7 +57,8 @@ STATIC int xfs_attr_leaf_create(xfs_da_args_t *args, xfs_dablk_t which_block,
                                struct xfs_buf **bpp);
 STATIC int xfs_attr_leaf_add_work(struct xfs_buf *leaf_buffer,
                                  xfs_da_args_t *args, int freemap_index);
-STATIC void xfs_attr_leaf_compact(xfs_trans_t *tp, struct xfs_buf *leaf_buffer);
+STATIC void xfs_attr_leaf_compact(struct xfs_da_args *args,
+                                  struct xfs_buf *leaf_buffer);
 STATIC void xfs_attr_leaf_rebalance(xfs_da_state_t *state,
                                                   xfs_da_state_blk_t *blk1,
                                                   xfs_da_state_blk_t *blk2);
@@ -87,6 +88,52 @@ STATIC void xfs_attr_leaf_moveents(xfs_attr_leafblock_t *src_leaf,
                                         xfs_mount_t *mp);
 STATIC int xfs_attr_leaf_entsize(xfs_attr_leafblock_t *leaf, int index);
+static void
+xfs_attr_leaf_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_attr_leaf_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC);
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_attr_leaf_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_attr_leaf_verify(bp);
+}
+static void
+xfs_attr_leaf_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_attr_leaf_verify(bp);
+}
+const struct xfs_buf_ops xfs_attr_leaf_buf_ops = {
+        .verify_read = xfs_attr_leaf_read_verify,
+        .verify_write = xfs_attr_leaf_write_verify,
+};
+int
+xfs_attr_leaf_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                                XFS_ATTR_FORK, &xfs_attr_leaf_buf_ops);
+}
 /*========================================================================
 * Namespace helper routines
 *========================================================================*/
@@ -869,17 +916,16 @@ xfs_attr_leaf_to_node(xfs_da_args_t *args)
        error = xfs_da_grow_inode(args, &blkno);
        if (error)
                goto out;
-        error = xfs_da_read_buf(args->trans, args->dp, 0, -1, &bp1,
+        error = xfs_attr_leaf_read(args->trans, args->dp, 0, -1, &bp1);
-                                             XFS_ATTR_FORK);
        if (error)
                goto out;
-        ASSERT(bp1 != NULL);
        bp2 = NULL;
        error = xfs_da_get_buf(args->trans, args->dp, blkno, -1, &bp2,
                                            XFS_ATTR_FORK);
        if (error)
                goto out;
-        ASSERT(bp2 != NULL);
+        bp2->b_ops = bp1->b_ops;
        memcpy(bp2->b_addr, bp1->b_addr, XFS_LBSIZE(dp->i_mount));
        bp1 = NULL;
        xfs_trans_log_buf(args->trans, bp2, 0, XFS_LBSIZE(dp->i_mount) - 1);
@@ -933,7 +979,7 @@ xfs_attr_leaf_create(
                                            XFS_ATTR_FORK);
        if (error)
                return(error);
-        ASSERT(bp != NULL);
+        bp->b_ops = &xfs_attr_leaf_buf_ops;
        leaf = bp->b_addr;
        memset((char *)leaf, 0, XFS_LBSIZE(dp->i_mount));
        hdr = &leaf->hdr;
@@ -1071,7 +1117,7 @@ xfs_attr_leaf_add(
         * Compact the entries to coalesce free space.
         * This may change the hdr->count via dropping INCOMPLETE entries.
         */
-        xfs_attr_leaf_compact(args->trans, bp);
+        xfs_attr_leaf_compact(args, bp);
        /*
         * After compaction, the block is guaranteed to have only one
@@ -1102,6 +1148,8 @@ xfs_attr_leaf_add_work(
        xfs_mount_t *mp;
        int tmp, i;
+        trace_xfs_attr_leaf_add_work(args);
        leaf = bp->b_addr;
        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        hdr = &leaf->hdr;
@@ -1214,15 +1262,17 @@ xfs_attr_leaf_add_work(
 */
 STATIC void
 xfs_attr_leaf_compact(
-        struct xfs_trans *trans,
+        struct xfs_da_args      *args,
-        struct xfs_buf  *bp)
+        struct xfs_buf          *bp)
 {
-        xfs_attr_leafblock_t *leaf_s, *leaf_d;
+        xfs_attr_leafblock_t    *leaf_s, *leaf_d;
-        xfs_attr_leaf_hdr_t *hdr_s, *hdr_d;
+        xfs_attr_leaf_hdr_t     *hdr_s, *hdr_d;
-        xfs_mount_t *mp;
+        struct xfs_trans        *trans = args->trans;
-        char *tmpbuffer;
+        struct xfs_mount        *mp = trans->t_mountp;
+        char                    *tmpbuffer;
+        trace_xfs_attr_leaf_compact(args);
-        mp = trans->t_mountp;
        tmpbuffer = kmem_alloc(XFS_LBSIZE(mp), KM_SLEEP);
        ASSERT(tmpbuffer != NULL);
        memcpy(tmpbuffer, bp->b_addr, XFS_LBSIZE(mp));
@@ -1291,6 +1341,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
        leaf2 = blk2->bp->b_addr;
        ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
+        ASSERT(leaf2->hdr.count == 0);
        args = state->args;
        trace_xfs_attr_leaf_rebalance(args);
@@ -1344,9 +1395,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                max  = be16_to_cpu(hdr2->firstused)
                                                - sizeof(xfs_attr_leaf_hdr_t);
                max -= be16_to_cpu(hdr2->count) * sizeof(xfs_attr_leaf_entry_t);
-                if (space > max) {
+                if (space > max)
-                        xfs_attr_leaf_compact(args->trans, blk2->bp);
+                        xfs_attr_leaf_compact(args, blk2->bp);
-                }
                /*
                 * Move high entries from leaf1 to low end of leaf2.
@@ -1361,6 +1411,7 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                 * I assert that since all callers pass in an empty
                 * second buffer, this code should never execute.
                 */
+                ASSERT(0);
                /*
                 * Figure the total bytes to be added to the destination leaf.
@@ -1376,9 +1427,8 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                max  = be16_to_cpu(hdr1->firstused)
                                                - sizeof(xfs_attr_leaf_hdr_t);
                max -= be16_to_cpu(hdr1->count) * sizeof(xfs_attr_leaf_entry_t);
-                if (space > max) {
+                if (space > max)
-                        xfs_attr_leaf_compact(args->trans, blk1->bp);
+                        xfs_attr_leaf_compact(args, blk1->bp);
-                }
                /*
                 * Move low entries from leaf2 to high end of leaf1.
@@ -1422,10 +1472,24 @@ xfs_attr_leaf_rebalance(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
                        args->index2 = 0;
                        args->blkno2 = blk2->blkno;
                } else {
+                        /*
+                         * On a double leaf split, the original attr location
+                         * is already stored in blkno2/index2, so don't
+                         * overwrite it overwise we corrupt the tree.
+                         */
                        blk2->index = blk1->index
                                    - be16_to_cpu(leaf1->hdr.count);
-                        args->index = args->index2 = blk2->index;
+                        args->index = blk2->index;
-                        args->blkno = args->blkno2 = blk2->blkno;
+                        args->blkno = blk2->blkno;
+                        if (!state->extravalid) {
+                                /*
+                                 * set the new attr location to match the old
+                                 * one and let the higher level split code
+                                 * decide where in the leaf to place it.
+                                 */
+                                args->index2 = blk2->index;
+                                args->blkno2 = blk2->blkno;
+                        }
                }
        } else {
                ASSERT(state->inleaf == 1);
@@ -1561,6 +1625,8 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
        xfs_dablk_t blkno;
        struct xfs_buf *bp;
+        trace_xfs_attr_leaf_toosmall(state->args);
        /*
         * Check for the degenerate case of the block being over 50% full.
         * If so, it's not worth even looking to see if we might be able
@@ -1620,18 +1686,16 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action)
                        blkno = be32_to_cpu(info->back);
                if (blkno == 0)
                        continue;
-                error = xfs_da_read_buf(state->args->trans, state->args->dp,
+                error = xfs_attr_leaf_read(state->args->trans, state->args->dp,
-                                        blkno, -1, &bp, XFS_ATTR_FORK);
+                                        blkno, -1, &bp);
                if (error)
                        return(error);
-                ASSERT(bp != NULL);
                leaf = (xfs_attr_leafblock_t *)info;
                count  = be16_to_cpu(leaf->hdr.count);
                bytes  = state->blocksize - (state->blocksize>>2);
                bytes -= be16_to_cpu(leaf->hdr.usedbytes);
                leaf = bp->b_addr;
-                ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
                count += be16_to_cpu(leaf->hdr.count);
                bytes -= be16_to_cpu(leaf->hdr.usedbytes);
                bytes -= count * sizeof(xfs_attr_leaf_entry_t);
@@ -1686,6 +1750,8 @@ xfs_attr_leaf_remove(
        int tablesize, tmp, i;
        xfs_mount_t *mp;
+        trace_xfs_attr_leaf_remove(args);
        leaf = bp->b_addr;
        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        hdr = &leaf->hdr;
@@ -2495,15 +2561,11 @@ xfs_attr_leaf_clearflag(xfs_da_args_t *args)
        /*
         * Set up the operation.
         */
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-                                             XFS_ATTR_FORK);
+        if (error)
-        if (error) {
                return(error);
-        }
-        ASSERT(bp != NULL);
        leaf = bp->b_addr;
-        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
        ASSERT(args->index >= 0);
        entry = &leaf->entries[ args->index ];
@@ -2560,15 +2622,11 @@ xfs_attr_leaf_setflag(xfs_da_args_t *args)
        /*
         * Set up the operation.
         */
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp);
-                                             XFS_ATTR_FORK);
+        if (error)
-        if (error) {
                return(error);
-        }
-        ASSERT(bp != NULL);
        leaf = bp->b_addr;
-        ASSERT(leaf->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(args->index < be16_to_cpu(leaf->hdr.count));
        ASSERT(args->index >= 0);
        entry = &leaf->entries[ args->index ];
@@ -2617,35 +2675,28 @@ xfs_attr_leaf_flipflags(xfs_da_args_t *args)
        /*
         * Read the block containing the "old" attr
         */
-        error = xfs_da_read_buf(args->trans, args->dp, args->blkno, -1, &bp1,
+        error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno, -1, &bp1);
-                                             XFS_ATTR_FORK);
+        if (error)
-        if (error) {
+                return error;
-                return(error);
-        }
-        ASSERT(bp1 != NULL);
        /*
         * Read the block containing the "new" attr, if it is different
         */
        if (args->blkno2 != args->blkno) {
-                error = xfs_da_read_buf(args->trans, args->dp, args->blkno2,
+                error = xfs_attr_leaf_read(args->trans, args->dp, args->blkno2,
-                                        -1, &bp2, XFS_ATTR_FORK);
+                                           -1, &bp2);
-                if (error) {
+                if (error)
-                        return(error);
+                        return error;
-                }
-                ASSERT(bp2 != NULL);
        } else {
                bp2 = bp1;
        }
        leaf1 = bp1->b_addr;
-        ASSERT(leaf1->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(args->index < be16_to_cpu(leaf1->hdr.count));
        ASSERT(args->index >= 0);
        entry1 = &leaf1->entries[ args->index ];
        leaf2 = bp2->b_addr;
-        ASSERT(leaf2->hdr.info.magic == cpu_to_be16(XFS_ATTR_LEAF_MAGIC));
        ASSERT(args->index2 < be16_to_cpu(leaf2->hdr.count));
        ASSERT(args->index2 >= 0);
        entry2 = &leaf2->entries[ args->index2 ];
@@ -2730,7 +2781,7 @@ xfs_attr_root_inactive(xfs_trans_t **trans, xfs_inode_t *dp)
         * the extents in reverse order the extent containing
         * block 0 must still be there.
         */
-        error = xfs_da_read_buf(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
+        error = xfs_da_node_read(*trans, dp, 0, -1, &bp, XFS_ATTR_FORK);
        if (error)
                return(error);
        blkno = XFS_BUF_ADDR(bp);
@@ -2815,7 +2866,7 @@ xfs_attr_node_inactive(
                 * traversal of the tree so we may deal with many blocks
                 * before we come back to this one.
                 */
-                error = xfs_da_read_buf(*trans, dp, child_fsb, -2, &child_bp,
+                error = xfs_da_node_read(*trans, dp, child_fsb, -2, &child_bp,
                                                XFS_ATTR_FORK);
                if (error)
                        return(error);
@@ -2856,8 +2907,8 @@ xfs_attr_node_inactive(
                 * child block number.
                 */
                if ((i+1) < count) {
-                        error = xfs_da_read_buf(*trans, dp, 0, parent_blkno,
+                        error = xfs_da_node_read(*trans, dp, 0, parent_blkno,
-                                &bp, XFS_ATTR_FORK);
+                                                 &bp, XFS_ATTR_FORK);
                        if (error)
                                return(error);
                        child_fsb = be32_to_cpu(node->btree[i+1].before);
diff --git a/fs/xfs/xfs_attr_leaf.h b/fs/xfs/xfs_attr_leaf.h
index dea17722945e..77de139a58f0 100644
--- a/fs/xfs/xfs_attr_leaf.h
+++ b/fs/xfs/xfs_attr_leaf.h
@@ -261,4 +261,10 @@ int	xfs_attr_leaf_order(struct xfs_buf *leaf1_bp,
                                   struct xfs_buf *leaf2_bp);
 int     xfs_attr_leaf_newentsize(int namelen, int valuelen, int blocksize,
                                        int *local);
+int     xfs_attr_leaf_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                        xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                        struct xfs_buf **bpp);
+extern const struct xfs_buf_ops xfs_attr_leaf_buf_ops;
 #endif  /* __XFS_ATTR_LEAF_H__ */
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 83d0cf3df930..0e92d12765d2 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -2662,8 +2662,9 @@ xfs_bmap_btree_to_extents(
        if ((error = xfs_btree_check_lptr(cur, cbno, 1)))
                return error;
 #endif
-        if ((error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp,
+        error = xfs_btree_read_bufl(mp, tp, cbno, 0, &cbp, XFS_BMAP_BTREE_REF,
-                        XFS_BMAP_BTREE_REF)))
+                                &xfs_bmbt_buf_ops);
+        if (error)
                return error;
        cblock = XFS_BUF_TO_BLOCK(cbp);
        if ((error = xfs_btree_check_block(cur, cblock, 0, cbp)))
@@ -3123,6 +3124,7 @@ xfs_bmap_extents_to_btree(
        /*
         * Fill in the child block.
         */
+        abp->b_ops = &xfs_bmbt_buf_ops;
        ablock = XFS_BUF_TO_BLOCK(abp);
        ablock->bb_magic = cpu_to_be32(XFS_BMAP_MAGIC);
        ablock->bb_level = 0;
@@ -3269,6 +3271,7 @@ xfs_bmap_local_to_extents(
                ASSERT(args.len == 1);
                *firstblock = args.fsbno;
                bp = xfs_btree_get_bufl(args.mp, tp, args.fsbno, 0);
+                bp->b_ops = &xfs_bmbt_buf_ops;
                memcpy(bp->b_addr, ifp->if_u1.if_data, ifp->if_bytes);
                xfs_trans_log_buf(tp, bp, 0, ifp->if_bytes - 1);
                xfs_bmap_forkoff_reset(args.mp, ip, whichfork);
@@ -4078,8 +4081,9 @@ xfs_bmap_read_extents(
         * pointer (leftmost) at each level.
         */
        while (level-- > 0) {
-                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                                XFS_BMAP_BTREE_REF)))
+                                XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+                if (error)
                        return error;
                block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
@@ -4124,7 +4128,8 @@ xfs_bmap_read_extents(
                 */
                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                if (nextbno != NULLFSBLOCK)
-                        xfs_btree_reada_bufl(mp, nextbno, 1);
+                        xfs_btree_reada_bufl(mp, nextbno, 1,
+                                             &xfs_bmbt_buf_ops);
                /*
                 * Copy records into the extent records.
                 */
@@ -4156,8 +4161,9 @@ xfs_bmap_read_extents(
                 */
                if (bno == NULLFSBLOCK)
                        break;
-                if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                                XFS_BMAP_BTREE_REF)))
+                                XFS_BMAP_BTREE_REF, &xfs_bmbt_buf_ops);
+                if (error)
                        return error;
                block = XFS_BUF_TO_BLOCK(bp);
        }
@@ -5599,7 +5605,7 @@ xfs_getbmap(
        xfs_ilock(ip, XFS_IOLOCK_SHARED);
        if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
                if (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size) {
-                        error = xfs_flush_pages(ip, 0, -1, 0, FI_REMAPF);
+                        error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
                        if (error)
                                goto out_unlock_iolock;
                }
@@ -5868,15 +5874,16 @@ xfs_bmap_check_leaf_extents(
         */
        while (level-- > 0) {
                /* See if buf is in cur first */
+                bp_release = 0;
                bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-                if (bp) {
+                if (!bp) {
-                        bp_release = 0;
-                } else {
                        bp_release = 1;
+                        error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                                XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+                        if (error)
+                                goto error_norelse;
                }
-                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-                                XFS_BMAP_BTREE_REF)))
-                        goto error_norelse;
                block = XFS_BUF_TO_BLOCK(bp);
                XFS_WANT_CORRUPTED_GOTO(
                        xfs_bmap_sanity_check(mp, bp, level),
@@ -5953,15 +5960,16 @@ xfs_bmap_check_leaf_extents(
                if (bno == NULLFSBLOCK)
                        break;
+                bp_release = 0;
                bp = xfs_bmap_get_bp(cur, XFS_FSB_TO_DADDR(mp, bno));
-                if (bp) {
+                if (!bp) {
-                        bp_release = 0;
-                } else {
                        bp_release = 1;
+                        error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
+                                                XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+                        if (error)
+                                goto error_norelse;
                }
-                if (!bp && (error = xfs_btree_read_bufl(mp, NULL, bno, 0, &bp,
-                                XFS_BMAP_BTREE_REF)))
-                        goto error_norelse;
                block = XFS_BUF_TO_BLOCK(bp);
        }
        if (bp_release) {
@@ -6052,7 +6060,9 @@ xfs_bmap_count_tree(
        struct xfs_btree_block  *block, *nextblock;
        int                     numrecs;
-        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF)))
+        error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp, XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+        if (error)
                return error;
        *count += 1;
        block = XFS_BUF_TO_BLOCK(bp);
@@ -6061,8 +6071,10 @@ xfs_bmap_count_tree(
                /* Not at node above leaves, count this level of nodes */
                nextbno = be64_to_cpu(block->bb_u.l.bb_rightsib);
                while (nextbno != NULLFSBLOCK) {
-                        if ((error = xfs_btree_read_bufl(mp, tp, nextbno,
+                        error = xfs_btree_read_bufl(mp, tp, nextbno, 0, &nbp,
-                                0, &nbp, XFS_BMAP_BTREE_REF)))
+                                                XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+                        if (error)
                                return error;
                        *count += 1;
                        nextblock = XFS_BUF_TO_BLOCK(nbp);
@@ -6091,8 +6103,10 @@ xfs_bmap_count_tree(
                        if (nextbno == NULLFSBLOCK)
                                break;
                        bno = nextbno;
-                        if ((error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
+                        error = xfs_btree_read_bufl(mp, tp, bno, 0, &bp,
-                                XFS_BMAP_BTREE_REF)))
+                                                XFS_BMAP_BTREE_REF,
+                                                &xfs_bmbt_buf_ops);
+                        if (error)
                                return error;
                        *count += 1;
                        block = XFS_BUF_TO_BLOCK(bp);
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 862084a47a7e..061b45cbe614 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -36,6 +36,7 @@
 #include "xfs_bmap.h"
 #include "xfs_error.h"
 #include "xfs_quota.h"
+#include "xfs_trace.h"
 /*
 * Determine the extent state.
@@ -707,6 +708,67 @@ xfs_bmbt_key_diff(
                                      cur->bc_rec.b.br_startoff;
 }
+static void
+xfs_bmbt_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        unsigned int            level;
+        int                     lblock_ok; /* block passes checks */
+        /* magic number and level verification.
+         *
+         * We don't know waht fork we belong to, so just verify that the level
+         * is less than the maximum of the two. Later checks will be more
+         * precise.
+         */
+        level = be16_to_cpu(block->bb_level);
+        lblock_ok = block->bb_magic == cpu_to_be32(XFS_BMAP_MAGIC) &&
+                    level < max(mp->m_bm_maxlevels[0], mp->m_bm_maxlevels[1]);
+        /* numrecs verification */
+        lblock_ok = lblock_ok &&
+                be16_to_cpu(block->bb_numrecs) <= mp->m_bmap_dmxr[level != 0];
+        /* sibling pointer verification */
+        lblock_ok = lblock_ok &&
+                block->bb_u.l.bb_leftsib &&
+                (block->bb_u.l.bb_leftsib == cpu_to_be64(NULLDFSBNO) ||
+                 XFS_FSB_SANITY_CHECK(mp,
+                        be64_to_cpu(block->bb_u.l.bb_leftsib))) &&
+                block->bb_u.l.bb_rightsib &&
+                (block->bb_u.l.bb_rightsib == cpu_to_be64(NULLDFSBNO) ||
+                 XFS_FSB_SANITY_CHECK(mp,
+                        be64_to_cpu(block->bb_u.l.bb_rightsib)));
+        if (!lblock_ok) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_bmbt_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_bmbt_verify(bp);
+}
+static void
+xfs_bmbt_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_bmbt_verify(bp);
+}
+const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+        .verify_read = xfs_bmbt_read_verify,
+        .verify_write = xfs_bmbt_write_verify,
+};
 #ifdef DEBUG
 STATIC int
 xfs_bmbt_keys_inorder(
@@ -746,6 +808,7 @@ static const struct xfs_btree_ops xfs_bmbt_ops = {
        .init_rec_from_cur      = xfs_bmbt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_bmbt_init_ptr_from_cur,
        .key_diff               = xfs_bmbt_key_diff,
+        .buf_ops                = &xfs_bmbt_buf_ops,
 #ifdef DEBUG
        .keys_inorder           = xfs_bmbt_keys_inorder,
        .recs_inorder           = xfs_bmbt_recs_inorder,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e66c4ea0f85..88469ca08696 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,5 +236,6 @@ extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
                struct xfs_trans *, struct xfs_inode *, int);
+extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
 #endif  /* __XFS_BMAP_BTREE_H__ */
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e53e317b1582..db010408d701 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -266,9 +266,13 @@ xfs_btree_dup_cursor(
        for (i = 0; i < new->bc_nlevels; i++) {
                new->bc_ptrs[i] = cur->bc_ptrs[i];
                new->bc_ra[i] = cur->bc_ra[i];
-                if ((bp = cur->bc_bufs[i])) {
+                bp = cur->bc_bufs[i];
-                        if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                if (bp) {
-                                XFS_BUF_ADDR(bp), mp->m_bsize, 0, &bp))) {
+                        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
+                                                   XFS_BUF_ADDR(bp), mp->m_bsize,
+                                                   0, &bp,
+                                                   cur->bc_ops->buf_ops);
+                        if (error) {
                                xfs_btree_del_cursor(new, error);
                                *ncur = NULL;
                                return error;
@@ -609,25 +613,26 @@ xfs_btree_offsets(
 * Get a buffer for the block, return it read in.
 * Long-form addressing.
 */
-int                                     /* error */
+int
 xfs_btree_read_bufl(
-        xfs_mount_t     *mp,            /* file system mount point */
+        struct xfs_mount        *mp,            /* file system mount point */
-        xfs_trans_t     *tp,            /* transaction pointer */
+        struct xfs_trans        *tp,            /* transaction pointer */
-        xfs_fsblock_t   fsbno,          /* file system block number */
+        xfs_fsblock_t           fsbno,          /* file system block number */
-        uint            lock,           /* lock flags for read_buf */
+        uint                    lock,           /* lock flags for read_buf */
-        xfs_buf_t       **bpp,          /* buffer for fsbno */
+        struct xfs_buf          **bpp,          /* buffer for fsbno */
-        int             refval)         /* ref count value for buffer */
+        int                     refval,         /* ref count value for buffer */
-{
+        const struct xfs_buf_ops *ops)
-        xfs_buf_t       *bp;            /* return value */
+{
+        struct xfs_buf          *bp;            /* return value */
        xfs_daddr_t             d;              /* real disk block address */
-        int             error;
+        int                     error;
        ASSERT(fsbno != NULLFSBLOCK);
        d = XFS_FSB_TO_DADDR(mp, fsbno);
-        if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-                        mp->m_bsize, lock, &bp))) {
+                                   mp->m_bsize, lock, &bp, ops);
+        if (error)
                return error;
-        }
        ASSERT(!xfs_buf_geterror(bp));
        if (bp)
                xfs_buf_set_ref(bp, refval);
@@ -642,15 +647,16 @@ xfs_btree_read_bufl(
 /* ARGSUSED */
 void
 xfs_btree_reada_bufl(
-        xfs_mount_t     *mp,            /* file system mount point */
+        struct xfs_mount        *mp,            /* file system mount point */
-        xfs_fsblock_t   fsbno,          /* file system block number */
+        xfs_fsblock_t           fsbno,          /* file system block number */
-        xfs_extlen_t    count)          /* count of filesystem blocks */
+        xfs_extlen_t            count,          /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops)
 {
        xfs_daddr_t             d;
        ASSERT(fsbno != NULLFSBLOCK);
        d = XFS_FSB_TO_DADDR(mp, fsbno);
-        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 }
 /*
@@ -660,17 +666,18 @@ xfs_btree_reada_bufl(
 /* ARGSUSED */
 void
 xfs_btree_reada_bufs(
-        xfs_mount_t     *mp,            /* file system mount point */
+        struct xfs_mount        *mp,            /* file system mount point */
-        xfs_agnumber_t  agno,           /* allocation group number */
+        xfs_agnumber_t          agno,           /* allocation group number */
-        xfs_agblock_t   agbno,          /* allocation group block number */
+        xfs_agblock_t           agbno,          /* allocation group block number */
-        xfs_extlen_t    count)          /* count of filesystem blocks */
+        xfs_extlen_t            count,          /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops)
 {
        xfs_daddr_t             d;
        ASSERT(agno != NULLAGNUMBER);
        ASSERT(agbno != NULLAGBLOCK);
        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count);
+        xfs_buf_readahead(mp->m_ddev_targp, d, mp->m_bsize * count, ops);
 }
 STATIC int
@@ -684,12 +691,14 @@ xfs_btree_readahead_lblock(
        xfs_dfsbno_t            right = be64_to_cpu(block->bb_u.l.bb_rightsib);
        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLDFSBNO) {
-                xfs_btree_reada_bufl(cur->bc_mp, left, 1);
+                xfs_btree_reada_bufl(cur->bc_mp, left, 1,
+                                     cur->bc_ops->buf_ops);
                rval++;
        }
        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLDFSBNO) {
-                xfs_btree_reada_bufl(cur->bc_mp, right, 1);
+                xfs_btree_reada_bufl(cur->bc_mp, right, 1,
+                                     cur->bc_ops->buf_ops);
                rval++;
        }
@@ -709,13 +718,13 @@ xfs_btree_readahead_sblock(
        if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                     left, 1);
+                                     left, 1, cur->bc_ops->buf_ops);
                rval++;
        }
        if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
                xfs_btree_reada_bufs(cur->bc_mp, cur->bc_private.a.agno,
-                                     right, 1);
+                                     right, 1, cur->bc_ops->buf_ops);
                rval++;
        }
@@ -853,18 +862,22 @@ xfs_btree_set_sibling(
        }
 }
-STATIC void
+void
 xfs_btree_init_block(
-        struct xfs_btree_cur    *cur,
+        struct xfs_mount *mp,
-        int                     level,
+        struct xfs_buf  *bp,
-        int                     numrecs,
+        __u32           magic,
-        struct xfs_btree_block  *new)   /* new block */
+        __u16           level,
+        __u16           numrecs,
+        unsigned int    flags)
 {
-        new->bb_magic = cpu_to_be32(xfs_magics[cur->bc_btnum]);
+        struct xfs_btree_block  *new = XFS_BUF_TO_BLOCK(bp);
+        new->bb_magic = cpu_to_be32(magic);
        new->bb_level = cpu_to_be16(level);
        new->bb_numrecs = cpu_to_be16(numrecs);
-        if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+        if (flags & XFS_BTREE_LONG_PTRS) {
                new->bb_u.l.bb_leftsib = cpu_to_be64(NULLDFSBNO);
                new->bb_u.l.bb_rightsib = cpu_to_be64(NULLDFSBNO);
        } else {
@@ -873,6 +886,17 @@ xfs_btree_init_block(
        }
 }
+STATIC void
+xfs_btree_init_block_cur(
+        struct xfs_btree_cur    *cur,
+        int                     level,
+        int                     numrecs,
+        struct xfs_buf          *bp)
+{
+        xfs_btree_init_block(cur->bc_mp, bp, xfs_magics[cur->bc_btnum],
+                               level, numrecs, cur->bc_flags);
+}
 /*
 * Return true if ptr is the last record in the btree and
 * we need to track updateѕ to this record.  The decision
@@ -972,6 +996,7 @@ xfs_btree_get_buf_block(
        if (!*bpp)
                return ENOMEM;
+        (*bpp)->b_ops = cur->bc_ops->buf_ops;
        *block = XFS_BUF_TO_BLOCK(*bpp);
        return 0;
 }
@@ -998,19 +1023,15 @@ xfs_btree_read_buf_block(
        d = xfs_btree_ptr_to_daddr(cur, ptr);
        error = xfs_trans_read_buf(mp, cur->bc_tp, mp->m_ddev_targp, d,
-                                   mp->m_bsize, flags, bpp);
+                                   mp->m_bsize, flags, bpp,
+                                   cur->bc_ops->buf_ops);
        if (error)
                return error;
        ASSERT(!xfs_buf_geterror(*bpp));
        xfs_btree_set_refs(cur, *bpp);
        *block = XFS_BUF_TO_BLOCK(*bpp);
+        return 0;
-        error = xfs_btree_check_block(cur, *block, level, *bpp);
-        if (error)
-                xfs_trans_brelse(cur->bc_tp, *bpp);
-        return error;
 }
 /*
@@ -2183,7 +2204,7 @@ xfs_btree_split(
                goto error0;
        /* Fill in the btree header for the new right block. */
-        xfs_btree_init_block(cur, xfs_btree_get_level(left), 0, right);
+        xfs_btree_init_block_cur(cur, xfs_btree_get_level(left), 0, rbp);
        /*
         * Split the entries between the old and the new block evenly.
@@ -2492,7 +2513,7 @@ xfs_btree_new_root(
                nptr = 2;
        }
        /* Fill in the new block's btree header and log it. */
-        xfs_btree_init_block(cur, cur->bc_nlevels, 2, new);
+        xfs_btree_init_block_cur(cur, cur->bc_nlevels, 2, nbp);
        xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
        ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
                        !xfs_btree_ptr_is_null(cur, &rptr));
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 5b240de104c0..f932897194eb 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -188,6 +188,8 @@ struct xfs_btree_ops {
        __int64_t (*key_diff)(struct xfs_btree_cur *cur,
                              union xfs_btree_key *key);
+        const struct xfs_buf_ops        *buf_ops;
 #ifdef DEBUG
        /* check that k1 is lower than k2 */
        int     (*keys_inorder)(struct xfs_btree_cur *cur,
@@ -355,7 +357,8 @@ xfs_btree_read_bufl(
        xfs_fsblock_t           fsbno,  /* file system block number */
        uint                    lock,   /* lock flags for read_buf */
        struct xfs_buf          **bpp,  /* buffer for fsbno */
-        int                     refval);/* ref count value for buffer */
+        int                     refval, /* ref count value for buffer */
+        const struct xfs_buf_ops *ops);
 /*
 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -365,7 +368,8 @@ void					/* error */
 xfs_btree_reada_bufl(
        struct xfs_mount        *mp,    /* file system mount point */
        xfs_fsblock_t           fsbno,  /* file system block number */
-        xfs_extlen_t            count); /* count of filesystem blocks */
+        xfs_extlen_t            count,  /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops);
 /*
 * Read-ahead the block, don't wait for it, don't return a buffer.
@@ -376,8 +380,20 @@ xfs_btree_reada_bufs(
        struct xfs_mount        *mp,    /* file system mount point */
        xfs_agnumber_t          agno,   /* allocation group number */
        xfs_agblock_t           agbno,  /* allocation group block number */
-        xfs_extlen_t            count); /* count of filesystem blocks */
+        xfs_extlen_t            count,  /* count of filesystem blocks */
+        const struct xfs_buf_ops *ops);
+/*
+ * Initialise a new btree block header
+ */
+void
+xfs_btree_init_block(
+        struct xfs_mount *mp,
+        struct xfs_buf  *bp,
+        __u32           magic,
+        __u16           level,
+        __u16           numrecs,
+        unsigned int    flags);
 /*
 * Common btree core entry points.
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 933b7930b863..26673a0b20e7 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -569,7 +569,9 @@ found:
         */
        if (bp->b_flags & XBF_STALE) {
                ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0);
+                ASSERT(bp->b_iodone == NULL);
                bp->b_flags &= _XBF_KMEM | _XBF_PAGES;
+                bp->b_ops = NULL;
        }
        trace_xfs_buf_find(bp, flags, _RET_IP_);
@@ -654,7 +656,8 @@ xfs_buf_read_map(
        struct xfs_buftarg      *target,
        struct xfs_buf_map      *map,
        int                     nmaps,
-        xfs_buf_flags_t         flags)
+        xfs_buf_flags_t         flags,
+        const struct xfs_buf_ops *ops)
 {
        struct xfs_buf          *bp;
@@ -666,6 +669,7 @@ xfs_buf_read_map(
                if (!XFS_BUF_ISDONE(bp)) {
                        XFS_STATS_INC(xb_get_read);
+                        bp->b_ops = ops;
                        _xfs_buf_read(bp, flags);
                } else if (flags & XBF_ASYNC) {
                        /*
@@ -691,13 +695,14 @@ void
 xfs_buf_readahead_map(
        struct xfs_buftarg      *target,
        struct xfs_buf_map      *map,
-        int                     nmaps)
+        int                     nmaps,
+        const struct xfs_buf_ops *ops)
 {
        if (bdi_read_congested(target->bt_bdi))
                return;
        xfs_buf_read_map(target, map, nmaps,
-                     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD);
+                     XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD, ops);
 }
 /*
@@ -709,10 +714,10 @@ xfs_buf_read_uncached(
        struct xfs_buftarg      *target,
        xfs_daddr_t             daddr,
        size_t                  numblks,
-        int                     flags)
+        int                     flags,
+        const struct xfs_buf_ops *ops)
 {
-        xfs_buf_t               *bp;
+        struct xfs_buf          *bp;
-        int                     error;
        bp = xfs_buf_get_uncached(target, numblks, flags);
        if (!bp)
@@ -723,13 +728,10 @@ xfs_buf_read_uncached(
        bp->b_bn = daddr;
        bp->b_maps[0].bm_bn = daddr;
        bp->b_flags |= XBF_READ;
+        bp->b_ops = ops;
        xfsbdstrat(target->bt_mount, bp);
-        error = xfs_buf_iowait(bp);
+        xfs_buf_iowait(bp);
-        if (error) {
-                xfs_buf_relse(bp);
-                return NULL;
-        }
        return bp;
 }
@@ -999,27 +1001,37 @@ STATIC void
 xfs_buf_iodone_work(
        struct work_struct      *work)
 {
-        xfs_buf_t               *bp =
+        struct xfs_buf          *bp =
                container_of(work, xfs_buf_t, b_iodone_work);
+        bool                    read = !!(bp->b_flags & XBF_READ);
+        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
+        if (read && bp->b_ops)
+                bp->b_ops->verify_read(bp);
        if (bp->b_iodone)
                (*(bp->b_iodone))(bp);
        else if (bp->b_flags & XBF_ASYNC)
                xfs_buf_relse(bp);
+        else {
+                ASSERT(read && bp->b_ops);
+                complete(&bp->b_iowait);
+        }
 }
 void
 xfs_buf_ioend(
-        xfs_buf_t               *bp,
+        struct xfs_buf  *bp,
-        int                     schedule)
+        int             schedule)
 {
+        bool            read = !!(bp->b_flags & XBF_READ);
        trace_xfs_buf_iodone(bp, _RET_IP_);
-        bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
        if (bp->b_error == 0)
                bp->b_flags |= XBF_DONE;
-        if ((bp->b_iodone) || (bp->b_flags & XBF_ASYNC)) {
+        if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) {
                if (schedule) {
                        INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work);
                        queue_work(xfslogd_workqueue, &bp->b_iodone_work);
@@ -1027,6 +1039,7 @@ xfs_buf_ioend(
                        xfs_buf_iodone_work(&bp->b_iodone_work);
                }
        } else {
+                bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD);
                complete(&bp->b_iowait);
        }
 }
@@ -1197,9 +1210,14 @@ xfs_buf_bio_end_io(
 {
        xfs_buf_t               *bp = (xfs_buf_t *)bio->bi_private;
-        xfs_buf_ioerror(bp, -error);
+        /*
+         * don't overwrite existing errors - otherwise we can lose errors on
+         * buffers that require multiple bios to complete.
+         */
+        if (!bp->b_error)
+                xfs_buf_ioerror(bp, -error);
-        if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
+        if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ))
                invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp));
        _xfs_buf_ioend(bp, 1);
@@ -1279,6 +1297,11 @@ next_chunk:
                if (size)
                        goto next_chunk;
        } else {
+                /*
+                 * This is guaranteed not to be the last io reference count
+                 * because the caller (xfs_buf_iorequest) holds a count itself.
+                 */
+                atomic_dec(&bp->b_io_remaining);
                xfs_buf_ioerror(bp, EIO);
                bio_put(bio);
        }
@@ -1304,6 +1327,20 @@ _xfs_buf_ioapply(
                        rw |= REQ_FUA;
                if (bp->b_flags & XBF_FLUSH)
                        rw |= REQ_FLUSH;
+                /*
+                 * Run the write verifier callback function if it exists. If
+                 * this function fails it will mark the buffer with an error and
+                 * the IO should not be dispatched.
+                 */
+                if (bp->b_ops) {
+                        bp->b_ops->verify_write(bp);
+                        if (bp->b_error) {
+                                xfs_force_shutdown(bp->b_target->bt_mount,
+                                                   SHUTDOWN_CORRUPT_INCORE);
+                                return;
+                        }
+                }
        } else if (bp->b_flags & XBF_READ_AHEAD) {
                rw = READA;
        } else {
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 7c0b6a0a1557..23f5642480bb 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -100,6 +100,7 @@ typedef struct xfs_buftarg {
 struct xfs_buf;
 typedef void (*xfs_buf_iodone_t)(struct xfs_buf *);
 #define XB_PAGES        2
 struct xfs_buf_map {
@@ -110,6 +111,11 @@ struct xfs_buf_map {
 #define DEFINE_SINGLE_BUF_MAP(map, blkno, numblk) \
        struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
+struct xfs_buf_ops {
+        void (*verify_read)(struct xfs_buf *);
+        void (*verify_write)(struct xfs_buf *);
+};
 typedef struct xfs_buf {
        /*
         * first cacheline holds all the fields needed for an uncontended cache
@@ -153,13 +159,13 @@ typedef struct xfs_buf {
        unsigned int            b_page_count;   /* size of page array */
        unsigned int            b_offset;       /* page offset in first page */
        unsigned short          b_error;        /* error code on I/O */
+        const struct xfs_buf_ops        *b_ops;
 #ifdef XFS_BUF_LOCK_TRACKING
        int                     b_last_holder;
 #endif
 } xfs_buf_t;
 /* Finding and Reading Buffers */
 struct xfs_buf *_xfs_buf_find(struct xfs_buftarg *target,
                              struct xfs_buf_map *map, int nmaps,
@@ -196,9 +202,11 @@ struct xfs_buf *xfs_buf_get_map(struct xfs_buftarg *target,
                               xfs_buf_flags_t flags);
 struct xfs_buf *xfs_buf_read_map(struct xfs_buftarg *target,
                               struct xfs_buf_map *map, int nmaps,
-                               xfs_buf_flags_t flags);
+                               xfs_buf_flags_t flags,
+                               const struct xfs_buf_ops *ops);
 void xfs_buf_readahead_map(struct xfs_buftarg *target,
-                               struct xfs_buf_map *map, int nmaps);
+                               struct xfs_buf_map *map, int nmaps,
+                               const struct xfs_buf_ops *ops);
 static inline struct xfs_buf *
 xfs_buf_get(
@@ -216,20 +224,22 @@ xfs_buf_read(
        struct xfs_buftarg      *target,
        xfs_daddr_t             blkno,
        size_t                  numblks,
-        xfs_buf_flags_t         flags)
+        xfs_buf_flags_t         flags,
+        const struct xfs_buf_ops *ops)
 {
        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-        return xfs_buf_read_map(target, &map, 1, flags);
+        return xfs_buf_read_map(target, &map, 1, flags, ops);
 }
 static inline void
 xfs_buf_readahead(
        struct xfs_buftarg      *target,
        xfs_daddr_t             blkno,
-        size_t                  numblks)
+        size_t                  numblks,
+        const struct xfs_buf_ops *ops)
 {
        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-        return xfs_buf_readahead_map(target, &map, 1);
+        return xfs_buf_readahead_map(target, &map, 1, ops);
 }
 struct xfs_buf *xfs_buf_get_empty(struct xfs_buftarg *target, size_t numblks);
@@ -239,7 +249,8 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length);
 struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks,
                                int flags);
 struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target,
-                                xfs_daddr_t daddr, size_t numblks, int flags);
+                                xfs_daddr_t daddr, size_t numblks, int flags,
+                                const struct xfs_buf_ops *ops);
 void xfs_buf_hold(struct xfs_buf *bp);
 /* Releasing Buffers */
diff --git a/fs/xfs/xfs_cksum.h b/fs/xfs/xfs_cksum.h
new file mode 100644
index 000000000000..fad1676ad8cd
--- /dev/null
+++ b/fs/xfs/xfs_cksum.h
@@ -0,0 +1,63 @@
+#ifndef _XFS_CKSUM_H
+#define _XFS_CKSUM_H 1
+#define XFS_CRC_SEED    (~(__uint32_t)0)
+/*
+ * Calculate the intermediate checksum for a buffer that has the CRC field
+ * inside it.  The offset of the 32bit crc fields is passed as the
+ * cksum_offset parameter.
+ */
+static inline __uint32_t
+xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+        __uint32_t zero = 0;
+        __uint32_t crc;
+        /* Calculate CRC up to the checksum. */
+        crc = crc32c(XFS_CRC_SEED, buffer, cksum_offset);
+        /* Skip checksum field */
+        crc = crc32c(crc, &zero, sizeof(__u32));
+        /* Calculate the rest of the CRC. */
+        return crc32c(crc, &buffer[cksum_offset + sizeof(__be32)],
+                      length - (cksum_offset + sizeof(__be32)));
+}
+/*
+ * Convert the intermediate checksum to the final ondisk format.
+ *
+ * The CRC32c calculation uses LE format even on BE machines, but returns the
+ * result in host endian format. Hence we need to byte swap it back to LE format
+ * so that it is consistent on disk.
+ */
+static inline __le32
+xfs_end_cksum(__uint32_t crc)
+{
+        return ~cpu_to_le32(crc);
+}
+/*
+ * Helper to generate the checksum for a buffer.
+ */
+static inline void
+xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+        __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+        *(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
+}
+/*
+ * Helper to verify the checksum for a buffer.
+ */
+static inline int
+xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+{
+        __uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+        return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
+}
+#endif /* _XFS_CKSUM_H */
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 7bfb7dd334fc..4d7696a02418 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -91,6 +91,84 @@ STATIC int	xfs_da_blk_unlink(xfs_da_state_t *state,
                                  xfs_da_state_blk_t *save_blk);
 STATIC void     xfs_da_state_kill_altpath(xfs_da_state_t *state);
+static void
+xfs_da_node_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_da_node_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->info.magic == cpu_to_be16(XFS_DA_NODE_MAGIC);
+        block_ok = block_ok &&
+                        be16_to_cpu(hdr->level) > 0 &&
+                        be16_to_cpu(hdr->count) > 0 ;
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_da_node_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_da_node_verify(bp);
+}
+/*
+ * leaf/node format detection on trees is sketchy, so a node read can be done on
+ * leaf level blocks when detection identifies the tree as a node format tree
+ * incorrectly. In this case, we need to swap the verifier to match the correct
+ * format of the block being read.
+ */
+static void
+xfs_da_node_read_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_da_blkinfo   *info = bp->b_addr;
+        switch (be16_to_cpu(info->magic)) {
+                case XFS_DA_NODE_MAGIC:
+                        xfs_da_node_verify(bp);
+                        break;
+                case XFS_ATTR_LEAF_MAGIC:
+                        bp->b_ops = &xfs_attr_leaf_buf_ops;
+                        bp->b_ops->verify_read(bp);
+                        return;
+                case XFS_DIR2_LEAFN_MAGIC:
+                        bp->b_ops = &xfs_dir2_leafn_buf_ops;
+                        bp->b_ops->verify_read(bp);
+                        return;
+                default:
+                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW,
+                                             mp, info);
+                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        break;
+        }
+}
+const struct xfs_buf_ops xfs_da_node_buf_ops = {
+        .verify_read = xfs_da_node_read_verify,
+        .verify_write = xfs_da_node_write_verify,
+};
+int
+xfs_da_node_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp,
+        int                     which_fork)
+{
+        return xfs_da_read_buf(tp, dp, bno, mappedbno, bpp,
+                                        which_fork, &xfs_da_node_buf_ops);
+}
 /*========================================================================
 * Routines used for growing the Btree.
 *========================================================================*/
@@ -125,6 +203,7 @@ xfs_da_node_create(xfs_da_args_t *args, xfs_dablk_t blkno, int level,
        xfs_trans_log_buf(tp, bp,
                XFS_DA_LOGRANGE(node, &node->hdr, sizeof(node->hdr)));
+        bp->b_ops = &xfs_da_node_buf_ops;
        *bpp = bp;
        return(0);
 }
@@ -324,6 +403,8 @@ xfs_da_root_split(xfs_da_state_t *state, xfs_da_state_blk_t *blk1,
        }
        memcpy(node, oldroot, size);
        xfs_trans_log_buf(tp, bp, 0, size - 1);
+        bp->b_ops = blk1->bp->b_ops;
        blk1->bp = bp;
        blk1->blkno = blkno;
@@ -746,7 +827,7 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
         */
        child = be32_to_cpu(oldroot->btree[0].before);
        ASSERT(child != 0);
-        error = xfs_da_read_buf(args->trans, args->dp, child, -1, &bp,
+        error = xfs_da_node_read(args->trans, args->dp, child, -1, &bp,
                                             args->whichfork);
        if (error)
                return(error);
@@ -754,7 +835,14 @@ xfs_da_root_join(xfs_da_state_t *state, xfs_da_state_blk_t *root_blk)
        xfs_da_blkinfo_onlychild_validate(bp->b_addr,
                                        be16_to_cpu(oldroot->hdr.level));
+        /*
+         * This could be copying a leaf back into the root block in the case of
+         * there only being a single leaf block left in the tree. Hence we have
+         * to update the b_ops pointer as well to match the buffer type change
+         * that could occur.
+         */
        memcpy(root_blk->bp->b_addr, bp->b_addr, state->blocksize);
+        root_blk->bp->b_ops = bp->b_ops;
        xfs_trans_log_buf(args->trans, root_blk->bp, 0, state->blocksize - 1);
        error = xfs_da_shrink_inode(args, child, bp);
        return(error);
@@ -779,6 +867,8 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
        xfs_dablk_t blkno;
        struct xfs_buf *bp;
+        trace_xfs_da_node_toosmall(state->args);
        /*
         * Check for the degenerate case of the block being over 50% full.
         * If so, it's not worth even looking to see if we might be able
@@ -835,7 +925,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action)
                        blkno = be32_to_cpu(info->back);
                if (blkno == 0)
                        continue;
-                error = xfs_da_read_buf(state->args->trans, state->args->dp,
+                error = xfs_da_node_read(state->args->trans, state->args->dp,
                                        blkno, -1, &bp, state->args->whichfork);
                if (error)
                        return(error);
@@ -900,6 +990,8 @@ xfs_da_fixhashpath(xfs_da_state_t *state, xfs_da_state_path_t *path)
        xfs_dahash_t lasthash=0;
        int level, count;
+        trace_xfs_da_fixhashpath(state->args);
        level = path->active-1;
        blk = &path->blk[ level ];
        switch (blk->magic) {
@@ -1079,7 +1171,7 @@ xfs_da_node_lookup_int(xfs_da_state_t *state, int *result)
                 * Read the next node down in the tree.
                 */
                blk->blkno = blkno;
-                error = xfs_da_read_buf(args->trans, args->dp, blkno,
+                error = xfs_da_node_read(args->trans, args->dp, blkno,
                                        -1, &blk->bp, args->whichfork);
                if (error) {
                        blk->blkno = 0;
@@ -1241,7 +1333,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                new_info->forw = cpu_to_be32(old_blk->blkno);
                new_info->back = old_info->back;
                if (old_info->back) {
-                        error = xfs_da_read_buf(args->trans, args->dp,
+                        error = xfs_da_node_read(args->trans, args->dp,
                                                be32_to_cpu(old_info->back),
                                                -1, &bp, args->whichfork);
                        if (error)
@@ -1262,7 +1354,7 @@ xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                new_info->forw = old_info->forw;
                new_info->back = cpu_to_be32(old_blk->blkno);
                if (old_info->forw) {
-                        error = xfs_da_read_buf(args->trans, args->dp,
+                        error = xfs_da_node_read(args->trans, args->dp,
                                                be32_to_cpu(old_info->forw),
                                                -1, &bp, args->whichfork);
                        if (error)
@@ -1362,7 +1454,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
                trace_xfs_da_unlink_back(args);
                save_info->back = drop_info->back;
                if (drop_info->back) {
-                        error = xfs_da_read_buf(args->trans, args->dp,
+                        error = xfs_da_node_read(args->trans, args->dp,
                                                be32_to_cpu(drop_info->back),
                                                -1, &bp, args->whichfork);
                        if (error)
@@ -1379,7 +1471,7 @@ xfs_da_blk_unlink(xfs_da_state_t *state, xfs_da_state_blk_t *drop_blk,
                trace_xfs_da_unlink_forward(args);
                save_info->forw = drop_info->forw;
                if (drop_info->forw) {
-                        error = xfs_da_read_buf(args->trans, args->dp,
+                        error = xfs_da_node_read(args->trans, args->dp,
                                                be32_to_cpu(drop_info->forw),
                                                -1, &bp, args->whichfork);
                        if (error)
@@ -1417,6 +1509,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
        xfs_dablk_t blkno=0;
        int level, error;
+        trace_xfs_da_path_shift(state->args);
        /*
         * Roll up the Btree looking for the first block where our
         * current index is not at the edge of the block.  Note that
@@ -1463,8 +1557,8 @@ xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
                 * Read the next child block.
                 */
                blk->blkno = blkno;
-                error = xfs_da_read_buf(args->trans, args->dp, blkno, -1,
+                error = xfs_da_node_read(args->trans, args->dp, blkno, -1,
-                                                     &blk->bp, args->whichfork);
+                                        &blk->bp, args->whichfork);
                if (error)
                        return(error);
                ASSERT(blk->bp != NULL);
@@ -1727,7 +1821,8 @@ xfs_da_swap_lastblock(
         * Read the last block in the btree space.
         */
        last_blkno = (xfs_dablk_t)lastoff - mp->m_dirblkfsbs;
-        if ((error = xfs_da_read_buf(tp, ip, last_blkno, -1, &last_buf, w)))
+        error = xfs_da_node_read(tp, ip, last_blkno, -1, &last_buf, w);
+        if (error)
                return error;
        /*
         * Copy the last block into the dead buffer and log it.
@@ -1753,7 +1848,8 @@ xfs_da_swap_lastblock(
         * If the moved block has a left sibling, fix up the pointers.
         */
        if ((sib_blkno = be32_to_cpu(dead_info->back))) {
-                if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+                error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+                if (error)
                        goto done;
                sib_info = sib_buf->b_addr;
                if (unlikely(
@@ -1774,7 +1870,8 @@ xfs_da_swap_lastblock(
         * If the moved block has a right sibling, fix up the pointers.
         */
        if ((sib_blkno = be32_to_cpu(dead_info->forw))) {
-                if ((error = xfs_da_read_buf(tp, ip, sib_blkno, -1, &sib_buf, w)))
+                error = xfs_da_node_read(tp, ip, sib_blkno, -1, &sib_buf, w);
+                if (error)
                        goto done;
                sib_info = sib_buf->b_addr;
                if (unlikely(
@@ -1797,7 +1894,8 @@ xfs_da_swap_lastblock(
         * Walk down the tree looking for the parent of the moved block.
         */
        for (;;) {
-                if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+                error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+                if (error)
                        goto done;
                par_node = par_buf->b_addr;
                if (unlikely(par_node->hdr.info.magic !=
@@ -1847,7 +1945,8 @@ xfs_da_swap_lastblock(
                        error = XFS_ERROR(EFSCORRUPTED);
                        goto done;
                }
-                if ((error = xfs_da_read_buf(tp, ip, par_blkno, -1, &par_buf, w)))
+                error = xfs_da_node_read(tp, ip, par_blkno, -1, &par_buf, w);
+                if (error)
                        goto done;
                par_node = par_buf->b_addr;
                if (unlikely(
@@ -2133,7 +2232,8 @@ xfs_da_read_buf(
        xfs_dablk_t             bno,
        xfs_daddr_t             mappedbno,
        struct xfs_buf          **bpp,
-        int                     whichfork)
+        int                     whichfork,
+        const struct xfs_buf_ops *ops)
 {
        struct xfs_buf          *bp;
        struct xfs_buf_map      map;
@@ -2155,7 +2255,7 @@ xfs_da_read_buf(
        error = xfs_trans_read_buf_map(dp->i_mount, trans,
                                        dp->i_mount->m_ddev_targp,
-                                        mapp, nmap, 0, &bp);
+                                        mapp, nmap, 0, &bp, ops);
        if (error)
                goto out_free;
@@ -2211,9 +2311,10 @@ xfs_da_reada_buf(
        struct xfs_trans        *trans,
        struct xfs_inode        *dp,
        xfs_dablk_t             bno,
-        int                     whichfork)
+        xfs_daddr_t             mappedbno,
+        int                     whichfork,
+        const struct xfs_buf_ops *ops)
 {
-        xfs_daddr_t             mappedbno = -1;
        struct xfs_buf_map      map;
        struct xfs_buf_map      *mapp;
        int                     nmap;
@@ -2221,7 +2322,7 @@ xfs_da_reada_buf(
        mapp = &map;
        nmap = 1;
-        error = xfs_dabuf_map(trans, dp, bno, -1, whichfork,
+        error = xfs_dabuf_map(trans, dp, bno, mappedbno, whichfork,
                                &mapp, &nmap);
        if (error) {
                /* mapping a hole is not an error, but we don't continue */
@@ -2231,7 +2332,7 @@ xfs_da_reada_buf(
        }
        mappedbno = mapp[0].bm_bn;
-        xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap);
+        xfs_buf_readahead_map(dp->i_mount->m_ddev_targp, mapp, nmap, ops);
 out_free:
        if (mapp != &map)
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 132adafb041e..ee5170c46ae1 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -18,7 +18,6 @@
 #ifndef __XFS_DA_BTREE_H__
 #define __XFS_DA_BTREE_H__
-struct xfs_buf;
 struct xfs_bmap_free;
 struct xfs_inode;
 struct xfs_mount;
@@ -214,6 +213,9 @@ int	xfs_da_path_shift(xfs_da_state_t *state, xfs_da_state_path_t *path,
 */
 int     xfs_da_blk_link(xfs_da_state_t *state, xfs_da_state_blk_t *old_blk,
                                       xfs_da_state_blk_t *new_blk);
+int     xfs_da_node_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                         xfs_dablk_t bno, xfs_daddr_t mappedbno,
+                         struct xfs_buf **bpp, int which_fork);
 /*
 * Utility routines.
@@ -226,9 +228,11 @@ int	xfs_da_get_buf(struct xfs_trans *trans, struct xfs_inode *dp,
                              struct xfs_buf **bp, int whichfork);
 int     xfs_da_read_buf(struct xfs_trans *trans, struct xfs_inode *dp,
                               xfs_dablk_t bno, xfs_daddr_t mappedbno,
-                               struct xfs_buf **bpp, int whichfork);
+                               struct xfs_buf **bpp, int whichfork,
+                               const struct xfs_buf_ops *ops);
 xfs_daddr_t     xfs_da_reada_buf(struct xfs_trans *trans, struct xfs_inode *dp,
-                        xfs_dablk_t bno, int whichfork);
+                                xfs_dablk_t bno, xfs_daddr_t mapped_bno,
+                                int whichfork, const struct xfs_buf_ops *ops);
 int     xfs_da_shrink_inode(xfs_da_args_t *args, xfs_dablk_t dead_blkno,
                                          struct xfs_buf *dead_buf);
diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c
index b9b8646e62db..d0e9c74d3d96 100644
--- a/fs/xfs/xfs_dfrag.c
+++ b/fs/xfs/xfs_dfrag.c
@@ -246,12 +246,10 @@ xfs_swap_extents(
                goto out_unlock;
        }
-        if (VN_CACHED(VFS_I(tip)) != 0) {
+        error = -filemap_write_and_wait(VFS_I(ip)->i_mapping);
-                error = xfs_flushinval_pages(tip, 0, -1,
+        if (error)
-                                FI_REMAPF_LOCKED);
+                goto out_unlock;
-                if (error)
+        truncate_pagecache_range(VFS_I(ip), 0, -1);
-                        goto out_unlock;
-        }
        /* Verify O_DIRECT for ftmp */
        if (VN_CACHED(VFS_I(tip)) != 0) {
@@ -315,8 +313,7 @@ xfs_swap_extents(
         * are safe.  We don't really care if non-io related
         * fields change.
         */
+        truncate_pagecache_range(VFS_I(ip), 0, -1);
-        xfs_tosspages(ip, 0, -1, FI_REMAPF);
        tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT);
        if ((error = xfs_trans_reserve(tp, 0,
diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c
index e93ca8f054f4..7536faaa61e7 100644
--- a/fs/xfs/xfs_dir2_block.c
+++ b/fs/xfs/xfs_dir2_block.c
@@ -56,6 +56,214 @@ xfs_dir_startup(void)
        xfs_dir_hash_dotdot = xfs_da_hashname((unsigned char *)"..", 2);
 }
+static void
+xfs_dir2_block_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
+        block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_dir2_block_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_block_verify(bp);
+}
+static void
+xfs_dir2_block_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_block_verify(bp);
+}
+const struct xfs_buf_ops xfs_dir2_block_buf_ops = {
+        .verify_read = xfs_dir2_block_read_verify,
+        .verify_write = xfs_dir2_block_write_verify,
+};
+static int
+xfs_dir2_block_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        struct xfs_buf          **bpp)
+{
+        struct xfs_mount        *mp = dp->i_mount;
+        return xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, bpp,
+                                XFS_DATA_FORK, &xfs_dir2_block_buf_ops);
+}
+static void
+xfs_dir2_block_need_space(
+        struct xfs_dir2_data_hdr        *hdr,
+        struct xfs_dir2_block_tail      *btp,
+        struct xfs_dir2_leaf_entry      *blp,
+        __be16                          **tagpp,
+        struct xfs_dir2_data_unused     **dupp,
+        struct xfs_dir2_data_unused     **enddupp,
+        int                             *compact,
+        int                             len)
+{
+        struct xfs_dir2_data_free       *bf;
+        __be16                          *tagp = NULL;
+        struct xfs_dir2_data_unused     *dup = NULL;
+        struct xfs_dir2_data_unused     *enddup = NULL;
+        *compact = 0;
+        bf = hdr->bestfree;
+        /*
+         * If there are stale entries we'll use one for the leaf.
+         */
+        if (btp->stale) {
+                if (be16_to_cpu(bf[0].length) >= len) {
+                        /*
+                         * The biggest entry enough to avoid compaction.
+                         */
+                        dup = (xfs_dir2_data_unused_t *)
+                              ((char *)hdr + be16_to_cpu(bf[0].offset));
+                        goto out;
+                }
+                /*
+                 * Will need to compact to make this work.
+                 * Tag just before the first leaf entry.
+                 */
+                *compact = 1;
+                tagp = (__be16 *)blp - 1;
+                /* Data object just before the first leaf entry.  */
+                dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+                /*
+                 * If it's not free then the data will go where the
+                 * leaf data starts now, if it works at all.
+                 */
+                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                        if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
+                            (uint)sizeof(*blp) < len)
+                                dup = NULL;
+                } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
+                        dup = NULL;
+                else
+                        dup = (xfs_dir2_data_unused_t *)blp;
+                goto out;
+        }
+        /*
+         * no stale entries, so just use free space.
+         * Tag just before the first leaf entry.
+         */
+        tagp = (__be16 *)blp - 1;
+        /* Data object just before the first leaf entry.  */
+        enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
+        /*
+         * If it's not free then can't do this add without cleaning up:
+         * the space before the first leaf entry needs to be free so it
+         * can be expanded to hold the pointer to the new entry.
+         */
+        if (be16_to_cpu(enddup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
+                /*
+                 * Check out the biggest freespace and see if it's the same one.
+                 */
+                dup = (xfs_dir2_data_unused_t *)
+                      ((char *)hdr + be16_to_cpu(bf[0].offset));
+                if (dup != enddup) {
+                        /*
+                         * Not the same free entry, just check its length.
+                         */
+                        if (be16_to_cpu(dup->length) < len)
+                                dup = NULL;
+                        goto out;
+                }
+                /*
+                 * It is the biggest freespace, can it hold the leaf too?
+                 */
+                if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
+                        /*
+                         * Yes, use the second-largest entry instead if it works.
+                         */
+                        if (be16_to_cpu(bf[1].length) >= len)
+                                dup = (xfs_dir2_data_unused_t *)
+                                      ((char *)hdr + be16_to_cpu(bf[1].offset));
+                        else
+                                dup = NULL;
+                }
+        }
+out:
+        *tagpp = tagp;
+        *dupp = dup;
+        *enddupp = enddup;
+}
+/*
+ * compact the leaf entries.
+ * Leave the highest-numbered stale entry stale.
+ * XXX should be the one closest to mid but mid is not yet computed.
+ */
+static void
+xfs_dir2_block_compact(
+        struct xfs_trans                *tp,
+        struct xfs_buf                  *bp,
+        struct xfs_dir2_data_hdr        *hdr,
+        struct xfs_dir2_block_tail      *btp,
+        struct xfs_dir2_leaf_entry      *blp,
+        int                             *needlog,
+        int                             *lfloghigh,
+        int                             *lfloglow)
+{
+        int                     fromidx;        /* source leaf index */
+        int                     toidx;          /* target leaf index */
+        int                     needscan = 0;
+        int                     highstale;      /* high stale index */
+        fromidx = toidx = be32_to_cpu(btp->count) - 1;
+        highstale = *lfloghigh = -1;
+        for (; fromidx >= 0; fromidx--) {
+                if (blp[fromidx].address == cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
+                        if (highstale == -1)
+                                highstale = toidx;
+                        else {
+                                if (*lfloghigh == -1)
+                                        *lfloghigh = toidx;
+                                continue;
+                        }
+                }
+                if (fromidx < toidx)
+                        blp[toidx] = blp[fromidx];
+                toidx--;
+        }
+        *lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
+        *lfloghigh -= be32_to_cpu(btp->stale) - 1;
+        be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
+        xfs_dir2_data_make_free(tp, bp,
+                (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
+                (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
+                needlog, &needscan);
+        blp += be32_to_cpu(btp->stale) - 1;
+        btp->stale = cpu_to_be32(1);
+        /*
+         * If we now need to rebuild the bestfree map, do so.
+         * This needs to happen before the next call to use_free.
+         */
+        if (needscan)
+                xfs_dir2_data_freescan(tp->t_mountp, hdr, needlog);
+}
 /*
 * Add an entry to a block directory.
 */
@@ -63,7 +271,6 @@ int						/* error */
 xfs_dir2_block_addname(
        xfs_da_args_t           *args)          /* directory op arguments */
 {
-        xfs_dir2_data_free_t    *bf;            /* bestfree table in block */
        xfs_dir2_data_hdr_t     *hdr;           /* block header */
        xfs_dir2_leaf_entry_t   *blp;           /* block leaf entries */
        struct xfs_buf          *bp;            /* buffer for block */
@@ -94,134 +301,44 @@ xfs_dir2_block_addname(
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
-        /*
-         * Read the (one and only) directory block into dabuf bp.
+        /* Read the (one and only) directory block into bp. */
-         */
+        error = xfs_dir2_block_read(tp, dp, &bp);
-        if ((error =
+        if (error)
-            xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
                return error;
-        }
-        ASSERT(bp != NULL);
-        hdr = bp->b_addr;
-        /*
-         * Check the magic number, corrupted if wrong.
-         */
-        if (unlikely(hdr->magic != cpu_to_be32(XFS_DIR2_BLOCK_MAGIC))) {
-                XFS_CORRUPTION_ERROR("xfs_dir2_block_addname",
-                                     XFS_ERRLEVEL_LOW, mp, hdr);
-                xfs_trans_brelse(tp, bp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        len = xfs_dir2_data_entsize(args->namelen);
        /*
         * Set up pointers to parts of the block.
         */
-        bf = hdr->bestfree;
+        hdr = bp->b_addr;
        btp = xfs_dir2_block_tail_p(mp, hdr);
        blp = xfs_dir2_block_leaf_p(btp);
        /*
-         * No stale entries?  Need space for entry and new leaf.
+         * Find out if we can reuse stale entries or whether we need extra
-         */
+         * space for entry and new leaf.
-        if (!btp->stale) {
-                /*
-                 * Tag just before the first leaf entry.
-                 */
-                tagp = (__be16 *)blp - 1;
-                /*
-                 * Data object just before the first leaf entry.
-                 */
-                enddup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-                /*
-                 * If it's not free then can't do this add without cleaning up:
-                 * the space before the first leaf entry needs to be free so it
-                 * can be expanded to hold the pointer to the new entry.
-                 */
-                if (be16_to_cpu(enddup->freetag) != XFS_DIR2_DATA_FREE_TAG)
-                        dup = enddup = NULL;
-                /*
-                 * Check out the biggest freespace and see if it's the same one.
-                 */
-                else {
-                        dup = (xfs_dir2_data_unused_t *)
-                              ((char *)hdr + be16_to_cpu(bf[0].offset));
-                        if (dup == enddup) {
-                                /*
-                                 * It is the biggest freespace, is it too small
-                                 * to hold the new leaf too?
-                                 */
-                                if (be16_to_cpu(dup->length) < len + (uint)sizeof(*blp)) {
-                                        /*
-                                         * Yes, we use the second-largest
-                                         * entry instead if it works.
-                                         */
-                                        if (be16_to_cpu(bf[1].length) >= len)
-                                                dup = (xfs_dir2_data_unused_t *)
-                                                      ((char *)hdr +
-                                                       be16_to_cpu(bf[1].offset));
-                                        else
-                                                dup = NULL;
-                                }
-                        } else {
-                                /*
-                                 * Not the same free entry,
-                                 * just check its length.
-                                 */
-                                if (be16_to_cpu(dup->length) < len) {
-                                        dup = NULL;
-                                }
-                        }
-                }
-                compact = 0;
-        }
-        /*
-         * If there are stale entries we'll use one for the leaf.
-         * Is the biggest entry enough to avoid compaction?
-         */
-        else if (be16_to_cpu(bf[0].length) >= len) {
-                dup = (xfs_dir2_data_unused_t *)
-                      ((char *)hdr + be16_to_cpu(bf[0].offset));
-                compact = 0;
-        }
-        /*
-         * Will need to compact to make this work.
         */
-        else {
+        xfs_dir2_block_need_space(hdr, btp, blp, &tagp, &dup,
-                /*
+                                  &enddup, &compact, len);
-                 * Tag just before the first leaf entry.
-                 */
-                tagp = (__be16 *)blp - 1;
-                /*
-                 * Data object just before the first leaf entry.
-                 */
-                dup = (xfs_dir2_data_unused_t *)((char *)hdr + be16_to_cpu(*tagp));
-                /*
-                 * If it's not free then the data will go where the
-                 * leaf data starts now, if it works at all.
-                 */
-                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                        if (be16_to_cpu(dup->length) + (be32_to_cpu(btp->stale) - 1) *
-                            (uint)sizeof(*blp) < len)
-                                dup = NULL;
-                } else if ((be32_to_cpu(btp->stale) - 1) * (uint)sizeof(*blp) < len)
-                        dup = NULL;
-                else
-                        dup = (xfs_dir2_data_unused_t *)blp;
-                compact = 1;
-        }
        /*
-         * If this isn't a real add, we're done with the buffer.
+         * Done everything we need for a space check now.
         */
-        if (args->op_flags & XFS_DA_OP_JUSTCHECK)
+        if (args->op_flags & XFS_DA_OP_JUSTCHECK) {
                xfs_trans_brelse(tp, bp);
+                if (!dup)
+                        return XFS_ERROR(ENOSPC);
+                return 0;
+        }
        /*
         * If we don't have space for the new entry & leaf ...
         */
        if (!dup) {
-                /*
+                /* Don't have a space reservation: return no-space.  */
-                 * Not trying to actually do anything, or don't have
+                if (args->total == 0)
-                 * a space reservation: return no-space.
-                 */
-                if ((args->op_flags & XFS_DA_OP_JUSTCHECK) || args->total == 0)
                        return XFS_ERROR(ENOSPC);
                /*
                 * Convert to the next larger format.
@@ -232,65 +349,24 @@ xfs_dir2_block_addname(
                        return error;
                return xfs_dir2_leaf_addname(args);
        }
-        /*
-         * Just checking, and it would work, so say so.
-         */
-        if (args->op_flags & XFS_DA_OP_JUSTCHECK)
-                return 0;
        needlog = needscan = 0;
        /*
         * If need to compact the leaf entries, do it now.
-         * Leave the highest-numbered stale entry stale.
-         * XXX should be the one closest to mid but mid is not yet computed.
-         */
-        if (compact) {
-                int     fromidx;                /* source leaf index */
-                int     toidx;                  /* target leaf index */
-                for (fromidx = toidx = be32_to_cpu(btp->count) - 1,
-                        highstale = lfloghigh = -1;
-                     fromidx >= 0;
-                     fromidx--) {
-                        if (blp[fromidx].address ==
-                            cpu_to_be32(XFS_DIR2_NULL_DATAPTR)) {
-                                if (highstale == -1)
-                                        highstale = toidx;
-                                else {
-                                        if (lfloghigh == -1)
-                                                lfloghigh = toidx;
-                                        continue;
-                                }
-                        }
-                        if (fromidx < toidx)
-                                blp[toidx] = blp[fromidx];
-                        toidx--;
-                }
-                lfloglow = toidx + 1 - (be32_to_cpu(btp->stale) - 1);
-                lfloghigh -= be32_to_cpu(btp->stale) - 1;
-                be32_add_cpu(&btp->count, -(be32_to_cpu(btp->stale) - 1));
-                xfs_dir2_data_make_free(tp, bp,
-                        (xfs_dir2_data_aoff_t)((char *)blp - (char *)hdr),
-                        (xfs_dir2_data_aoff_t)((be32_to_cpu(btp->stale) - 1) * sizeof(*blp)),
-                        &needlog, &needscan);
-                blp += be32_to_cpu(btp->stale) - 1;
-                btp->stale = cpu_to_be32(1);
-                /*
-                 * If we now need to rebuild the bestfree map, do so.
-                 * This needs to happen before the next call to use_free.
-                 */
-                if (needscan) {
-                        xfs_dir2_data_freescan(mp, hdr, &needlog);
-                        needscan = 0;
-                }
-        }
-        /*
-         * Set leaf logging boundaries to impossible state.
-         * For the no-stale case they're set explicitly.
         */
+        if (compact)
+                xfs_dir2_block_compact(tp, bp, hdr, btp, blp, &needlog,
+                                      &lfloghigh, &lfloglow);
        else if (btp->stale) {
+                /*
+                 * Set leaf logging boundaries to impossible state.
+                 * For the no-stale case they're set explicitly.
+                 */
                lfloglow = be32_to_cpu(btp->count);
                lfloghigh = -1;
        }
        /*
         * Find the slot that's first lower than our hash value, -1 if none.
         */
@@ -450,18 +526,13 @@ xfs_dir2_block_getdents(
        /*
         * If the block number in the offset is out of range, we're done.
         */
-        if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk) {
+        if (xfs_dir2_dataptr_to_db(mp, *offset) > mp->m_dirdatablk)
                return 0;
-        }
-        /*
+        error = xfs_dir2_block_read(NULL, dp, &bp);
-         * Can't read the block, give up, else get dabuf in bp.
-         */
-        error = xfs_da_read_buf(NULL, dp, mp->m_dirdatablk, -1,
-                                &bp, XFS_DATA_FORK);
        if (error)
                return error;
-        ASSERT(bp != NULL);
        /*
         * Extract the byte offset we start at from the seek pointer.
         * We'll skip entries before this.
@@ -637,14 +708,11 @@ xfs_dir2_block_lookup_int(
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
-        /*
-         * Read the buffer, return error if we can't get it.
+        error = xfs_dir2_block_read(tp, dp, &bp);
-         */
+        if (error)
-        if ((error =
-            xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &bp, XFS_DATA_FORK))) {
                return error;
-        }
-        ASSERT(bp != NULL);
        hdr = bp->b_addr;
        xfs_dir2_data_check(dp, bp);
        btp = xfs_dir2_block_tail_p(mp, hdr);
@@ -917,10 +985,10 @@ xfs_dir2_leaf_to_block(
        /*
         * Read the data block if we don't already have it, give up if it fails.
         */
-        if (dbp == NULL &&
+        if (!dbp) {
-            (error = xfs_da_read_buf(tp, dp, mp->m_dirdatablk, -1, &dbp,
+                error = xfs_dir2_data_read(tp, dp, mp->m_dirdatablk, -1, &dbp);
-                    XFS_DATA_FORK))) {
+                if (error)
-                return error;
+                        return error;
        }
        hdr = dbp->b_addr;
        ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
@@ -944,6 +1012,7 @@ xfs_dir2_leaf_to_block(
        /*
         * Start converting it to block form.
         */
+        dbp->b_ops = &xfs_dir2_block_buf_ops;
        hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
        needlog = 1;
        needscan = 0;
@@ -1073,6 +1142,7 @@ xfs_dir2_sf_to_block(
                kmem_free(sfp);
                return error;
        }
+        bp->b_ops = &xfs_dir2_block_buf_ops;
        hdr = bp->b_addr;
        hdr->magic = cpu_to_be32(XFS_DIR2_BLOCK_MAGIC);
        /*
diff --git a/fs/xfs/xfs_dir2_data.c b/fs/xfs/xfs_dir2_data.c
index 44ffd4d6bc91..ffcf1774152e 100644
--- a/fs/xfs/xfs_dir2_data.c
+++ b/fs/xfs/xfs_dir2_data.c
@@ -34,14 +34,13 @@
 STATIC xfs_dir2_data_free_t *
 xfs_dir2_data_freefind(xfs_dir2_data_hdr_t *hdr, xfs_dir2_data_unused_t *dup);
-#ifdef DEBUG
 /*
 * Check the consistency of the data block.
 * The input can also be a block-format directory.
- * Pop an assert if we find anything bad.
+ * Return 0 is the buffer is good, otherwise an error.
 */
-void
+int
-xfs_dir2_data_check(
+__xfs_dir2_data_check(
        struct xfs_inode        *dp,            /* incore inode pointer */
        struct xfs_buf          *bp)            /* data block's buffer */
 {
@@ -64,18 +63,23 @@ xfs_dir2_data_check(
        int                     stale;          /* count of stale leaves */
        struct xfs_name         name;
-        mp = dp->i_mount;
+        mp = bp->b_target->bt_mount;
        hdr = bp->b_addr;
        bf = hdr->bestfree;
        p = (char *)(hdr + 1);
-        if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
+        switch (hdr->magic) {
+        case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
                btp = xfs_dir2_block_tail_p(mp, hdr);
                lep = xfs_dir2_block_leaf_p(btp);
                endp = (char *)lep;
-        } else {
+                break;
-                ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC));
+        case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
                endp = (char *)hdr + mp->m_dirblksize;
+                break;
+        default:
+                XFS_ERROR_REPORT("Bad Magic", XFS_ERRLEVEL_LOW, mp);
+                return EFSCORRUPTED;
        }
        count = lastfree = freeseen = 0;
@@ -83,19 +87,22 @@ xfs_dir2_data_check(
         * Account for zero bestfree entries.
         */
        if (!bf[0].length) {
-                ASSERT(!bf[0].offset);
+                XFS_WANT_CORRUPTED_RETURN(!bf[0].offset);
                freeseen |= 1 << 0;
        }
        if (!bf[1].length) {
-                ASSERT(!bf[1].offset);
+                XFS_WANT_CORRUPTED_RETURN(!bf[1].offset);
                freeseen |= 1 << 1;
        }
        if (!bf[2].length) {
-                ASSERT(!bf[2].offset);
+                XFS_WANT_CORRUPTED_RETURN(!bf[2].offset);
                freeseen |= 1 << 2;
        }
-        ASSERT(be16_to_cpu(bf[0].length) >= be16_to_cpu(bf[1].length));
-        ASSERT(be16_to_cpu(bf[1].length) >= be16_to_cpu(bf[2].length));
+        XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[0].length) >=
+                                                be16_to_cpu(bf[1].length));
+        XFS_WANT_CORRUPTED_RETURN(be16_to_cpu(bf[1].length) >=
+                                                be16_to_cpu(bf[2].length));
        /*
         * Loop over the data/unused entries.
         */
@@ -107,17 +114,20 @@ xfs_dir2_data_check(
                 * doesn't need to be there.
                 */
                if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
-                        ASSERT(lastfree == 0);
+                        XFS_WANT_CORRUPTED_RETURN(lastfree == 0);
-                        ASSERT(be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+                        XFS_WANT_CORRUPTED_RETURN(
-                               (char *)dup - (char *)hdr);
+                                be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup)) ==
+                                               (char *)dup - (char *)hdr);
                        dfp = xfs_dir2_data_freefind(hdr, dup);
                        if (dfp) {
                                i = (int)(dfp - bf);
-                                ASSERT((freeseen & (1 << i)) == 0);
+                                XFS_WANT_CORRUPTED_RETURN(
+                                        (freeseen & (1 << i)) == 0);
                                freeseen |= 1 << i;
                        } else {
-                                ASSERT(be16_to_cpu(dup->length) <=
+                                XFS_WANT_CORRUPTED_RETURN(
-                                       be16_to_cpu(bf[2].length));
+                                        be16_to_cpu(dup->length) <=
+                                                be16_to_cpu(bf[2].length));
                        }
                        p += be16_to_cpu(dup->length);
                        lastfree = 1;
@@ -130,10 +140,12 @@ xfs_dir2_data_check(
                 * The linear search is crude but this is DEBUG code.
                 */
                dep = (xfs_dir2_data_entry_t *)p;
-                ASSERT(dep->namelen != 0);
+                XFS_WANT_CORRUPTED_RETURN(dep->namelen != 0);
-                ASSERT(xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)) == 0);
+                XFS_WANT_CORRUPTED_RETURN(
-                ASSERT(be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+                        !xfs_dir_ino_validate(mp, be64_to_cpu(dep->inumber)));
-                       (char *)dep - (char *)hdr);
+                XFS_WANT_CORRUPTED_RETURN(
+                        be16_to_cpu(*xfs_dir2_data_entry_tag_p(dep)) ==
+                                               (char *)dep - (char *)hdr);
                count++;
                lastfree = 0;
                if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
@@ -148,27 +160,122 @@ xfs_dir2_data_check(
                                    be32_to_cpu(lep[i].hashval) == hash)
                                        break;
                        }
-                        ASSERT(i < be32_to_cpu(btp->count));
+                        XFS_WANT_CORRUPTED_RETURN(i < be32_to_cpu(btp->count));
                }
                p += xfs_dir2_data_entsize(dep->namelen);
        }
        /*
         * Need to have seen all the entries and all the bestfree slots.
         */
-        ASSERT(freeseen == 7);
+        XFS_WANT_CORRUPTED_RETURN(freeseen == 7);
        if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC)) {
                for (i = stale = 0; i < be32_to_cpu(btp->count); i++) {
                        if (lep[i].address ==
                            cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
                                stale++;
                        if (i > 0)
-                                ASSERT(be32_to_cpu(lep[i].hashval) >= be32_to_cpu(lep[i - 1].hashval));
+                                XFS_WANT_CORRUPTED_RETURN(
+                                        be32_to_cpu(lep[i].hashval) >=
+                                                be32_to_cpu(lep[i - 1].hashval));
                }
-                ASSERT(count == be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+                XFS_WANT_CORRUPTED_RETURN(count ==
-                ASSERT(stale == be32_to_cpu(btp->stale));
+                        be32_to_cpu(btp->count) - be32_to_cpu(btp->stale));
+                XFS_WANT_CORRUPTED_RETURN(stale == be32_to_cpu(btp->stale));
        }
+        return 0;
+}
+static void
+xfs_dir2_data_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC);
+        block_ok = block_ok && __xfs_dir2_data_check(NULL, bp) == 0;
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+/*
+ * Readahead of the first block of the directory when it is opened is completely
+ * oblivious to the format of the directory. Hence we can either get a block
+ * format buffer or a data format buffer on readahead.
+ */
+static void
+xfs_dir2_data_reada_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_data_hdr *hdr = bp->b_addr;
+        switch (hdr->magic) {
+        case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
+                bp->b_ops = &xfs_dir2_block_buf_ops;
+                bp->b_ops->verify_read(bp);
+                return;
+        case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
+                xfs_dir2_data_verify(bp);
+                return;
+        default:
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+                break;
+        }
+}
+static void
+xfs_dir2_data_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_data_verify(bp);
+}
+static void
+xfs_dir2_data_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_data_verify(bp);
+}
+const struct xfs_buf_ops xfs_dir2_data_buf_ops = {
+        .verify_read = xfs_dir2_data_read_verify,
+        .verify_write = xfs_dir2_data_write_verify,
+};
+static const struct xfs_buf_ops xfs_dir2_data_reada_buf_ops = {
+        .verify_read = xfs_dir2_data_reada_verify,
+        .verify_write = xfs_dir2_data_write_verify,
+};
+int
+xfs_dir2_data_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mapped_bno,
+        struct xfs_buf          **bpp)
+{
+        return xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
+                                XFS_DATA_FORK, &xfs_dir2_data_buf_ops);
+}
+int
+xfs_dir2_data_readahead(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             bno,
+        xfs_daddr_t             mapped_bno)
+{
+        return xfs_da_reada_buf(tp, dp, bno, mapped_bno,
+                                XFS_DATA_FORK, &xfs_dir2_data_reada_buf_ops);
 }
-#endif
 /*
 * Given a data block and an unused entry from that block,
@@ -409,10 +516,9 @@ xfs_dir2_data_init(
         */
        error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, blkno), -1, &bp,
                XFS_DATA_FORK);
-        if (error) {
+        if (error)
                return error;
-        }
+        bp->b_ops = &xfs_dir2_data_buf_ops;
-        ASSERT(bp != NULL);
        /*
         * Initialize the header.
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 0b296253bd01..60cd2fa4e047 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -48,6 +48,83 @@ static void xfs_dir2_leaf_log_bests(struct xfs_trans *tp, struct xfs_buf *bp,
                                    int first, int last);
 static void xfs_dir2_leaf_log_tail(struct xfs_trans *tp, struct xfs_buf *bp);
+static void
+xfs_dir2_leaf_verify(
+        struct xfs_buf          *bp,
+        __be16                  magic)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_leaf_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->info.magic == magic;
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_dir2_leaf1_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+static void
+xfs_dir2_leaf1_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAF1_MAGIC));
+}
+void
+xfs_dir2_leafn_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+void
+xfs_dir2_leafn_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_leaf_verify(bp, cpu_to_be16(XFS_DIR2_LEAFN_MAGIC));
+}
+static const struct xfs_buf_ops xfs_dir2_leaf1_buf_ops = {
+        .verify_read = xfs_dir2_leaf1_read_verify,
+        .verify_write = xfs_dir2_leaf1_write_verify,
+};
+const struct xfs_buf_ops xfs_dir2_leafn_buf_ops = {
+        .verify_read = xfs_dir2_leafn_read_verify,
+        .verify_write = xfs_dir2_leafn_write_verify,
+};
+static int
+xfs_dir2_leaf_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                                XFS_DATA_FORK, &xfs_dir2_leaf1_buf_ops);
+}
+int
+xfs_dir2_leafn_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                                XFS_DATA_FORK, &xfs_dir2_leafn_buf_ops);
+}
 /*
 * Convert a block form directory to a leaf form directory.
@@ -125,6 +202,7 @@ xfs_dir2_block_to_leaf(
        /*
         * Fix up the block header, make it a data block.
         */
+        dbp->b_ops = &xfs_dir2_data_buf_ops;
        hdr->magic = cpu_to_be32(XFS_DIR2_DATA_MAGIC);
        if (needscan)
                xfs_dir2_data_freescan(mp, hdr, &needlog);
@@ -311,15 +389,11 @@ xfs_dir2_leaf_addname(
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
-        /*
-         * Read the leaf block.
+        error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
-         */
+        if (error)
-        error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-                XFS_DATA_FORK);
-        if (error) {
                return error;
-        }
-        ASSERT(lbp != NULL);
        /*
         * Look up the entry by hash value and name.
         * We know it's not there, our caller has already done a lookup.
@@ -494,22 +568,21 @@ xfs_dir2_leaf_addname(
                hdr = dbp->b_addr;
                bestsp[use_block] = hdr->bestfree[0].length;
                grown = 1;
-        }
+        } else {
-        /*
+                /*
-         * Already had space in some data block.
+                 * Already had space in some data block.
-         * Just read that one in.
+                 * Just read that one in.
-         */
+                 */
-        else {
+                error = xfs_dir2_data_read(tp, dp,
-                if ((error =
+                                           xfs_dir2_db_to_da(mp, use_block),
-                    xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, use_block),
+                                           -1, &dbp);
-                            -1, &dbp, XFS_DATA_FORK))) {
+                if (error) {
                        xfs_trans_brelse(tp, lbp);
                        return error;
                }
                hdr = dbp->b_addr;
                grown = 0;
        }
-        xfs_dir2_data_check(dp, dbp);
        /*
         * Point to the biggest freespace in our data block.
         */
@@ -892,10 +965,9 @@ xfs_dir2_leaf_readbuf(
         * Read the directory block starting at the first mapping.
         */
        mip->curdb = xfs_dir2_da_to_db(mp, map->br_startoff);
-        error = xfs_da_read_buf(NULL, dp, map->br_startoff,
+        error = xfs_dir2_data_read(NULL, dp, map->br_startoff,
                        map->br_blockcount >= mp->m_dirblkfsbs ?
-                            XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1,
+                            XFS_FSB_TO_DADDR(mp, map->br_startblock) : -1, &bp);
-                        &bp, XFS_DATA_FORK);
        /*
         * Should just skip over the data block instead of giving up.
@@ -922,11 +994,11 @@ xfs_dir2_leaf_readbuf(
                 */
                if (i > mip->ra_current &&
                    map[mip->ra_index].br_blockcount >= mp->m_dirblkfsbs) {
-                        xfs_buf_readahead(mp->m_ddev_targp,
+                        xfs_dir2_data_readahead(NULL, dp,
+                                map[mip->ra_index].br_startoff + mip->ra_offset,
                                XFS_FSB_TO_DADDR(mp,
                                        map[mip->ra_index].br_startblock +
-                                                        mip->ra_offset),
+                                                        mip->ra_offset));
-                                (int)BTOBB(mp->m_dirblksize));
                        mip->ra_current = i;
                }
@@ -935,10 +1007,9 @@ xfs_dir2_leaf_readbuf(
                 * use our mapping, but this is a very rare case.
                 */
                else if (i > mip->ra_current) {
-                        xfs_da_reada_buf(NULL, dp,
+                        xfs_dir2_data_readahead(NULL, dp,
                                        map[mip->ra_index].br_startoff +
-                                                        mip->ra_offset,
+                                                        mip->ra_offset, -1);
-                                        XFS_DATA_FORK);
                        mip->ra_current = i;
                }
@@ -1177,15 +1248,14 @@ xfs_dir2_leaf_init(
         * Get the buffer for the block.
         */
        error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, bno), -1, &bp,
-                XFS_DATA_FORK);
+                               XFS_DATA_FORK);
-        if (error) {
+        if (error)
                return error;
-        }
-        ASSERT(bp != NULL);
-        leaf = bp->b_addr;
        /*
         * Initialize the header.
         */
+        leaf = bp->b_addr;
        leaf->hdr.info.magic = cpu_to_be16(magic);
        leaf->hdr.info.forw = 0;
        leaf->hdr.info.back = 0;
@@ -1198,10 +1268,12 @@ xfs_dir2_leaf_init(
         * the block.
         */
        if (magic == XFS_DIR2_LEAF1_MAGIC) {
+                bp->b_ops = &xfs_dir2_leaf1_buf_ops;
                ltp = xfs_dir2_leaf_tail_p(mp, leaf);
                ltp->bestcount = 0;
                xfs_dir2_leaf_log_tail(tp, bp);
-        }
+        } else
+                bp->b_ops = &xfs_dir2_leafn_buf_ops;
        *bpp = bp;
        return 0;
 }
@@ -1372,13 +1444,11 @@ xfs_dir2_leaf_lookup_int(
        dp = args->dp;
        tp = args->trans;
        mp = dp->i_mount;
-        /*
-         * Read the leaf block into the buffer.
+        error = xfs_dir2_leaf_read(tp, dp, mp->m_dirleafblk, -1, &lbp);
-         */
-        error = xfs_da_read_buf(tp, dp, mp->m_dirleafblk, -1, &lbp,
-                                                        XFS_DATA_FORK);
        if (error)
                return error;
        *lbpp = lbp;
        leaf = lbp->b_addr;
        xfs_dir2_leaf_check(dp, lbp);
@@ -1409,14 +1479,13 @@ xfs_dir2_leaf_lookup_int(
                if (newdb != curdb) {
                        if (dbp)
                                xfs_trans_brelse(tp, dbp);
-                        error = xfs_da_read_buf(tp, dp,
+                        error = xfs_dir2_data_read(tp, dp,
-                                                xfs_dir2_db_to_da(mp, newdb),
+                                                   xfs_dir2_db_to_da(mp, newdb),
-                                                -1, &dbp, XFS_DATA_FORK);
+                                                   -1, &dbp);
                        if (error) {
                                xfs_trans_brelse(tp, lbp);
                                return error;
                        }
-                        xfs_dir2_data_check(dp, dbp);
                        curdb = newdb;
                }
                /*
@@ -1451,9 +1520,9 @@ xfs_dir2_leaf_lookup_int(
                ASSERT(cidb != -1);
                if (cidb != curdb) {
                        xfs_trans_brelse(tp, dbp);
-                        error = xfs_da_read_buf(tp, dp,
+                        error = xfs_dir2_data_read(tp, dp,
-                                                xfs_dir2_db_to_da(mp, cidb),
+                                                   xfs_dir2_db_to_da(mp, cidb),
-                                                -1, &dbp, XFS_DATA_FORK);
+                                                   -1, &dbp);
                        if (error) {
                                xfs_trans_brelse(tp, lbp);
                                return error;
@@ -1738,10 +1807,9 @@ xfs_dir2_leaf_trim_data(
        /*
         * Read the offending data block.  We need its buffer.
         */
-        if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp,
+        error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, db), -1, &dbp);
-                        XFS_DATA_FORK))) {
+        if (error)
                return error;
-        }
        leaf = lbp->b_addr;
        ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -1864,10 +1932,9 @@ xfs_dir2_node_to_leaf(
        /*
         * Read the freespace block.
         */
-        if ((error = xfs_da_read_buf(tp, dp, mp->m_dirfreeblk, -1, &fbp,
+        error = xfs_dir2_free_read(tp, dp,  mp->m_dirfreeblk, &fbp);
-                        XFS_DATA_FORK))) {
+        if (error)
                return error;
-        }
        free = fbp->b_addr;
        ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
        ASSERT(!free->hdr.firstdb);
@@ -1890,7 +1957,10 @@ xfs_dir2_node_to_leaf(
                xfs_dir2_leaf_compact(args, lbp);
        else
                xfs_dir2_leaf_log_header(tp, lbp);
+        lbp->b_ops = &xfs_dir2_leaf1_buf_ops;
        leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAF1_MAGIC);
        /*
         * Set up the leaf tail from the freespace block.
         */
diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c
index 6c7052406605..5980f9b7fa9b 100644
--- a/fs/xfs/xfs_dir2_node.c
+++ b/fs/xfs/xfs_dir2_node.c
@@ -55,6 +55,74 @@ static int xfs_dir2_leafn_remove(xfs_da_args_t *args, struct xfs_buf *bp,
 static int xfs_dir2_node_addname_int(xfs_da_args_t *args,
                                     xfs_da_state_blk_t *fblk);
+static void
+xfs_dir2_free_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dir2_free_hdr *hdr = bp->b_addr;
+        int                     block_ok = 0;
+        block_ok = hdr->magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC);
+        if (!block_ok) {
+                XFS_CORRUPTION_ERROR("xfs_dir2_free_verify magic",
+                                     XFS_ERRLEVEL_LOW, mp, hdr);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_dir2_free_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_free_verify(bp);
+}
+static void
+xfs_dir2_free_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dir2_free_verify(bp);
+}
+static const struct xfs_buf_ops xfs_dir2_free_buf_ops = {
+        .verify_read = xfs_dir2_free_read_verify,
+        .verify_write = xfs_dir2_free_write_verify,
+};
+static int
+__xfs_dir2_free_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        xfs_daddr_t             mappedbno,
+        struct xfs_buf          **bpp)
+{
+        return xfs_da_read_buf(tp, dp, fbno, mappedbno, bpp,
+                                XFS_DATA_FORK, &xfs_dir2_free_buf_ops);
+}
+int
+xfs_dir2_free_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        struct xfs_buf          **bpp)
+{
+        return __xfs_dir2_free_read(tp, dp, fbno, -1, bpp);
+}
+static int
+xfs_dir2_free_try_read(
+        struct xfs_trans        *tp,
+        struct xfs_inode        *dp,
+        xfs_dablk_t             fbno,
+        struct xfs_buf          **bpp)
+{
+        return __xfs_dir2_free_read(tp, dp, fbno, -2, bpp);
+}
 /*
 * Log entries from a freespace block.
 */
@@ -131,11 +199,12 @@ xfs_dir2_leaf_to_node(
        /*
         * Get the buffer for the new freespace block.
         */
-        if ((error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
+        error = xfs_da_get_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb), -1, &fbp,
-                        XFS_DATA_FORK))) {
+                                XFS_DATA_FORK);
+        if (error)
                return error;
-        }
+        fbp->b_ops = &xfs_dir2_free_buf_ops;
-        ASSERT(fbp != NULL);
        free = fbp->b_addr;
        leaf = lbp->b_addr;
        ltp = xfs_dir2_leaf_tail_p(mp, leaf);
@@ -157,7 +226,10 @@ xfs_dir2_leaf_to_node(
                *to = cpu_to_be16(off);
        }
        free->hdr.nused = cpu_to_be32(n);
+        lbp->b_ops = &xfs_dir2_leafn_buf_ops;
        leaf->hdr.info.magic = cpu_to_be16(XFS_DIR2_LEAFN_MAGIC);
        /*
         * Log everything.
         */
@@ -394,12 +466,10 @@ xfs_dir2_leafn_lookup_for_addname(
                                 */
                                if (curbp)
                                        xfs_trans_brelse(tp, curbp);
-                                /*
-                                 * Read the free block.
+                                error = xfs_dir2_free_read(tp, dp,
-                                 */
-                                error = xfs_da_read_buf(tp, dp,
                                                xfs_dir2_db_to_da(mp, newfdb),
-                                                -1, &curbp, XFS_DATA_FORK);
+                                                &curbp);
                                if (error)
                                        return error;
                                free = curbp->b_addr;
@@ -534,9 +604,9 @@ xfs_dir2_leafn_lookup_for_entry(
                                ASSERT(state->extravalid);
                                curbp = state->extrablk.bp;
                        } else {
-                                error = xfs_da_read_buf(tp, dp,
+                                error = xfs_dir2_data_read(tp, dp,
                                                xfs_dir2_db_to_da(mp, newdb),
-                                                -1, &curbp, XFS_DATA_FORK);
+                                                -1, &curbp);
                                if (error)
                                        return error;
                        }
@@ -568,6 +638,7 @@ xfs_dir2_leafn_lookup_for_entry(
                        state->extrablk.index = (int)((char *)dep -
                                                        (char *)curbp->b_addr);
                        state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                        curbp->b_ops = &xfs_dir2_data_buf_ops;
                        if (cmp == XFS_CMP_EXACT)
                                return XFS_ERROR(EEXIST);
                }
@@ -582,6 +653,7 @@ xfs_dir2_leafn_lookup_for_entry(
                        state->extrablk.index = -1;
                        state->extrablk.blkno = curdb;
                        state->extrablk.magic = XFS_DIR2_DATA_MAGIC;
+                        curbp->b_ops = &xfs_dir2_data_buf_ops;
                } else {
                        /* If the curbp is not the CI match block, drop it */
                        if (state->extrablk.bp != curbp)
@@ -825,6 +897,77 @@ xfs_dir2_leafn_rebalance(
        }
 }
+static int
+xfs_dir2_data_block_free(
+        xfs_da_args_t           *args,
+        struct xfs_dir2_data_hdr *hdr,
+        struct xfs_dir2_free    *free,
+        xfs_dir2_db_t           fdb,
+        int                     findex,
+        struct xfs_buf          *fbp,
+        int                     longest)
+{
+        struct xfs_trans        *tp = args->trans;
+        int                     logfree = 0;
+        if (!hdr) {
+                /* One less used entry in the free table.  */
+                be32_add_cpu(&free->hdr.nused, -1);
+                xfs_dir2_free_log_header(tp, fbp);
+                /*
+                 * If this was the last entry in the table, we can trim the
+                 * table size back.  There might be other entries at the end
+                 * referring to non-existent data blocks, get those too.
+                 */
+                if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
+                        int     i;              /* free entry index */
+                        for (i = findex - 1; i >= 0; i--) {
+                                if (free->bests[i] != cpu_to_be16(NULLDATAOFF))
+                                        break;
+                        }
+                        free->hdr.nvalid = cpu_to_be32(i + 1);
+                        logfree = 0;
+                } else {
+                        /* Not the last entry, just punch it out.  */
+                        free->bests[findex] = cpu_to_be16(NULLDATAOFF);
+                        logfree = 1;
+                }
+                /*
+                 * If there are no useful entries left in the block,
+                 * get rid of the block if we can.
+                 */
+                if (!free->hdr.nused) {
+                        int error;
+                        error = xfs_dir2_shrink_inode(args, fdb, fbp);
+                        if (error == 0) {
+                                fbp = NULL;
+                                logfree = 0;
+                        } else if (error != ENOSPC || args->total != 0)
+                                return error;
+                        /*
+                         * It's possible to get ENOSPC if there is no
+                         * space reservation.  In this case some one
+                         * else will eventually get rid of this block.
+                         */
+                }
+        } else {
+                /*
+                 * Data block is not empty, just set the free entry to the new
+                 * value.
+                 */
+                free->bests[findex] = cpu_to_be16(longest);
+                logfree = 1;
+        }
+        /* Log the free entry that changed, unless we got rid of it.  */
+        if (logfree)
+                xfs_dir2_free_log_bests(tp, fbp, findex, findex);
+        return 0;
+}
 /*
 * Remove an entry from a node directory.
 * This removes the leaf entry and the data entry,
@@ -908,17 +1051,16 @@ xfs_dir2_leafn_remove(
                xfs_dir2_db_t   fdb;            /* freeblock block number */
                int             findex;         /* index in freeblock entries */
                xfs_dir2_free_t *free;          /* freeblock structure */
-                int             logfree;        /* need to log free entry */
                /*
                 * Convert the data block number to a free block,
                 * read in the free block.
                 */
                fdb = xfs_dir2_db_to_fdb(mp, db);
-                if ((error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, fdb),
+                error = xfs_dir2_free_read(tp, dp, xfs_dir2_db_to_da(mp, fdb),
-                                -1, &fbp, XFS_DATA_FORK))) {
+                                           &fbp);
+                if (error)
                        return error;
-                }
                free = fbp->b_addr;
                ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
                ASSERT(be32_to_cpu(free->hdr.firstdb) ==
@@ -954,68 +1096,12 @@ xfs_dir2_leafn_remove(
                 * If we got rid of the data block, we can eliminate that entry
                 * in the free block.
                 */
-                if (hdr == NULL) {
+                error = xfs_dir2_data_block_free(args, hdr, free,
-                        /*
+                                                 fdb, findex, fbp, longest);
-                         * One less used entry in the free table.
+                if (error)
-                         */
+                        return error;
-                        be32_add_cpu(&free->hdr.nused, -1);
-                        xfs_dir2_free_log_header(tp, fbp);
-                        /*
-                         * If this was the last entry in the table, we can
-                         * trim the table size back.  There might be other
-                         * entries at the end referring to non-existent
-                         * data blocks, get those too.
-                         */
-                        if (findex == be32_to_cpu(free->hdr.nvalid) - 1) {
-                                int     i;              /* free entry index */
-                                for (i = findex - 1;
-                                     i >= 0 &&
-                                     free->bests[i] == cpu_to_be16(NULLDATAOFF);
-                                     i--)
-                                        continue;
-                                free->hdr.nvalid = cpu_to_be32(i + 1);
-                                logfree = 0;
-                        }
-                        /*
-                         * Not the last entry, just punch it out.
-                         */
-                        else {
-                                free->bests[findex] = cpu_to_be16(NULLDATAOFF);
-                                logfree = 1;
-                        }
-                        /*
-                         * If there are no useful entries left in the block,
-                         * get rid of the block if we can.
-                         */
-                        if (!free->hdr.nused) {
-                                error = xfs_dir2_shrink_inode(args, fdb, fbp);
-                                if (error == 0) {
-                                        fbp = NULL;
-                                        logfree = 0;
-                                } else if (error != ENOSPC || args->total != 0)
-                                        return error;
-                                /*
-                                 * It's possible to get ENOSPC if there is no
-                                 * space reservation.  In this case some one
-                                 * else will eventually get rid of this block.
-                                 */
-                        }
-                }
-                /*
-                 * Data block is not empty, just set the free entry to
-                 * the new value.
-                 */
-                else {
-                        free->bests[findex] = cpu_to_be16(longest);
-                        logfree = 1;
-                }
-                /*
-                 * Log the free entry that changed, unless we got rid of it.
-                 */
-                if (logfree)
-                        xfs_dir2_free_log_bests(tp, fbp, findex, findex);
        }
        xfs_dir2_leafn_check(dp, bp);
        /*
         * Return indication of whether this leaf block is empty enough
@@ -1169,12 +1255,11 @@ xfs_dir2_leafn_toosmall(
                /*
                 * Read the sibling leaf block.
                 */
-                if ((error =
+                error = xfs_dir2_leafn_read(state->args->trans, state->args->dp,
-                    xfs_da_read_buf(state->args->trans, state->args->dp, blkno,
+                                            blkno, -1, &bp);
-                            -1, &bp, XFS_DATA_FORK))) {
+                if (error)
                        return error;
-                }
-                ASSERT(bp != NULL);
                /*
                 * Count bytes in the two blocks combined.
                 */
@@ -1454,14 +1539,13 @@ xfs_dir2_node_addname_int(
                         * This should be really rare, so there's no reason
                         * to avoid it.
                         */
-                        if ((error = xfs_da_read_buf(tp, dp,
+                        error = xfs_dir2_free_try_read(tp, dp,
-                                        xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
+                                                xfs_dir2_db_to_da(mp, fbno),
-                                        XFS_DATA_FORK))) {
+                                                &fbp);
+                        if (error)
                                return error;
-                        }
+                        if (!fbp)
-                        if (unlikely(fbp == NULL)) {
                                continue;
-                        }
                        free = fbp->b_addr;
                        ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
                        findex = 0;
@@ -1520,9 +1604,10 @@ xfs_dir2_node_addname_int(
                 * that was just allocated.
                 */
                fbno = xfs_dir2_db_to_fdb(mp, dbno);
-                if (unlikely(error = xfs_da_read_buf(tp, dp,
+                error = xfs_dir2_free_try_read(tp, dp,
-                                xfs_dir2_db_to_da(mp, fbno), -2, &fbp,
+                                               xfs_dir2_db_to_da(mp, fbno),
-                                XFS_DATA_FORK)))
+                                               &fbp);
+                if (error)
                        return error;
                /*
@@ -1561,12 +1646,12 @@ xfs_dir2_node_addname_int(
                        /*
                         * Get a buffer for the new block.
                         */
-                        if ((error = xfs_da_get_buf(tp, dp,
+                        error = xfs_da_get_buf(tp, dp,
-                                                   xfs_dir2_db_to_da(mp, fbno),
+                                               xfs_dir2_db_to_da(mp, fbno),
-                                                   -1, &fbp, XFS_DATA_FORK))) {
+                                               -1, &fbp, XFS_DATA_FORK);
+                        if (error)
                                return error;
-                        }
+                        fbp->b_ops = &xfs_dir2_free_buf_ops;
-                        ASSERT(fbp != NULL);
                        /*
                         * Initialize the new block to be empty, and remember
@@ -1630,8 +1715,8 @@ xfs_dir2_node_addname_int(
                /*
                 * Read the data block in.
                 */
-                error = xfs_da_read_buf(tp, dp, xfs_dir2_db_to_da(mp, dbno),
+                error = xfs_dir2_data_read(tp, dp, xfs_dir2_db_to_da(mp, dbno),
-                                -1, &dbp, XFS_DATA_FORK);
+                                           -1, &dbp);
                if (error)
                        return error;
                hdr = dbp->b_addr;
@@ -1917,18 +2002,15 @@ xfs_dir2_node_trim_free(
        /*
         * Read the freespace block.
         */
-        if (unlikely(error = xfs_da_read_buf(tp, dp, (xfs_dablk_t)fo, -2, &bp,
+        error = xfs_dir2_free_try_read(tp, dp, fo, &bp);
-                        XFS_DATA_FORK))) {
+        if (error)
                return error;
-        }
        /*
         * There can be holes in freespace.  If fo is a hole, there's
         * nothing to do.
         */
-        if (bp == NULL) {
+        if (!bp)
                return 0;
-        }
        free = bp->b_addr;
        ASSERT(free->hdr.magic == cpu_to_be32(XFS_DIR2_FREE_MAGIC));
        /*
diff --git a/fs/xfs/xfs_dir2_priv.h b/fs/xfs/xfs_dir2_priv.h
index 3523d3e15aa8..7da79f6515fd 100644
--- a/fs/xfs/xfs_dir2_priv.h
+++ b/fs/xfs/xfs_dir2_priv.h
@@ -30,6 +30,8 @@ extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
                                const unsigned char *name, int len);
 /* xfs_dir2_block.c */
+extern const struct xfs_buf_ops xfs_dir2_block_buf_ops;
 extern int xfs_dir2_block_addname(struct xfs_da_args *args);
 extern int xfs_dir2_block_getdents(struct xfs_inode *dp, void *dirent,
                xfs_off_t *offset, filldir_t filldir);
@@ -41,10 +43,19 @@ extern int xfs_dir2_leaf_to_block(struct xfs_da_args *args,
 /* xfs_dir2_data.c */
 #ifdef DEBUG
-extern void xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+#define xfs_dir2_data_check(dp,bp) __xfs_dir2_data_check(dp, bp);
 #else
 #define xfs_dir2_data_check(dp,bp)
 #endif
+extern const struct xfs_buf_ops xfs_dir2_data_buf_ops;
+extern int __xfs_dir2_data_check(struct xfs_inode *dp, struct xfs_buf *bp);
+extern int xfs_dir2_data_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t bno, xfs_daddr_t mapped_bno, struct xfs_buf **bpp);
+extern int xfs_dir2_data_readahead(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t bno, xfs_daddr_t mapped_bno);
 extern struct xfs_dir2_data_free *
 xfs_dir2_data_freeinsert(struct xfs_dir2_data_hdr *hdr,
                struct xfs_dir2_data_unused *dup, int *loghead);
@@ -66,6 +77,10 @@ extern void xfs_dir2_data_use_free(struct xfs_trans *tp, struct xfs_buf *bp,
                xfs_dir2_data_aoff_t len, int *needlogp, int *needscanp);
 /* xfs_dir2_leaf.c */
+extern const struct xfs_buf_ops xfs_dir2_leafn_buf_ops;
+extern int xfs_dir2_leafn_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t fbno, xfs_daddr_t mappedbno, struct xfs_buf **bpp);
 extern int xfs_dir2_block_to_leaf(struct xfs_da_args *args,
                struct xfs_buf *dbp);
 extern int xfs_dir2_leaf_addname(struct xfs_da_args *args);
@@ -115,6 +130,8 @@ extern int xfs_dir2_node_removename(struct xfs_da_args *args);
 extern int xfs_dir2_node_replace(struct xfs_da_args *args);
 extern int xfs_dir2_node_trim_free(struct xfs_da_args *args, xfs_fileoff_t fo,
                int *rvalp);
+extern int xfs_dir2_free_read(struct xfs_trans *tp, struct xfs_inode *dp,
+                xfs_dablk_t fbno, struct xfs_buf **bpp);
 /* xfs_dir2_sf.c */
 extern xfs_ino_t xfs_dir2_sf_get_parent_ino(struct xfs_dir2_sf_hdr *sfp);
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index bf27fcca4843..9e1bf5294c91 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -248,7 +248,59 @@ xfs_qm_init_dquot_blk(
        xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
+static void
+xfs_dquot_buf_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_dqblk        *d = (struct xfs_dqblk *)bp->b_addr;
+        struct xfs_disk_dquot   *ddq;
+        xfs_dqid_t              id = 0;
+        int                     i;
+        /*
+         * On the first read of the buffer, verify that each dquot is valid.
+         * We don't know what the id of the dquot is supposed to be, just that
+         * they should be increasing monotonically within the buffer. If the
+         * first id is corrupt, then it will fail on the second dquot in the
+         * buffer so corruptions could point to the wrong dquot in this case.
+         */
+        for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+                int     error;
+                ddq = &d[i].dd_diskdq;
+                if (i == 0)
+                        id = be32_to_cpu(ddq->d_id);
+                error = xfs_qm_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
+                                        "xfs_dquot_read_verify");
+                if (error) {
+                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, d);
+                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        break;
+                }
+        }
+}
+static void
+xfs_dquot_buf_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dquot_buf_verify(bp);
+}
+void
+xfs_dquot_buf_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_dquot_buf_verify(bp);
+}
+const struct xfs_buf_ops xfs_dquot_buf_ops = {
+        .verify_read = xfs_dquot_buf_read_verify,
+        .verify_write = xfs_dquot_buf_write_verify,
+};
 /*
 * Allocate a block and fill it with dquots.
@@ -315,6 +367,7 @@ xfs_qm_dqalloc(
        error = xfs_buf_geterror(bp);
        if (error)
                goto error1;
+        bp->b_ops = &xfs_dquot_buf_ops;
        /*
         * Make a chunk of dquots out of this buffer and log
@@ -359,6 +412,51 @@ xfs_qm_dqalloc(
        return (error);
 }
+STATIC int
+xfs_qm_dqrepair(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_dquot        *dqp,
+        xfs_dqid_t              firstid,
+        struct xfs_buf          **bpp)
+{
+        int                     error;
+        struct xfs_disk_dquot   *ddq;
+        struct xfs_dqblk        *d;
+        int                     i;
+        /*
+         * Read the buffer without verification so we get the corrupted
+         * buffer returned to us. make sure we verify it on write, though.
+         */
+        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, dqp->q_blkno,
+                                   mp->m_quotainfo->qi_dqchunklen,
+                                   0, bpp, NULL);
+        if (error) {
+                ASSERT(*bpp == NULL);
+                return XFS_ERROR(error);
+        }
+        (*bpp)->b_ops = &xfs_dquot_buf_ops;
+        ASSERT(xfs_buf_islocked(*bpp));
+        d = (struct xfs_dqblk *)(*bpp)->b_addr;
+        /* Do the actual repair of dquots in this buffer */
+        for (i = 0; i < mp->m_quotainfo->qi_dqperchunk; i++) {
+                ddq = &d[i].dd_diskdq;
+                error = xfs_qm_dqcheck(mp, ddq, firstid + i,
+                                       dqp->dq_flags & XFS_DQ_ALLTYPES,
+                                       XFS_QMOPT_DQREPAIR, "xfs_qm_dqrepair");
+                if (error) {
+                        /* repair failed, we're screwed */
+                        xfs_trans_brelse(tp, *bpp);
+                        return XFS_ERROR(EIO);
+                }
+        }
+        return 0;
+}
 /*
 * Maps a dquot to the buffer containing its on-disk version.
@@ -378,7 +476,6 @@ xfs_qm_dqtobp(
        xfs_buf_t       *bp;
        xfs_inode_t     *quotip = XFS_DQ_TO_QIP(dqp);
        xfs_mount_t     *mp = dqp->q_mount;
-        xfs_disk_dquot_t *ddq;
        xfs_dqid_t      id = be32_to_cpu(dqp->q_core.d_id);
        xfs_trans_t     *tp = (tpp ? *tpp : NULL);
@@ -439,33 +536,24 @@ xfs_qm_dqtobp(
                error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                           dqp->q_blkno,
                                           mp->m_quotainfo->qi_dqchunklen,
-                                           0, &bp);
+                                           0, &bp, &xfs_dquot_buf_ops);
-                if (error || !bp)
-                        return XFS_ERROR(error);
-        }
-        ASSERT(xfs_buf_islocked(bp));
-        /*
+                if (error == EFSCORRUPTED && (flags & XFS_QMOPT_DQREPAIR)) {
-         * calculate the location of the dquot inside the buffer.
+                        xfs_dqid_t firstid = (xfs_dqid_t)map.br_startoff *
-         */
+                                                mp->m_quotainfo->qi_dqperchunk;
-        ddq = bp->b_addr + dqp->q_bufoffset;
+                        ASSERT(bp == NULL);
+                        error = xfs_qm_dqrepair(mp, tp, dqp, firstid, &bp);
+                }
-        /*
+                if (error) {
-         * A simple sanity check in case we got a corrupted dquot...
+                        ASSERT(bp == NULL);
-         */
+                        return XFS_ERROR(error);
-        error = xfs_qm_dqcheck(mp, ddq, id, dqp->dq_flags & XFS_DQ_ALLTYPES,
-                           flags & (XFS_QMOPT_DQREPAIR|XFS_QMOPT_DOWARN),
-                           "dqtobp");
-        if (error) {
-                if (!(flags & XFS_QMOPT_DQREPAIR)) {
-                        xfs_trans_brelse(tp, bp);
-                        return XFS_ERROR(EIO);
                }
        }
+        ASSERT(xfs_buf_islocked(bp));
        *O_bpp = bp;
-        *O_ddpp = ddq;
+        *O_ddpp = bp->b_addr + dqp->q_bufoffset;
        return (0);
 }
@@ -920,7 +1008,7 @@ xfs_qm_dqflush(
         * Get the buffer containing the on-disk dquot
         */
        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dqp->q_blkno,
-                                   mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+                                   mp->m_quotainfo->qi_dqchunklen, 0, &bp, NULL);
        if (error)
                goto out_unlock;
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 7d20af27346d..c694a8469c4a 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -161,4 +161,6 @@ static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
        return dqp;
 }
+extern const struct xfs_buf_ops xfs_dquot_buf_ops;
 #endif /* __XFS_DQUOT_H__ */
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 8c6d1d70278c..a83611849cee 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -29,6 +29,7 @@
 #include "xfs_inode.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 /*
 * Note that we only accept fileids which are long enough rather than allow
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aa473fa640a2..67284edb84d7 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -31,6 +31,8 @@
 #include "xfs_error.h"
 #include "xfs_vnodeops.h"
 #include "xfs_da_btree.h"
+#include "xfs_dir2_format.h"
+#include "xfs_dir2_priv.h"
 #include "xfs_ioctl.h"
 #include "xfs_trace.h"
@@ -84,7 +86,7 @@ xfs_rw_ilock_demote(
 *      valid before the operation, it will be read from disk before
 *      being partially zeroed.
 */
-STATIC int
+int
 xfs_iozero(
        struct xfs_inode        *ip,    /* inode                        */
        loff_t                  pos,    /* offset in file               */
@@ -255,15 +257,14 @@ xfs_file_aio_read(
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp;
-                if ((iocb->ki_pos & target->bt_smask) ||
+                if ((pos & target->bt_smask) || (size & target->bt_smask)) {
-                    (size & target->bt_smask)) {
+                        if (pos == i_size_read(inode))
-                        if (iocb->ki_pos == i_size_read(inode))
                                return 0;
                        return -XFS_ERROR(EINVAL);
                }
        }
-        n = mp->m_super->s_maxbytes - iocb->ki_pos;
+        n = mp->m_super->s_maxbytes - pos;
        if (n <= 0 || size == 0)
                return 0;
@@ -289,20 +290,21 @@ xfs_file_aio_read(
                xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
                if (inode->i_mapping->nrpages) {
-                        ret = -xfs_flushinval_pages(ip,
+                        ret = -filemap_write_and_wait_range(
-                                        (iocb->ki_pos & PAGE_CACHE_MASK),
+                                                        VFS_I(ip)->i_mapping,
-                                        -1, FI_REMAPF_LOCKED);
+                                                        pos, -1);
                        if (ret) {
                                xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
                                return ret;
                        }
+                        truncate_pagecache_range(VFS_I(ip), pos, -1);
                }
                xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
        }
-        trace_xfs_file_read(ip, size, iocb->ki_pos, ioflags);
+        trace_xfs_file_read(ip, size, pos, ioflags);
-        ret = generic_file_aio_read(iocb, iovp, nr_segs, iocb->ki_pos);
+        ret = generic_file_aio_read(iocb, iovp, nr_segs, pos);
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
@@ -670,10 +672,11 @@ xfs_file_dio_aio_write(
                goto out;
        if (mapping->nrpages) {
-                ret = -xfs_flushinval_pages(ip, (pos & PAGE_CACHE_MASK), -1,
+                ret = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                                        FI_REMAPF_LOCKED);
+                                                    pos, -1);
                if (ret)
                        goto out;
+                truncate_pagecache_range(VFS_I(ip), pos, -1);
        }
        /*
@@ -728,16 +731,17 @@ xfs_file_buffered_aio_write(
 write_retry:
        trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
        ret = generic_file_buffered_write(iocb, iovp, nr_segs,
-                        pos, &iocb->ki_pos, count, ret);
+                        pos, &iocb->ki_pos, count, 0);
        /*
-         * if we just got an ENOSPC, flush the inode now we aren't holding any
+         * If we just got an ENOSPC, try to write back all dirty inodes to
-         * page locks and retry *once*
+         * convert delalloc space to free up some of the excess reserved
+         * metadata space.
         */
        if (ret == -ENOSPC && !enospc) {
                enospc = 1;
-                ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE);
+                xfs_flush_inodes(ip->i_mount);
-                if (!ret)
+                goto write_retry;
-                        goto write_retry;
        }
        current->backing_dev_info = NULL;
@@ -889,7 +893,7 @@ xfs_dir_open(
         */
        mode = xfs_ilock_map_shared(ip);
        if (ip->i_d.di_nextents > 0)
-                xfs_da_reada_buf(NULL, ip, 0, XFS_DATA_FORK);
+                xfs_dir2_data_readahead(NULL, ip, 0, -1);
        xfs_iunlock(ip, mode);
        return 0;
 }
diff --git a/fs/xfs/xfs_fs.h b/fs/xfs/xfs_fs.h
index c13fed8c394a..6dda3f949b04 100644
--- a/fs/xfs/xfs_fs.h
+++ b/fs/xfs/xfs_fs.h
@@ -233,7 +233,8 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_LOGV2       0x0100  /* log format version 2 */
 #define XFS_FSOP_GEOM_FLAGS_SECTOR      0x0200  /* sector sizes >1BB    */
 #define XFS_FSOP_GEOM_FLAGS_ATTR2       0x0400  /* inline attributes rework */
-#define XFS_FSOP_GEOM_FLAGS_DIRV2CI     0x1000  /* ASCII only CI names */
+#define XFS_FSOP_GEOM_FLAGS_PROJID32    0x0800  /* 32-bit project IDs   */
+#define XFS_FSOP_GEOM_FLAGS_DIRV2CI     0x1000  /* ASCII only CI names  */
 #define XFS_FSOP_GEOM_FLAGS_LAZYSB      0x4000  /* lazy superblock counters */
@@ -339,6 +340,35 @@ typedef struct xfs_error_injection {
 /*
+ * Speculative preallocation trimming.
+ */
+#define XFS_EOFBLOCKS_VERSION           1
+struct xfs_eofblocks {
+        __u32           eof_version;
+        __u32           eof_flags;
+        uid_t           eof_uid;
+        gid_t           eof_gid;
+        prid_t          eof_prid;
+        __u32           pad32;
+        __u64           eof_min_file_size;
+        __u64           pad64[12];
+};
+/* eof_flags values */
+#define XFS_EOF_FLAGS_SYNC              (1 << 0) /* sync/wait mode scan */
+#define XFS_EOF_FLAGS_UID               (1 << 1) /* filter by uid */
+#define XFS_EOF_FLAGS_GID               (1 << 2) /* filter by gid */
+#define XFS_EOF_FLAGS_PRID              (1 << 3) /* filter by project id */
+#define XFS_EOF_FLAGS_MINFILESIZE       (1 << 4) /* filter by min file size */
+#define XFS_EOF_FLAGS_VALID     \
+        (XFS_EOF_FLAGS_SYNC |   \
+         XFS_EOF_FLAGS_UID |    \
+         XFS_EOF_FLAGS_GID |    \
+         XFS_EOF_FLAGS_PRID |   \
+         XFS_EOF_FLAGS_MINFILESIZE)
+/*
 * The user-level Handle Request interface structure.
 */
 typedef struct xfs_fsop_handlereq {
@@ -456,6 +486,7 @@ typedef struct xfs_handle {
 /*      XFS_IOC_GETBIOSIZE ---- deprecated 47      */
 #define XFS_IOC_GETBMAPX        _IOWR('X', 56, struct getbmap)
 #define XFS_IOC_ZERO_RANGE      _IOW ('X', 57, struct xfs_flock64)
+#define XFS_IOC_FREE_EOFBLOCKS  _IOR ('X', 58, struct xfs_eofblocks)
 /*
 * ioctl commands that replace IRIX syssgi()'s
diff --git a/fs/xfs/xfs_fs_subr.c b/fs/xfs/xfs_fs_subr.c
deleted file mode 100644
index 652b875a9d4c..000000000000
--- a/fs/xfs/xfs_fs_subr.c
+++ /dev/null
@@ -1,96 +0,0 @@
-/*
- * Copyright (c) 2000-2002,2005-2006 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_vnodeops.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_inode.h"
-#include "xfs_trace.h"
-/*
- * note: all filemap functions return negative error codes. These
- * need to be inverted before returning to the xfs core functions.
- */
-void
-xfs_tosspages(
-        xfs_inode_t     *ip,
-        xfs_off_t       first,
-        xfs_off_t       last,
-        int             fiopt)
-{
-        /* can't toss partial tail pages, so mask them out */
-        last &= ~(PAGE_SIZE - 1);
-        truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1);
-}
-int
-xfs_flushinval_pages(
-        xfs_inode_t     *ip,
-        xfs_off_t       first,
-        xfs_off_t       last,
-        int             fiopt)
-{
-        struct address_space *mapping = VFS_I(ip)->i_mapping;
-        int             ret = 0;
-        trace_xfs_pagecache_inval(ip, first, last);
-        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-        ret = filemap_write_and_wait_range(mapping, first,
-                                last == -1 ? LLONG_MAX : last);
-        if (!ret)
-                truncate_inode_pages_range(mapping, first, last);
-        return -ret;
-}
-int
-xfs_flush_pages(
-        xfs_inode_t     *ip,
-        xfs_off_t       first,
-        xfs_off_t       last,
-        uint64_t        flags,
-        int             fiopt)
-{
-        struct address_space *mapping = VFS_I(ip)->i_mapping;
-        int             ret = 0;
-        int             ret2;
-        xfs_iflags_clear(ip, XFS_ITRUNCATED);
-        ret = -filemap_fdatawrite_range(mapping, first,
-                                last == -1 ? LLONG_MAX : last);
-        if (flags & XBF_ASYNC)
-                return ret;
-        ret2 = xfs_wait_on_pages(ip, first, last);
-        if (!ret)
-                ret = ret2;
-        return ret;
-}
-int
-xfs_wait_on_pages(
-        xfs_inode_t     *ip,
-        xfs_off_t       first,
-        xfs_off_t       last)
-{
-        struct address_space *mapping = VFS_I(ip)->i_mapping;
-        if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) {
-                return -filemap_fdatawait_range(mapping, first,
-                                        last == -1 ? XFS_ISIZE(ip) - 1 : last);
-        }
-        return 0;
-}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 4beaede43277..94eaeedc5498 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -97,7 +97,9 @@ xfs_fs_geometry(
                        (xfs_sb_version_haslazysbcount(&mp->m_sb) ?
                                XFS_FSOP_GEOM_FLAGS_LAZYSB : 0) |
                        (xfs_sb_version_hasattr2(&mp->m_sb) ?
-                                XFS_FSOP_GEOM_FLAGS_ATTR2 : 0);
+                                XFS_FSOP_GEOM_FLAGS_ATTR2 : 0) |
+                        (xfs_sb_version_hasprojid32bit(&mp->m_sb) ?
+                                XFS_FSOP_GEOM_FLAGS_PROJID32 : 0);
                geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
                                mp->m_sb.sb_logsectsize : BBSIZE;
                geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -112,18 +114,40 @@ xfs_fs_geometry(
        return 0;
 }
+static struct xfs_buf *
+xfs_growfs_get_hdr_buf(
+        struct xfs_mount        *mp,
+        xfs_daddr_t             blkno,
+        size_t                  numblks,
+        int                     flags,
+        const struct xfs_buf_ops *ops)
+{
+        struct xfs_buf          *bp;
+        bp = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, flags);
+        if (!bp)
+                return NULL;
+        xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
+        bp->b_bn = blkno;
+        bp->b_maps[0].bm_bn = blkno;
+        bp->b_ops = ops;
+        return bp;
+}
 static int
 xfs_growfs_data_private(
        xfs_mount_t             *mp,            /* mount point for filesystem */
        xfs_growfs_data_t       *in)            /* growfs data input struct */
 {
        xfs_agf_t               *agf;
+        struct xfs_agfl         *agfl;
        xfs_agi_t               *agi;
        xfs_agnumber_t          agno;
        xfs_extlen_t            agsize;
        xfs_extlen_t            tmpsize;
        xfs_alloc_rec_t         *arec;
-        struct xfs_btree_block  *block;
        xfs_buf_t               *bp;
        int                     bucket;
        int                     dpct;
@@ -146,9 +170,14 @@ xfs_growfs_data_private(
        dpct = pct - mp->m_sb.sb_imax_pct;
        bp = xfs_buf_read_uncached(mp->m_ddev_targp,
                                XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1),
-                                XFS_FSS_TO_BB(mp, 1), 0);
+                                XFS_FSS_TO_BB(mp, 1), 0, NULL);
        if (!bp)
                return EIO;
+        if (bp->b_error) {
+                int     error = bp->b_error;
+                xfs_buf_relse(bp);
+                return error;
+        }
        xfs_buf_relse(bp);
        new = nb;       /* use new as a temporary here */
@@ -186,17 +215,18 @@ xfs_growfs_data_private(
        nfree = 0;
        for (agno = nagcount - 1; agno >= oagcount; agno--, new -= agsize) {
                /*
-                 * AG freelist header block
+                 * AG freespace header block
                 */
-                bp = xfs_buf_get(mp->m_ddev_targp,
+                bp = xfs_growfs_get_hdr_buf(mp,
-                                 XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
+                                XFS_AG_DADDR(mp, agno, XFS_AGF_DADDR(mp)),
-                                 XFS_FSS_TO_BB(mp, 1), 0);
+                                XFS_FSS_TO_BB(mp, 1), 0,
+                                &xfs_agf_buf_ops);
                if (!bp) {
                        error = ENOMEM;
                        goto error0;
                }
                agf = XFS_BUF_TO_AGF(bp);
-                memset(agf, 0, mp->m_sb.sb_sectsize);
                agf->agf_magicnum = cpu_to_be32(XFS_AGF_MAGIC);
                agf->agf_versionnum = cpu_to_be32(XFS_AGF_VERSION);
                agf->agf_seqno = cpu_to_be32(agno);
@@ -223,17 +253,39 @@ xfs_growfs_data_private(
                        goto error0;
                /*
+                 * AG freelist header block
+                 */
+                bp = xfs_growfs_get_hdr_buf(mp,
+                                XFS_AG_DADDR(mp, agno, XFS_AGFL_DADDR(mp)),
+                                XFS_FSS_TO_BB(mp, 1), 0,
+                                &xfs_agfl_buf_ops);
+                if (!bp) {
+                        error = ENOMEM;
+                        goto error0;
+                }
+                agfl = XFS_BUF_TO_AGFL(bp);
+                for (bucket = 0; bucket < XFS_AGFL_SIZE(mp); bucket++)
+                        agfl->agfl_bno[bucket] = cpu_to_be32(NULLAGBLOCK);
+                error = xfs_bwrite(bp);
+                xfs_buf_relse(bp);
+                if (error)
+                        goto error0;
+                /*
                 * AG inode header block
                 */
-                bp = xfs_buf_get(mp->m_ddev_targp,
+                bp = xfs_growfs_get_hdr_buf(mp,
-                                 XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
+                                XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                                 XFS_FSS_TO_BB(mp, 1), 0);
+                                XFS_FSS_TO_BB(mp, 1), 0,
+                                &xfs_agi_buf_ops);
                if (!bp) {
                        error = ENOMEM;
                        goto error0;
                }
                agi = XFS_BUF_TO_AGI(bp);
-                memset(agi, 0, mp->m_sb.sb_sectsize);
                agi->agi_magicnum = cpu_to_be32(XFS_AGI_MAGIC);
                agi->agi_versionnum = cpu_to_be32(XFS_AGI_VERSION);
                agi->agi_seqno = cpu_to_be32(agno);
@@ -254,24 +306,22 @@ xfs_growfs_data_private(
                /*
                 * BNO btree root block
                 */
-                bp = xfs_buf_get(mp->m_ddev_targp,
+                bp = xfs_growfs_get_hdr_buf(mp,
-                                 XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
+                                XFS_AGB_TO_DADDR(mp, agno, XFS_BNO_BLOCK(mp)),
-                                 BTOBB(mp->m_sb.sb_blocksize), 0);
+                                BTOBB(mp->m_sb.sb_blocksize), 0,
+                                &xfs_allocbt_buf_ops);
                if (!bp) {
                        error = ENOMEM;
                        goto error0;
                }
-                block = XFS_BUF_TO_BLOCK(bp);
-                memset(block, 0, mp->m_sb.sb_blocksize);
+                xfs_btree_init_block(mp, bp, XFS_ABTB_MAGIC, 0, 1, 0);
-                block->bb_magic = cpu_to_be32(XFS_ABTB_MAGIC);
+                arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-                block->bb_level = 0;
-                block->bb_numrecs = cpu_to_be16(1);
-                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
                error = xfs_bwrite(bp);
                xfs_buf_relse(bp);
                if (error)
@@ -280,25 +330,22 @@ xfs_growfs_data_private(
                /*
                 * CNT btree root block
                 */
-                bp = xfs_buf_get(mp->m_ddev_targp,
+                bp = xfs_growfs_get_hdr_buf(mp,
-                                 XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
+                                XFS_AGB_TO_DADDR(mp, agno, XFS_CNT_BLOCK(mp)),
-                                 BTOBB(mp->m_sb.sb_blocksize), 0);
+                                BTOBB(mp->m_sb.sb_blocksize), 0,
+                                &xfs_allocbt_buf_ops);
                if (!bp) {
                        error = ENOMEM;
                        goto error0;
                }
-                block = XFS_BUF_TO_BLOCK(bp);
-                memset(block, 0, mp->m_sb.sb_blocksize);
+                xfs_btree_init_block(mp, bp, XFS_ABTC_MAGIC, 0, 1, 0);
-                block->bb_magic = cpu_to_be32(XFS_ABTC_MAGIC);
+                arec = XFS_ALLOC_REC_ADDR(mp, XFS_BUF_TO_BLOCK(bp), 1);
-                block->bb_level = 0;
-                block->bb_numrecs = cpu_to_be16(1);
-                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
-                arec = XFS_ALLOC_REC_ADDR(mp, block, 1);
                arec->ar_startblock = cpu_to_be32(XFS_PREALLOC_BLOCKS(mp));
                arec->ar_blockcount = cpu_to_be32(
                        agsize - be32_to_cpu(arec->ar_startblock));
                nfree += be32_to_cpu(arec->ar_blockcount);
                error = xfs_bwrite(bp);
                xfs_buf_relse(bp);
                if (error)
@@ -307,20 +354,17 @@ xfs_growfs_data_private(
                /*
                 * INO btree root block
                 */
-                bp = xfs_buf_get(mp->m_ddev_targp,
+                bp = xfs_growfs_get_hdr_buf(mp,
-                                 XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
+                                XFS_AGB_TO_DADDR(mp, agno, XFS_IBT_BLOCK(mp)),
-                                 BTOBB(mp->m_sb.sb_blocksize), 0);
+                                BTOBB(mp->m_sb.sb_blocksize), 0,
+                                &xfs_inobt_buf_ops);
                if (!bp) {
                        error = ENOMEM;
                        goto error0;
                }
-                block = XFS_BUF_TO_BLOCK(bp);
-                memset(block, 0, mp->m_sb.sb_blocksize);
+                xfs_btree_init_block(mp, bp, XFS_IBT_MAGIC, 0, 0, 0);
-                block->bb_magic = cpu_to_be32(XFS_IBT_MAGIC);
-                block->bb_level = 0;
-                block->bb_numrecs = 0;
-                block->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
-                block->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
                error = xfs_bwrite(bp);
                xfs_buf_relse(bp);
                if (error)
@@ -408,14 +452,16 @@ xfs_growfs_data_private(
                if (agno < oagcount) {
                        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                                  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
-                                  XFS_FSS_TO_BB(mp, 1), 0, &bp);
+                                  XFS_FSS_TO_BB(mp, 1), 0, &bp,
+                                  &xfs_sb_buf_ops);
                } else {
                        bp = xfs_trans_get_buf(NULL, mp->m_ddev_targp,
                                  XFS_AGB_TO_DADDR(mp, agno, XFS_SB_BLOCK(mp)),
                                  XFS_FSS_TO_BB(mp, 1), 0);
-                        if (bp)
+                        if (bp) {
+                                bp->b_ops = &xfs_sb_buf_ops;
                                xfs_buf_zero(bp, 0, BBTOB(bp->b_length));
-                        else
+                        } else
                                error = ENOMEM;
                }
@@ -426,6 +472,7 @@ xfs_growfs_data_private(
                        break;
                }
                xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, XFS_SB_ALL_BITS);
                /*
                 * If we get an error writing out the alternate superblocks,
                 * just issue a warning and continue.  The real work is
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 76e81cff70b9..5399ef222dd7 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,7 +21,8 @@
 /*
 * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
 * other XFS code uses these values.  Times are measured in centisecs (i.e.
- * 100ths of a second).
+ * 100ths of a second) with the exception of eofb_timer, which is measured in
+ * seconds.
 */
 xfs_param_t xfs_params = {
                          /*    MIN             DFLT            MAX     */
@@ -40,4 +41,5 @@ xfs_param_t xfs_params = {
        .rotorstep      = {     1,              1,              255     },
        .inherit_nodfrg = {     0,              1,              1       },
        .fstrm_timer    = {     1,              30*100,         3600*100},
+        .eofb_timer     = {     1,              300,            3600*24},
 };
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index c5c4ef4f2bdb..a815412eab80 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -200,7 +200,8 @@ xfs_ialloc_inode_init(
                 */
                d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
-                                         mp->m_bsize * blks_per_cluster, 0);
+                                         mp->m_bsize * blks_per_cluster,
+                                         XBF_UNMAPPED);
                if (!fbuf)
                        return ENOMEM;
                /*
@@ -210,6 +211,7 @@ xfs_ialloc_inode_init(
                 *      to log a whole cluster of inodes instead of all the
                 *      individual transactions causing a lot of log traffic.
                 */
+                fbuf->b_ops = &xfs_inode_buf_ops;
                xfs_buf_zero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
                for (i = 0; i < ninodes; i++) {
                        int     ioffset = i << mp->m_sb.sb_inodelog;
@@ -877,9 +879,9 @@ error0:
 * This function is designed to be called twice if it has to do an allocation
 * to make more free inodes.  On the first call, *IO_agbp should be set to NULL.
 * If an inode is available without having to performn an allocation, an inode
- * number is returned.  In this case, *IO_agbp would be NULL.  If an allocation
+ * number is returned.  In this case, *IO_agbp is set to NULL.  If an allocation
- * needes to be done, xfs_dialloc would return the current AGI buffer in
+ * needs to be done, xfs_dialloc returns the current AGI buffer in *IO_agbp.
- * *IO_agbp.  The caller should then commit the current transaction, allocate a
+ * The caller should then commit the current transaction, allocate a
 * new transaction, and call xfs_dialloc() again, passing in the previous value
 * of *IO_agbp.  IO_agbp should be held across the transactions. Since the AGI
 * buffer is locked across the two calls, the second call is guaranteed to have
@@ -1472,6 +1474,57 @@ xfs_check_agi_unlinked(
 #define xfs_check_agi_unlinked(agi)
 #endif
+static void
+xfs_agi_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_agi  *agi = XFS_BUF_TO_AGI(bp);
+        int             agi_ok;
+        /*
+         * Validate the magic number of the agi block.
+         */
+        agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
+                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum));
+        /*
+         * during growfs operations, the perag is not fully initialised,
+         * so we can't use it for any useful checking. growfs ensures we can't
+         * use it by using uncached buffers that don't have the perag attached
+         * so we can detect and avoid this problem.
+         */
+        if (bp->b_pag)
+                agi_ok = agi_ok && be32_to_cpu(agi->agi_seqno) ==
+                                                bp->b_pag->pag_agno;
+        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
+                        XFS_RANDOM_IALLOC_READ_AGI))) {
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, agi);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+        xfs_check_agi_unlinked(agi);
+}
+static void
+xfs_agi_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agi_verify(bp);
+}
+static void
+xfs_agi_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_agi_verify(bp);
+}
+const struct xfs_buf_ops xfs_agi_buf_ops = {
+        .verify_read = xfs_agi_read_verify,
+        .verify_write = xfs_agi_write_verify,
+};
 /*
 * Read in the allocation group header (inode allocation section)
 */
@@ -1482,38 +1535,18 @@ xfs_read_agi(
        xfs_agnumber_t          agno,   /* allocation group number */
        struct xfs_buf          **bpp)  /* allocation group hdr buf */
 {
-        struct xfs_agi          *agi;   /* allocation group header */
-        int                     agi_ok; /* agi is consistent */
        int                     error;
        ASSERT(agno != NULLAGNUMBER);
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                        XFS_AG_DADDR(mp, agno, XFS_AGI_DADDR(mp)),
-                        XFS_FSS_TO_BB(mp, 1), 0, bpp);
+                        XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
        if (error)
                return error;
        ASSERT(!xfs_buf_geterror(*bpp));
-        agi = XFS_BUF_TO_AGI(*bpp);
-        /*
-         * Validate the magic number of the agi block.
-         */
-        agi_ok = agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC) &&
-                XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)) &&
-                be32_to_cpu(agi->agi_seqno) == agno;
-        if (unlikely(XFS_TEST_ERROR(!agi_ok, mp, XFS_ERRTAG_IALLOC_READ_AGI,
-                        XFS_RANDOM_IALLOC_READ_AGI))) {
-                XFS_CORRUPTION_ERROR("xfs_read_agi", XFS_ERRLEVEL_LOW,
-                                     mp, agi);
-                xfs_trans_brelse(tp, *bpp);
-                return XFS_ERROR(EFSCORRUPTED);
-        }
        xfs_buf_set_ref(*bpp, XFS_AGI_REF);
-        xfs_check_agi_unlinked(agi);
        return 0;
 }
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index 1fd6ea4e9c91..c8da3df271e6 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -147,7 +147,9 @@ int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
 /*
 * Get the data from the pointed-to record.
 */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
+int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
                xfs_inobt_rec_incore_t *rec, int *stat);
+extern const struct xfs_buf_ops xfs_agi_buf_ops;
 #endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_ialloc_btree.c b/fs/xfs/xfs_ialloc_btree.c
index 2b8b7a37aa18..bec344b36507 100644
--- a/fs/xfs/xfs_ialloc_btree.c
+++ b/fs/xfs/xfs_ialloc_btree.c
@@ -33,6 +33,7 @@
 #include "xfs_ialloc.h"
 #include "xfs_alloc.h"
 #include "xfs_error.h"
+#include "xfs_trace.h"
 STATIC int
@@ -181,6 +182,59 @@ xfs_inobt_key_diff(
                          cur->bc_rec.i.ir_startino;
 }
+void
+xfs_inobt_verify(
+        struct xfs_buf          *bp)
+{
+        struct xfs_mount        *mp = bp->b_target->bt_mount;
+        struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
+        unsigned int            level;
+        int                     sblock_ok; /* block passes checks */
+        /* magic number and level verification */
+        level = be16_to_cpu(block->bb_level);
+        sblock_ok = block->bb_magic == cpu_to_be32(XFS_IBT_MAGIC) &&
+                    level < mp->m_in_maxlevels;
+        /* numrecs verification */
+        sblock_ok = sblock_ok &&
+                be16_to_cpu(block->bb_numrecs) <= mp->m_inobt_mxr[level != 0];
+        /* sibling pointer verification */
+        sblock_ok = sblock_ok &&
+                (block->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK) ||
+                 be32_to_cpu(block->bb_u.s.bb_leftsib) < mp->m_sb.sb_agblocks) &&
+                block->bb_u.s.bb_leftsib &&
+                (block->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK) ||
+                 be32_to_cpu(block->bb_u.s.bb_rightsib) < mp->m_sb.sb_agblocks) &&
+                block->bb_u.s.bb_rightsib;
+        if (!sblock_ok) {
+                trace_xfs_btree_corrupt(bp, _RET_IP_);
+                XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, block);
+                xfs_buf_ioerror(bp, EFSCORRUPTED);
+        }
+}
+static void
+xfs_inobt_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inobt_verify(bp);
+}
+static void
+xfs_inobt_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inobt_verify(bp);
+}
+const struct xfs_buf_ops xfs_inobt_buf_ops = {
+        .verify_read = xfs_inobt_read_verify,
+        .verify_write = xfs_inobt_write_verify,
+};
 #ifdef DEBUG
 STATIC int
 xfs_inobt_keys_inorder(
@@ -218,6 +272,7 @@ static const struct xfs_btree_ops xfs_inobt_ops = {
        .init_rec_from_cur      = xfs_inobt_init_rec_from_cur,
        .init_ptr_from_cur      = xfs_inobt_init_ptr_from_cur,
        .key_diff               = xfs_inobt_key_diff,
+        .buf_ops                = &xfs_inobt_buf_ops,
 #ifdef DEBUG
        .keys_inorder           = xfs_inobt_keys_inorder,
        .recs_inorder           = xfs_inobt_recs_inorder,
diff --git a/fs/xfs/xfs_ialloc_btree.h b/fs/xfs/xfs_ialloc_btree.h
index f782ad0c4769..25c0239a8eab 100644
--- a/fs/xfs/xfs_ialloc_btree.h
+++ b/fs/xfs/xfs_ialloc_btree.h
@@ -109,4 +109,6 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *,
                struct xfs_trans *, struct xfs_buf *, xfs_agnumber_t);
 extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int);
+extern const struct xfs_buf_ops xfs_inobt_buf_ops;
 #endif  /* __XFS_IALLOC_BTREE_H__ */
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_icache.c
index 9500caf15acf..96e344e3e927 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_icache.c
@@ -19,6 +19,7 @@
 #include "xfs_fs.h"
 #include "xfs_types.h"
 #include "xfs_log.h"
+#include "xfs_log_priv.h"
 #include "xfs_inum.h"
 #include "xfs_trans.h"
 #include "xfs_trans_priv.h"
@@ -35,11 +36,425 @@
 #include "xfs_quota.h"
 #include "xfs_trace.h"
 #include "xfs_fsops.h"
+#include "xfs_icache.h"
 #include <linux/kthread.h>
 #include <linux/freezer.h>
-struct workqueue_struct *xfs_syncd_wq;  /* sync workqueue */
+STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
+                                struct xfs_perag *pag, struct xfs_inode *ip);
+/*
+ * Allocate and initialise an xfs_inode.
+ */
+STATIC struct xfs_inode *
+xfs_inode_alloc(
+        struct xfs_mount        *mp,
+        xfs_ino_t               ino)
+{
+        struct xfs_inode        *ip;
+        /*
+         * if this didn't occur in transactions, we could use
+         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
+         * code up to do this anyway.
+         */
+        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
+        if (!ip)
+                return NULL;
+        if (inode_init_always(mp->m_super, VFS_I(ip))) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                return NULL;
+        }
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(!xfs_isiflocked(ip));
+        ASSERT(ip->i_ino == 0);
+        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+        /* initialise the xfs inode */
+        ip->i_ino = ino;
+        ip->i_mount = mp;
+        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
+        ip->i_afp = NULL;
+        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
+        ip->i_flags = 0;
+        ip->i_delayed_blks = 0;
+        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
+        return ip;
+}
+STATIC void
+xfs_inode_free_callback(
+        struct rcu_head         *head)
+{
+        struct inode            *inode = container_of(head, struct inode, i_rcu);
+        struct xfs_inode        *ip = XFS_I(inode);
+        kmem_zone_free(xfs_inode_zone, ip);
+}
+STATIC void
+xfs_inode_free(
+        struct xfs_inode        *ip)
+{
+        switch (ip->i_d.di_mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                break;
+        }
+        if (ip->i_afp)
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+        if (ip->i_itemp) {
+                ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
+                xfs_inode_item_destroy(ip);
+                ip->i_itemp = NULL;
+        }
+        /* asserts to verify all state is correct here */
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(!xfs_isiflocked(ip));
+        /*
+         * Because we use RCU freeing we need to ensure the inode always
+         * appears to be reclaimed with an invalid inode number when in the
+         * free state. The ip->i_flags_lock provides the barrier against lookup
+         * races.
+         */
+        spin_lock(&ip->i_flags_lock);
+        ip->i_flags = XFS_IRECLAIM;
+        ip->i_ino = 0;
+        spin_unlock(&ip->i_flags_lock);
+        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
+}
+/*
+ * Check the validity of the inode we just found it the cache
+ */
+static int
+xfs_iget_cache_hit(
+        struct xfs_perag        *pag,
+        struct xfs_inode        *ip,
+        xfs_ino_t               ino,
+        int                     flags,
+        int                     lock_flags) __releases(RCU)
+{
+        struct inode            *inode = VFS_I(ip);
+        struct xfs_mount        *mp = ip->i_mount;
+        int                     error;
+        /*
+         * check for re-use of an inode within an RCU grace period due to the
+         * radix tree nodes not being updated yet. We monitor for this by
+         * setting the inode number to zero before freeing the inode structure.
+         * If the inode has been reallocated and set up, then the inode number
+         * will not match, so check for that, too.
+         */
+        spin_lock(&ip->i_flags_lock);
+        if (ip->i_ino != ino) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
+        /*
+         * If we are racing with another cache hit that is currently
+         * instantiating this inode or currently recycling it out of
+         * reclaimabe state, wait for the initialisation to complete
+         * before continuing.
+         *
+         * XXX(hch): eventually we should do something equivalent to
+         *           wait_on_inode to wait for these flags to be cleared
+         *           instead of polling for it.
+         */
+        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
+                trace_xfs_iget_skip(ip);
+                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
+                goto out_error;
+        }
+        /*
+         * If lookup is racing with unlink return an error immediately.
+         */
+        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+                error = ENOENT;
+                goto out_error;
+        }
+        /*
+         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+         * Need to carefully get it back into useable state.
+         */
+        if (ip->i_flags & XFS_IRECLAIMABLE) {
+                trace_xfs_iget_reclaim(ip);
+                /*
+                 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
+                 * from stomping over us while we recycle the inode.  We can't
+                 * clear the radix tree reclaimable tag yet as it requires
+                 * pag_ici_lock to be held exclusive.
+                 */
+                ip->i_flags |= XFS_IRECLAIM;
+                spin_unlock(&ip->i_flags_lock);
+                rcu_read_unlock();
+                error = -inode_init_always(mp->m_super, inode);
+                if (error) {
+                        /*
+                         * Re-initializing the inode failed, and we are in deep
+                         * trouble.  Try to re-add it to the reclaim list.
+                         */
+                        rcu_read_lock();
+                        spin_lock(&ip->i_flags_lock);
+                        ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
+                        ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
+                        trace_xfs_iget_reclaim_fail(ip);
+                        goto out_error;
+                }
+                spin_lock(&pag->pag_ici_lock);
+                spin_lock(&ip->i_flags_lock);
+                /*
+                 * Clear the per-lifetime state in the inode as we are now
+                 * effectively a new inode and need to return to the initial
+                 * state before reuse occurs.
+                 */
+                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
+                ip->i_flags |= XFS_INEW;
+                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+                inode->i_state = I_NEW;
+                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+                spin_unlock(&ip->i_flags_lock);
+                spin_unlock(&pag->pag_ici_lock);
+        } else {
+                /* If the VFS inode is being torn down, pause and try again. */
+                if (!igrab(inode)) {
+                        trace_xfs_iget_skip(ip);
+                        error = EAGAIN;
+                        goto out_error;
+                }
+                /* We've got a live one. */
+                spin_unlock(&ip->i_flags_lock);
+                rcu_read_unlock();
+                trace_xfs_iget_hit(ip);
+        }
+        if (lock_flags != 0)
+                xfs_ilock(ip, lock_flags);
+        xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
+        XFS_STATS_INC(xs_ig_found);
+        return 0;
+out_error:
+        spin_unlock(&ip->i_flags_lock);
+        rcu_read_unlock();
+        return error;
+}
+static int
+xfs_iget_cache_miss(
+        struct xfs_mount        *mp,
+        struct xfs_perag        *pag,
+        xfs_trans_t             *tp,
+        xfs_ino_t               ino,
+        struct xfs_inode        **ipp,
+        int                     flags,
+        int                     lock_flags)
+{
+        struct xfs_inode        *ip;
+        int                     error;
+        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
+        int                     iflags;
+        ip = xfs_inode_alloc(mp, ino);
+        if (!ip)
+                return ENOMEM;
+        error = xfs_iread(mp, tp, ip, flags);
+        if (error)
+                goto out_destroy;
+        trace_xfs_iget_miss(ip);
+        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
+                error = ENOENT;
+                goto out_destroy;
+        }
+        /*
+         * Preload the radix tree so we can insert safely under the
+         * write spinlock. Note that we cannot sleep inside the preload
+         * region. Since we can be called from transaction context, don't
+         * recurse into the file system.
+         */
+        if (radix_tree_preload(GFP_NOFS)) {
+                error = EAGAIN;
+                goto out_destroy;
+        }
+        /*
+         * Because the inode hasn't been added to the radix-tree yet it can't
+         * be found by another thread, so we can do the non-sleeping lock here.
+         */
+        if (lock_flags) {
+                if (!xfs_ilock_nowait(ip, lock_flags))
+                        BUG();
+        }
+        /*
+         * These values must be set before inserting the inode into the radix
+         * tree as the moment it is inserted a concurrent lookup (allowed by the
+         * RCU locking mechanism) can find it and that lookup must see that this
+         * is an inode currently under construction (i.e. that XFS_INEW is set).
+         * The ip->i_flags_lock that protects the XFS_INEW flag forms the
+         * memory barrier that ensures this detection works correctly at lookup
+         * time.
+         */
+        iflags = XFS_INEW;
+        if (flags & XFS_IGET_DONTCACHE)
+                iflags |= XFS_IDONTCACHE;
+        ip->i_udquot = ip->i_gdquot = NULL;
+        xfs_iflags_set(ip, iflags);
+        /* insert the new inode */
+        spin_lock(&pag->pag_ici_lock);
+        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
+        if (unlikely(error)) {
+                WARN_ON(error != -EEXIST);
+                XFS_STATS_INC(xs_ig_dup);
+                error = EAGAIN;
+                goto out_preload_end;
+        }
+        spin_unlock(&pag->pag_ici_lock);
+        radix_tree_preload_end();
+        *ipp = ip;
+        return 0;
+out_preload_end:
+        spin_unlock(&pag->pag_ici_lock);
+        radix_tree_preload_end();
+        if (lock_flags)
+                xfs_iunlock(ip, lock_flags);
+out_destroy:
+        __destroy_inode(VFS_I(ip));
+        xfs_inode_free(ip);
+        return error;
+}
+/*
+ * Look up an inode by number in the given file system.
+ * The inode is looked up in the cache held in each AG.
+ * If the inode is found in the cache, initialise the vfs inode
+ * if necessary.
+ *
+ * If it is not in core, read it in from the file system's device,
+ * add it to the cache and initialise the vfs inode.
+ *
+ * The inode is locked according to the value of the lock_flags parameter.
+ * This flag parameter indicates how and if the inode's IO lock and inode lock
+ * should be taken.
+ *
+ * mp -- the mount point structure for the current file system.  It points
+ *       to the inode hash table.
+ * tp -- a pointer to the current transaction if there is one.  This is
+ *       simply passed through to the xfs_iread() call.
+ * ino -- the number of the inode desired.  This is the unique identifier
+ *        within the file system for the inode being requested.
+ * lock_flags -- flags indicating how to lock the inode.  See the comment
+ *               for xfs_ilock() for a list of valid values.
+ */
+int
+xfs_iget(
+        xfs_mount_t     *mp,
+        xfs_trans_t     *tp,
+        xfs_ino_t       ino,
+        uint            flags,
+        uint            lock_flags,
+        xfs_inode_t     **ipp)
+{
+        xfs_inode_t     *ip;
+        int             error;
+        xfs_perag_t     *pag;
+        xfs_agino_t     agino;
+        /*
+         * xfs_reclaim_inode() uses the ILOCK to ensure an inode
+         * doesn't get freed while it's being referenced during a
+         * radix tree traversal here.  It assumes this function
+         * aqcuires only the ILOCK (and therefore it has no need to
+         * involve the IOLOCK in this synchronization).
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
+        /* reject inode numbers outside existing AGs */
+        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
+                return EINVAL;
+        /* get the perag structure and ensure that it's inode capable */
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
+        agino = XFS_INO_TO_AGINO(mp, ino);
+again:
+        error = 0;
+        rcu_read_lock();
+        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
+        if (ip) {
+                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
+                if (error)
+                        goto out_error_or_again;
+        } else {
+                rcu_read_unlock();
+                XFS_STATS_INC(xs_ig_missed);
+                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
+                                                        flags, lock_flags);
+                if (error)
+                        goto out_error_or_again;
+        }
+        xfs_perag_put(pag);
+        *ipp = ip;
+        /*
+         * If we have a real type for an on-disk inode, we can set ops(&unlock)
+         * now.  If it's a new inode being created, xfs_ialloc will handle it.
+         */
+        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
+                xfs_setup_inode(ip);
+        return 0;
+out_error_or_again:
+        if (error == EAGAIN) {
+                delay(1);
+                goto again;
+        }
+        xfs_perag_put(pag);
+        return error;
+}
 /*
 * The inode lookup is done in batches to keep the amount of lock traffic and
@@ -101,8 +516,11 @@ xfs_inode_ag_walk(
        struct xfs_mount        *mp,
        struct xfs_perag        *pag,
        int                     (*execute)(struct xfs_inode *ip,
-                                           struct xfs_perag *pag, int flags),
+                                           struct xfs_perag *pag, int flags,
-        int                     flags)
+                                           void *args),
+        int                     flags,
+        void                    *args,
+        int                     tag)
 {
        uint32_t                first_index;
        int                     last_error = 0;
@@ -121,9 +539,17 @@ restart:
                int             i;
                rcu_read_lock();
-                nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
+                if (tag == -1)
+                        nr_found = radix_tree_gang_lookup(&pag->pag_ici_root,
                                        (void **)batch, first_index,
                                        XFS_LOOKUP_BATCH);
+                else
+                        nr_found = radix_tree_gang_lookup_tag(
+                                        &pag->pag_ici_root,
+                                        (void **) batch, first_index,
+                                        XFS_LOOKUP_BATCH, tag);
                if (!nr_found) {
                        rcu_read_unlock();
                        break;
@@ -164,7 +590,7 @@ restart:
                for (i = 0; i < nr_found; i++) {
                        if (!batch[i])
                                continue;
-                        error = execute(batch[i], pag, flags);
+                        error = execute(batch[i], pag, flags, args);
                        IRELE(batch[i]);
                        if (error == EAGAIN) {
                                skipped++;
@@ -189,12 +615,40 @@ restart:
        return last_error;
 }
+/*
+ * Background scanning to trim post-EOF preallocated space. This is queued
+ * based on the 'background_prealloc_discard_period' tunable (5m by default).
+ */
+STATIC void
+xfs_queue_eofblocks(
+        struct xfs_mount *mp)
+{
+        rcu_read_lock();
+        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_EOFBLOCKS_TAG))
+                queue_delayed_work(mp->m_eofblocks_workqueue,
+                                   &mp->m_eofblocks_work,
+                                   msecs_to_jiffies(xfs_eofb_secs * 1000));
+        rcu_read_unlock();
+}
+void
+xfs_eofblocks_worker(
+        struct work_struct *work)
+{
+        struct xfs_mount *mp = container_of(to_delayed_work(work),
+                                struct xfs_mount, m_eofblocks_work);
+        xfs_icache_free_eofblocks(mp, NULL);
+        xfs_queue_eofblocks(mp);
+}
 int
 xfs_inode_ag_iterator(
        struct xfs_mount        *mp,
        int                     (*execute)(struct xfs_inode *ip,
-                                           struct xfs_perag *pag, int flags),
+                                           struct xfs_perag *pag, int flags,
-        int                     flags)
+                                           void *args),
+        int                     flags,
+        void                    *args)
 {
        struct xfs_perag        *pag;
        int                     error = 0;
@@ -204,7 +658,7 @@ xfs_inode_ag_iterator(
        ag = 0;
        while ((pag = xfs_perag_get(mp, ag))) {
                ag = pag->pag_agno + 1;
-                error = xfs_inode_ag_walk(mp, pag, execute, flags);
+                error = xfs_inode_ag_walk(mp, pag, execute, flags, args, -1);
                xfs_perag_put(pag);
                if (error) {
                        last_error = error;
@@ -215,224 +669,50 @@ xfs_inode_ag_iterator(
        return XFS_ERROR(last_error);
 }
-STATIC int
-xfs_sync_inode_data(
-        struct xfs_inode        *ip,
-        struct xfs_perag        *pag,
-        int                     flags)
-{
-        struct inode            *inode = VFS_I(ip);
-        struct address_space *mapping = inode->i_mapping;
-        int                     error = 0;
-        if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
-                return 0;
-        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
-                if (flags & SYNC_TRYLOCK)
-                        return 0;
-                xfs_ilock(ip, XFS_IOLOCK_SHARED);
-        }
-        error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
-                                0 : XBF_ASYNC, FI_NONE);
-        xfs_iunlock(ip, XFS_IOLOCK_SHARED);
-        return error;
-}
-/*
- * Write out pagecache data for the whole filesystem.
- */
-STATIC int
-xfs_sync_data(
-        struct xfs_mount        *mp,
-        int                     flags)
-{
-        int                     error;
-        ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
-        error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
-        if (error)
-                return XFS_ERROR(error);
-        xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
-        return 0;
-}
-STATIC int
-xfs_sync_fsdata(
-        struct xfs_mount        *mp)
-{
-        struct xfs_buf          *bp;
-        int                     error;
-        /*
-         * If the buffer is pinned then push on the log so we won't get stuck
-         * waiting in the write for someone, maybe ourselves, to flush the log.
-         *
-         * Even though we just pushed the log above, we did not have the
-         * superblock buffer locked at that point so it can become pinned in
-         * between there and here.
-         */
-        bp = xfs_getsb(mp, 0);
-        if (xfs_buf_ispinned(bp))
-                xfs_log_force(mp, 0);
-        error = xfs_bwrite(bp);
-        xfs_buf_relse(bp);
-        return error;
-}
-/*
- * When remounting a filesystem read-only or freezing the filesystem, we have
- * two phases to execute. This first phase is syncing the data before we
- * quiesce the filesystem, and the second is flushing all the inodes out after
- * we've waited for all the transactions created by the first phase to
- * complete. The second phase ensures that the inodes are written to their
- * location on disk rather than just existing in transactions in the log. This
- * means after a quiesce there is no log replay required to write the inodes to
- * disk (this is the main difference between a sync and a quiesce).
- */
-/*
- * First stage of freeze - no writers will make progress now we are here,
- * so we flush delwri and delalloc buffers here, then wait for all I/O to
- * complete.  Data is frozen at that point. Metadata is not frozen,
- * transactions can still occur here so don't bother emptying the AIL
- * because it'll just get dirty again.
- */
 int
-xfs_quiesce_data(
+xfs_inode_ag_iterator_tag(
-        struct xfs_mount        *mp)
+        struct xfs_mount        *mp,
-{
+        int                     (*execute)(struct xfs_inode *ip,
-        int                     error, error2 = 0;
+                                           struct xfs_perag *pag, int flags,
+                                           void *args),
-        /* force out the log */
+        int                     flags,
-        xfs_log_force(mp, XFS_LOG_SYNC);
+        void                    *args,
+        int                     tag)
-        /* write superblock and hoover up shutdown errors */
-        error = xfs_sync_fsdata(mp);
-        /* mark the log as covered if needed */
-        if (xfs_log_need_covered(mp))
-                error2 = xfs_fs_log_dummy(mp);
-        return error ? error : error2;
-}
-/*
- * Second stage of a quiesce. The data is already synced, now we have to take
- * care of the metadata. New transactions are already blocked, so we need to
- * wait for any remaining transactions to drain out before proceeding.
- */
-void
-xfs_quiesce_attr(
-        struct xfs_mount        *mp)
-{
-        int     error = 0;
-        /* wait for all modifications to complete */
-        while (atomic_read(&mp->m_active_trans) > 0)
-                delay(100);
-        /* reclaim inodes to do any IO before the freeze completes */
-        xfs_reclaim_inodes(mp, 0);
-        xfs_reclaim_inodes(mp, SYNC_WAIT);
-        /* flush all pending changes from the AIL */
-        xfs_ail_push_all_sync(mp->m_ail);
-        /*
-         * Just warn here till VFS can correctly support
-         * read-only remount without racing.
-         */
-        WARN_ON(atomic_read(&mp->m_active_trans) != 0);
-        /* Push the superblock and write an unmount record */
-        error = xfs_log_sbcount(mp);
-        if (error)
-                xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
-                                "Frozen image may not be consistent.");
-        xfs_log_unmount_write(mp);
-        /*
-         * At this point we might have modified the superblock again and thus
-         * added an item to the AIL, thus flush it again.
-         */
-        xfs_ail_push_all_sync(mp->m_ail);
-        /*
-         * The superblock buffer is uncached and xfsaild_push() will lock and
-         * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-         * here but a lock on the superblock buffer will block until iodone()
-         * has completed.
-         */
-        xfs_buf_lock(mp->m_sb_bp);
-        xfs_buf_unlock(mp->m_sb_bp);
-}
-static void
-xfs_syncd_queue_sync(
-        struct xfs_mount        *mp)
-{
-        queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work,
-                                msecs_to_jiffies(xfs_syncd_centisecs * 10));
-}
-/*
- * Every sync period we need to unpin all items, reclaim inodes and sync
- * disk quotas.  We might need to cover the log to indicate that the
- * filesystem is idle and not frozen.
- */
-STATIC void
-xfs_sync_worker(
-        struct work_struct *work)
 {
-        struct xfs_mount *mp = container_of(to_delayed_work(work),
+        struct xfs_perag        *pag;
-                                        struct xfs_mount, m_sync_work);
+        int                     error = 0;
-        int             error;
+        int                     last_error = 0;
+        xfs_agnumber_t          ag;
-        /*
-         * We shouldn't write/force the log if we are in the mount/unmount
-         * process or on a read only filesystem. The workqueue still needs to be
-         * active in both cases, however, because it is used for inode reclaim
-         * during these times.  Use the MS_ACTIVE flag to avoid doing anything
-         * during mount.  Doing work during unmount is avoided by calling
-         * cancel_delayed_work_sync on this work queue before tearing down
-         * the ail and the log in xfs_log_unmount.
-         */
-        if (!(mp->m_super->s_flags & MS_ACTIVE) &&
-            !(mp->m_flags & XFS_MOUNT_RDONLY)) {
-                /* dgc: errors ignored here */
-                if (mp->m_super->s_writers.frozen == SB_UNFROZEN &&
-                    xfs_log_need_covered(mp))
-                        error = xfs_fs_log_dummy(mp);
-                else
-                        xfs_log_force(mp, 0);
-                /* start pushing all the metadata that is currently
+        ag = 0;
-                 * dirty */
+        while ((pag = xfs_perag_get_tag(mp, ag, tag))) {
-                xfs_ail_push_all(mp->m_ail);
+                ag = pag->pag_agno + 1;
+                error = xfs_inode_ag_walk(mp, pag, execute, flags, args, tag);
+                xfs_perag_put(pag);
+                if (error) {
+                        last_error = error;
+                        if (error == EFSCORRUPTED)
+                                break;
+                }
        }
+        return XFS_ERROR(last_error);
-        /* queue us up again */
-        xfs_syncd_queue_sync(mp);
 }
 /*
 * Queue a new inode reclaim pass if there are reclaimable inodes and there
 * isn't a reclaim pass already in progress. By default it runs every 5s based
- * on the xfs syncd work default of 30s. Perhaps this should have it's own
+ * on the xfs periodic sync default of 30s. Perhaps this should have it's own
 * tunable, but that can be done if this method proves to be ineffective or too
 * aggressive.
 */
 static void
-xfs_syncd_queue_reclaim(
+xfs_reclaim_work_queue(
        struct xfs_mount        *mp)
 {
        rcu_read_lock();
        if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) {
-                queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work,
+                queue_delayed_work(mp->m_reclaim_workqueue, &mp->m_reclaim_work,
                        msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10));
        }
        rcu_read_unlock();
@@ -445,7 +725,7 @@ xfs_syncd_queue_reclaim(
 * goes low. It scans as quickly as possible avoiding locked inodes or those
 * already being flushed, and once done schedules a future pass.
 */
-STATIC void
+void
 xfs_reclaim_worker(
        struct work_struct *work)
 {
@@ -453,65 +733,10 @@ xfs_reclaim_worker(
                                        struct xfs_mount, m_reclaim_work);
        xfs_reclaim_inodes(mp, SYNC_TRYLOCK);
-        xfs_syncd_queue_reclaim(mp);
+        xfs_reclaim_work_queue(mp);
 }
-/*
+static void
- * Flush delayed allocate data, attempting to free up reserved space
- * from existing allocations.  At this point a new allocation attempt
- * has failed with ENOSPC and we are in the process of scratching our
- * heads, looking about for more room.
- *
- * Queue a new data flush if there isn't one already in progress and
- * wait for completion of the flush. This means that we only ever have one
- * inode flush in progress no matter how many ENOSPC events are occurring and
- * so will prevent the system from bogging down due to every concurrent
- * ENOSPC event scanning all the active inodes in the system for writeback.
- */
-void
-xfs_flush_inodes(
-        struct xfs_inode        *ip)
-{
-        struct xfs_mount        *mp = ip->i_mount;
-        queue_work(xfs_syncd_wq, &mp->m_flush_work);
-        flush_work(&mp->m_flush_work);
-}
-STATIC void
-xfs_flush_worker(
-        struct work_struct *work)
-{
-        struct xfs_mount *mp = container_of(work,
-                                        struct xfs_mount, m_flush_work);
-        xfs_sync_data(mp, SYNC_TRYLOCK);
-        xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
-}
-int
-xfs_syncd_init(
-        struct xfs_mount        *mp)
-{
-        INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
-        INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker);
-        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
-        xfs_syncd_queue_sync(mp);
-        return 0;
-}
-void
-xfs_syncd_stop(
-        struct xfs_mount        *mp)
-{
-        cancel_delayed_work_sync(&mp->m_sync_work);
-        cancel_delayed_work_sync(&mp->m_reclaim_work);
-        cancel_work_sync(&mp->m_flush_work);
-}
-void
 __xfs_inode_set_reclaim_tag(
        struct xfs_perag        *pag,
        struct xfs_inode        *ip)
@@ -529,7 +754,7 @@ __xfs_inode_set_reclaim_tag(
                spin_unlock(&ip->i_mount->m_perag_lock);
                /* schedule periodic background inode reclaim */
-                xfs_syncd_queue_reclaim(ip->i_mount);
+                xfs_reclaim_work_queue(ip->i_mount);
                trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno,
                                                        -1, _RET_IP_);
@@ -577,7 +802,7 @@ __xfs_inode_clear_reclaim(
        }
 }
-void
+STATIC void
 __xfs_inode_clear_reclaim_tag(
        xfs_mount_t     *mp,
        xfs_perag_t     *pag,
@@ -787,9 +1012,9 @@ out:
        /*
         * We could return EAGAIN here to make reclaim rescan the inode tree in
         * a short while. However, this just burns CPU time scanning the tree
-         * waiting for IO to complete and xfssyncd never goes back to the idle
+         * waiting for IO to complete and the reclaim work never goes back to
-         * state. Instead, return 0 to let the next scheduled background reclaim
+         * the idle state. Instead, return 0 to let the next scheduled
-         * attempt to reclaim the inode again.
+         * background reclaim attempt to reclaim the inode again.
         */
        return 0;
 }
@@ -800,7 +1025,7 @@ out:
 * then a shut down during filesystem unmount reclaim walk leak all the
 * unreclaimed inodes.
 */
-int
+STATIC int
 xfs_reclaim_inodes_ag(
        struct xfs_mount        *mp,
        int                     flags,
@@ -945,7 +1170,7 @@ xfs_reclaim_inodes_nr(
        int                     nr_to_scan)
 {
        /* kick background reclaimer and push the AIL */
-        xfs_syncd_queue_reclaim(mp);
+        xfs_reclaim_work_queue(mp);
        xfs_ail_push_all(mp->m_ail);
        xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
@@ -971,3 +1196,146 @@ xfs_reclaim_inodes_count(
        return reclaimable;
 }
+STATIC int
+xfs_inode_match_id(
+        struct xfs_inode        *ip,
+        struct xfs_eofblocks    *eofb)
+{
+        if (eofb->eof_flags & XFS_EOF_FLAGS_UID &&
+            ip->i_d.di_uid != eofb->eof_uid)
+                return 0;
+        if (eofb->eof_flags & XFS_EOF_FLAGS_GID &&
+            ip->i_d.di_gid != eofb->eof_gid)
+                return 0;
+        if (eofb->eof_flags & XFS_EOF_FLAGS_PRID &&
+            xfs_get_projid(ip) != eofb->eof_prid)
+                return 0;
+        return 1;
+}
+STATIC int
+xfs_inode_free_eofblocks(
+        struct xfs_inode        *ip,
+        struct xfs_perag        *pag,
+        int                     flags,
+        void                    *args)
+{
+        int ret;
+        struct xfs_eofblocks *eofb = args;
+        if (!xfs_can_free_eofblocks(ip, false)) {
+                /* inode could be preallocated or append-only */
+                trace_xfs_inode_free_eofblocks_invalid(ip);
+                xfs_inode_clear_eofblocks_tag(ip);
+                return 0;
+        }
+        /*
+         * If the mapping is dirty the operation can block and wait for some
+         * time. Unless we are waiting, skip it.
+         */
+        if (!(flags & SYNC_WAIT) &&
+            mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY))
+                return 0;
+        if (eofb) {
+                if (!xfs_inode_match_id(ip, eofb))
+                        return 0;
+                /* skip the inode if the file size is too small */
+                if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+                    XFS_ISIZE(ip) < eofb->eof_min_file_size)
+                        return 0;
+        }
+        ret = xfs_free_eofblocks(ip->i_mount, ip, true);
+        /* don't revisit the inode if we're not waiting */
+        if (ret == EAGAIN && !(flags & SYNC_WAIT))
+                ret = 0;
+        return ret;
+}
+int
+xfs_icache_free_eofblocks(
+        struct xfs_mount        *mp,
+        struct xfs_eofblocks    *eofb)
+{
+        int flags = SYNC_TRYLOCK;
+        if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
+                flags = SYNC_WAIT;
+        return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
+                                         eofb, XFS_ICI_EOFBLOCKS_TAG);
+}
+void
+xfs_inode_set_eofblocks_tag(
+        xfs_inode_t     *ip)
+{
+        struct xfs_mount *mp = ip->i_mount;
+        struct xfs_perag *pag;
+        int tagged;
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+        spin_lock(&pag->pag_ici_lock);
+        trace_xfs_inode_set_eofblocks_tag(ip);
+        tagged = radix_tree_tagged(&pag->pag_ici_root,
+                                   XFS_ICI_EOFBLOCKS_TAG);
+        radix_tree_tag_set(&pag->pag_ici_root,
+                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                           XFS_ICI_EOFBLOCKS_TAG);
+        if (!tagged) {
+                /* propagate the eofblocks tag up into the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_set(&ip->i_mount->m_perag_tree,
+                                   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                   XFS_ICI_EOFBLOCKS_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                /* kick off background trimming */
+                xfs_queue_eofblocks(ip->i_mount);
+                trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
+                                              -1, _RET_IP_);
+        }
+        spin_unlock(&pag->pag_ici_lock);
+        xfs_perag_put(pag);
+}
+void
+xfs_inode_clear_eofblocks_tag(
+        xfs_inode_t     *ip)
+{
+        struct xfs_mount *mp = ip->i_mount;
+        struct xfs_perag *pag;
+        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
+        spin_lock(&pag->pag_ici_lock);
+        trace_xfs_inode_clear_eofblocks_tag(ip);
+        radix_tree_tag_clear(&pag->pag_ici_root,
+                             XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                             XFS_ICI_EOFBLOCKS_TAG);
+        if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+                /* clear the eofblocks tag from the perag radix tree */
+                spin_lock(&ip->i_mount->m_perag_lock);
+                radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
+                                     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
+                                     XFS_ICI_EOFBLOCKS_TAG);
+                spin_unlock(&ip->i_mount->m_perag_lock);
+                trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
+                                               -1, _RET_IP_);
+        }
+        spin_unlock(&pag->pag_ici_lock);
+        xfs_perag_put(pag);
+}
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_icache.h
index 941202e7ac6e..e0f138c70a2f 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_icache.h
@@ -24,28 +24,30 @@ struct xfs_perag;
 #define SYNC_WAIT               0x0001  /* wait for i/o to complete */
 #define SYNC_TRYLOCK            0x0002  /* only try to lock inodes */
-extern struct workqueue_struct  *xfs_syncd_wq;  /* sync workqueue */
+int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
+             uint flags, uint lock_flags, xfs_inode_t **ipp);
-int xfs_syncd_init(struct xfs_mount *mp);
+void xfs_reclaim_worker(struct work_struct *work);
-void xfs_syncd_stop(struct xfs_mount *mp);
-int xfs_quiesce_data(struct xfs_mount *mp);
-void xfs_quiesce_attr(struct xfs_mount *mp);
-void xfs_flush_inodes(struct xfs_inode *ip);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
 void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
-void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
+void xfs_inode_set_eofblocks_tag(struct xfs_inode *ip);
-                                struct xfs_inode *ip);
+void xfs_inode_clear_eofblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_eofblocks(struct xfs_mount *, struct xfs_eofblocks *);
+void xfs_eofblocks_worker(struct work_struct *);
 int xfs_sync_inode_grab(struct xfs_inode *ip);
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
-        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags),
+        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
-        int flags);
+                int flags, void *args),
+        int flags, void *args);
+int xfs_inode_ag_iterator_tag(struct xfs_mount *mp,
+        int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag,
+                int flags, void *args),
+        int flags, void *args, int tag);
 #endif
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
deleted file mode 100644
index 784a803383ec..000000000000
--- a/fs/xfs/xfs_iget.c
+++ /dev/null
@@ -1,705 +0,0 @@
-/*
- * Copyright (c) 2000-2005 Silicon Graphics, Inc.
- * All Rights Reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it would be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write the Free Software Foundation,
- * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- */
-#include "xfs.h"
-#include "xfs_fs.h"
-#include "xfs_types.h"
-#include "xfs_acl.h"
-#include "xfs_log.h"
-#include "xfs_inum.h"
-#include "xfs_trans.h"
-#include "xfs_sb.h"
-#include "xfs_ag.h"
-#include "xfs_mount.h"
-#include "xfs_bmap_btree.h"
-#include "xfs_alloc_btree.h"
-#include "xfs_ialloc_btree.h"
-#include "xfs_dinode.h"
-#include "xfs_inode.h"
-#include "xfs_btree.h"
-#include "xfs_ialloc.h"
-#include "xfs_quota.h"
-#include "xfs_utils.h"
-#include "xfs_trans_priv.h"
-#include "xfs_inode_item.h"
-#include "xfs_bmap.h"
-#include "xfs_trace.h"
-/*
- * Allocate and initialise an xfs_inode.
- */
-STATIC struct xfs_inode *
-xfs_inode_alloc(
-        struct xfs_mount        *mp,
-        xfs_ino_t               ino)
-{
-        struct xfs_inode        *ip;
-        /*
-         * if this didn't occur in transactions, we could use
-         * KM_MAYFAIL and return NULL here on ENOMEM. Set the
-         * code up to do this anyway.
-         */
-        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
-        if (!ip)
-                return NULL;
-        if (inode_init_always(mp->m_super, VFS_I(ip))) {
-                kmem_zone_free(xfs_inode_zone, ip);
-                return NULL;
-        }
-        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-        ASSERT(!xfs_isiflocked(ip));
-        ASSERT(ip->i_ino == 0);
-        mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-        /* initialise the xfs inode */
-        ip->i_ino = ino;
-        ip->i_mount = mp;
-        memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
-        ip->i_afp = NULL;
-        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
-        ip->i_flags = 0;
-        ip->i_delayed_blks = 0;
-        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
-        return ip;
-}
-STATIC void
-xfs_inode_free_callback(
-        struct rcu_head         *head)
-{
-        struct inode            *inode = container_of(head, struct inode, i_rcu);
-        struct xfs_inode        *ip = XFS_I(inode);
-        kmem_zone_free(xfs_inode_zone, ip);
-}
-void
-xfs_inode_free(
-        struct xfs_inode        *ip)
-{
-        switch (ip->i_d.di_mode & S_IFMT) {
-        case S_IFREG:
-        case S_IFDIR:
-        case S_IFLNK:
-                xfs_idestroy_fork(ip, XFS_DATA_FORK);
-                break;
-        }
-        if (ip->i_afp)
-                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-        if (ip->i_itemp) {
-                ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
-                xfs_inode_item_destroy(ip);
-                ip->i_itemp = NULL;
-        }
-        /* asserts to verify all state is correct here */
-        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-        ASSERT(!xfs_isiflocked(ip));
-        /*
-         * Because we use RCU freeing we need to ensure the inode always
-         * appears to be reclaimed with an invalid inode number when in the
-         * free state. The ip->i_flags_lock provides the barrier against lookup
-         * races.
-         */
-        spin_lock(&ip->i_flags_lock);
-        ip->i_flags = XFS_IRECLAIM;
-        ip->i_ino = 0;
-        spin_unlock(&ip->i_flags_lock);
-        call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
-}
-/*
- * Check the validity of the inode we just found it the cache
- */
-static int
-xfs_iget_cache_hit(
-        struct xfs_perag        *pag,
-        struct xfs_inode        *ip,
-        xfs_ino_t               ino,
-        int                     flags,
-        int                     lock_flags) __releases(RCU)
-{
-        struct inode            *inode = VFS_I(ip);
-        struct xfs_mount        *mp = ip->i_mount;
-        int                     error;
-        /*
-         * check for re-use of an inode within an RCU grace period due to the
-         * radix tree nodes not being updated yet. We monitor for this by
-         * setting the inode number to zero before freeing the inode structure.
-         * If the inode has been reallocated and set up, then the inode number
-         * will not match, so check for that, too.
-         */
-        spin_lock(&ip->i_flags_lock);
-        if (ip->i_ino != ino) {
-                trace_xfs_iget_skip(ip);
-                XFS_STATS_INC(xs_ig_frecycle);
-                error = EAGAIN;
-                goto out_error;
-        }
-        /*
-         * If we are racing with another cache hit that is currently
-         * instantiating this inode or currently recycling it out of
-         * reclaimabe state, wait for the initialisation to complete
-         * before continuing.
-         *
-         * XXX(hch): eventually we should do something equivalent to
-         *           wait_on_inode to wait for these flags to be cleared
-         *           instead of polling for it.
-         */
-        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
-                trace_xfs_iget_skip(ip);
-                XFS_STATS_INC(xs_ig_frecycle);
-                error = EAGAIN;
-                goto out_error;
-        }
-        /*
-         * If lookup is racing with unlink return an error immediately.
-         */
-        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                error = ENOENT;
-                goto out_error;
-        }
-        /*
-         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
-         * Need to carefully get it back into useable state.
-         */
-        if (ip->i_flags & XFS_IRECLAIMABLE) {
-                trace_xfs_iget_reclaim(ip);
-                /*
-                 * We need to set XFS_IRECLAIM to prevent xfs_reclaim_inode
-                 * from stomping over us while we recycle the inode.  We can't
-                 * clear the radix tree reclaimable tag yet as it requires
-                 * pag_ici_lock to be held exclusive.
-                 */
-                ip->i_flags |= XFS_IRECLAIM;
-                spin_unlock(&ip->i_flags_lock);
-                rcu_read_unlock();
-                error = -inode_init_always(mp->m_super, inode);
-                if (error) {
-                        /*
-                         * Re-initializing the inode failed, and we are in deep
-                         * trouble.  Try to re-add it to the reclaim list.
-                         */
-                        rcu_read_lock();
-                        spin_lock(&ip->i_flags_lock);
-                        ip->i_flags &= ~(XFS_INEW | XFS_IRECLAIM);
-                        ASSERT(ip->i_flags & XFS_IRECLAIMABLE);
-                        trace_xfs_iget_reclaim_fail(ip);
-                        goto out_error;
-                }
-                spin_lock(&pag->pag_ici_lock);
-                spin_lock(&ip->i_flags_lock);
-                /*
-                 * Clear the per-lifetime state in the inode as we are now
-                 * effectively a new inode and need to return to the initial
-                 * state before reuse occurs.
-                 */
-                ip->i_flags &= ~XFS_IRECLAIM_RESET_FLAGS;
-                ip->i_flags |= XFS_INEW;
-                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
-                inode->i_state = I_NEW;
-                ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-                mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-                spin_unlock(&ip->i_flags_lock);
-                spin_unlock(&pag->pag_ici_lock);
-        } else {
-                /* If the VFS inode is being torn down, pause and try again. */
-                if (!igrab(inode)) {
-                        trace_xfs_iget_skip(ip);
-                        error = EAGAIN;
-                        goto out_error;
-                }
-                /* We've got a live one. */
-                spin_unlock(&ip->i_flags_lock);
-                rcu_read_unlock();
-                trace_xfs_iget_hit(ip);
-        }
-        if (lock_flags != 0)
-                xfs_ilock(ip, lock_flags);
-        xfs_iflags_clear(ip, XFS_ISTALE | XFS_IDONTCACHE);
-        XFS_STATS_INC(xs_ig_found);
-        return 0;
-out_error:
-        spin_unlock(&ip->i_flags_lock);
-        rcu_read_unlock();
-        return error;
-}
-static int
-xfs_iget_cache_miss(
-        struct xfs_mount        *mp,
-        struct xfs_perag        *pag,
-        xfs_trans_t             *tp,
-        xfs_ino_t               ino,
-        struct xfs_inode        **ipp,
-        int                     flags,
-        int                     lock_flags)
-{
-        struct xfs_inode        *ip;
-        int                     error;
-        xfs_agino_t             agino = XFS_INO_TO_AGINO(mp, ino);
-        int                     iflags;
-        ip = xfs_inode_alloc(mp, ino);
-        if (!ip)
-                return ENOMEM;
-        error = xfs_iread(mp, tp, ip, flags);
-        if (error)
-                goto out_destroy;
-        trace_xfs_iget_miss(ip);
-        if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                error = ENOENT;
-                goto out_destroy;
-        }
-        /*
-         * Preload the radix tree so we can insert safely under the
-         * write spinlock. Note that we cannot sleep inside the preload
-         * region. Since we can be called from transaction context, don't
-         * recurse into the file system.
-         */
-        if (radix_tree_preload(GFP_NOFS)) {
-                error = EAGAIN;
-                goto out_destroy;
-        }
-        /*
-         * Because the inode hasn't been added to the radix-tree yet it can't
-         * be found by another thread, so we can do the non-sleeping lock here.
-         */
-        if (lock_flags) {
-                if (!xfs_ilock_nowait(ip, lock_flags))
-                        BUG();
-        }
-        /*
-         * These values must be set before inserting the inode into the radix
-         * tree as the moment it is inserted a concurrent lookup (allowed by the
-         * RCU locking mechanism) can find it and that lookup must see that this
-         * is an inode currently under construction (i.e. that XFS_INEW is set).
-         * The ip->i_flags_lock that protects the XFS_INEW flag forms the
-         * memory barrier that ensures this detection works correctly at lookup
-         * time.
-         */
-        iflags = XFS_INEW;
-        if (flags & XFS_IGET_DONTCACHE)
-                iflags |= XFS_IDONTCACHE;
-        ip->i_udquot = ip->i_gdquot = NULL;
-        xfs_iflags_set(ip, iflags);
-        /* insert the new inode */
-        spin_lock(&pag->pag_ici_lock);
-        error = radix_tree_insert(&pag->pag_ici_root, agino, ip);
-        if (unlikely(error)) {
-                WARN_ON(error != -EEXIST);
-                XFS_STATS_INC(xs_ig_dup);
-                error = EAGAIN;
-                goto out_preload_end;
-        }
-        spin_unlock(&pag->pag_ici_lock);
-        radix_tree_preload_end();
-        *ipp = ip;
-        return 0;
-out_preload_end:
-        spin_unlock(&pag->pag_ici_lock);
-        radix_tree_preload_end();
-        if (lock_flags)
-                xfs_iunlock(ip, lock_flags);
-out_destroy:
-        __destroy_inode(VFS_I(ip));
-        xfs_inode_free(ip);
-        return error;
-}
-/*
- * Look up an inode by number in the given file system.
- * The inode is looked up in the cache held in each AG.
- * If the inode is found in the cache, initialise the vfs inode
- * if necessary.
- *
- * If it is not in core, read it in from the file system's device,
- * add it to the cache and initialise the vfs inode.
- *
- * The inode is locked according to the value of the lock_flags parameter.
- * This flag parameter indicates how and if the inode's IO lock and inode lock
- * should be taken.
- *
- * mp -- the mount point structure for the current file system.  It points
- *       to the inode hash table.
- * tp -- a pointer to the current transaction if there is one.  This is
- *       simply passed through to the xfs_iread() call.
- * ino -- the number of the inode desired.  This is the unique identifier
- *        within the file system for the inode being requested.
- * lock_flags -- flags indicating how to lock the inode.  See the comment
- *               for xfs_ilock() for a list of valid values.
- */
-int
-xfs_iget(
-        xfs_mount_t     *mp,
-        xfs_trans_t     *tp,
-        xfs_ino_t       ino,
-        uint            flags,
-        uint            lock_flags,
-        xfs_inode_t     **ipp)
-{
-        xfs_inode_t     *ip;
-        int             error;
-        xfs_perag_t     *pag;
-        xfs_agino_t     agino;
-        /*
-         * xfs_reclaim_inode() uses the ILOCK to ensure an inode
-         * doesn't get freed while it's being referenced during a
-         * radix tree traversal here.  It assumes this function
-         * aqcuires only the ILOCK (and therefore it has no need to
-         * involve the IOLOCK in this synchronization).
-         */
-        ASSERT((lock_flags & (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED)) == 0);
-        /* reject inode numbers outside existing AGs */
-        if (!ino || XFS_INO_TO_AGNO(mp, ino) >= mp->m_sb.sb_agcount)
-                return EINVAL;
-        /* get the perag structure and ensure that it's inode capable */
-        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-        agino = XFS_INO_TO_AGINO(mp, ino);
-again:
-        error = 0;
-        rcu_read_lock();
-        ip = radix_tree_lookup(&pag->pag_ici_root, agino);
-        if (ip) {
-                error = xfs_iget_cache_hit(pag, ip, ino, flags, lock_flags);
-                if (error)
-                        goto out_error_or_again;
-        } else {
-                rcu_read_unlock();
-                XFS_STATS_INC(xs_ig_missed);
-                error = xfs_iget_cache_miss(mp, pag, tp, ino, &ip,
-                                                        flags, lock_flags);
-                if (error)
-                        goto out_error_or_again;
-        }
-        xfs_perag_put(pag);
-        *ipp = ip;
-        /*
-         * If we have a real type for an on-disk inode, we can set ops(&unlock)
-         * now.  If it's a new inode being created, xfs_ialloc will handle it.
-         */
-        if (xfs_iflags_test(ip, XFS_INEW) && ip->i_d.di_mode != 0)
-                xfs_setup_inode(ip);
-        return 0;
-out_error_or_again:
-        if (error == EAGAIN) {
-                delay(1);
-                goto again;
-        }
-        xfs_perag_put(pag);
-        return error;
-}
-/*
- * This is a wrapper routine around the xfs_ilock() routine
- * used to centralize some grungy code.  It is used in places
- * that wish to lock the inode solely for reading the extents.
- * The reason these places can't just call xfs_ilock(SHARED)
- * is that the inode lock also guards to bringing in of the
- * extents from disk for a file in b-tree format.  If the inode
- * is in b-tree format, then we need to lock the inode exclusively
- * until the extents are read in.  Locking it exclusively all
- * the time would limit our parallelism unnecessarily, though.
- * What we do instead is check to see if the extents have been
- * read in yet, and only lock the inode exclusively if they
- * have not.
- *
- * The function returns a value which should be given to the
- * corresponding xfs_iunlock_map_shared().  This value is
- * the mode in which the lock was actually taken.
- */
-uint
-xfs_ilock_map_shared(
-        xfs_inode_t     *ip)
-{
-        uint    lock_mode;
-        if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
-            ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
-                lock_mode = XFS_ILOCK_EXCL;
-        } else {
-                lock_mode = XFS_ILOCK_SHARED;
-        }
-        xfs_ilock(ip, lock_mode);
-        return lock_mode;
-}
-/*
- * This is simply the unlock routine to go with xfs_ilock_map_shared().
- * All it does is call xfs_iunlock() with the given lock_mode.
- */
-void
-xfs_iunlock_map_shared(
-        xfs_inode_t     *ip,
-        unsigned int    lock_mode)
-{
-        xfs_iunlock(ip, lock_mode);
-}
-/*
- * The xfs inode contains 2 locks: a multi-reader lock called the
- * i_iolock and a multi-reader lock called the i_lock.  This routine
- * allows either or both of the locks to be obtained.
- *
- * The 2 locks should always be ordered so that the IO lock is
- * obtained first in order to prevent deadlock.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks
- *       to be locked.  It can be:
- *              XFS_IOLOCK_SHARED,
- *              XFS_IOLOCK_EXCL,
- *              XFS_ILOCK_SHARED,
- *              XFS_ILOCK_EXCL,
- *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
- *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
- *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
- *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
- */
-void
-xfs_ilock(
-        xfs_inode_t             *ip,
-        uint                    lock_flags)
-{
-        /*
-         * You can't set both SHARED and EXCL for the same lock,
-         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-         */
-        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-        if (lock_flags & XFS_IOLOCK_EXCL)
-                mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-        else if (lock_flags & XFS_IOLOCK_SHARED)
-                mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-        if (lock_flags & XFS_ILOCK_EXCL)
-                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-        else if (lock_flags & XFS_ILOCK_SHARED)
-                mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
-        trace_xfs_ilock(ip, lock_flags, _RET_IP_);
-}
-/*
- * This is just like xfs_ilock(), except that the caller
- * is guaranteed not to sleep.  It returns 1 if it gets
- * the requested locks and 0 otherwise.  If the IO lock is
- * obtained but the inode lock cannot be, then the IO lock
- * is dropped before returning.
- *
- * ip -- the inode being locked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be locked.  See the comment for xfs_ilock() for a list
- *       of valid values.
- */
-int
-xfs_ilock_nowait(
-        xfs_inode_t             *ip,
-        uint                    lock_flags)
-{
-        /*
-         * You can't set both SHARED and EXCL for the same lock,
-         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-         */
-        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-        if (lock_flags & XFS_IOLOCK_EXCL) {
-                if (!mrtryupdate(&ip->i_iolock))
-                        goto out;
-        } else if (lock_flags & XFS_IOLOCK_SHARED) {
-                if (!mrtryaccess(&ip->i_iolock))
-                        goto out;
-        }
-        if (lock_flags & XFS_ILOCK_EXCL) {
-                if (!mrtryupdate(&ip->i_lock))
-                        goto out_undo_iolock;
-        } else if (lock_flags & XFS_ILOCK_SHARED) {
-                if (!mrtryaccess(&ip->i_lock))
-                        goto out_undo_iolock;
-        }
-        trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
-        return 1;
- out_undo_iolock:
-        if (lock_flags & XFS_IOLOCK_EXCL)
-                mrunlock_excl(&ip->i_iolock);
-        else if (lock_flags & XFS_IOLOCK_SHARED)
-                mrunlock_shared(&ip->i_iolock);
- out:
-        return 0;
-}
-/*
- * xfs_iunlock() is used to drop the inode locks acquired with
- * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
- * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
- * that we know which locks to drop.
- *
- * ip -- the inode being unlocked
- * lock_flags -- this parameter indicates the inode's locks to be
- *       to be unlocked.  See the comment for xfs_ilock() for a list
- *       of valid values for this parameter.
- *
- */
-void
-xfs_iunlock(
-        xfs_inode_t             *ip,
-        uint                    lock_flags)
-{
-        /*
-         * You can't set both SHARED and EXCL for the same lock,
-         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
-         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
-         */
-        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
-               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
-        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
-               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
-        ASSERT(lock_flags != 0);
-        if (lock_flags & XFS_IOLOCK_EXCL)
-                mrunlock_excl(&ip->i_iolock);
-        else if (lock_flags & XFS_IOLOCK_SHARED)
-                mrunlock_shared(&ip->i_iolock);
-        if (lock_flags & XFS_ILOCK_EXCL)
-                mrunlock_excl(&ip->i_lock);
-        else if (lock_flags & XFS_ILOCK_SHARED)
-                mrunlock_shared(&ip->i_lock);
-        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
-}
-/*
- * give up write locks.  the i/o lock cannot be held nested
- * if it is being demoted.
- */
-void
-xfs_ilock_demote(
-        xfs_inode_t             *ip,
-        uint                    lock_flags)
-{
-        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
-        ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
-        if (lock_flags & XFS_ILOCK_EXCL)
-                mrdemote(&ip->i_lock);
-        if (lock_flags & XFS_IOLOCK_EXCL)
-                mrdemote(&ip->i_iolock);
-        trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
-}
-#ifdef DEBUG
-int
-xfs_isilocked(
-        xfs_inode_t             *ip,
-        uint                    lock_flags)
-{
-        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-                if (!(lock_flags & XFS_ILOCK_SHARED))
-                        return !!ip->i_lock.mr_writer;
-                return rwsem_is_locked(&ip->i_lock.mr_lock);
-        }
-        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-                if (!(lock_flags & XFS_IOLOCK_SHARED))
-                        return !!ip->i_iolock.mr_writer;
-                return rwsem_is_locked(&ip->i_iolock.mr_lock);
-        }
-        ASSERT(0);
-        return 0;
-}
-#endif
-void
-__xfs_iflock(
-        struct xfs_inode        *ip)
-{
-        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
-        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
-        do {
-                prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
-                if (xfs_isiflocked(ip))
-                        io_schedule();
-        } while (!xfs_iflock_nowait(ip));
-        finish_wait(wq, &wait.wait);
-}
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1938b41ee9f5..66282dcb821b 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -45,6 +45,7 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 kmem_zone_t *xfs_ifork_zone;
 kmem_zone_t *xfs_inode_zone;
@@ -74,6 +75,256 @@ xfs_get_extsz_hint(
        return 0;
 }
+/*
+ * This is a wrapper routine around the xfs_ilock() routine used to centralize
+ * some grungy code.  It is used in places that wish to lock the inode solely
+ * for reading the extents.  The reason these places can't just call
+ * xfs_ilock(SHARED) is that the inode lock also guards to bringing in of the
+ * extents from disk for a file in b-tree format.  If the inode is in b-tree
+ * format, then we need to lock the inode exclusively until the extents are read
+ * in.  Locking it exclusively all the time would limit our parallelism
+ * unnecessarily, though.  What we do instead is check to see if the extents
+ * have been read in yet, and only lock the inode exclusively if they have not.
+ *
+ * The function returns a value which should be given to the corresponding
+ * xfs_iunlock_map_shared().  This value is the mode in which the lock was
+ * actually taken.
+ */
+uint
+xfs_ilock_map_shared(
+        xfs_inode_t     *ip)
+{
+        uint    lock_mode;
+        if ((ip->i_d.di_format == XFS_DINODE_FMT_BTREE) &&
+            ((ip->i_df.if_flags & XFS_IFEXTENTS) == 0)) {
+                lock_mode = XFS_ILOCK_EXCL;
+        } else {
+                lock_mode = XFS_ILOCK_SHARED;
+        }
+        xfs_ilock(ip, lock_mode);
+        return lock_mode;
+}
+/*
+ * This is simply the unlock routine to go with xfs_ilock_map_shared().
+ * All it does is call xfs_iunlock() with the given lock_mode.
+ */
+void
+xfs_iunlock_map_shared(
+        xfs_inode_t     *ip,
+        unsigned int    lock_mode)
+{
+        xfs_iunlock(ip, lock_mode);
+}
+/*
+ * The xfs inode contains 2 locks: a multi-reader lock called the
+ * i_iolock and a multi-reader lock called the i_lock.  This routine
+ * allows either or both of the locks to be obtained.
+ *
+ * The 2 locks should always be ordered so that the IO lock is
+ * obtained first in order to prevent deadlock.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks
+ *       to be locked.  It can be:
+ *              XFS_IOLOCK_SHARED,
+ *              XFS_IOLOCK_EXCL,
+ *              XFS_ILOCK_SHARED,
+ *              XFS_ILOCK_EXCL,
+ *              XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED,
+ *              XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL,
+ *              XFS_IOLOCK_EXCL | XFS_ILOCK_SHARED,
+ *              XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL
+ */
+void
+xfs_ilock(
+        xfs_inode_t             *ip,
+        uint                    lock_flags)
+{
+        trace_xfs_ilock(ip, lock_flags, _RET_IP_);
+        /*
+         * You can't set both SHARED and EXCL for the same lock,
+         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+        if (lock_flags & XFS_IOLOCK_EXCL)
+                mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+        else if (lock_flags & XFS_IOLOCK_SHARED)
+                mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+        if (lock_flags & XFS_ILOCK_EXCL)
+                mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+        else if (lock_flags & XFS_ILOCK_SHARED)
+                mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags));
+}
+/*
+ * This is just like xfs_ilock(), except that the caller
+ * is guaranteed not to sleep.  It returns 1 if it gets
+ * the requested locks and 0 otherwise.  If the IO lock is
+ * obtained but the inode lock cannot be, then the IO lock
+ * is dropped before returning.
+ *
+ * ip -- the inode being locked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be locked.  See the comment for xfs_ilock() for a list
+ *       of valid values.
+ */
+int
+xfs_ilock_nowait(
+        xfs_inode_t             *ip,
+        uint                    lock_flags)
+{
+        trace_xfs_ilock_nowait(ip, lock_flags, _RET_IP_);
+        /*
+         * You can't set both SHARED and EXCL for the same lock,
+         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+        if (lock_flags & XFS_IOLOCK_EXCL) {
+                if (!mrtryupdate(&ip->i_iolock))
+                        goto out;
+        } else if (lock_flags & XFS_IOLOCK_SHARED) {
+                if (!mrtryaccess(&ip->i_iolock))
+                        goto out;
+        }
+        if (lock_flags & XFS_ILOCK_EXCL) {
+                if (!mrtryupdate(&ip->i_lock))
+                        goto out_undo_iolock;
+        } else if (lock_flags & XFS_ILOCK_SHARED) {
+                if (!mrtryaccess(&ip->i_lock))
+                        goto out_undo_iolock;
+        }
+        return 1;
+ out_undo_iolock:
+        if (lock_flags & XFS_IOLOCK_EXCL)
+                mrunlock_excl(&ip->i_iolock);
+        else if (lock_flags & XFS_IOLOCK_SHARED)
+                mrunlock_shared(&ip->i_iolock);
+ out:
+        return 0;
+}
+/*
+ * xfs_iunlock() is used to drop the inode locks acquired with
+ * xfs_ilock() and xfs_ilock_nowait().  The caller must pass
+ * in the flags given to xfs_ilock() or xfs_ilock_nowait() so
+ * that we know which locks to drop.
+ *
+ * ip -- the inode being unlocked
+ * lock_flags -- this parameter indicates the inode's locks to be
+ *       to be unlocked.  See the comment for xfs_ilock() for a list
+ *       of valid values for this parameter.
+ *
+ */
+void
+xfs_iunlock(
+        xfs_inode_t             *ip,
+        uint                    lock_flags)
+{
+        /*
+         * You can't set both SHARED and EXCL for the same lock,
+         * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED,
+         * and XFS_ILOCK_EXCL are valid values to set in lock_flags.
+         */
+        ASSERT((lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) !=
+               (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL));
+        ASSERT((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) !=
+               (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0);
+        ASSERT(lock_flags != 0);
+        if (lock_flags & XFS_IOLOCK_EXCL)
+                mrunlock_excl(&ip->i_iolock);
+        else if (lock_flags & XFS_IOLOCK_SHARED)
+                mrunlock_shared(&ip->i_iolock);
+        if (lock_flags & XFS_ILOCK_EXCL)
+                mrunlock_excl(&ip->i_lock);
+        else if (lock_flags & XFS_ILOCK_SHARED)
+                mrunlock_shared(&ip->i_lock);
+        trace_xfs_iunlock(ip, lock_flags, _RET_IP_);
+}
+/*
+ * give up write locks.  the i/o lock cannot be held nested
+ * if it is being demoted.
+ */
+void
+xfs_ilock_demote(
+        xfs_inode_t             *ip,
+        uint                    lock_flags)
+{
+        ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL));
+        ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0);
+        if (lock_flags & XFS_ILOCK_EXCL)
+                mrdemote(&ip->i_lock);
+        if (lock_flags & XFS_IOLOCK_EXCL)
+                mrdemote(&ip->i_iolock);
+        trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
+}
+#ifdef DEBUG
+int
+xfs_isilocked(
+        xfs_inode_t             *ip,
+        uint                    lock_flags)
+{
+        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
+                if (!(lock_flags & XFS_ILOCK_SHARED))
+                        return !!ip->i_lock.mr_writer;
+                return rwsem_is_locked(&ip->i_lock.mr_lock);
+        }
+        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
+                if (!(lock_flags & XFS_IOLOCK_SHARED))
+                        return !!ip->i_iolock.mr_writer;
+                return rwsem_is_locked(&ip->i_iolock.mr_lock);
+        }
+        ASSERT(0);
+        return 0;
+}
+#endif
+void
+__xfs_iflock(
+        struct xfs_inode        *ip)
+{
+        wait_queue_head_t *wq = bit_waitqueue(&ip->i_flags, __XFS_IFLOCK_BIT);
+        DEFINE_WAIT_BIT(wait, &ip->i_flags, __XFS_IFLOCK_BIT);
+        do {
+                prepare_to_wait_exclusive(wq, &wait.wait, TASK_UNINTERRUPTIBLE);
+                if (xfs_isiflocked(ip))
+                        io_schedule();
+        } while (!xfs_iflock_nowait(ip));
+        finish_wait(wq, &wait.wait);
+}
 #ifdef DEBUG
 /*
 * Make sure that the extents in the given memory buffer
@@ -131,6 +382,65 @@ xfs_inobp_check(
 }
 #endif
+static void
+xfs_inode_buf_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        int             i;
+        int             ni;
+        /*
+         * Validate the magic number and version of every inode in the buffer
+         */
+        ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
+        for (i = 0; i < ni; i++) {
+                int             di_ok;
+                xfs_dinode_t    *dip;
+                dip = (struct xfs_dinode *)xfs_buf_offset(bp,
+                                        (i << mp->m_sb.sb_inodelog));
+                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+                            XFS_DINODE_GOOD_VERSION(dip->di_version);
+                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
+                                                XFS_ERRTAG_ITOBP_INOTOBP,
+                                                XFS_RANDOM_ITOBP_INOTOBP))) {
+                        xfs_buf_ioerror(bp, EFSCORRUPTED);
+                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
+                                             mp, dip);
+#ifdef DEBUG
+                        xfs_emerg(mp,
+                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
+                                (unsigned long long)bp->b_bn, i,
+                                be16_to_cpu(dip->di_magic));
+                        ASSERT(0);
+#endif
+                }
+        }
+        xfs_inobp_check(mp, bp);
+}
+static void
+xfs_inode_buf_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inode_buf_verify(bp);
+}
+static void
+xfs_inode_buf_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_inode_buf_verify(bp);
+}
+const struct xfs_buf_ops xfs_inode_buf_ops = {
+        .verify_read = xfs_inode_buf_read_verify,
+        .verify_write = xfs_inode_buf_write_verify,
+};
 /*
 * This routine is called to map an inode to the buffer containing the on-disk
 * version of the inode.  It returns a pointer to the buffer containing the
@@ -145,71 +455,33 @@ xfs_imap_to_bp(
        struct xfs_mount        *mp,
        struct xfs_trans        *tp,
        struct xfs_imap         *imap,
-        struct xfs_dinode       **dipp,
+        struct xfs_dinode       **dipp,
        struct xfs_buf          **bpp,
        uint                    buf_flags,
        uint                    iget_flags)
 {
        struct xfs_buf          *bp;
        int                     error;
-        int                     i;
-        int                     ni;
        buf_flags |= XBF_UNMAPPED;
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
-                                   (int)imap->im_len, buf_flags, &bp);
+                                   (int)imap->im_len, buf_flags, &bp,
+                                   &xfs_inode_buf_ops);
        if (error) {
-                if (error != EAGAIN) {
+                if (error == EAGAIN) {
-                        xfs_warn(mp,
-                                "%s: xfs_trans_read_buf() returned error %d.",
-                                __func__, error);
-                } else {
                        ASSERT(buf_flags & XBF_TRYLOCK);
+                        return error;
                }
-                return error;
-        }
-        /*
-         * Validate the magic number and version of every inode in the buffer
-         * (if DEBUG kernel) or the first inode in the buffer, otherwise.
-         */
-#ifdef DEBUG
-        ni = BBTOB(imap->im_len) >> mp->m_sb.sb_inodelog;
-#else   /* usual case */
-        ni = 1;
-#endif
-        for (i = 0; i < ni; i++) {
+                if (error == EFSCORRUPTED &&
-                int             di_ok;
+                    (iget_flags & XFS_IGET_UNTRUSTED))
-                xfs_dinode_t    *dip;
+                        return XFS_ERROR(EINVAL);
-                dip = (xfs_dinode_t *)xfs_buf_offset(bp,
+                xfs_warn(mp, "%s: xfs_trans_read_buf() returned error %d.",
-                                        (i << mp->m_sb.sb_inodelog));
+                        __func__, error);
-                di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
+                return error;
-                            XFS_DINODE_GOOD_VERSION(dip->di_version);
-                if (unlikely(XFS_TEST_ERROR(!di_ok, mp,
-                                                XFS_ERRTAG_ITOBP_INOTOBP,
-                                                XFS_RANDOM_ITOBP_INOTOBP))) {
-                        if (iget_flags & XFS_IGET_UNTRUSTED) {
-                                xfs_trans_brelse(tp, bp);
-                                return XFS_ERROR(EINVAL);
-                        }
-                        XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
-                                             mp, dip);
-#ifdef DEBUG
-                        xfs_emerg(mp,
-                                "bad inode magic/vsn daddr %lld #%d (magic=%x)",
-                                (unsigned long long)imap->im_blkno, i,
-                                be16_to_cpu(dip->di_magic));
-                        ASSERT(0);
-#endif
-                        xfs_trans_brelse(tp, bp);
-                        return XFS_ERROR(EFSCORRUPTED);
-                }
        }
-        xfs_inobp_check(mp, bp);
        *bpp = bp;
        *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset);
        return 0;
@@ -853,16 +1125,16 @@ xfs_iread_extents(
 * set according to the contents of the given cred structure.
 *
 * Use xfs_dialloc() to allocate the on-disk inode. If xfs_dialloc()
- * has a free inode available, call xfs_iget()
+ * has a free inode available, call xfs_iget() to obtain the in-core
- * to obtain the in-core version of the allocated inode.  Finally,
+ * version of the allocated inode.  Finally, fill in the inode and
- * fill in the inode and log its initial contents.  In this case,
+ * log its initial contents.  In this case, ialloc_context would be
- * ialloc_context would be set to NULL and call_again set to false.
+ * set to NULL.
 *
- * If xfs_dialloc() does not have an available inode,
+ * If xfs_dialloc() does not have an available inode, it will replenish
- * it will replenish its supply by doing an allocation. Since we can
+ * its supply by doing an allocation. Since we can only do one
- * only do one allocation within a transaction without deadlocks, we
+ * allocation within a transaction without deadlocks, we must commit
- * must commit the current transaction before returning the inode itself.
+ * the current transaction before returning the inode itself.
- * In this case, therefore, we will set call_again to true and return.
+ * In this case, therefore, we will set ialloc_context and return.
 * The caller should then commit the current transaction, start a new
 * transaction, and call xfs_ialloc() again to actually get the inode.
 *
@@ -1514,6 +1786,18 @@ xfs_ifree_cluster(
                if (!bp)
                        return ENOMEM;
+                /*
+                 * This buffer may not have been correctly initialised as we
+                 * didn't read it from disk. That's not important because we are
+                 * only using to mark the buffer as stale in the log, and to
+                 * attach stale cached inodes on it. That means it will never be
+                 * dispatched for IO. If it is, we want to know about it, and we
+                 * want it to fail. We can acheive this by adding a write
+                 * verifier to the buffer.
+                 */
+                 bp->b_ops = &xfs_inode_buf_ops;
                /*
                 * Walk the inodes already attached to the buffer and mark them
                 * stale. These will all have the flush locks held, so an
@@ -3661,3 +3945,40 @@ xfs_iext_irec_update_extoffs(
                ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
        }
 }
+/*
+ * Test whether it is appropriate to check an inode for and free post EOF
+ * blocks. The 'force' parameter determines whether we should also consider
+ * regular files that are marked preallocated or append-only.
+ */
+bool
+xfs_can_free_eofblocks(struct xfs_inode *ip, bool force)
+{
+        /* prealloc/delalloc exists only on regular files */
+        if (!S_ISREG(ip->i_d.di_mode))
+                return false;
+        /*
+         * Zero sized files with no cached pages and delalloc blocks will not
+         * have speculative prealloc/delalloc blocks to remove.
+         */
+        if (VFS_I(ip)->i_size == 0 &&
+            VN_CACHED(VFS_I(ip)) == 0 &&
+            ip->i_delayed_blks == 0)
+                return false;
+        /* If we haven't read in the extent list, then don't do it now. */
+        if (!(ip->i_df.if_flags & XFS_IFEXTENTS))
+                return false;
+        /*
+         * Do not free real preallocated or append-only files unless the file
+         * has delalloc blocks and we are forced to remove them.
+         */
+        if (ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
+                if (!force || ip->i_delayed_blks == 0)
+                        return false;
+        return true;
+}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 94b32f906e79..22baf6ea4fac 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -496,11 +496,10 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
        (((pip)->i_mount->m_flags & XFS_MOUNT_GRPID) || \
         ((pip)->i_d.di_mode & S_ISGID))
 /*
- * xfs_iget.c prototypes.
+ * xfs_inode.c prototypes.
 */
-int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
-                         uint, uint, xfs_inode_t **);
 void            xfs_ilock(xfs_inode_t *, uint);
 int             xfs_ilock_nowait(xfs_inode_t *, uint);
 void            xfs_iunlock(xfs_inode_t *, uint);
@@ -508,11 +507,6 @@ void		xfs_ilock_demote(xfs_inode_t *, uint);
 int             xfs_isilocked(xfs_inode_t *, uint);
 uint            xfs_ilock_map_shared(xfs_inode_t *);
 void            xfs_iunlock_map_shared(xfs_inode_t *, uint);
-void            xfs_inode_free(struct xfs_inode *ip);
-/*
- * xfs_inode.c prototypes.
- */
 int             xfs_ialloc(struct xfs_trans *, xfs_inode_t *, umode_t,
                           xfs_nlink_t, xfs_dev_t, prid_t, int,
                           struct xfs_buf **, xfs_inode_t **);
@@ -591,6 +585,7 @@ void		xfs_iext_irec_compact(xfs_ifork_t *);
 void            xfs_iext_irec_compact_pages(xfs_ifork_t *);
 void            xfs_iext_irec_compact_full(xfs_ifork_t *);
 void            xfs_iext_irec_update_extoffs(xfs_ifork_t *, int, int);
+bool            xfs_can_free_eofblocks(struct xfs_inode *, bool);
 #define xfs_ipincount(ip)       ((unsigned int) atomic_read(&ip->i_pincount))
@@ -603,5 +598,6 @@ void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 extern struct kmem_zone *xfs_ifork_zone;
 extern struct kmem_zone *xfs_inode_zone;
 extern struct kmem_zone *xfs_ili_zone;
+extern const struct xfs_buf_ops xfs_inode_buf_ops;
 #endif  /* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c1df3c623de2..c1c3ef88a260 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -42,6 +42,7 @@
 #include "xfs_inode_item.h"
 #include "xfs_export.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 #include <linux/capability.h>
 #include <linux/dcache.h>
@@ -1602,6 +1603,26 @@ xfs_file_ioctl(
                error = xfs_errortag_clearall(mp, 1);
                return -error;
+        case XFS_IOC_FREE_EOFBLOCKS: {
+                struct xfs_eofblocks eofb;
+                if (copy_from_user(&eofb, arg, sizeof(eofb)))
+                        return -XFS_ERROR(EFAULT);
+                if (eofb.eof_version != XFS_EOFBLOCKS_VERSION)
+                        return -XFS_ERROR(EINVAL);
+                if (eofb.eof_flags & ~XFS_EOF_FLAGS_VALID)
+                        return -XFS_ERROR(EINVAL);
+                if (memchr_inv(&eofb.pad32, 0, sizeof(eofb.pad32)) ||
+                    memchr_inv(eofb.pad64, 0, sizeof(eofb.pad64)))
+                        return -XFS_ERROR(EINVAL);
+                error = xfs_icache_free_eofblocks(mp, &eofb);
+                return -error;
+        }
        default:
                return -ENOTTY;
        }
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7f537663365b..add06b4e9a63 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -41,6 +41,7 @@
 #include "xfs_utils.h"
 #include "xfs_iomap.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 #define XFS_WRITEIO_ALIGN(mp,off)       (((off) >> mp->m_writeio_log) \
@@ -373,7 +374,7 @@ xfs_iomap_write_delay(
        xfs_extlen_t    extsz;
        int             nimaps;
        xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
-        int             prealloc, flushed = 0;
+        int             prealloc;
        int             error;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,31 +435,29 @@ retry:
        }
        /*
-         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT.  For
+         * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
-         * ENOSPC, * flush all other inodes with delalloc blocks to free up
-         * some of the excess reserved metadata space. For both cases, retry
         * without EOF preallocation.
         */
        if (nimaps == 0) {
                trace_xfs_delalloc_enospc(ip, offset, count);
-                if (flushed)
+                if (prealloc) {
-                        return XFS_ERROR(error ? error : ENOSPC);
+                        prealloc = 0;
+                        error = 0;
-                if (error == ENOSPC) {
+                        goto retry;
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                        xfs_flush_inodes(ip);
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
                }
+                return XFS_ERROR(error ? error : ENOSPC);
-                flushed = 1;
-                error = 0;
-                prealloc = 0;
-                goto retry;
        }
        if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
                return xfs_alert_fsblock_zero(ip, &imap[0]);
+        /*
+         * Tag the inode as speculatively preallocated so we can reclaim this
+         * space on demand, if necessary.
+         */
+        if (prealloc)
+                xfs_inode_set_eofblocks_tag(ip);
        *ret_imap = imap[0];
        return 0;
 }
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 4e00cf091d2c..d82efaa2ac73 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -38,6 +38,7 @@
 #include "xfs_vnodeops.h"
 #include "xfs_inode_item.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 #include <linux/capability.h>
 #include <linux/xattr.h>
@@ -779,8 +780,8 @@ xfs_setattr_size(
         * care about here.
         */
        if (oldsize != ip->i_d.di_size && newsize > ip->i_d.di_size) {
-                error = xfs_flush_pages(ip, ip->i_d.di_size, newsize, 0,
+                error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-                                        FI_NONE);
+                                                      ip->i_d.di_size, newsize);
                if (error)
                        goto out_unlock;
        }
@@ -854,6 +855,9 @@ xfs_setattr_size(
                 * and do not wait the usual (long) time for writeout.
                 */
                xfs_iflags_set(ip, XFS_ITRUNCATED);
+                /* A truncate down always removes post-EOF blocks. */
+                xfs_inode_clear_eofblocks_tag(ip);
        }
        if (mask & ATTR_CTIME) {
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index 01d10a66e302..2ea7d402188d 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -34,6 +34,7 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 STATIC int
 xfs_internal_inum(
@@ -395,7 +396,8 @@ xfs_bulkstat(
                                        if (xfs_inobt_maskn(chunkidx, nicluster)
                                                        & ~r.ir_free)
                                                xfs_btree_reada_bufs(mp, agno,
-                                                        agbno, nbcluster);
+                                                        agbno, nbcluster,
+                                                        &xfs_inode_buf_ops);
                                }
                                irbp->ir_startino = r.ir_startino;
                                irbp->ir_freecount = r.ir_freecount;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 828662f70d64..fe7e4df85a7b 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -44,6 +44,7 @@
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
 #include <linux/slab.h>
+#include <linux/crc32c.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/file.h>
@@ -118,6 +119,7 @@
 #define xfs_rotorstep           xfs_params.rotorstep.val
 #define xfs_inherit_nodefrag    xfs_params.inherit_nodfrg.val
 #define xfs_fstrm_centisecs     xfs_params.fstrm_timer.val
+#define xfs_eofb_secs           xfs_params.eofb_timer.val
 #define current_cpu()           (raw_smp_processor_id())
 #define current_pid()           (current->pid)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 4dad756962d0..46bd9d52ab51 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -34,6 +34,8 @@
 #include "xfs_dinode.h"
 #include "xfs_inode.h"
 #include "xfs_trace.h"
+#include "xfs_fsops.h"
+#include "xfs_cksum.h"
 kmem_zone_t     *xfs_log_ticket_zone;
@@ -458,7 +460,8 @@ xfs_log_reserve(
        tic->t_trans_type = t_type;
        *ticp = tic;
-        xlog_grant_push_ail(log, tic->t_unit_res * tic->t_cnt);
+        xlog_grant_push_ail(log, tic->t_cnt ? tic->t_unit_res * tic->t_cnt
+                                            : tic->t_unit_res);
        trace_xfs_log_reserve(log, tic);
@@ -679,25 +682,29 @@ out:
 }
 /*
- * Finish the recovery of the file system.  This is separate from
+ * Finish the recovery of the file system.  This is separate from the
- * the xfs_log_mount() call, because it depends on the code in
+ * xfs_log_mount() call, because it depends on the code in xfs_mountfs() to read
- * xfs_mountfs() to read in the root and real-time bitmap inodes
+ * in the root and real-time bitmap inodes between calling xfs_log_mount() and
- * between calling xfs_log_mount() and here.
+ * here.
 *
- * mp           - ubiquitous xfs mount point structure
+ * If we finish recovery successfully, start the background log work. If we are
+ * not doing recovery, then we have a RO filesystem and we don't need to start
+ * it.
 */
 int
 xfs_log_mount_finish(xfs_mount_t *mp)
 {
-        int     error;
+        int     error = 0;
-        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
+        if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
                error = xlog_recover_finish(mp->m_log);
-        else {
+                if (!error)
-                error = 0;
+                        xfs_log_work_queue(mp);
+        } else {
                ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
        }
        return error;
 }
@@ -850,15 +857,49 @@ xfs_log_unmount_write(xfs_mount_t *mp)
 }       /* xfs_log_unmount_write */
 /*
- * Deallocate log structures for unmount/relocation.
+ * Empty the log for unmount/freeze.
+ *
+ * To do this, we first need to shut down the background log work so it is not
+ * trying to cover the log as we clean up. We then need to unpin all objects in
+ * the log so we can then flush them out. Once they have completed their IO and
+ * run the callbacks removing themselves from the AIL, we can write the unmount
+ * record.
+ */
+void
+xfs_log_quiesce(
+        struct xfs_mount        *mp)
+{
+        cancel_delayed_work_sync(&mp->m_log->l_work);
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        /*
+         * The superblock buffer is uncached and while xfs_ail_push_all_sync()
+         * will push it, xfs_wait_buftarg() will not wait for it. Further,
+         * xfs_buf_iowait() cannot be used because it was pushed with the
+         * XBF_ASYNC flag set, so we need to use a lock/unlock pair to wait for
+         * the IO to complete.
+         */
+        xfs_ail_push_all_sync(mp->m_ail);
+        xfs_wait_buftarg(mp->m_ddev_targp);
+        xfs_buf_lock(mp->m_sb_bp);
+        xfs_buf_unlock(mp->m_sb_bp);
+        xfs_log_unmount_write(mp);
+}
+/*
+ * Shut down and release the AIL and Log.
 *
- * We need to stop the aild from running before we destroy
+ * During unmount, we need to ensure we flush all the dirty metadata objects
- * and deallocate the log as the aild references the log.
+ * from the AIL so that the log is empty before we write the unmount record to
+ * the log. Once this is done, we can tear down the AIL and the log.
 */
 void
-xfs_log_unmount(xfs_mount_t *mp)
+xfs_log_unmount(
+        struct xfs_mount        *mp)
 {
-        cancel_delayed_work_sync(&mp->m_sync_work);
+        xfs_log_quiesce(mp);
        xfs_trans_ail_destroy(mp);
        xlog_dealloc_log(mp->m_log);
 }
@@ -1090,8 +1131,7 @@ xlog_iodone(xfs_buf_t *bp)
         * with it being freed after writing the unmount record to the
         * log.
         */
+}
-}       /* xlog_iodone */
 /*
 * Return size of each in-core log record buffer.
@@ -1161,6 +1201,40 @@ done:
 }       /* xlog_get_iclog_buffer_size */
+void
+xfs_log_work_queue(
+        struct xfs_mount        *mp)
+{
+        queue_delayed_work(mp->m_log_workqueue, &mp->m_log->l_work,
+                                msecs_to_jiffies(xfs_syncd_centisecs * 10));
+}
+/*
+ * Every sync period we need to unpin all items in the AIL and push them to
+ * disk. If there is nothing dirty, then we might need to cover the log to
+ * indicate that the filesystem is idle.
+ */
+void
+xfs_log_worker(
+        struct work_struct      *work)
+{
+        struct xlog             *log = container_of(to_delayed_work(work),
+                                                struct xlog, l_work);
+        struct xfs_mount        *mp = log->l_mp;
+        /* dgc: errors ignored - not fatal and nowhere to report them */
+        if (xfs_log_need_covered(mp))
+                xfs_fs_log_dummy(mp);
+        else
+                xfs_log_force(mp, 0);
+        /* start pushing all the metadata that is currently dirty */
+        xfs_ail_push_all(mp->m_ail);
+        /* queue us up again */
+        xfs_log_work_queue(mp);
+}
 /*
 * This routine initializes some of the log structure for a given mount point.
 * Its primary purpose is to fill in enough, so recovery can occur.  However,
@@ -1195,6 +1269,7 @@ xlog_alloc_log(
        log->l_logBBsize   = num_bblks;
        log->l_covered_state = XLOG_STATE_COVER_IDLE;
        log->l_flags       |= XLOG_ACTIVE_RECOVERY;
+        INIT_DELAYED_WORK(&log->l_work, xfs_log_worker);
        log->l_prev_block  = -1;
        /* log->l_tail_lsn = 0x100000000LL; cycle = 1; current block = 0 */
@@ -1417,6 +1492,84 @@ xlog_grant_push_ail(
 }
 /*
+ * Stamp cycle number in every block
+ */
+STATIC void
+xlog_pack_data(
+        struct xlog             *log,
+        struct xlog_in_core     *iclog,
+        int                     roundoff)
+{
+        int                     i, j, k;
+        int                     size = iclog->ic_offset + roundoff;
+        __be32                  cycle_lsn;
+        xfs_caddr_t             dp;
+        cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+        dp = iclog->ic_datap;
+        for (i = 0; i < BTOBB(size); i++) {
+                if (i >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE))
+                        break;
+                iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
+                *(__be32 *)dp = cycle_lsn;
+                dp += BBSIZE;
+        }
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+                xlog_in_core_2_t *xhdr = iclog->ic_data;
+                for ( ; i < BTOBB(size); i++) {
+                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+                        xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
+                        *(__be32 *)dp = cycle_lsn;
+                        dp += BBSIZE;
+                }
+                for (i = 1; i < log->l_iclog_heads; i++)
+                        xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+        }
+}
+/*
+ * Calculate the checksum for a log buffer.
+ *
+ * This is a little more complicated than it should be because the various
+ * headers and the actual data are non-contiguous.
+ */
+__le32
+xlog_cksum(
+        struct xlog             *log,
+        struct xlog_rec_header  *rhead,
+        char                    *dp,
+        int                     size)
+{
+        __uint32_t              crc;
+        /* first generate the crc for the record header ... */
+        crc = xfs_start_cksum((char *)rhead,
+                              sizeof(struct xlog_rec_header),
+                              offsetof(struct xlog_rec_header, h_crc));
+        /* ... then for additional cycle data for v2 logs ... */
+        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+                union xlog_in_core2 *xhdr = (union xlog_in_core2 *)rhead;
+                int             i;
+                for (i = 1; i < log->l_iclog_heads; i++) {
+                        crc = crc32c(crc, &xhdr[i].hic_xheader,
+                                     sizeof(struct xlog_rec_ext_header));
+                }
+        }
+        /* ... and finally for the payload */
+        crc = crc32c(crc, dp, size);
+        return xfs_end_cksum(crc);
+}
+/*
 * The bdstrat callback function for log bufs. This gives us a central
 * place to trap bufs in case we get hit by a log I/O error and need to
 * shutdown. Actually, in practice, even when we didn't get a log error,
@@ -1476,7 +1629,6 @@ xlog_sync(
        struct xlog             *log,
        struct xlog_in_core     *iclog)
 {
-        xfs_caddr_t     dptr;           /* pointer to byte sized element */
        xfs_buf_t       *bp;
        int             i;
        uint            count;          /* byte count of bwrite */
@@ -1485,6 +1637,7 @@ xlog_sync(
        int             split = 0;      /* split write into two regions */
        int             error;
        int             v2 = xfs_sb_version_haslogv2(&log->l_mp->m_sb);
+        int             size;
        XFS_STATS_INC(xs_log_writes);
        ASSERT(atomic_read(&iclog->ic_refcnt) == 0);
@@ -1515,13 +1668,10 @@ xlog_sync(
        xlog_pack_data(log, iclog, roundoff); 
        /* real byte length */
-        if (v2) {
+        size = iclog->ic_offset;
-                iclog->ic_header.h_len =
+        if (v2)
-                        cpu_to_be32(iclog->ic_offset + roundoff);
+                size += roundoff;
-        } else {
+        iclog->ic_header.h_len = cpu_to_be32(size);
-                iclog->ic_header.h_len =
-                        cpu_to_be32(iclog->ic_offset);
-        }
        bp = iclog->ic_bp;
        XFS_BUF_SET_ADDR(bp, BLOCK_LSN(be64_to_cpu(iclog->ic_header.h_lsn)));
@@ -1530,12 +1680,36 @@ xlog_sync(
        /* Do we need to split this write into 2 parts? */
        if (XFS_BUF_ADDR(bp) + BTOBB(count) > log->l_logBBsize) {
+                char            *dptr;
                split = count - (BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp)));
                count = BBTOB(log->l_logBBsize - XFS_BUF_ADDR(bp));
-                iclog->ic_bwritecnt = 2;        /* split into 2 writes */
+                iclog->ic_bwritecnt = 2;
+                /*
+                 * Bump the cycle numbers at the start of each block in the
+                 * part of the iclog that ends up in the buffer that gets
+                 * written to the start of the log.
+                 *
+                 * Watch out for the header magic number case, though.
+                 */
+                dptr = (char *)&iclog->ic_header + count;
+                for (i = 0; i < split; i += BBSIZE) {
+                        __uint32_t cycle = be32_to_cpu(*(__be32 *)dptr);
+                        if (++cycle == XLOG_HEADER_MAGIC_NUM)
+                                cycle++;
+                        *(__be32 *)dptr = cpu_to_be32(cycle);
+                        dptr += BBSIZE;
+                }
        } else {
                iclog->ic_bwritecnt = 1;
        }
+        /* calculcate the checksum */
+        iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
+                                            iclog->ic_datap, size);
        bp->b_io_length = BTOBB(count);
        bp->b_fspriv = iclog;
        XFS_BUF_ZEROFLAGS(bp);
@@ -1589,19 +1763,6 @@ xlog_sync(
                bp->b_flags |= XBF_SYNCIO;
                if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
                        bp->b_flags |= XBF_FUA;
-                dptr = bp->b_addr;
-                /*
-                 * Bump the cycle numbers at the start of each block
-                 * since this part of the buffer is at the start of
-                 * a new cycle.  Watch out for the header magic number
-                 * case, though.
-                 */
-                for (i = 0; i < split; i += BBSIZE) {
-                        be32_add_cpu((__be32 *)dptr, 1);
-                        if (be32_to_cpu(*(__be32 *)dptr) == XLOG_HEADER_MAGIC_NUM)
-                                be32_add_cpu((__be32 *)dptr, 1);
-                        dptr += BBSIZE;
-                }
                ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
                ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1618,7 +1779,6 @@ xlog_sync(
        return 0;
 }       /* xlog_sync */
 /*
 * Deallocate a log structure
 */
@@ -3713,3 +3873,4 @@ xlog_iclogs_empty(
        } while (iclog != log->l_iclog);
        return 1;
 }
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 748d312850e2..5caee96059df 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -181,5 +181,9 @@ int	xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
                                xfs_lsn_t *commit_lsn, int flags);
 bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
+void    xfs_log_work_queue(struct xfs_mount *mp);
+void    xfs_log_worker(struct work_struct *work);
+void    xfs_log_quiesce(struct xfs_mount *mp);
 #endif
 #endif  /* __XFS_LOG_H__ */
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 18a801d76a42..16d8d12ea3b4 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -139,7 +139,6 @@ static inline uint xlog_get_client_id(__be32 i)
 /*
 * Flags for log structure
 */
-#define XLOG_CHKSUM_MISMATCH    0x1     /* used only during recovery */
 #define XLOG_ACTIVE_RECOVERY    0x2     /* in the middle of recovery */
 #define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
 #define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
@@ -291,7 +290,7 @@ typedef struct xlog_rec_header {
        __be32    h_len;        /* len in bytes; should be 64-bit aligned: 4 */
        __be64    h_lsn;        /* lsn of this LR                       :  8 */
        __be64    h_tail_lsn;   /* lsn of 1st LR w/ buffers not committed: 8 */
-        __be32    h_chksum;     /* may not be used; non-zero if used    :  4 */
+        __le32    h_crc;        /* crc of log record                    :  4 */
        __be32    h_prev_block; /* block number to previous LR          :  4 */
        __be32    h_num_logops; /* number of log operations in this LR  :  4 */
        __be32    h_cycle_data[XLOG_HEADER_CYCLE_SIZE / BBSIZE];
@@ -495,6 +494,7 @@ struct xlog {
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
+        struct delayed_work     l_work;         /* background flush work */
        uint                    l_flags;
        uint                    l_quotaoffs_flag; /* XFS_DQ_*, for QUOTAOFFs */
        struct list_head        *l_buf_cancel_table;
@@ -554,11 +554,9 @@ xlog_recover(
 extern int
 xlog_recover_finish(
        struct xlog             *log);
-extern void
-xlog_pack_data(
+extern __le32    xlog_cksum(struct xlog *log, struct xlog_rec_header *rhead,
-        struct xlog             *log,
+                            char *dp, int size);
-        struct xlog_in_core     *iclog,
-        int);
 extern kmem_zone_t *xfs_log_ticket_zone;
 struct xlog_ticket *
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index d308749fabf1..96fcbb85ff83 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -41,7 +41,9 @@
 #include "xfs_trans_priv.h"
 #include "xfs_quota.h"
 #include "xfs_utils.h"
+#include "xfs_cksum.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 STATIC int
 xlog_find_zeroed(
@@ -2143,7 +2145,7 @@ xlog_recover_buffer_pass2(
                buf_flags |= XBF_UNMAPPED;
        bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
-                          buf_flags);
+                          buf_flags, NULL);
        if (!bp)
                return XFS_ERROR(ENOMEM);
        error = bp->b_error;
@@ -2236,7 +2238,8 @@ xlog_recover_inode_pass2(
        }
        trace_xfs_log_recover_inode_recover(log, in_f);
-        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0);
+        bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len, 0,
+                          NULL);
        if (!bp) {
                error = ENOMEM;
                goto error;
@@ -2547,7 +2550,8 @@ xlog_recover_dquot_pass2(
        ASSERT(dq_f->qlf_len == 1);
        error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp, dq_f->qlf_blkno,
-                                   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp);
+                                   XFS_FSB_TO_BB(mp, dq_f->qlf_len), 0, &bp,
+                                   NULL);
        if (error)
                return error;
@@ -3213,80 +3217,58 @@ xlog_recover_process_iunlinks(
        mp->m_dmevmask = mp_dmevmask;
 }
-#ifdef DEBUG
-STATIC void
-xlog_pack_data_checksum(
-        struct xlog             *log,
-        struct xlog_in_core     *iclog,
-        int                     size)
-{
-        int             i;
-        __be32          *up;
-        uint            chksum = 0;
-        up = (__be32 *)iclog->ic_datap;
-        /* divide length by 4 to get # words */
-        for (i = 0; i < (size >> 2); i++) {
-                chksum ^= be32_to_cpu(*up);
-                up++;
-        }
-        iclog->ic_header.h_chksum = cpu_to_be32(chksum);
-}
-#else
-#define xlog_pack_data_checksum(log, iclog, size)
-#endif
 /*
- * Stamp cycle number in every block
+ * Upack the log buffer data and crc check it. If the check fails, issue a
+ * warning if and only if the CRC in the header is non-zero. This makes the
+ * check an advisory warning, and the zero CRC check will prevent failure
+ * warnings from being emitted when upgrading the kernel from one that does not
+ * add CRCs by default.
+ *
+ * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
+ * corruption failure
 */
-void
+STATIC int
-xlog_pack_data(
+xlog_unpack_data_crc(
-        struct xlog             *log,
+        struct xlog_rec_header  *rhead,
-        struct xlog_in_core     *iclog,
+        xfs_caddr_t             dp,
-        int                     roundoff)
+        struct xlog             *log)
 {
-        int                     i, j, k;
+        __le32                  crc;
-        int                     size = iclog->ic_offset + roundoff;
-        __be32                  cycle_lsn;
+        crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
-        xfs_caddr_t             dp;
+        if (crc != rhead->h_crc) {
+                if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
-        xlog_pack_data_checksum(log, iclog, size);
+                        xfs_alert(log->l_mp,
+                "log record CRC mismatch: found 0x%x, expected 0x%x.\n",
-        cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
+                                        le32_to_cpu(rhead->h_crc),
+                                        le32_to_cpu(crc));
-        dp = iclog->ic_datap;
+                        xfs_hex_dump(dp, 32);
-        for (i = 0; i < BTOBB(size) &&
-                i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
-                iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
-                *(__be32 *)dp = cycle_lsn;
-                dp += BBSIZE;
-        }
-        if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-                xlog_in_core_2_t *xhdr = iclog->ic_data;
-                for ( ; i < BTOBB(size); i++) {
-                        j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
-                        k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
-                        xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
-                        *(__be32 *)dp = cycle_lsn;
-                        dp += BBSIZE;
                }
-                for (i = 1; i < log->l_iclog_heads; i++) {
+                /*
-                        xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
+                 * If we've detected a log record corruption, then we can't
-                }
+                 * recover past this point. Abort recovery if we are enforcing
+                 * CRC protection by punting an error back up the stack.
+                 */
+                if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
+                        return EFSCORRUPTED;
        }
+        return 0;
 }
-STATIC void
+STATIC int
 xlog_unpack_data(
        struct xlog_rec_header  *rhead,
        xfs_caddr_t             dp,
        struct xlog             *log)
 {
        int                     i, j, k;
+        int                     error;
+        error = xlog_unpack_data_crc(rhead, dp, log);
+        if (error)
+                return error;
        for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
                  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
@@ -3303,6 +3285,8 @@ xlog_unpack_data(
                        dp += BBSIZE;
                }
        }
+        return 0;
 }
 STATIC int
@@ -3434,9 +3418,13 @@ xlog_do_recovery_pass(
                        if (error)
                                goto bread_err2;
-                        xlog_unpack_data(rhead, offset, log);
+                        error = xlog_unpack_data(rhead, offset, log);
-                        if ((error = xlog_recover_process_data(log,
+                        if (error)
-                                                rhash, rhead, offset, pass)))
+                                goto bread_err2;
+                        error = xlog_recover_process_data(log,
+                                                rhash, rhead, offset, pass);
+                        if (error)
                                goto bread_err2;
                        blk_no += bblks + hblks;
                }
@@ -3546,9 +3534,14 @@ xlog_do_recovery_pass(
                                if (error)
                                        goto bread_err2;
                        }
-                        xlog_unpack_data(rhead, offset, log);
-                        if ((error = xlog_recover_process_data(log, rhash,
+                        error = xlog_unpack_data(rhead, offset, log);
-                                                        rhead, offset, pass)))
+                        if (error)
+                                goto bread_err2;
+                        error = xlog_recover_process_data(log, rhash,
+                                                        rhead, offset, pass);
+                        if (error)
                                goto bread_err2;
                        blk_no += bblks;
                }
@@ -3573,9 +3566,13 @@ xlog_do_recovery_pass(
                        if (error)
                                goto bread_err2;
-                        xlog_unpack_data(rhead, offset, log);
+                        error = xlog_unpack_data(rhead, offset, log);
-                        if ((error = xlog_recover_process_data(log, rhash,
+                        if (error)
-                                                        rhead, offset, pass)))
+                                goto bread_err2;
+                        error = xlog_recover_process_data(log, rhash,
+                                                        rhead, offset, pass);
+                        if (error)
                                goto bread_err2;
                        blk_no += bblks + hblks;
                }
@@ -3689,13 +3686,14 @@ xlog_do_recover(
        /*
         * Now that we've finished replaying all buffer and inode
-         * updates, re-read in the superblock.
+         * updates, re-read in the superblock and reverify it.
         */
        bp = xfs_getsb(log->l_mp, 0);
        XFS_BUF_UNDONE(bp);
        ASSERT(!(XFS_BUF_ISWRITE(bp)));
        XFS_BUF_READ(bp);
        XFS_BUF_UNASYNC(bp);
+        bp->b_ops = &xfs_sb_buf_ops;
        xfsbdstrat(log->l_mp, bp);
        error = xfs_buf_iowait(bp);
        if (error) {
@@ -3707,7 +3705,7 @@ xlog_do_recover(
        /* Convert superblock from on-disk format */
        sbp = &log->l_mp->m_sb;
-        xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
+        xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
        ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
        ASSERT(xfs_sb_good_version(sbp));
        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b2bd3a0e6376..da508463ff10 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -42,6 +42,7 @@
 #include "xfs_fsops.h"
 #include "xfs_utils.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 #ifdef HAVE_PERCPU_SB
@@ -303,9 +304,8 @@ STATIC int
 xfs_mount_validate_sb(
        xfs_mount_t     *mp,
        xfs_sb_t        *sbp,
-        int             flags)
+        bool            check_inprogress)
 {
-        int             loud = !(flags & XFS_MFSI_QUIET);
        /*
         * If the log device and data device have the
@@ -315,21 +315,18 @@ xfs_mount_validate_sb(
         * a volume filesystem in a non-volume manner.
         */
        if (sbp->sb_magicnum != XFS_SB_MAGIC) {
-                if (loud)
+                xfs_warn(mp, "bad magic number");
-                        xfs_warn(mp, "bad magic number");
                return XFS_ERROR(EWRONGFS);
        }
        if (!xfs_sb_good_version(sbp)) {
-                if (loud)
+                xfs_warn(mp, "bad version");
-                        xfs_warn(mp, "bad version");
                return XFS_ERROR(EWRONGFS);
        }
        if (unlikely(
            sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) {
-                if (loud)
+                xfs_warn(mp,
-                        xfs_warn(mp,
                "filesystem is marked as having an external log; "
                "specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
@@ -337,8 +334,7 @@ xfs_mount_validate_sb(
        if (unlikely(
            sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) {
-                if (loud)
+                xfs_warn(mp,
-                        xfs_warn(mp,
                "filesystem is marked as having an internal log; "
                "do not specify logdev on the mount command line.");
                return XFS_ERROR(EINVAL);
@@ -372,8 +368,7 @@ xfs_mount_validate_sb(
            sbp->sb_dblocks == 0                                        ||
            sbp->sb_dblocks > XFS_MAX_DBLOCKS(sbp)                      ||
            sbp->sb_dblocks < XFS_MIN_DBLOCKS(sbp))) {
-                if (loud)
+                XFS_CORRUPTION_ERROR("SB sanity check failed",
-                        XFS_CORRUPTION_ERROR("SB sanity check failed",
                                XFS_ERRLEVEL_LOW, mp, sbp);
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -382,12 +377,10 @@ xfs_mount_validate_sb(
         * Until this is fixed only page-sized or smaller data blocks work.
         */
        if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) {
-                if (loud) {
+                xfs_warn(mp,
-                        xfs_warn(mp,
                "File system with blocksize %d bytes. "
                "Only pagesize (%ld) or less will currently work.",
                                sbp->sb_blocksize, PAGE_SIZE);
-                }
                return XFS_ERROR(ENOSYS);
        }
@@ -401,23 +394,20 @@ xfs_mount_validate_sb(
        case 2048:
                break;
        default:
-                if (loud)
+                xfs_warn(mp, "inode size of %d bytes not supported",
-                        xfs_warn(mp, "inode size of %d bytes not supported",
                                sbp->sb_inodesize);
                return XFS_ERROR(ENOSYS);
        }
        if (xfs_sb_validate_fsb_count(sbp, sbp->sb_dblocks) ||
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
-                if (loud)
+                xfs_warn(mp,
-                        xfs_warn(mp,
                "file system too large to be mounted on this system.");
                return XFS_ERROR(EFBIG);
        }
-        if (unlikely(sbp->sb_inprogress)) {
+        if (check_inprogress && sbp->sb_inprogress) {
-                if (loud)
+                xfs_warn(mp, "Offline file system operation in progress!");
-                        xfs_warn(mp, "file system busy");
                return XFS_ERROR(EFSCORRUPTED);
        }
@@ -425,9 +415,7 @@ xfs_mount_validate_sb(
         * Version 1 directory format has never worked on Linux.
         */
        if (unlikely(!xfs_sb_version_hasdirv2(sbp))) {
-                if (loud)
+                xfs_warn(mp, "file system using version 1 directory format");
-                        xfs_warn(mp,
-                                "file system using version 1 directory format");
                return XFS_ERROR(ENOSYS);
        }
@@ -520,11 +508,9 @@ out_unwind:
 void
 xfs_sb_from_disk(
-        struct xfs_mount        *mp,
+        struct xfs_sb   *to,
        xfs_dsb_t       *from)
 {
-        struct xfs_sb *to = &mp->m_sb;
        to->sb_magicnum = be32_to_cpu(from->sb_magicnum);
        to->sb_blocksize = be32_to_cpu(from->sb_blocksize);
        to->sb_dblocks = be64_to_cpu(from->sb_dblocks);
@@ -626,6 +612,72 @@ xfs_sb_to_disk(
        }
 }
+static void
+xfs_sb_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_mount *mp = bp->b_target->bt_mount;
+        struct xfs_sb   sb;
+        int             error;
+        xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+        /*
+         * Only check the in progress field for the primary superblock as
+         * mkfs.xfs doesn't clear it from secondary superblocks.
+         */
+        error = xfs_mount_validate_sb(mp, &sb, bp->b_bn == XFS_SB_DADDR);
+        if (error)
+                xfs_buf_ioerror(bp, error);
+}
+static void
+xfs_sb_read_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_sb_verify(bp);
+}
+/*
+ * We may be probed for a filesystem match, so we may not want to emit
+ * messages when the superblock buffer is not actually an XFS superblock.
+ * If we find an XFS superblock, the run a normal, noisy mount because we are
+ * really going to mount it and want to know about errors.
+ */
+static void
+xfs_sb_quiet_read_verify(
+        struct xfs_buf  *bp)
+{
+        struct xfs_sb   sb;
+        xfs_sb_from_disk(&sb, XFS_BUF_TO_SBP(bp));
+        if (sb.sb_magicnum == XFS_SB_MAGIC) {
+                /* XFS filesystem, verify noisily! */
+                xfs_sb_read_verify(bp);
+                return;
+        }
+        /* quietly fail */
+        xfs_buf_ioerror(bp, EFSCORRUPTED);
+}
+static void
+xfs_sb_write_verify(
+        struct xfs_buf  *bp)
+{
+        xfs_sb_verify(bp);
+}
+const struct xfs_buf_ops xfs_sb_buf_ops = {
+        .verify_read = xfs_sb_read_verify,
+        .verify_write = xfs_sb_write_verify,
+};
+static const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+        .verify_read = xfs_sb_quiet_read_verify,
+        .verify_write = xfs_sb_write_verify,
+};
 /*
 * xfs_readsb
 *
@@ -651,26 +703,27 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 reread:
        bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR,
-                                        BTOBB(sector_size), 0);
+                                   BTOBB(sector_size), 0,
+                                   loud ? &xfs_sb_buf_ops
+                                        : &xfs_sb_quiet_buf_ops);
        if (!bp) {
                if (loud)
                        xfs_warn(mp, "SB buffer read failed");
                return EIO;
        }
+        if (bp->b_error) {
-        /*
+                error = bp->b_error;
-         * Initialize the mount structure from the superblock.
-         * But first do some basic consistency checking.
-         */
-        xfs_sb_from_disk(mp, XFS_BUF_TO_SBP(bp));
-        error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags);
-        if (error) {
                if (loud)
                        xfs_warn(mp, "SB validate failed");
                goto release_buf;
        }
        /*
+         * Initialize the mount structure from the superblock.
+         */
+        xfs_sb_from_disk(&mp->m_sb, XFS_BUF_TO_SBP(bp));
+        /*
         * We must be able to do sector-sized and sector-aligned IO.
         */
        if (sector_size > mp->m_sb.sb_sectsize) {
@@ -1001,7 +1054,7 @@ xfs_check_sizes(xfs_mount_t *mp)
        }
        bp = xfs_buf_read_uncached(mp->m_ddev_targp,
                                        d - XFS_FSS_TO_BB(mp, 1),
-                                        XFS_FSS_TO_BB(mp, 1), 0);
+                                        XFS_FSS_TO_BB(mp, 1), 0, NULL);
        if (!bp) {
                xfs_warn(mp, "last sector read failed");
                return EIO;
@@ -1016,7 +1069,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                }
                bp = xfs_buf_read_uncached(mp->m_logdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
-                                        XFS_FSB_TO_BB(mp, 1), 0);
+                                        XFS_FSB_TO_BB(mp, 1), 0, NULL);
                if (!bp) {
                        xfs_warn(mp, "log device read failed");
                        return EIO;
@@ -1427,6 +1480,8 @@ xfs_unmountfs(
        __uint64_t              resblks;
        int                     error;
+        cancel_delayed_work_sync(&mp->m_eofblocks_work);
        xfs_qm_unmount_quotas(mp);
        xfs_rtunmount_inodes(mp);
        IRELE(mp->m_rootip);
@@ -1450,21 +1505,16 @@ xfs_unmountfs(
        /*
         * And reclaim all inodes.  At this point there should be no dirty
-         * inode, and none should be pinned or locked, but use synchronous
+         * inodes and none should be pinned or locked, but use synchronous
-         * reclaim just to be sure.
+         * reclaim just to be sure. We can stop background inode reclaim
+         * here as well if it is still running.
         */
+        cancel_delayed_work_sync(&mp->m_reclaim_work);
        xfs_reclaim_inodes(mp, SYNC_WAIT);
        xfs_qm_unmount(mp);
        /*
-         * Flush out the log synchronously so that we know for sure
-         * that nothing is pinned.  This is important because bflush()
-         * will skip pinned buffers.
-         */
-        xfs_log_force(mp, XFS_LOG_SYNC);
-        /*
         * Unreserve any blocks we have so that when we unmount we don't account
         * the reserved free space as used. This is really only necessary for
         * lazy superblock counting because it trusts the incore superblock
@@ -1489,23 +1539,6 @@ xfs_unmountfs(
                xfs_warn(mp, "Unable to update superblock counters. "
                                "Freespace may not be correct on next mount.");
-        /*
-         * At this point we might have modified the superblock again and thus
-         * added an item to the AIL, thus flush it again.
-         */
-        xfs_ail_push_all_sync(mp->m_ail);
-        xfs_wait_buftarg(mp->m_ddev_targp);
-        /*
-         * The superblock buffer is uncached and xfsaild_push() will lock and
-         * set the XBF_ASYNC flag on the buffer. We cannot do xfs_buf_iowait()
-         * here but a lock on the superblock buffer will block until iodone()
-         * has completed.
-         */
-        xfs_buf_lock(mp->m_sb_bp);
-        xfs_buf_unlock(mp->m_sb_bp);
-        xfs_log_unmount_write(mp);
        xfs_log_unmount(mp);
        xfs_uuid_unmount(mp);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index deee09e534dc..bab8314507e4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -51,8 +51,6 @@ typedef struct xfs_trans_reservations {
 #else /* __KERNEL__ */
-#include "xfs_sync.h"
 struct xlog;
 struct xfs_inode;
 struct xfs_mru_cache;
@@ -197,9 +195,9 @@ typedef struct xfs_mount {
        struct mutex            m_icsb_mutex;   /* balancer sync lock */
 #endif
        struct xfs_mru_cache    *m_filestream;  /* per-mount filestream data */
-        struct delayed_work     m_sync_work;    /* background sync work */
        struct delayed_work     m_reclaim_work; /* background inode reclaim */
-        struct work_struct      m_flush_work;   /* background inode flush */
+        struct delayed_work     m_eofblocks_work; /* background eof blocks
+                                                     trimming */
        __int64_t               m_update_flags; /* sb flags we need to update
                                                   on the next remount,rw */
        struct shrinker         m_inode_shrink; /* inode reclaim shrinker */
@@ -209,6 +207,9 @@ typedef struct xfs_mount {
        struct workqueue_struct *m_data_workqueue;
        struct workqueue_struct *m_unwritten_workqueue;
        struct workqueue_struct *m_cil_workqueue;
+        struct workqueue_struct *m_reclaim_workqueue;
+        struct workqueue_struct *m_log_workqueue;
+        struct workqueue_struct *m_eofblocks_workqueue;
 } xfs_mount_t;
 /*
@@ -387,7 +388,9 @@ extern void	xfs_set_low_space_thresholds(struct xfs_mount *);
 extern void     xfs_mod_sb(struct xfs_trans *, __int64_t);
 extern int      xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t,
                                        xfs_agnumber_t *);
-extern void     xfs_sb_from_disk(struct xfs_mount *, struct xfs_dsb *);
+extern void     xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
 extern void     xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
+extern const struct xfs_buf_ops xfs_sb_buf_ops;
 #endif  /* __XFS_MOUNT_H__ */
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 2e86fa0cfc0d..60eff4763156 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -40,6 +40,7 @@
 #include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 /*
 * The global quota manager. There is only one of these for the entire
@@ -891,7 +892,8 @@ xfs_qm_dqiter_bufs(
        while (blkcnt--) {
                error = xfs_trans_read_buf(mp, NULL, mp->m_ddev_targp,
                              XFS_FSB_TO_DADDR(mp, bno),
-                              mp->m_quotainfo->qi_dqchunklen, 0, &bp);
+                              mp->m_quotainfo->qi_dqchunklen, 0, &bp,
+                              &xfs_dquot_buf_ops);
                if (error)
                        break;
@@ -978,7 +980,8 @@ xfs_qm_dqiterate(
                                while (rablkcnt--) {
                                        xfs_buf_readahead(mp->m_ddev_targp,
                                               XFS_FSB_TO_DADDR(mp, rablkno),
-                                               mp->m_quotainfo->qi_dqchunklen);
+                                               mp->m_quotainfo->qi_dqchunklen,
+                                               NULL);
                                        rablkno++;
                                }
                        }
@@ -1453,7 +1456,7 @@ xfs_qm_dqreclaim_one(
        int                     error;
        if (!xfs_dqlock_nowait(dqp))
-                goto out_busy;
+                goto out_move_tail;
        /*
         * This dquot has acquired a reference in the meantime remove it from
@@ -1476,7 +1479,7 @@ xfs_qm_dqreclaim_one(
         * getting flushed to disk, we don't want to reclaim it.
         */
        if (!xfs_dqflock_nowait(dqp))
-                goto out_busy;
+                goto out_unlock_move_tail;
        if (XFS_DQ_IS_DIRTY(dqp)) {
                struct xfs_buf  *bp = NULL;
@@ -1487,7 +1490,7 @@ xfs_qm_dqreclaim_one(
                if (error) {
                        xfs_warn(mp, "%s: dquot %p flush failed",
                                 __func__, dqp);
-                        goto out_busy;
+                        goto out_unlock_move_tail;
                }
                xfs_buf_delwri_queue(bp, buffer_list);
@@ -1496,7 +1499,7 @@ xfs_qm_dqreclaim_one(
                 * Give the dquot another try on the freelist, as the
                 * flushing will take some time.
                 */
-                goto out_busy;
+                goto out_unlock_move_tail;
        }
        xfs_dqfunlock(dqp);
@@ -1515,14 +1518,13 @@ xfs_qm_dqreclaim_one(
        XFS_STATS_INC(xs_qm_dqreclaims);
        return;
-out_busy:
-        xfs_dqunlock(dqp);
        /*
         * Move the dquot to the tail of the list so that we don't spin on it.
         */
+out_unlock_move_tail:
+        xfs_dqunlock(dqp);
+out_move_tail:
        list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
        trace_xfs_dqreclaim_busy(dqp);
        XFS_STATS_INC(xs_qm_dqreclaim_misses);
 }
diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c
index 858a3b186110..5f53e75409b8 100644
--- a/fs/xfs/xfs_qm_syscalls.c
+++ b/fs/xfs/xfs_qm_syscalls.c
@@ -40,6 +40,7 @@
 #include "xfs_utils.h"
 #include "xfs_qm.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 STATIC int      xfs_qm_log_quotaoff(xfs_mount_t *, xfs_qoff_logitem_t **, uint);
 STATIC int      xfs_qm_log_quotaoff_end(xfs_mount_t *, xfs_qoff_logitem_t *,
@@ -845,7 +846,8 @@ STATIC int
 xfs_dqrele_inode(
        struct xfs_inode        *ip,
        struct xfs_perag        *pag,
-        int                     flags)
+        int                     flags,
+        void                    *args)
 {
        /* skip quota inodes */
        if (ip == ip->i_mount->m_quotainfo->qi_uquotaip ||
@@ -881,5 +883,5 @@ xfs_qm_dqrele_all_inodes(
        uint             flags)
 {
        ASSERT(mp->m_quotainfo);
-        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags);
+        xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, NULL);
 }
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ca28a4ba4b54..98dc670d3ee0 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -38,6 +38,7 @@
 #include "xfs_utils.h"
 #include "xfs_trace.h"
 #include "xfs_buf.h"
+#include "xfs_icache.h"
 /*
@@ -869,7 +870,7 @@ xfs_rtbuf_get(
        ASSERT(map.br_startblock != NULLFSBLOCK);
        error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
                                   XFS_FSB_TO_DADDR(mp, map.br_startblock),
-                                   mp->m_bsize, 0, &bp);
+                                   mp->m_bsize, 0, &bp, NULL);
        if (error)
                return error;
        ASSERT(!xfs_buf_geterror(bp));
@@ -1872,9 +1873,14 @@ xfs_growfs_rt(
         */
        bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                XFS_FSB_TO_BB(mp, nrblocks - 1),
-                                XFS_FSB_TO_BB(mp, 1), 0);
+                                XFS_FSB_TO_BB(mp, 1), 0, NULL);
        if (!bp)
                return EIO;
+        if (bp->b_error) {
+                error = bp->b_error;
+                xfs_buf_relse(bp);
+                return error;
+        }
        xfs_buf_relse(bp);
        /*
@@ -2219,9 +2225,11 @@ xfs_rtmount_init(
        }
        bp = xfs_buf_read_uncached(mp->m_rtdev_targp,
                                        d - XFS_FSB_TO_BB(mp, 1),
-                                        XFS_FSB_TO_BB(mp, 1), 0);
+                                        XFS_FSB_TO_BB(mp, 1), 0, NULL);
-        if (!bp) {
+        if (!bp || bp->b_error) {
                xfs_warn(mp, "realtime device size check failed");
+                if (bp)
+                        xfs_buf_relse(bp);
                return EIO;
        }
        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_sb.h b/fs/xfs/xfs_sb.h
index f429d9d5d325..a05b45175fb0 100644
--- a/fs/xfs/xfs_sb.h
+++ b/fs/xfs/xfs_sb.h
@@ -81,6 +81,7 @@ struct xfs_mount;
 #define XFS_SB_VERSION2_ATTR2BIT        0x00000008      /* Inline attr rework */
 #define XFS_SB_VERSION2_PARENTBIT       0x00000010      /* parent pointers */
 #define XFS_SB_VERSION2_PROJID32BIT     0x00000080      /* 32 bit project id */
+#define XFS_SB_VERSION2_CRCBIT          0x00000100      /* metadata CRCs */
 #define XFS_SB_VERSION2_OKREALFBITS     \
        (XFS_SB_VERSION2_LAZYSBCOUNTBIT | \
@@ -503,6 +504,12 @@ static inline int xfs_sb_version_hasprojid32bit(xfs_sb_t *sbp)
                (sbp->sb_features2 & XFS_SB_VERSION2_PROJID32BIT);
 }
+static inline int xfs_sb_version_hascrc(xfs_sb_t *sbp)
+{
+        return (xfs_sb_version_hasmorebits(sbp) &&
+                (sbp->sb_features2 & XFS_SB_VERSION2_CRCBIT));
+}
 /*
 * end of superblock version macros
 */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 26a09bd7f975..ab8839b26272 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -49,7 +49,7 @@
 #include "xfs_extfree_item.h"
 #include "xfs_mru_cache.h"
 #include "xfs_inode_item.h"
-#include "xfs_sync.h"
+#include "xfs_icache.h"
 #include "xfs_trace.h"
 #include <linux/namei.h>
@@ -863,8 +863,30 @@ xfs_init_mount_workqueues(
                        WQ_MEM_RECLAIM, 0, mp->m_fsname);
        if (!mp->m_cil_workqueue)
                goto out_destroy_unwritten;
+        mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s",
+                        WQ_NON_REENTRANT, 0, mp->m_fsname);
+        if (!mp->m_reclaim_workqueue)
+                goto out_destroy_cil;
+        mp->m_log_workqueue = alloc_workqueue("xfs-log/%s",
+                        WQ_NON_REENTRANT, 0, mp->m_fsname);
+        if (!mp->m_log_workqueue)
+                goto out_destroy_reclaim;
+        mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s",
+                        WQ_NON_REENTRANT, 0, mp->m_fsname);
+        if (!mp->m_eofblocks_workqueue)
+                goto out_destroy_log;
        return 0;
+out_destroy_log:
+        destroy_workqueue(mp->m_log_workqueue);
+out_destroy_reclaim:
+        destroy_workqueue(mp->m_reclaim_workqueue);
+out_destroy_cil:
+        destroy_workqueue(mp->m_cil_workqueue);
 out_destroy_unwritten:
        destroy_workqueue(mp->m_unwritten_workqueue);
 out_destroy_data_iodone_queue:
@@ -877,11 +899,32 @@ STATIC void
 xfs_destroy_mount_workqueues(
        struct xfs_mount        *mp)
 {
+        destroy_workqueue(mp->m_eofblocks_workqueue);
+        destroy_workqueue(mp->m_log_workqueue);
+        destroy_workqueue(mp->m_reclaim_workqueue);
        destroy_workqueue(mp->m_cil_workqueue);
        destroy_workqueue(mp->m_data_workqueue);
        destroy_workqueue(mp->m_unwritten_workqueue);
 }
+/*
+ * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
+ * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
+ * for IO to complete so that we effectively throttle multiple callers to the
+ * rate at which IO is completing.
+ */
+void
+xfs_flush_inodes(
+        struct xfs_mount        *mp)
+{
+        struct super_block      *sb = mp->m_super;
+        if (down_read_trylock(&sb->s_umount)) {
+                sync_inodes_sb(sb);
+                up_read(&sb->s_umount);
+        }
+}
 /* Catch misguided souls that try to use this interface on XFS */
 STATIC struct inode *
 xfs_fs_alloc_inode(
@@ -1006,9 +1049,8 @@ xfs_fs_put_super(
        struct xfs_mount        *mp = XFS_M(sb);
        xfs_filestream_unmount(mp);
-        cancel_delayed_work_sync(&mp->m_sync_work);
        xfs_unmountfs(mp);
-        xfs_syncd_stop(mp);
        xfs_freesb(mp);
        xfs_icsb_destroy_counters(mp);
        xfs_destroy_mount_workqueues(mp);
@@ -1023,7 +1065,6 @@ xfs_fs_sync_fs(
        int                     wait)
 {
        struct xfs_mount        *mp = XFS_M(sb);
-        int                     error;
        /*
         * Doing anything during the async pass would be counterproductive.
@@ -1031,17 +1072,14 @@ xfs_fs_sync_fs(
        if (!wait)
                return 0;
-        error = xfs_quiesce_data(mp);
+        xfs_log_force(mp, XFS_LOG_SYNC);
-        if (error)
-                return -error;
        if (laptop_mode) {
                /*
                 * The disk must be active because we're syncing.
-                 * We schedule xfssyncd now (now that the disk is
+                 * We schedule log work now (now that the disk is
                 * active) instead of later (when it might not be).
                 */
-                flush_delayed_work(&mp->m_sync_work);
+                flush_delayed_work(&mp->m_log->l_work);
        }
        return 0;
@@ -1118,6 +1156,48 @@ xfs_restore_resvblks(struct xfs_mount *mp)
        xfs_reserve_blocks(mp, &resblks, NULL);
 }
+/*
+ * Trigger writeback of all the dirty metadata in the file system.
+ *
+ * This ensures that the metadata is written to their location on disk rather
+ * than just existing in transactions in the log. This means after a quiesce
+ * there is no log replay required to write the inodes to disk - this is the
+ * primary difference between a sync and a quiesce.
+ *
+ * Note: xfs_log_quiesce() stops background log work - the callers must ensure
+ * it is started again when appropriate.
+ */
+void
+xfs_quiesce_attr(
+        struct xfs_mount        *mp)
+{
+        int     error = 0;
+        /* wait for all modifications to complete */
+        while (atomic_read(&mp->m_active_trans) > 0)
+                delay(100);
+        /* force the log to unpin objects from the now complete transactions */
+        xfs_log_force(mp, XFS_LOG_SYNC);
+        /* reclaim inodes to do any IO before the freeze completes */
+        xfs_reclaim_inodes(mp, 0);
+        xfs_reclaim_inodes(mp, SYNC_WAIT);
+        /* Push the superblock and write an unmount record */
+        error = xfs_log_sbcount(mp);
+        if (error)
+                xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. "
+                                "Frozen image may not be consistent.");
+        /*
+         * Just warn here till VFS can correctly support
+         * read-only remount without racing.
+         */
+        WARN_ON(atomic_read(&mp->m_active_trans) != 0);
+        xfs_log_quiesce(mp);
+}
 STATIC int
 xfs_fs_remount(
        struct super_block      *sb,
@@ -1198,20 +1278,18 @@ xfs_fs_remount(
                 * value if it is non-zero, otherwise go with the default.
                 */
                xfs_restore_resvblks(mp);
+                xfs_log_work_queue(mp);
        }
        /* rw -> ro */
        if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
                /*
-                 * After we have synced the data but before we sync the
+                 * Before we sync the metadata, we need to free up the reserve
-                 * metadata, we need to free up the reserve block pool so that
+                 * block pool so that the used block count in the superblock on
-                 * the used block count in the superblock on disk is correct at
+                 * disk is correct at the end of the remount. Stash the current
-                 * the end of the remount. Stash the current reserve pool size
+                 * reserve pool size so that if we get remounted rw, we can
-                 * so that if we get remounted rw, we can return it to the same
+                 * return it to the same size.
-                 * size.
                 */
-                xfs_quiesce_data(mp);
                xfs_save_resvblks(mp);
                xfs_quiesce_attr(mp);
                mp->m_flags |= XFS_MOUNT_RDONLY;
@@ -1243,6 +1321,7 @@ xfs_fs_unfreeze(
        struct xfs_mount        *mp = XFS_M(sb);
        xfs_restore_resvblks(mp);
+        xfs_log_work_queue(mp);
        return 0;
 }
@@ -1321,6 +1400,8 @@ xfs_fs_fill_super(
        spin_lock_init(&mp->m_sb_lock);
        mutex_init(&mp->m_growlock);
        atomic_set(&mp->m_active_trans, 0);
+        INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
+        INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
        mp->m_super = sb;
        sb->s_fs_info = mp;
@@ -1371,10 +1452,6 @@ xfs_fs_fill_super(
        /*
         * we must configure the block size in the superblock before we run the
         * full mount process as the mount process can lookup and cache inodes.
-         * For the same reason we must also initialise the syncd and register
-         * the inode cache shrinker so that inodes can be reclaimed during
-         * operations like a quotacheck that iterate all inodes in the
-         * filesystem.
         */
        sb->s_magic = XFS_SB_MAGIC;
        sb->s_blocksize = mp->m_sb.sb_blocksize;
@@ -1384,13 +1461,9 @@ xfs_fs_fill_super(
        sb->s_time_gran = 1;
        set_posix_acl_flag(sb);
-        error = xfs_syncd_init(mp);
-        if (error)
-                goto out_filestream_unmount;
        error = xfs_mountfs(mp);
        if (error)
-                goto out_syncd_stop;
+                goto out_filestream_unmount;
        root = igrab(VFS_I(mp->m_rootip));
        if (!root) {
@@ -1408,8 +1481,7 @@ xfs_fs_fill_super(
        }
        return 0;
- out_syncd_stop:
-        xfs_syncd_stop(mp);
 out_filestream_unmount:
        xfs_filestream_unmount(mp);
 out_free_sb:
@@ -1429,7 +1501,6 @@ out_destroy_workqueues:
 out_unmount:
        xfs_filestream_unmount(mp);
        xfs_unmountfs(mp);
-        xfs_syncd_stop(mp);
        goto out_free_sb;
 }
@@ -1625,16 +1696,6 @@ STATIC int __init
 xfs_init_workqueues(void)
 {
        /*
-         * We never want to the same work item to run twice, reclaiming inodes
-         * or idling the log is not going to get any faster by multiple CPUs
-         * competing for ressources.  Use the default large max_active value
-         * so that even lots of filesystems can perform these task in parallel.
-         */
-        xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_NON_REENTRANT, 0);
-        if (!xfs_syncd_wq)
-                return -ENOMEM;
-        /*
         * The allocation workqueue can be used in memory reclaim situations
         * (writepage path), and parallelism is only limited by the number of
         * AGs in all the filesystems mounted. Hence use the default large
@@ -1642,20 +1703,15 @@ xfs_init_workqueues(void)
         */
        xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0);
        if (!xfs_alloc_wq)
-                goto out_destroy_syncd;
+                return -ENOMEM;
        return 0;
-out_destroy_syncd:
-        destroy_workqueue(xfs_syncd_wq);
-        return -ENOMEM;
 }
 STATIC void
 xfs_destroy_workqueues(void)
 {
        destroy_workqueue(xfs_alloc_wq);
-        destroy_workqueue(xfs_syncd_wq);
 }
 STATIC int __init
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 9de4a920ba05..bbe3d15a7904 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,6 +74,7 @@ struct block_device;
 extern __uint64_t xfs_max_file_offset(unsigned int);
+extern void xfs_flush_inodes(struct xfs_mount *mp);
 extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
 extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
 extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index ee2d2adaa438..2801b5ce6cdb 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -202,6 +202,15 @@ static ctl_table xfs_table[] = {
                .extra1         = &xfs_params.fstrm_timer.min,
                .extra2         = &xfs_params.fstrm_timer.max,
        },
+        {
+                .procname       = "speculative_prealloc_lifetime",
+                .data           = &xfs_params.eofb_timer.val,
+                .maxlen         = sizeof(int),
+                .mode           = 0644,
+                .proc_handler   = proc_dointvec_minmax,
+                .extra1         = &xfs_params.eofb_timer.min,
+                .extra2         = &xfs_params.eofb_timer.max,
+        },
        /* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
        {
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index b9937d450f8e..bd8e157c20ef 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -47,6 +47,7 @@ typedef struct xfs_param {
        xfs_sysctl_val_t rotorstep;     /* inode32 AG rotoring control knob */
        xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
        xfs_sysctl_val_t fstrm_timer;   /* Filestream dir-AG assoc'n timeout. */
+        xfs_sysctl_val_t eofb_timer;    /* Interval between eofb scan wakeups */
 } xfs_param_t;
 /*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 7d36ccf57f93..2e137d4a85ae 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -96,6 +96,8 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_full);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list);
+DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list);
 DECLARE_EVENT_CLASS(xfs_perag_class,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
@@ -130,6 +132,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag);
 DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
 TRACE_EVENT(xfs_attr_list_node_descend,
        TP_PROTO(struct xfs_attr_list_context *ctx,
@@ -585,6 +589,10 @@ DEFINE_INODE_EVENT(xfs_update_time);
 DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
 DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
+DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
 DECLARE_EVENT_CLASS(xfs_iref_class,
        TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip),
        TP_ARGS(ip, caller_ip),
@@ -1496,8 +1504,42 @@ DEFINE_DIR2_EVENT(xfs_dir2_node_replace);
 DEFINE_DIR2_EVENT(xfs_dir2_node_removename);
 DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf);
+DECLARE_EVENT_CLASS(xfs_attr_class,
+        TP_PROTO(struct xfs_da_args *args),
+        TP_ARGS(args),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_ino_t, ino)
+                __dynamic_array(char, name, args->namelen)
+                __field(int, namelen)
+                __field(int, valuelen)
+                __field(xfs_dahash_t, hashval)
+                __field(int, op_flags)
+        ),
+        TP_fast_assign(
+                __entry->dev = VFS_I(args->dp)->i_sb->s_dev;
+                __entry->ino = args->dp->i_ino;
+                if (args->namelen)
+                        memcpy(__get_str(name), args->name, args->namelen);
+                __entry->namelen = args->namelen;
+                __entry->valuelen = args->valuelen;
+                __entry->hashval = args->hashval;
+                __entry->op_flags = args->op_flags;
+        ),
+        TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d valuelen %d "
+                  "hashval 0x%x op_flags %s",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->ino,
+                  __entry->namelen,
+                  __entry->namelen ? __get_str(name) : NULL,
+                  __entry->namelen,
+                  __entry->valuelen,
+                  __entry->hashval,
+                  __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS))
+)
 #define DEFINE_ATTR_EVENT(name) \
-DEFINE_EVENT(xfs_da_class, name, \
+DEFINE_EVENT(xfs_attr_class, name, \
        TP_PROTO(struct xfs_da_args *args), \
        TP_ARGS(args))
 DEFINE_ATTR_EVENT(xfs_attr_sf_add);
@@ -1511,10 +1553,14 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add_old);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add_new);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_add_work);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_addname);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_create);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_compact);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_get);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_replace);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_remove);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_removename);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_split);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_split_before);
@@ -1526,12 +1572,21 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_to_sf);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_to_node);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_rebalance);
 DEFINE_ATTR_EVENT(xfs_attr_leaf_unbalance);
+DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
 DEFINE_ATTR_EVENT(xfs_attr_node_addname);
+DEFINE_ATTR_EVENT(xfs_attr_node_get);
 DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_node_replace);
 DEFINE_ATTR_EVENT(xfs_attr_node_removename);
+DEFINE_ATTR_EVENT(xfs_attr_fillstate);
+DEFINE_ATTR_EVENT(xfs_attr_refillstate);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_get);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_set);
+DEFINE_ATTR_EVENT(xfs_attr_rmtval_remove);
 #define DEFINE_DA_EVENT(name) \
 DEFINE_EVENT(xfs_da_class, name, \
        TP_PROTO(struct xfs_da_args *args), \
@@ -1550,9 +1605,12 @@ DEFINE_DA_EVENT(xfs_da_node_split);
 DEFINE_DA_EVENT(xfs_da_node_remove);
 DEFINE_DA_EVENT(xfs_da_node_rebalance);
 DEFINE_DA_EVENT(xfs_da_node_unbalance);
+DEFINE_DA_EVENT(xfs_da_node_toosmall);
 DEFINE_DA_EVENT(xfs_da_swap_lastblock);
 DEFINE_DA_EVENT(xfs_da_grow_inode);
 DEFINE_DA_EVENT(xfs_da_shrink_inode);
+DEFINE_DA_EVENT(xfs_da_fixhashpath);
+DEFINE_DA_EVENT(xfs_da_path_shift);
 DECLARE_EVENT_CLASS(xfs_dir2_space_class,
        TP_PROTO(struct xfs_da_args *args, int idx),
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index db056544cbb5..c6c0601abd7a 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -464,10 +464,7 @@ xfs_trans_get_buf(
        int                     numblks,
        uint                    flags)
 {
-        struct xfs_buf_map      map = {
+        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-                .bm_bn = blkno,
-                .bm_len = numblks,
-        };
        return xfs_trans_get_buf_map(tp, target, &map, 1, flags);
 }
@@ -476,7 +473,8 @@ int		xfs_trans_read_buf_map(struct xfs_mount *mp,
                                       struct xfs_buftarg *target,
                                       struct xfs_buf_map *map, int nmaps,
                                       xfs_buf_flags_t flags,
-                                       struct xfs_buf **bpp);
+                                       struct xfs_buf **bpp,
+                                       const struct xfs_buf_ops *ops);
 static inline int
 xfs_trans_read_buf(
@@ -486,13 +484,12 @@ xfs_trans_read_buf(
        xfs_daddr_t             blkno,
        int                     numblks,
        xfs_buf_flags_t         flags,
-        struct xfs_buf          **bpp)
+        struct xfs_buf          **bpp,
+        const struct xfs_buf_ops *ops)
 {
-        struct xfs_buf_map      map = {
+        DEFINE_SINGLE_BUF_MAP(map, blkno, numblks);
-                .bm_bn = blkno,
+        return xfs_trans_read_buf_map(mp, tp, target, &map, 1,
-                .bm_len = numblks,
+                                      flags, bpp, ops);
-        };
-        return xfs_trans_read_buf_map(mp, tp, target, &map, 1, flags, bpp);
 }
 struct xfs_buf  *xfs_trans_getsb(xfs_trans_t *, struct xfs_mount *, int);
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 6311b99c267f..4fc17d479d42 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -257,7 +257,8 @@ xfs_trans_read_buf_map(
        struct xfs_buf_map      *map,
        int                     nmaps,
        xfs_buf_flags_t         flags,
-        struct xfs_buf          **bpp)
+        struct xfs_buf          **bpp,
+        const struct xfs_buf_ops *ops)
 {
        xfs_buf_t               *bp;
        xfs_buf_log_item_t      *bip;
@@ -265,7 +266,7 @@ xfs_trans_read_buf_map(
        *bpp = NULL;
        if (!tp) {
-                bp = xfs_buf_read_map(target, map, nmaps, flags);
+                bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
                if (!bp)
                        return (flags & XBF_TRYLOCK) ?
                                        EAGAIN : XFS_ERROR(ENOMEM);
@@ -312,7 +313,9 @@ xfs_trans_read_buf_map(
                if (!(XFS_BUF_ISDONE(bp))) {
                        trace_xfs_trans_read_buf_io(bp, _RET_IP_);
                        ASSERT(!XFS_BUF_ISASYNC(bp));
+                        ASSERT(bp->b_iodone == NULL);
                        XFS_BUF_READ(bp);
+                        bp->b_ops = ops;
                        xfsbdstrat(tp->t_mountp, bp);
                        error = xfs_buf_iowait(bp);
                        if (error) {
@@ -349,7 +352,7 @@ xfs_trans_read_buf_map(
                return 0;
        }
-        bp = xfs_buf_read_map(target, map, nmaps, flags);
+        bp = xfs_buf_read_map(target, map, nmaps, flags, ops);
        if (bp == NULL) {
                *bpp = NULL;
                return (flags & XBF_TRYLOCK) ?
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c637344b4..d95f565a390e 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -47,6 +47,7 @@
 #include "xfs_filestream.h"
 #include "xfs_vnodeops.h"
 #include "xfs_trace.h"
+#include "xfs_icache.h"
 /*
 * The maximum pathlen is 1024 bytes. Since the minimum file system
@@ -79,7 +80,7 @@ xfs_readlink_bmap(
                d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
                byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0, NULL);
                if (!bp)
                        return XFS_ERROR(ENOMEM);
                error = bp->b_error;
@@ -150,7 +151,7 @@ xfs_readlink(
 * when the link count isn't zero and by xfs_dm_punch_hole() when
 * punching a hole to EOF.
 */
-STATIC int
+int
 xfs_free_eofblocks(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
@@ -199,7 +200,7 @@ xfs_free_eofblocks(
                if (need_iolock) {
                        if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) {
                                xfs_trans_cancel(tp, 0);
-                                return 0;
+                                return EAGAIN;
                        }
                }
@@ -237,6 +238,8 @@ xfs_free_eofblocks(
                } else {
                        error = xfs_trans_commit(tp,
                                                XFS_TRANS_RELEASE_LOG_RES);
+                        if (!error)
+                                xfs_inode_clear_eofblocks_tag(ip);
                }
                xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -425,19 +428,18 @@ xfs_release(
                truncated = xfs_iflags_test_and_clear(ip, XFS_ITRUNCATED);
                if (truncated) {
                        xfs_iflags_clear(ip, XFS_IDIRTY_RELEASE);
-                        if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0)
+                        if (VN_DIRTY(VFS_I(ip)) && ip->i_delayed_blks > 0) {
-                                xfs_flush_pages(ip, 0, -1, XBF_ASYNC, FI_NONE);
+                                error = -filemap_flush(VFS_I(ip)->i_mapping);
+                                if (error)
+                                        return error;
+                        }
                }
        }
        if (ip->i_d.di_nlink == 0)
                return 0;
-        if ((S_ISREG(ip->i_d.di_mode) &&
+        if (xfs_can_free_eofblocks(ip, false)) {
-             (VFS_I(ip)->i_size > 0 ||
-              (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
-             (ip->i_df.if_flags & XFS_IFEXTENTS))  &&
-            (!(ip->i_d.di_flags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)))) {
                /*
                 * If we can't get the iolock just skip truncating the blocks
@@ -464,7 +466,7 @@ xfs_release(
                        return 0;
                error = xfs_free_eofblocks(mp, ip, true);
-                if (error)
+                if (error && error != EAGAIN)
                        return error;
                /* delalloc blocks after truncation means it really is dirty */
@@ -513,13 +515,12 @@ xfs_inactive(
                goto out;
        if (ip->i_d.di_nlink != 0) {
-                if ((S_ISREG(ip->i_d.di_mode) &&
+                /*
-                    (VFS_I(ip)->i_size > 0 ||
+                 * force is true because we are evicting an inode from the
-                     (VN_CACHED(VFS_I(ip)) > 0 || ip->i_delayed_blks > 0)) &&
+                 * cache. Post-eof blocks must be freed, lest we end up with
-                    (ip->i_df.if_flags & XFS_IFEXTENTS) &&
+                 * broken free space accounting.
-                    (!(ip->i_d.di_flags &
+                 */
-                                (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) ||
+                if (xfs_can_free_eofblocks(ip, true)) {
-                     ip->i_delayed_blks != 0))) {
                        error = xfs_free_eofblocks(mp, ip, false);
                        if (error)
                                return VN_INACTIVE_CACHE;
@@ -777,7 +778,7 @@ xfs_create(
                        XFS_TRANS_PERM_LOG_RES, log_count);
        if (error == ENOSPC) {
                /* flush outstanding delalloc blocks and retry */
-                xfs_flush_inodes(dp);
+                xfs_flush_inodes(mp);
                error = xfs_trans_reserve(tp, resblks, log_res, 0,
                                XFS_TRANS_PERM_LOG_RES, log_count);
        }
@@ -1957,12 +1958,11 @@ xfs_free_file_space(
        rounding = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
        ioffset = offset & ~(rounding - 1);
+        error = -filemap_write_and_wait_range(VFS_I(ip)->i_mapping,
-        if (VN_CACHED(VFS_I(ip)) != 0) {
+                                              ioffset, -1);
-                error = xfs_flushinval_pages(ip, ioffset, -1, FI_REMAPF_LOCKED);
+        if (error)
-                if (error)
+                goto out_unlock_iolock;
-                        goto out_unlock_iolock;
+        truncate_pagecache_range(VFS_I(ip), ioffset, -1);
-        }
        /*
         * Need to zero the stuff we're not freeing, on disk.
@@ -2095,6 +2095,73 @@ xfs_free_file_space(
        return error;
 }
+STATIC int
+xfs_zero_file_space(
+        struct xfs_inode        *ip,
+        xfs_off_t               offset,
+        xfs_off_t               len,
+        int                     attr_flags)
+{
+        struct xfs_mount        *mp = ip->i_mount;
+        uint                    granularity;
+        xfs_off_t               start_boundary;
+        xfs_off_t               end_boundary;
+        int                     error;
+        granularity = max_t(uint, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE);
+        /*
+         * Round the range of extents we are going to convert inwards.  If the
+         * offset is aligned, then it doesn't get changed so we zero from the
+         * start of the block offset points to.
+         */
+        start_boundary = round_up(offset, granularity);
+        end_boundary = round_down(offset + len, granularity);
+        ASSERT(start_boundary >= offset);
+        ASSERT(end_boundary <= offset + len);
+        if (!(attr_flags & XFS_ATTR_NOLOCK))
+                xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        if (start_boundary < end_boundary - 1) {
+                /* punch out the page cache over the conversion range */
+                truncate_pagecache_range(VFS_I(ip), start_boundary,
+                                         end_boundary - 1);
+                /* convert the blocks */
+                error = xfs_alloc_file_space(ip, start_boundary,
+                                        end_boundary - start_boundary - 1,
+                                        XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT,
+                                        attr_flags);
+                if (error)
+                        goto out_unlock;
+                /* We've handled the interior of the range, now for the edges */
+                if (start_boundary != offset)
+                        error = xfs_iozero(ip, offset, start_boundary - offset);
+                if (error)
+                        goto out_unlock;
+                if (end_boundary != offset + len)
+                        error = xfs_iozero(ip, end_boundary,
+                                           offset + len - end_boundary);
+        } else {
+                /*
+                 * It's either a sub-granularity range or the range spanned lies
+                 * partially across two adjacent blocks.
+                 */
+                error = xfs_iozero(ip, offset, len);
+        }
+out_unlock:
+        if (!(attr_flags & XFS_ATTR_NOLOCK))
+                xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+        return error;
+}
 /*
 * xfs_change_file_space()
 *      This routine allocates or frees disk space for the given file.
@@ -2120,10 +2187,8 @@ xfs_change_file_space(
        xfs_fsize_t     fsize;
        int             setprealloc;
        xfs_off_t       startoffset;
-        xfs_off_t       llen;
        xfs_trans_t     *tp;
        struct iattr    iattr;
-        int             prealloc_type;
        if (!S_ISREG(ip->i_d.di_mode))
                return XFS_ERROR(EINVAL);
@@ -2141,12 +2206,30 @@ xfs_change_file_space(
                return XFS_ERROR(EINVAL);
        }
-        llen = bf->l_len > 0 ? bf->l_len - 1 : bf->l_len;
+        /*
+         * length of <= 0 for resv/unresv/zero is invalid.  length for
+         * alloc/free is ignored completely and we have no idea what userspace
+         * might have set it to, so set it to zero to allow range
+         * checks to pass.
+         */
+        switch (cmd) {
+        case XFS_IOC_ZERO_RANGE:
+        case XFS_IOC_RESVSP:
+        case XFS_IOC_RESVSP64:
+        case XFS_IOC_UNRESVSP:
+        case XFS_IOC_UNRESVSP64:
+                if (bf->l_len <= 0)
+                        return XFS_ERROR(EINVAL);
+                break;
+        default:
+                bf->l_len = 0;
+                break;
+        }
        if (bf->l_start < 0 ||
            bf->l_start > mp->m_super->s_maxbytes ||
-            bf->l_start + llen < 0 ||
+            bf->l_start + bf->l_len < 0 ||
-            bf->l_start + llen > mp->m_super->s_maxbytes)
+            bf->l_start + bf->l_len >= mp->m_super->s_maxbytes)
                return XFS_ERROR(EINVAL);
        bf->l_whence = 0;
@@ -2154,29 +2237,20 @@ xfs_change_file_space(
        startoffset = bf->l_start;
        fsize = XFS_ISIZE(ip);
-        /*
-         * XFS_IOC_RESVSP and XFS_IOC_UNRESVSP will reserve or unreserve
-         * file space.
-         * These calls do NOT zero the data space allocated to the file,
-         * nor do they change the file size.
-         *
-         * XFS_IOC_ALLOCSP and XFS_IOC_FREESP will allocate and free file
-         * space.
-         * These calls cause the new file data to be zeroed and the file
-         * size to be changed.
-         */
        setprealloc = clrprealloc = 0;
-        prealloc_type = XFS_BMAPI_PREALLOC;
        switch (cmd) {
        case XFS_IOC_ZERO_RANGE:
-                prealloc_type |= XFS_BMAPI_CONVERT;
+                error = xfs_zero_file_space(ip, startoffset, bf->l_len,
-                xfs_tosspages(ip, startoffset, startoffset + bf->l_len, 0);
+                                                attr_flags);
-                /* FALLTHRU */
+                if (error)
+                        return error;
+                setprealloc = 1;
+                break;
        case XFS_IOC_RESVSP:
        case XFS_IOC_RESVSP64:
                error = xfs_alloc_file_space(ip, startoffset, bf->l_len,
-                                                prealloc_type, attr_flags);
+                                                XFS_BMAPI_PREALLOC, attr_flags);
                if (error)
                        return error;
                setprealloc = 1;
diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h
index 447e146b2ba6..5163022d9808 100644
--- a/fs/xfs/xfs_vnodeops.h
+++ b/fs/xfs/xfs_vnodeops.h
@@ -48,14 +48,9 @@ int xfs_attr_set(struct xfs_inode *dp, const unsigned char *name,
 int xfs_attr_remove(struct xfs_inode *dp, const unsigned char *name, int flags);
 int xfs_attr_list(struct xfs_inode *dp, char *buffer, int bufsize,
                int flags, struct attrlist_cursor_kern *cursor);
-void xfs_tosspages(struct xfs_inode *inode, xfs_off_t first,
-                xfs_off_t last, int fiopt);
-int xfs_flushinval_pages(struct xfs_inode *ip, xfs_off_t first,
-                xfs_off_t last, int fiopt);
-int xfs_flush_pages(struct xfs_inode *ip, xfs_off_t first,
-                xfs_off_t last, uint64_t flags, int fiopt);
-int xfs_wait_on_pages(struct xfs_inode *ip, xfs_off_t first, xfs_off_t last);
+int xfs_iozero(struct xfs_inode *, loff_t, size_t);
 int xfs_zero_eof(struct xfs_inode *, xfs_off_t, xfs_fsize_t);
+int xfs_free_eofblocks(struct xfs_mount *, struct xfs_inode *, bool);
 #endif /* _XFS_VNODEOPS_H */