Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into linux-next

Conflicts: fs/ubifs/super.c Merge the upstream tree in order to resolve a conflict with the per-bdi writeback changes from the linux-2.6-block tree.
author: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> 2009-09-21 05:09:22 -0400
committer: Artem Bityutskiy <Artem.Bityutskiy@nokia.com> 2009-09-21 05:09:22 -0400
commit: 7cce2f4cb7f5f641f78c8e3eea4e7b1b96cb71c0 (patch)
tree: b064d077928cf224660ab1e1841cdab2c9fd8b08 /fs
parent: e055f7e873d900925c222cf2d1ec955af4a9ca90 (diff)
parent: ebc79c4f8da0f92efa968e0328f32334a2ce80cf (diff)
292 files changed, 9459 insertions, 6600 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 332b5ff02fec..f7003cfac63d 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -76,7 +76,7 @@ static const match_table_t tokens = {
 * Return 0 upon success, -ERRNO upon failure.
 */
-static int v9fs_parse_options(struct v9fs_session_info *v9ses)
+static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts)
 {
        char *options;
        substring_t args[MAX_OPT_ARGS];
@@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses)
        v9ses->debug = 0;
        v9ses->cache = 0;
-        if (!v9ses->options)
+        if (!opts)
                return 0;
-        options = kstrdup(v9ses->options, GFP_KERNEL);
+        options = kstrdup(opts, GFP_KERNEL);
        if (!options) {
                P9_DPRINTK(P9_DEBUG_ERROR,
                           "failed to allocate copy of option string\n");
@@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses,
        v9ses->uid = ~0;
        v9ses->dfltuid = V9FS_DEFUID;
        v9ses->dfltgid = V9FS_DEFGID;
-        if (data) {
-                v9ses->options = kstrdup(data, GFP_KERNEL);
-                if (!v9ses->options) {
-                        P9_DPRINTK(P9_DEBUG_ERROR,
-                           "failed to allocate copy of option string\n");
-                        retval = -ENOMEM;
-                        goto error;
-                }
-        }
-        rc = v9fs_parse_options(v9ses);
+        rc = v9fs_parse_options(v9ses, data);
        if (rc < 0) {
                retval = rc;
                goto error;
        }
-        v9ses->clnt = p9_client_create(dev_name, v9ses->options);
+        v9ses->clnt = p9_client_create(dev_name, data);
        if (IS_ERR(v9ses->clnt)) {
                retval = PTR_ERR(v9ses->clnt);
                v9ses->clnt = NULL;
@@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses)
        __putname(v9ses->uname);
        __putname(v9ses->aname);
-        kfree(v9ses->options);
 }
 /**
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index a7d567192998..38762bf102a9 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -85,7 +85,6 @@ struct v9fs_session_info {
        unsigned int afid;
        unsigned int cache;
-        char *options;          /* copy of mount options */
        char *uname;            /* user name to mount as */
        char *aname;            /* name of remote hierarchy being mounted */
        unsigned int maxdata;   /* max data for client interface */
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 81f8bbf12f9f..06a223d50a81 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended)
 /**
 * v9fs_blank_wstat - helper function to setup a 9P stat structure
- * @v9ses: 9P session info (for determining extended mode)
 * @wstat: structure to initialize
 *
 */
@@ -207,65 +206,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat)
 struct inode *v9fs_get_inode(struct super_block *sb, int mode)
 {
+        int err;
        struct inode *inode;
        struct v9fs_session_info *v9ses = sb->s_fs_info;
        P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode);
        inode = new_inode(sb);
-        if (inode) {
+        if (!inode) {
-                inode->i_mode = mode;
-                inode->i_uid = current_fsuid();
-                inode->i_gid = current_fsgid();
-                inode->i_blocks = 0;
-                inode->i_rdev = 0;
-                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                inode->i_mapping->a_ops = &v9fs_addr_operations;
-                switch (mode & S_IFMT) {
-                case S_IFIFO:
-                case S_IFBLK:
-                case S_IFCHR:
-                case S_IFSOCK:
-                        if (!v9fs_extended(v9ses)) {
-                                P9_DPRINTK(P9_DEBUG_ERROR,
-                                      "special files without extended mode\n");
-                                return ERR_PTR(-EINVAL);
-                        }
-                        init_special_inode(inode, inode->i_mode,
-                                           inode->i_rdev);
-                        break;
-                case S_IFREG:
-                        inode->i_op = &v9fs_file_inode_operations;
-                        inode->i_fop = &v9fs_file_operations;
-                        break;
-                case S_IFLNK:
-                        if (!v9fs_extended(v9ses)) {
-                                P9_DPRINTK(P9_DEBUG_ERROR,
-                                        "extended modes used w/o 9P2000.u\n");
-                                return ERR_PTR(-EINVAL);
-                        }
-                        inode->i_op = &v9fs_symlink_inode_operations;
-                        break;
-                case S_IFDIR:
-                        inc_nlink(inode);
-                        if (v9fs_extended(v9ses))
-                                inode->i_op = &v9fs_dir_inode_operations_ext;
-                        else
-                                inode->i_op = &v9fs_dir_inode_operations;
-                        inode->i_fop = &v9fs_dir_operations;
-                        break;
-                default:
-                        P9_DPRINTK(P9_DEBUG_ERROR,
-                                "BAD mode 0x%x S_IFMT 0x%x\n",
-                                mode, mode & S_IFMT);
-                        return ERR_PTR(-EINVAL);
-                }
-        } else {
                P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n");
                return ERR_PTR(-ENOMEM);
        }
+        inode->i_mode = mode;
+        inode->i_uid = current_fsuid();
+        inode->i_gid = current_fsgid();
+        inode->i_blocks = 0;
+        inode->i_rdev = 0;
+        inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+        inode->i_mapping->a_ops = &v9fs_addr_operations;
+        switch (mode & S_IFMT) {
+        case S_IFIFO:
+        case S_IFBLK:
+        case S_IFCHR:
+        case S_IFSOCK:
+                if (!v9fs_extended(v9ses)) {
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                   "special files without extended mode\n");
+                        err = -EINVAL;
+                        goto error;
+                }
+                init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                break;
+        case S_IFREG:
+                inode->i_op = &v9fs_file_inode_operations;
+                inode->i_fop = &v9fs_file_operations;
+                break;
+        case S_IFLNK:
+                if (!v9fs_extended(v9ses)) {
+                        P9_DPRINTK(P9_DEBUG_ERROR,
+                                   "extended modes used w/o 9P2000.u\n");
+                        err = -EINVAL;
+                        goto error;
+                }
+                inode->i_op = &v9fs_symlink_inode_operations;
+                break;
+        case S_IFDIR:
+                inc_nlink(inode);
+                if (v9fs_extended(v9ses))
+                        inode->i_op = &v9fs_dir_inode_operations_ext;
+                else
+                        inode->i_op = &v9fs_dir_inode_operations;
+                inode->i_fop = &v9fs_dir_operations;
+                break;
+        default:
+                P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
+                           mode, mode & S_IFMT);
+                err = -EINVAL;
+                goto error;
+        }
        return inode;
+error:
+        iput(inode);
+        return ERR_PTR(err);
 }
 /*
@@ -338,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid,
        ret = NULL;
        st = p9_client_stat(fid);
-        if (IS_ERR(st)) {
+        if (IS_ERR(st))
-                err = PTR_ERR(st);
+                return ERR_CAST(st);
-                st = NULL;
-                goto error;
-        }
        umode = p9mode2unixmode(v9ses, st->mode);
        ret = v9fs_get_inode(sb, umode);
        if (IS_ERR(ret)) {
                err = PTR_ERR(ret);
-                ret = NULL;
                goto error;
        }
        v9fs_stat2inode(st, ret, sb);
        ret->i_ino = v9fs_qid2ino(&st->qid);
+        p9stat_free(st);
        kfree(st);
        return ret;
 error:
+        p9stat_free(st);
        kfree(st);
-        if (ret)
-                iput(ret);
        return ERR_PTR(err);
 }
@@ -403,9 +404,9 @@ v9fs_open_created(struct inode *inode, struct file *file)
 * @v9ses: session information
 * @dir: directory that dentry is being created in
 * @dentry:  dentry that is being created
+ * @extension: 9p2000.u extension string to support devices, etc.
 * @perm: create permissions
 * @mode: open mode
- * @extension: 9p2000.u extension string to support devices, etc.
 *
 */
 static struct p9_fid *
@@ -470,7 +471,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
                dentry->d_op = &v9fs_dentry_operations;
        d_instantiate(dentry, inode);
-        v9fs_fid_add(dentry, fid);
+        err = v9fs_fid_add(dentry, fid);
+        if (err < 0)
+                goto error;
        return ofid;
 error:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 38d695d66a0b..8961f1a8f668 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -81,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data)
 static void
 v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
-                int flags)
+                int flags, void *data)
 {
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
@@ -91,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
            MS_NOATIME;
+        save_mount_options(sb, data);
 }
 /**
@@ -113,14 +115,11 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
        struct v9fs_session_info *v9ses = NULL;
        struct p9_wstat *st = NULL;
        int mode = S_IRWXUGO | S_ISVTX;
-        uid_t uid = current_fsuid();
-        gid_t gid = current_fsgid();
        struct p9_fid *fid;
        int retval = 0;
        P9_DPRINTK(P9_DEBUG_VFS, " \n");
-        st = NULL;
        v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL);
        if (!v9ses)
                return -ENOMEM;
@@ -142,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                retval = PTR_ERR(sb);
                goto free_stat;
        }
-        v9fs_fill_super(sb, v9ses, flags);
+        v9fs_fill_super(sb, v9ses, flags, data);
        inode = v9fs_get_inode(sb, S_IFDIR | mode);
        if (IS_ERR(inode)) {
@@ -150,9 +149,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
                goto release_sb;
        }
-        inode->i_uid = uid;
-        inode->i_gid = gid;
        root = d_alloc_root(inode);
        if (!root) {
                iput(inode);
@@ -173,10 +169,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
        simple_set_mnt(mnt, sb);
        return 0;
-release_sb:
-        deactivate_locked_super(sb);
 free_stat:
+        p9stat_free(st);
        kfree(st);
 clunk_fid:
@@ -185,7 +179,12 @@ clunk_fid:
 close_session:
        v9fs_session_close(v9ses);
        kfree(v9ses);
+        return retval;
+release_sb:
+        p9stat_free(st);
+        kfree(st);
+        deactivate_locked_super(sb);
        return retval;
 }
@@ -207,24 +206,10 @@ static void v9fs_kill_super(struct super_block *s)
        v9fs_session_close(v9ses);
        kfree(v9ses);
+        s->s_fs_info = NULL;
        P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n");
 }
-/**
- * v9fs_show_options - Show mount options in /proc/mounts
- * @m: seq_file to write to
- * @mnt: mount descriptor
- *
- */
-static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt)
-{
-        struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info;
-        seq_printf(m, "%s", v9ses->options);
-        return 0;
-}
 static void
 v9fs_umount_begin(struct super_block *sb)
 {
@@ -237,7 +222,7 @@ v9fs_umount_begin(struct super_block *sb)
 static const struct super_operations v9fs_super_ops = {
        .statfs = simple_statfs,
        .clear_inode = v9fs_clear_inode,
-        .show_options = v9fs_show_options,
+        .show_options = generic_show_options,
        .umount_begin = v9fs_umount_begin,
 };
diff --git a/fs/Kconfig b/fs/Kconfig
index 0e7da7bb5d93..455aa207e67e 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -43,6 +43,7 @@ source "fs/xfs/Kconfig"
 source "fs/gfs2/Kconfig"
 source "fs/ocfs2/Kconfig"
 source "fs/btrfs/Kconfig"
+source "fs/nilfs2/Kconfig"
 endif # BLOCK
@@ -186,7 +187,6 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
-source "fs/nilfs2/Kconfig"
 endif # MISC_FILESYSTEMS
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 0149dab365e7..681c2a7b013f 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page)
        inode = page->mapping->host;
-        ASSERT(file != NULL);
+        if (file) {
-        key = file->private_data;
+                key = file->private_data;
-        ASSERT(key != NULL);
+                ASSERT(key != NULL);
+        } else {
+                key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell);
+                if (IS_ERR(key)) {
+                        ret = PTR_ERR(key);
+                        goto error_nokey;
+                }
+        }
        _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
@@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page)
                unlock_page(page);
        }
+        if (!file)
+                key_put(key);
        _leave(" = 0");
        return 0;
 error:
        SetPageError(page);
        unlock_page(page);
+        if (!file)
+                key_put(key);
+error_nokey:
        _leave(" = %d", ret);
        return ret;
 }
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c2e7a7ff0080..c63a3c8beb73 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -712,7 +712,6 @@ int afs_writeback_all(struct afs_vnode *vnode)
                .bdi            = mapping->backing_dev_info,
                .sync_mode      = WB_SYNC_ALL,
                .nr_to_write    = LONG_MAX,
-                .for_writepages = 1,
                .range_cyclic   = 1,
        };
        int ret;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index aa39ae83f019..3da18d453488 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -77,7 +77,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry)
        }
        /* Update the expiry counter if fs is busy */
-        if (!may_umount_tree(mnt)) {
+        if (!may_umount_tree(path.mnt)) {
                struct autofs_info *ino = autofs4_dentry_ino(top);
                ino->last_used = jiffies;
                goto done;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b7c1603cd4bd..7c1e65d54872 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -501,22 +501,22 @@ static unsigned long load_elf_interp(struct elfhdr *interp_elf_ex,
                }
        }
-        /*
+        if (last_bss > elf_bss) {
-         * Now fill out the bss section.  First pad the last page up
+                /*
-         * to the page boundary, and then perform a mmap to make sure
+                 * Now fill out the bss section.  First pad the last page up
-         * that there are zero-mapped pages up to and including the 
+                 * to the page boundary, and then perform a mmap to make sure
-         * last bss page.
+                 * that there are zero-mapped pages up to and including the
-         */
+                 * last bss page.
-        if (padzero(elf_bss)) {
+                 */
-                error = -EFAULT;
+                if (padzero(elf_bss)) {
-                goto out_close;
+                        error = -EFAULT;
-        }
+                        goto out_close;
+                }
-        /* What we have mapped so far */
+                /* What we have mapped so far */
-        elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
+                elf_bss = ELF_PAGESTART(elf_bss + ELF_MIN_ALIGN - 1);
-        /* Map the last of the bss segment */
+                /* Map the last of the bss segment */
-        if (last_bss > elf_bss) {
                down_write(&current->mm->mmap_sem);
                error = do_brk(elf_bss, last_bss - elf_bss);
                up_write(&current->mm->mmap_sem);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 697f6b5f1313..e92f229e3c6e 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs)
        if (IS_ERR(bprm.file))
                return res;
+        bprm.cred = prepare_exec_creds();
+        res = -ENOMEM;
+        if (!bprm.cred)
+                goto out;
        res = prepare_binprm(&bprm);
        if (res <= (unsigned long)-4096)
                res = load_flat_file(&bprm, libs, id, NULL);
-        if (bprm.file) {
-                allow_write_access(bprm.file);
+        abort_creds(bprm.cred);
-                fput(bprm.file);
-                bprm.file = NULL;
+out:
-        }
+        allow_write_access(bprm.file);
+        fput(bprm.file);
        return(res);
 }
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 3a6d4fb2a329..71e7e03ac343 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -420,7 +420,6 @@ static void bdev_destroy_inode(struct inode *inode)
 {
        struct bdev_inode *bdi = BDEV_I(inode);
-        bdi->bdev.bd_inode_backing_dev_info = NULL;
        kmem_cache_free(bdev_cachep, bdi);
 }
@@ -564,6 +563,16 @@ struct block_device *bdget(dev_t dev)
 EXPORT_SYMBOL(bdget);
+/**
+ * bdgrab -- Grab a reference to an already referenced block device
+ * @bdev:       Block device to grab a reference to.
+ */
+struct block_device *bdgrab(struct block_device *bdev)
+{
+        atomic_inc(&bdev->bd_inode->i_count);
+        return bdev;
+}
 long nr_blockdev_pages(void)
 {
        struct block_device *bdev;
@@ -1395,6 +1404,33 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 }
 /*
+ * Write data to the block device.  Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov,
+                         unsigned long nr_segs, loff_t pos)
+{
+        struct file *file = iocb->ki_filp;
+        ssize_t ret;
+        BUG_ON(iocb->ki_pos != pos);
+        ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos);
+        if (ret > 0 || ret == -EIOCBQUEUED) {
+                ssize_t err;
+                err = generic_write_sync(file, pos, ret);
+                if (err < 0 && ret > 0)
+                        ret = err;
+        }
+        return ret;
+}
+EXPORT_SYMBOL_GPL(blkdev_aio_write);
+/*
 * Try to release a page associated with block device when the system
 * is under memory pressure.
 */
@@ -1426,7 +1462,7 @@ const struct file_operations def_blk_fops = {
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
-        .aio_write      = generic_file_aio_write_nolock,
+        .aio_write      = blkdev_aio_write,
        .mmap           = generic_file_mmap,
        .fsync          = block_fsync,
        .unlocked_ioctl = block_ioctl,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 6e4f6c50a120..019e8af449ab 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -424,11 +424,11 @@ int btrfs_requeue_work(struct btrfs_work *work)
         * list
         */
        if (worker->idle) {
-                spin_lock_irqsave(&worker->workers->lock, flags);
+                spin_lock(&worker->workers->lock);
                worker->idle = 0;
                list_move_tail(&worker->worker_list,
                               &worker->workers->worker_list);
-                spin_unlock_irqrestore(&worker->workers->lock, flags);
+                spin_unlock(&worker->workers->lock);
        }
        if (!worker->working) {
                wake = 1;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 60a45f3a4e91..3fdcc0512d3a 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -557,19 +557,7 @@ static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2)
        btrfs_disk_key_to_cpu(&k1, disk);
-        if (k1.objectid > k2->objectid)
+        return btrfs_comp_cpu_keys(&k1, k2);
-                return 1;
-        if (k1.objectid < k2->objectid)
-                return -1;
-        if (k1.type > k2->type)
-                return 1;
-        if (k1.type < k2->type)
-                return -1;
-        if (k1.offset > k2->offset)
-                return 1;
-        if (k1.offset < k2->offset)
-                return -1;
-        return 0;
 }
 /*
@@ -1052,9 +1040,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
-        if (btrfs_header_nritems(mid) > 2)
-                return 0;
        if (btrfs_header_nritems(mid) < 2)
                err_on_enospc = 1;
@@ -1701,6 +1686,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct extent_buffer *b;
        int slot;
        int ret;
+        int err;
        int level;
        int lowest_unlock = 1;
        u8 lowest_level = 0;
@@ -1737,8 +1723,6 @@ again:
                        p->locks[level] = 1;
                if (cow) {
-                        int wret;
                        /*
                         * if we don't really need to cow this block
                         * then we don't want to set the path blocking,
@@ -1749,12 +1733,12 @@ again:
                        btrfs_set_path_blocking(p);
-                        wret = btrfs_cow_block(trans, root, b,
+                        err = btrfs_cow_block(trans, root, b,
-                                               p->nodes[level + 1],
+                                              p->nodes[level + 1],
-                                               p->slots[level + 1], &b);
+                                              p->slots[level + 1], &b);
-                        if (wret) {
+                        if (err) {
                                free_extent_buffer(b);
-                                ret = wret;
+                                ret = err;
                                goto done;
                        }
                }
@@ -1793,41 +1777,45 @@ cow_done:
                ret = bin_search(b, key, level, &slot);
                if (level != 0) {
-                        if (ret && slot > 0)
+                        int dec = 0;
+                        if (ret && slot > 0) {
+                                dec = 1;
                                slot -= 1;
+                        }
                        p->slots[level] = slot;
-                        ret = setup_nodes_for_search(trans, root, p, b, level,
+                        err = setup_nodes_for_search(trans, root, p, b, level,
                                                     ins_len);
-                        if (ret == -EAGAIN)
+                        if (err == -EAGAIN)
                                goto again;
-                        else if (ret)
+                        if (err) {
+                                ret = err;
                                goto done;
+                        }
                        b = p->nodes[level];
                        slot = p->slots[level];
                        unlock_up(p, level, lowest_unlock);
-                        /* this is only true while dropping a snapshot */
                        if (level == lowest_level) {
-                                ret = 0;
+                                if (dec)
+                                        p->slots[level]++;
                                goto done;
                        }
-                        ret = read_block_for_search(trans, root, p,
+                        err = read_block_for_search(trans, root, p,
                                                    &b, level, slot, key);
-                        if (ret == -EAGAIN)
+                        if (err == -EAGAIN)
                                goto again;
+                        if (err) {
-                        if (ret == -EIO)
+                                ret = err;
                                goto done;
+                        }
                        if (!p->skip_locking) {
-                                int lret;
                                btrfs_clear_path_blocking(p, NULL);
-                                lret = btrfs_try_spin_lock(b);
+                                err = btrfs_try_spin_lock(b);
-                                if (!lret) {
+                                if (!err) {
                                        btrfs_set_path_blocking(p);
                                        btrfs_tree_lock(b);
                                        btrfs_clear_path_blocking(p, b);
@@ -1837,16 +1825,14 @@ cow_done:
                        p->slots[level] = slot;
                        if (ins_len > 0 &&
                            btrfs_leaf_free_space(root, b) < ins_len) {
-                                int sret;
                                btrfs_set_path_blocking(p);
-                                sret = split_leaf(trans, root, key,
+                                err = split_leaf(trans, root, key,
-                                                      p, ins_len, ret == 0);
+                                                 p, ins_len, ret == 0);
                                btrfs_clear_path_blocking(p, NULL);
-                                BUG_ON(sret > 0);
+                                BUG_ON(err > 0);
-                                if (sret) {
+                                if (err) {
-                                        ret = sret;
+                                        ret = err;
                                        goto done;
                                }
                        }
@@ -3807,7 +3793,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                }
                /* delete the leaf if it is mostly empty */
-                if (used < BTRFS_LEAF_DATA_SIZE(root) / 2) {
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 3) {
                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to del_ptr below
@@ -4042,10 +4028,9 @@ out:
 * calling this function.
 */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-                        struct btrfs_key *key, int lowest_level,
+                        struct btrfs_key *key, int level,
                        int cache_only, u64 min_trans)
 {
-        int level = lowest_level;
        int slot;
        struct extent_buffer *c;
@@ -4058,11 +4043,40 @@ int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                c = path->nodes[level];
 next:
                if (slot >= btrfs_header_nritems(c)) {
-                        level++;
+                        int ret;
-                        if (level == BTRFS_MAX_LEVEL)
+                        int orig_lowest;
+                        struct btrfs_key cur_key;
+                        if (level + 1 >= BTRFS_MAX_LEVEL ||
+                            !path->nodes[level + 1])
                                return 1;
-                        continue;
+                        if (path->locks[level + 1]) {
+                                level++;
+                                continue;
+                        }
+                        slot = btrfs_header_nritems(c) - 1;
+                        if (level == 0)
+                                btrfs_item_key_to_cpu(c, &cur_key, slot);
+                        else
+                                btrfs_node_key_to_cpu(c, &cur_key, slot);
+                        orig_lowest = path->lowest_level;
+                        btrfs_release_path(root, path);
+                        path->lowest_level = level;
+                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
+                                                0, 0);
+                        path->lowest_level = orig_lowest;
+                        if (ret < 0)
+                                return ret;
+                        c = path->nodes[level];
+                        slot = path->slots[level];
+                        if (ret == 0)
+                                slot++;
+                        goto next;
                }
                if (level == 0)
                        btrfs_item_key_to_cpu(c, key, slot);
                else {
@@ -4146,7 +4160,8 @@ again:
         * advance the path if there are now more items available.
         */
        if (nritems > 0 && path->slots[0] < nritems - 1) {
-                path->slots[0]++;
+                if (ret == 0)
+                        path->slots[0]++;
                ret = 0;
                goto done;
        }
@@ -4278,10 +4293,10 @@ int btrfs_previous_item(struct btrfs_root *root,
                        path->slots[0]--;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.type == type)
-                        return 0;
                if (found_key.objectid < min_objectid)
                        break;
+                if (found_key.type == type)
+                        return 0;
                if (found_key.objectid == min_objectid &&
                    found_key.type < type)
                        break;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 98a873838717..837435ce84ca 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -481,7 +481,7 @@ struct btrfs_shared_data_ref {
 struct btrfs_extent_inline_ref {
        u8 type;
-        u64 offset;
+        __le64 offset;
 } __attribute__ ((__packed__));
 /* old style backrefs item */
@@ -689,6 +689,7 @@ struct btrfs_space_info {
        struct list_head block_groups;
        spinlock_t lock;
        struct rw_semaphore groups_sem;
+        atomic_t caching_threads;
 };
 /*
@@ -707,6 +708,9 @@ struct btrfs_free_cluster {
        /* first extent starting offset */
        u64 window_start;
+        /* if this cluster simply points at a bitmap in the block group */
+        bool points_to_bitmap;
        struct btrfs_block_group_cache *block_group;
        /*
         * when a cluster is allocated from a block group, we put the
@@ -716,24 +720,37 @@ struct btrfs_free_cluster {
        struct list_head block_group_list;
 };
+enum btrfs_caching_type {
+        BTRFS_CACHE_NO          = 0,
+        BTRFS_CACHE_STARTED     = 1,
+        BTRFS_CACHE_FINISHED    = 2,
+};
 struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
+        struct btrfs_fs_info *fs_info;
        spinlock_t lock;
-        struct mutex cache_mutex;
        u64 pinned;
        u64 reserved;
        u64 flags;
-        int cached;
+        u64 sectorsize;
+        int extents_thresh;
+        int free_extents;
+        int total_bitmaps;
        int ro;
        int dirty;
+        /* cache tracking stuff */
+        wait_queue_head_t caching_q;
+        int cached;
        struct btrfs_space_info *space_info;
        /* free space cache stuff */
        spinlock_t tree_lock;
-        struct rb_root free_space_bytes;
        struct rb_root free_space_offset;
+        u64 free_space;
        /* block group cache stuff */
        struct rb_node cache_node;
@@ -808,6 +825,7 @@ struct btrfs_fs_info {
        struct mutex drop_mutex;
        struct mutex volume_mutex;
        struct mutex tree_reloc_mutex;
+        struct rw_semaphore extent_commit_sem;
        /*
         * this protects the ordered operations list only while we are
@@ -1988,6 +2006,7 @@ void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
                                 u64 bytes);
 void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
                              u64 bytes);
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index d28d29c95f7c..8b8192790011 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1352,6 +1352,7 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
 {
        int err;
+        bdi->name = "btrfs";
        bdi->capabilities = BDI_CAP_MAP_COPY;
        err = bdi_init(bdi);
        if (err)
@@ -1599,6 +1600,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
+        sb->s_bdi = &fs_info->bdi;
        /*
         * we set the i_size on the btree inode to the max possible int.
@@ -1639,6 +1641,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        mutex_init(&fs_info->cleaner_mutex);
        mutex_init(&fs_info->volume_mutex);
        mutex_init(&fs_info->tree_reloc_mutex);
+        init_rwsem(&fs_info->extent_commit_sem);
        btrfs_init_free_cluster(&fs_info->meta_alloc_cluster);
        btrfs_init_free_cluster(&fs_info->data_alloc_cluster);
@@ -1799,6 +1802,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                           btrfs_super_chunk_root(disk_super),
                                           blocksize, generation);
        BUG_ON(!chunk_root->node);
+        if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
+                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
+                       sb->s_id);
+                goto fail_chunk_root;
+        }
        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
        chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1826,6 +1834,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                          blocksize, generation);
        if (!tree_root->node)
                goto fail_chunk_root;
+        if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
+                       sb->s_id);
+                goto fail_tree_root;
+        }
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
@@ -2322,6 +2335,9 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        fs_info->closing = 2;
+        smp_mb();
        if (fs_info->delalloc_bytes) {
                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
                       (unsigned long long)fs_info->delalloc_bytes);
@@ -2343,6 +2359,7 @@ int close_ctree(struct btrfs_root *root)
        free_extent_buffer(root->fs_info->csum_root->commit_root);
        btrfs_free_block_groups(root->fs_info);
+        btrfs_free_pinned_extents(root->fs_info);
        del_fs_roots(fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5aca3997d42..535f85ba104f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -21,6 +21,7 @@
 #include <linux/blkdev.h>
 #include <linux/sort.h>
 #include <linux/rcupdate.h>
+#include <linux/kthread.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -61,6 +62,13 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
+static noinline int
+block_group_cache_done(struct btrfs_block_group_cache *cache)
+{
+        smp_mb();
+        return cache->cached == BTRFS_CACHE_FINISHED;
+}
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
        return (cache->flags & bits) == bits;
@@ -146,20 +154,70 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
 }
 /*
+ * We always set EXTENT_LOCKED for the super mirror extents so we don't
+ * overwrite them, so those bits need to be unset.  Also, if we are unmounting
+ * with pinned extents still sitting there because we had a block group caching,
+ * we need to clear those now, since we are done.
+ */
+void btrfs_free_pinned_extents(struct btrfs_fs_info *info)
+{
+        u64 start, end, last = 0;
+        int ret;
+        while (1) {
+                ret = find_first_extent_bit(&info->pinned_extents, last,
+                                            &start, &end,
+                                            EXTENT_LOCKED|EXTENT_DIRTY);
+                if (ret)
+                        break;
+                clear_extent_bits(&info->pinned_extents, start, end,
+                                  EXTENT_LOCKED|EXTENT_DIRTY, GFP_NOFS);
+                last = end+1;
+        }
+}
+static int remove_sb_from_cache(struct btrfs_root *root,
+                                struct btrfs_block_group_cache *cache)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 bytenr;
+        u64 *logical;
+        int stripe_len;
+        int i, nr, ret;
+        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                bytenr = btrfs_sb_offset(i);
+                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
+                                       cache->key.objectid, bytenr,
+                                       0, &logical, &nr, &stripe_len);
+                BUG_ON(ret);
+                while (nr--) {
+                        try_lock_extent(&fs_info->pinned_extents,
+                                        logical[nr],
+                                        logical[nr] + stripe_len - 1, GFP_NOFS);
+                }
+                kfree(logical);
+        }
+        return 0;
+}
+/*
 * this is only called by cache_block_group, since we could have freed extents
 * we need to check the pinned_extents for any extents that can't be used yet
 * since their free space will be released as soon as the transaction commits.
 */
-static int add_new_free_space(struct btrfs_block_group_cache *block_group,
+static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_fs_info *info, u64 start, u64 end)
 {
-        u64 extent_start, extent_end, size;
+        u64 extent_start, extent_end, size, total_added = 0;
        int ret;
        while (start < end) {
                ret = find_first_extent_bit(&info->pinned_extents, start,
                                            &extent_start, &extent_end,
-                                            EXTENT_DIRTY);
+                                            EXTENT_DIRTY|EXTENT_LOCKED);
                if (ret)
                        break;
@@ -167,6 +225,7 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
                        start = extent_end + 1;
                } else if (extent_start > start && extent_start < end) {
                        size = extent_start - start;
+                        total_added += size;
                        ret = btrfs_add_free_space(block_group, start,
                                                   size);
                        BUG_ON(ret);
@@ -178,84 +237,93 @@ static int add_new_free_space(struct btrfs_block_group_cache *block_group,
        if (start < end) {
                size = end - start;
+                total_added += size;
                ret = btrfs_add_free_space(block_group, start, size);
                BUG_ON(ret);
        }
-        return 0;
+        return total_added;
-}
-static int remove_sb_from_cache(struct btrfs_root *root,
-                                struct btrfs_block_group_cache *cache)
-{
-        u64 bytenr;
-        u64 *logical;
-        int stripe_len;
-        int i, nr, ret;
-        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
-                bytenr = btrfs_sb_offset(i);
-                ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
-                                       cache->key.objectid, bytenr, 0,
-                                       &logical, &nr, &stripe_len);
-                BUG_ON(ret);
-                while (nr--) {
-                        btrfs_remove_free_space(cache, logical[nr],
-                                                stripe_len);
-                }
-                kfree(logical);
-        }
-        return 0;
 }
-static int cache_block_group(struct btrfs_root *root,
+static int caching_kthread(void *data)
-                             struct btrfs_block_group_cache *block_group)
 {
+        struct btrfs_block_group_cache *block_group = data;
+        struct btrfs_fs_info *fs_info = block_group->fs_info;
+        u64 last = 0;
        struct btrfs_path *path;
        int ret = 0;
        struct btrfs_key key;
        struct extent_buffer *leaf;
        int slot;
-        u64 last;
+        u64 total_found = 0;
-        if (!block_group)
-                return 0;
-        root = root->fs_info->extent_root;
-        if (block_group->cached)
+        BUG_ON(!fs_info);
-                return 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        path->reada = 2;
+        atomic_inc(&block_group->space_info->caching_threads);
+        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
        /*
-         * we get into deadlocks with paths held by callers of this function.
+         * We don't want to deadlock with somebody trying to allocate a new
-         * since the alloc_mutex is protecting things right now, just
+         * extent for the extent root while also trying to search the extent
-         * skip the locking here
+         * root to add free space.  So we skip locking and search the commit
+         * root, since its read-only
         */
        path->skip_locking = 1;
-        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
+        path->search_commit_root = 1;
+        path->reada = 2;
        key.objectid = last;
        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
-        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+again:
+        /* need to make sure the commit_root doesn't disappear */
+        down_read(&fs_info->extent_commit_sem);
+        ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
        while (1) {
+                smp_mb();
+                if (block_group->fs_info->closing > 1) {
+                        last = (u64)-1;
+                        break;
+                }
                leaf = path->nodes[0];
                slot = path->slots[0];
                if (slot >= btrfs_header_nritems(leaf)) {
-                        ret = btrfs_next_leaf(root, path);
+                        ret = btrfs_next_leaf(fs_info->extent_root, path);
                        if (ret < 0)
                                goto err;
-                        if (ret == 0)
+                        else if (ret)
-                                continue;
-                        else
                                break;
+                        if (need_resched() ||
+                            btrfs_transaction_in_commit(fs_info)) {
+                                leaf = path->nodes[0];
+                                /* this shouldn't happen, but if the
+                                 * leaf is empty just move on.
+                                 */
+                                if (btrfs_header_nritems(leaf) == 0)
+                                        break;
+                                /*
+                                 * we need to copy the key out so that
+                                 * we are sure the next search advances
+                                 * us forward in the btree.
+                                 */
+                                btrfs_item_key_to_cpu(leaf, &key, 0);
+                                btrfs_release_path(fs_info->extent_root, path);
+                                up_read(&fs_info->extent_commit_sem);
+                                schedule_timeout(1);
+                                goto again;
+                        }
+                        continue;
                }
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (key.objectid < block_group->key.objectid)
@@ -266,24 +334,59 @@ static int cache_block_group(struct btrfs_root *root,
                        break;
                if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) {
-                        add_new_free_space(block_group, root->fs_info, last,
+                        total_found += add_new_free_space(block_group,
-                                           key.objectid);
+                                                          fs_info, last,
+                                                          key.objectid);
                        last = key.objectid + key.offset;
                }
+                if (total_found > (1024 * 1024 * 2)) {
+                        total_found = 0;
+                        wake_up(&block_group->caching_q);
+                }
 next:
                path->slots[0]++;
        }
+        ret = 0;
-        add_new_free_space(block_group, root->fs_info, last,
+        total_found += add_new_free_space(block_group, fs_info, last,
-                           block_group->key.objectid +
+                                          block_group->key.objectid +
-                           block_group->key.offset);
+                                          block_group->key.offset);
+        spin_lock(&block_group->lock);
+        block_group->cached = BTRFS_CACHE_FINISHED;
+        spin_unlock(&block_group->lock);
-        block_group->cached = 1;
-        remove_sb_from_cache(root, block_group);
-        ret = 0;
 err:
        btrfs_free_path(path);
+        up_read(&fs_info->extent_commit_sem);
+        atomic_dec(&block_group->space_info->caching_threads);
+        wake_up(&block_group->caching_q);
+        return 0;
+}
+static int cache_block_group(struct btrfs_block_group_cache *cache)
+{
+        struct task_struct *tsk;
+        int ret = 0;
+        spin_lock(&cache->lock);
+        if (cache->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&cache->lock);
+                return ret;
+        }
+        cache->cached = BTRFS_CACHE_STARTED;
+        spin_unlock(&cache->lock);
+        tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n",
+                          cache->key.objectid);
+        if (IS_ERR(tsk)) {
+                ret = PTR_ERR(tsk);
+                printk(KERN_ERR "error running thread %d\n", ret);
+                BUG();
+        }
        return ret;
 }
@@ -1408,7 +1511,8 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
 static void btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
-        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+                             DISCARD_FL_BARRIER);
 }
 #endif
@@ -2387,13 +2491,29 @@ fail:
 }
+static struct btrfs_block_group_cache *
+next_block_group(struct btrfs_root *root,
+                 struct btrfs_block_group_cache *cache)
+{
+        struct rb_node *node;
+        spin_lock(&root->fs_info->block_group_cache_lock);
+        node = rb_next(&cache->cache_node);
+        btrfs_put_block_group(cache);
+        if (node) {
+                cache = rb_entry(node, struct btrfs_block_group_cache,
+                                 cache_node);
+                atomic_inc(&cache->count);
+        } else
+                cache = NULL;
+        spin_unlock(&root->fs_info->block_group_cache_lock);
+        return cache;
+}
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-        struct btrfs_block_group_cache *cache, *entry;
+        struct btrfs_block_group_cache *cache;
-        struct rb_node *n;
        int err = 0;
-        int werr = 0;
        struct btrfs_path *path;
        u64 last = 0;
@@ -2402,39 +2522,35 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        while (1) {
-                cache = NULL;
+                if (last == 0) {
-                spin_lock(&root->fs_info->block_group_cache_lock);
+                        err = btrfs_run_delayed_refs(trans, root,
-                for (n = rb_first(&root->fs_info->block_group_cache_tree);
+                                                     (unsigned long)-1);
-                     n; n = rb_next(n)) {
+                        BUG_ON(err);
-                        entry = rb_entry(n, struct btrfs_block_group_cache,
-                                         cache_node);
-                        if (entry->dirty) {
-                                cache = entry;
-                                break;
-                        }
                }
-                spin_unlock(&root->fs_info->block_group_cache_lock);
-                if (!cache)
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
-                        break;
+                while (cache) {
+                        if (cache->dirty)
+                                break;
+                        cache = next_block_group(root, cache);
+                }
+                if (!cache) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
                cache->dirty = 0;
-                last += cache->key.offset;
+                last = cache->key.objectid + cache->key.offset;
-                err = write_one_cache_group(trans, root,
+                err = write_one_cache_group(trans, root, path, cache);
-                                            path, cache);
+                BUG_ON(err);
-                /*
+                btrfs_put_block_group(cache);
-                 * if we fail to write the cache group, we want
-                 * to keep it marked dirty in hopes that a later
-                 * write will work
-                 */
-                if (err) {
-                        werr = err;
-                        continue;
-                }
        }
        btrfs_free_path(path);
-        return werr;
+        return 0;
 }
 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
@@ -2484,6 +2600,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->force_alloc = 0;
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
+        atomic_set(&found->caching_threads, 0);
        return 0;
 }
@@ -2947,13 +3064,9 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        if (pin) {
+        if (pin)
                set_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
-        } else {
-                clear_extent_dirty(&fs_info->pinned_extents,
-                                bytenr, bytenr + num - 1, GFP_NOFS);
-        }
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
@@ -2969,14 +3082,34 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                        spin_unlock(&cache->space_info->lock);
                        fs_info->total_pinned += len;
                } else {
+                        int unpin = 0;
+                        /*
+                         * in order to not race with the block group caching, we
+                         * only want to unpin the extent if we are cached.  If
+                         * we aren't cached, we want to start async caching this
+                         * block group so we can free the extent the next time
+                         * around.
+                         */
                        spin_lock(&cache->space_info->lock);
                        spin_lock(&cache->lock);
-                        cache->pinned -= len;
+                        unpin = (cache->cached == BTRFS_CACHE_FINISHED);
-                        cache->space_info->bytes_pinned -= len;
+                        if (likely(unpin)) {
+                                cache->pinned -= len;
+                                cache->space_info->bytes_pinned -= len;
+                                fs_info->total_pinned -= len;
+                        }
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                        fs_info->total_pinned -= len;
-                        if (cache->cached)
+                        if (likely(unpin))
+                                clear_extent_dirty(&fs_info->pinned_extents,
+                                                   bytenr, bytenr + len -1,
+                                                   GFP_NOFS);
+                        else
+                                cache_block_group(cache);
+                        if (unpin)
                                btrfs_add_free_space(cache, bytenr, len);
                }
                btrfs_put_block_group(cache);
@@ -3030,6 +3163,7 @@ int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy)
                                            &start, &end, EXTENT_DIRTY);
                if (ret)
                        break;
                set_extent_dirty(copy, start, end, GFP_NOFS);
                last = end + 1;
        }
@@ -3058,6 +3192,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
        return ret;
 }
@@ -3436,6 +3571,45 @@ static u64 stripe_align(struct btrfs_root *root, u64 val)
 }
 /*
+ * when we wait for progress in the block group caching, its because
+ * our allocation attempt failed at least once.  So, we must sleep
+ * and let some progress happen before we try again.
+ *
+ * This function will sleep at least once waiting for new free space to
+ * show up, and then it will check the block group free space numbers
+ * for our min num_bytes.  Another option is to have it go ahead
+ * and look in the rbtree for a free extent of a given size, but this
+ * is a good start.
+ */
+static noinline int
+wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
+                                u64 num_bytes)
+{
+        DEFINE_WAIT(wait);
+        prepare_to_wait(&cache->caching_q, &wait, TASK_UNINTERRUPTIBLE);
+        if (block_group_cache_done(cache)) {
+                finish_wait(&cache->caching_q, &wait);
+                return 0;
+        }
+        schedule();
+        finish_wait(&cache->caching_q, &wait);
+        wait_event(cache->caching_q, block_group_cache_done(cache) ||
+                   (cache->free_space >= num_bytes));
+        return 0;
+}
+enum btrfs_loop_type {
+        LOOP_CACHED_ONLY = 0,
+        LOOP_CACHING_NOWAIT = 1,
+        LOOP_CACHING_WAIT = 2,
+        LOOP_ALLOC_CHUNK = 3,
+        LOOP_NO_EMPTY_SIZE = 4,
+};
+/*
 * walks the btree of allocated extents and find a hole of a given size.
 * The key ins is changed to record the hole:
 * ins->objectid == block start
@@ -3460,6 +3634,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+        bool found_uncached_bg = false;
        WARN_ON(num_bytes < root->sectorsize);
        btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
@@ -3491,15 +3666,18 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        search_start = max(search_start, first_logical_byte(root, 0));
        search_start = max(search_start, hint_byte);
-        if (!last_ptr) {
+        if (!last_ptr)
                empty_cluster = 0;
-                loop = 1;
-        }
        if (search_start == hint_byte) {
                block_group = btrfs_lookup_block_group(root->fs_info,
                                                       search_start);
-                if (block_group && block_group_bits(block_group, data)) {
+                /*
+                 * we don't want to use the block group if it doesn't match our
+                 * allocation bits, or if its not cached.
+                 */
+                if (block_group && block_group_bits(block_group, data) &&
+                    block_group_cache_done(block_group)) {
                        down_read(&space_info->groups_sem);
                        if (list_empty(&block_group->list) ||
                            block_group->ro) {
@@ -3522,21 +3700,35 @@ search:
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups, list) {
                u64 offset;
+                int cached;
                atomic_inc(&block_group->count);
                search_start = block_group->key.objectid;
 have_block_group:
-                if (unlikely(!block_group->cached)) {
+                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
-                        mutex_lock(&block_group->cache_mutex);
+                        /*
-                        ret = cache_block_group(root, block_group);
+                         * we want to start caching kthreads, but not too many
-                        mutex_unlock(&block_group->cache_mutex);
+                         * right off the bat so we don't overwhelm the system,
-                        if (ret) {
+                         * so only start them if there are less than 2 and we're
-                                btrfs_put_block_group(block_group);
+                         * in the initial allocation phase.
-                                break;
+                         */
+                        if (loop > LOOP_CACHING_NOWAIT ||
+                            atomic_read(&space_info->caching_threads) < 2) {
+                                ret = cache_block_group(block_group);
+                                BUG_ON(ret);
                        }
                }
+                cached = block_group_cache_done(block_group);
+                if (unlikely(!cached)) {
+                        found_uncached_bg = true;
+                        /* if we only want cached bgs, loop */
+                        if (loop == LOOP_CACHED_ONLY)
+                                goto loop;
+                }
                if (unlikely(block_group->ro))
                        goto loop;
@@ -3615,14 +3807,21 @@ refill_cluster:
                                        spin_unlock(&last_ptr->refill_lock);
                                        goto checks;
                                }
+                        } else if (!cached && loop > LOOP_CACHING_NOWAIT) {
+                                spin_unlock(&last_ptr->refill_lock);
+                                wait_block_group_cache_progress(block_group,
+                                       num_bytes + empty_cluster + empty_size);
+                                goto have_block_group;
                        }
                        /*
                         * at this point we either didn't find a cluster
                         * or we weren't able to allocate a block from our
                         * cluster.  Free the cluster we've been trying
                         * to use, and go to the next block group
                         */
-                        if (loop < 2) {
+                        if (loop < LOOP_NO_EMPTY_SIZE) {
                                btrfs_return_cluster_to_free_space(NULL,
                                                                   last_ptr);
                                spin_unlock(&last_ptr->refill_lock);
@@ -3633,11 +3832,17 @@ refill_cluster:
                offset = btrfs_find_space_for_alloc(block_group, search_start,
                                                    num_bytes, empty_size);
-                if (!offset)
+                if (!offset && (cached || (!cached &&
+                                           loop == LOOP_CACHING_NOWAIT))) {
                        goto loop;
+                } else if (!offset && (!cached &&
+                                       loop > LOOP_CACHING_NOWAIT)) {
+                        wait_block_group_cache_progress(block_group,
+                                        num_bytes + empty_size);
+                        goto have_block_group;
+                }
 checks:
                search_start = stripe_align(root, offset);
                /* move on to the next group */
                if (search_start + num_bytes >= search_end) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
@@ -3683,13 +3888,26 @@ loop:
        }
        up_read(&space_info->groups_sem);
-        /* loop == 0, try to find a clustered alloc in every block group
+        /* LOOP_CACHED_ONLY, only search fully cached block groups
-         * loop == 1, try again after forcing a chunk allocation
+         * LOOP_CACHING_NOWAIT, search partially cached block groups, but
-         * loop == 2, set empty_size and empty_cluster to 0 and try again
+         *                      dont wait foR them to finish caching
+         * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
+         * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
+         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
+         *                      again
         */
-        if (!ins->objectid && loop < 3 &&
+        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
-            (empty_size || empty_cluster || allowed_chunk_alloc)) {
+            (found_uncached_bg || empty_size || empty_cluster ||
-                if (loop >= 2) {
+             allowed_chunk_alloc)) {
+                if (found_uncached_bg) {
+                        found_uncached_bg = false;
+                        if (loop < LOOP_CACHING_WAIT) {
+                                loop++;
+                                goto search;
+                        }
+                }
+                if (loop == LOOP_ALLOC_CHUNK) {
                        empty_size = 0;
                        empty_cluster = 0;
                }
@@ -3702,7 +3920,7 @@ loop:
                        space_info->force_alloc = 1;
                }
-                if (loop < 3) {
+                if (loop < LOOP_NO_EMPTY_SIZE) {
                        loop++;
                        goto search;
                }
@@ -3798,7 +4016,7 @@ again:
                               num_bytes, data, 1);
                goto again;
        }
-        if (ret) {
+        if (ret == -ENOSPC) {
                struct btrfs_space_info *sinfo;
                sinfo = __find_space_info(root->fs_info, data);
@@ -3806,7 +4024,6 @@ again:
                       "wanted %llu\n", (unsigned long long)data,
                       (unsigned long long)num_bytes);
                dump_space_info(sinfo, num_bytes);
-                BUG();
        }
        return ret;
@@ -3844,7 +4061,9 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
                                     empty_size, hint_byte, search_end, ins,
                                     data);
-        update_reserved_extents(root, ins->objectid, ins->offset, 1);
+        if (!ret)
+                update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
@@ -4006,9 +4225,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        struct btrfs_block_group_cache *block_group;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        mutex_lock(&block_group->cache_mutex);
+        cache_block_group(block_group);
-        cache_block_group(root, block_group);
+        wait_event(block_group->caching_q,
-        mutex_unlock(&block_group->cache_mutex);
+                   block_group_cache_done(block_group));
        ret = btrfs_remove_free_space(block_group, ins->objectid,
                                      ins->offset);
@@ -4039,7 +4258,8 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans,
        ret = __btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
                                     empty_size, hint_byte, search_end,
                                     ins, 0);
-        BUG_ON(ret);
+        if (ret)
+                return ret;
        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
                if (parent == 0)
@@ -6955,11 +7175,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                         &info->block_group_cache_tree);
                spin_unlock(&info->block_group_cache_lock);
-                btrfs_remove_free_space_cache(block_group);
                down_write(&block_group->space_info->groups_sem);
                list_del(&block_group->list);
                up_write(&block_group->space_info->groups_sem);
+                if (block_group->cached == BTRFS_CACHE_STARTED)
+                        wait_event(block_group->caching_q,
+                                   block_group_cache_done(block_group));
+                btrfs_remove_free_space_cache(block_group);
                WARN_ON(atomic_read(&block_group->count) != 1);
                kfree(block_group);
@@ -7025,9 +7250,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
                spin_lock_init(&cache->tree_lock);
-                mutex_init(&cache->cache_mutex);
+                cache->fs_info = info;
+                init_waitqueue_head(&cache->caching_q);
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
+                /*
+                 * we only want to have 32k of ram per block group for keeping
+                 * track of free space, and if we pass 1/2 of that we want to
+                 * start converting things over to using bitmaps
+                 */
+                cache->extents_thresh = ((1024 * 32) / 2) /
+                        sizeof(struct btrfs_free_space);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
                                   sizeof(cache->item));
@@ -7036,6 +7271,26 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                key.objectid = found_key.objectid + found_key.offset;
                btrfs_release_path(root, path);
                cache->flags = btrfs_block_group_flags(&cache->item);
+                cache->sectorsize = root->sectorsize;
+                remove_sb_from_cache(root, cache);
+                /*
+                 * check for two cases, either we are full, and therefore
+                 * don't need to bother with the caching work since we won't
+                 * find any space, or we are empty, and we can just add all
+                 * the space in and be done with it.  This saves us _alot_ of
+                 * time, particularly in the full case.
+                 */
+                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                } else if (btrfs_block_group_used(&cache->item) == 0) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                        add_new_free_space(cache, root->fs_info,
+                                           found_key.objectid,
+                                           found_key.objectid +
+                                           found_key.offset);
+                }
                ret = update_space_info(info, cache->flags, found_key.offset,
                                        btrfs_block_group_used(&cache->item),
@@ -7079,10 +7334,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.objectid = chunk_offset;
        cache->key.offset = size;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+        cache->sectorsize = root->sectorsize;
+        /*
+         * we only want to have 32k of ram per block group for keeping track
+         * of free space, and if we pass 1/2 of that we want to start
+         * converting things over to using bitmaps
+         */
+        cache->extents_thresh = ((1024 * 32) / 2) /
+                sizeof(struct btrfs_free_space);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
        spin_lock_init(&cache->tree_lock);
-        mutex_init(&cache->cache_mutex);
+        init_waitqueue_head(&cache->caching_q);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
@@ -7091,6 +7355,12 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->flags = type;
        btrfs_set_block_group_flags(&cache->item, type);
+        cache->cached = BTRFS_CACHE_FINISHED;
+        remove_sb_from_cache(root, cache);
+        add_new_free_space(cache, root->fs_info, chunk_offset,
+                           chunk_offset + size);
        ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
                                &cache->space_info);
        BUG_ON(ret);
@@ -7149,7 +7419,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
        spin_unlock(&root->fs_info->block_group_cache_lock);
-        btrfs_remove_free_space_cache(block_group);
        down_write(&block_group->space_info->groups_sem);
        /*
         * we must use list_del_init so people can check to see if they
@@ -7158,11 +7428,18 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        list_del_init(&block_group->list);
        up_write(&block_group->space_info->groups_sem);
+        if (block_group->cached == BTRFS_CACHE_STARTED)
+                wait_event(block_group->caching_q,
+                           block_group_cache_done(block_group));
+        btrfs_remove_free_space_cache(block_group);
        spin_lock(&block_group->space_info->lock);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
        spin_unlock(&block_group->space_info->lock);
-        block_group->space_info->full = 0;
+        btrfs_clear_space_info_full(root->fs_info);
        btrfs_put_block_group(block_group);
        btrfs_put_block_group(block_group);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 4538e48581a5..5edcee3a617f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -16,45 +16,46 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/math64.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
-struct btrfs_free_space {
+#define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
-        struct rb_node bytes_index;
+#define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
-        struct rb_node offset_index;
-        u64 offset;
-        u64 bytes;
-};
-static int tree_insert_offset(struct rb_root *root, u64 offset,
+static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
-                              struct rb_node *node)
+                                          u64 offset)
 {
-        struct rb_node **p = &root->rb_node;
+        BUG_ON(offset < bitmap_start);
-        struct rb_node *parent = NULL;
+        offset -= bitmap_start;
-        struct btrfs_free_space *info;
+        return (unsigned long)(div64_u64(offset, sectorsize));
+}
-        while (*p) {
+static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
-                parent = *p;
+{
-                info = rb_entry(parent, struct btrfs_free_space, offset_index);
+        return (unsigned long)(div64_u64(bytes, sectorsize));
+}
-                if (offset < info->offset)
+static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
-                        p = &(*p)->rb_left;
+                                   u64 offset)
-                else if (offset > info->offset)
+{
-                        p = &(*p)->rb_right;
+        u64 bitmap_start;
-                else
+        u64 bytes_per_bitmap;
-                        return -EEXIST;
-        }
-        rb_link_node(node, parent, p);
+        bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
-        rb_insert_color(node, root);
+        bitmap_start = offset - block_group->key.objectid;
+        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
+        bitmap_start *= bytes_per_bitmap;
+        bitmap_start += block_group->key.objectid;
-        return 0;
+        return bitmap_start;
 }
-static int tree_insert_bytes(struct rb_root *root, u64 bytes,
+static int tree_insert_offset(struct rb_root *root, u64 offset,
-                             struct rb_node *node)
+                              struct rb_node *node, int bitmap)
 {
        struct rb_node **p = &root->rb_node;
        struct rb_node *parent = NULL;
@@ -62,12 +63,34 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
        while (*p) {
                parent = *p;
-                info = rb_entry(parent, struct btrfs_free_space, bytes_index);
+                info = rb_entry(parent, struct btrfs_free_space, offset_index);
-                if (bytes < info->bytes)
+                if (offset < info->offset) {
                        p = &(*p)->rb_left;
-                else
+                } else if (offset > info->offset) {
                        p = &(*p)->rb_right;
+                } else {
+                        /*
+                         * we could have a bitmap entry and an extent entry
+                         * share the same offset.  If this is the case, we want
+                         * the extent entry to always be found first if we do a
+                         * linear search through the tree, since we want to have
+                         * the quickest allocation time, and allocating from an
+                         * extent is faster than allocating from a bitmap.  So
+                         * if we're inserting a bitmap and we find an entry at
+                         * this offset, we want to go right, or after this entry
+                         * logically.  If we are inserting an extent and we've
+                         * found a bitmap, we want to go left, or before
+                         * logically.
+                         */
+                        if (bitmap) {
+                                WARN_ON(info->bitmap);
+                                p = &(*p)->rb_right;
+                        } else {
+                                WARN_ON(!info->bitmap);
+                                p = &(*p)->rb_left;
+                        }
+                }
        }
        rb_link_node(node, parent, p);
@@ -79,110 +102,143 @@ static int tree_insert_bytes(struct rb_root *root, u64 bytes,
 /*
 * searches the tree for the given offset.
 *
- * fuzzy == 1: this is used for allocations where we are given a hint of where
+ * fuzzy - If this is set, then we are trying to make an allocation, and we just
- * to look for free space.  Because the hint may not be completely on an offset
+ * want a section that has at least bytes size and comes at or after the given
- * mark, or the hint may no longer point to free space we need to fudge our
+ * offset.
- * results a bit.  So we look for free space starting at or after offset with at
- * least bytes size.  We prefer to find as close to the given offset as we can.
- * Also if the offset is within a free space range, then we will return the free
- * space that contains the given offset, which means we can return a free space
- * chunk with an offset before the provided offset.
- *
- * fuzzy == 0: this is just a normal tree search.  Give us the free space that
- * starts at the given offset which is at least bytes size, and if its not there
- * return NULL.
 */
-static struct btrfs_free_space *tree_search_offset(struct rb_root *root,
+static struct btrfs_free_space *
-                                                   u64 offset, u64 bytes,
+tree_search_offset(struct btrfs_block_group_cache *block_group,
-                                                   int fuzzy)
+                   u64 offset, int bitmap_only, int fuzzy)
 {
-        struct rb_node *n = root->rb_node;
+        struct rb_node *n = block_group->free_space_offset.rb_node;
-        struct btrfs_free_space *entry, *ret = NULL;
+        struct btrfs_free_space *entry, *prev = NULL;
+        /* find entry that is closest to the 'offset' */
+        while (1) {
+                if (!n) {
+                        entry = NULL;
+                        break;
+                }
-        while (n) {
                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                prev = entry;
-                if (offset < entry->offset) {
+                if (offset < entry->offset)
-                        if (fuzzy &&
-                            (!ret || entry->offset < ret->offset) &&
-                            (bytes <= entry->bytes))
-                                ret = entry;
                        n = n->rb_left;
-                } else if (offset > entry->offset) {
+                else if (offset > entry->offset)
-                        if (fuzzy &&
-                            (entry->offset + entry->bytes - 1) >= offset &&
-                            bytes <= entry->bytes) {
-                                ret = entry;
-                                break;
-                        }
                        n = n->rb_right;
-                } else {
+                else
-                        if (bytes > entry->bytes) {
-                                n = n->rb_right;
-                                continue;
-                        }
-                        ret = entry;
                        break;
-                }
        }
-        return ret;
+        if (bitmap_only) {
-}
+                if (!entry)
+                        return NULL;
+                if (entry->bitmap)
+                        return entry;
-/*
+                /*
- * return a chunk at least bytes size, as close to offset that we can get.
+                 * bitmap entry and extent entry may share same offset,
- */
+                 * in that case, bitmap entry comes after extent entry.
-static struct btrfs_free_space *tree_search_bytes(struct rb_root *root,
+                 */
-                                                  u64 offset, u64 bytes)
+                n = rb_next(n);
-{
+                if (!n)
-        struct rb_node *n = root->rb_node;
+                        return NULL;
-        struct btrfs_free_space *entry, *ret = NULL;
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+                if (entry->offset != offset)
-        while (n) {
+                        return NULL;
-                entry = rb_entry(n, struct btrfs_free_space, bytes_index);
-                if (bytes < entry->bytes) {
+                WARN_ON(!entry->bitmap);
+                return entry;
+        } else if (entry) {
+                if (entry->bitmap) {
                        /*
-                         * We prefer to get a hole size as close to the size we
+                         * if previous extent entry covers the offset,
-                         * are asking for so we don't take small slivers out of
+                         * we should return it instead of the bitmap entry
-                         * huge holes, but we also want to get as close to the
-                         * offset as possible so we don't have a whole lot of
-                         * fragmentation.
                         */
-                        if (offset <= entry->offset) {
+                        n = &entry->offset_index;
-                                if (!ret)
+                        while (1) {
-                                        ret = entry;
+                                n = rb_prev(n);
-                                else if (entry->bytes < ret->bytes)
+                                if (!n)
-                                        ret = entry;
+                                        break;
-                                else if (entry->offset < ret->offset)
+                                prev = rb_entry(n, struct btrfs_free_space,
-                                        ret = entry;
+                                                offset_index);
+                                if (!prev->bitmap) {
+                                        if (prev->offset + prev->bytes > offset)
+                                                entry = prev;
+                                        break;
+                                }
                        }
-                        n = n->rb_left;
+                }
-                } else if (bytes > entry->bytes) {
+                return entry;
-                        n = n->rb_right;
+        }
+        if (!prev)
+                return NULL;
+        /* find last entry before the 'offset' */
+        entry = prev;
+        if (entry->offset > offset) {
+                n = rb_prev(&entry->offset_index);
+                if (n) {
+                        entry = rb_entry(n, struct btrfs_free_space,
+                                        offset_index);
+                        BUG_ON(entry->offset > offset);
                } else {
-                        /*
+                        if (fuzzy)
-                         * Ok we may have multiple chunks of the wanted size,
+                                return entry;
-                         * so we don't want to take the first one we find, we
+                        else
-                         * want to take the one closest to our given offset, so
+                                return NULL;
-                         * keep searching just in case theres a better match.
-                         */
-                        n = n->rb_right;
-                        if (offset > entry->offset)
-                                continue;
-                        else if (!ret || entry->offset < ret->offset)
-                                ret = entry;
                }
        }
-        return ret;
+        if (entry->bitmap) {
+                n = &entry->offset_index;
+                while (1) {
+                        n = rb_prev(n);
+                        if (!n)
+                                break;
+                        prev = rb_entry(n, struct btrfs_free_space,
+                                        offset_index);
+                        if (!prev->bitmap) {
+                                if (prev->offset + prev->bytes > offset)
+                                        return prev;
+                                break;
+                        }
+                }
+                if (entry->offset + BITS_PER_BITMAP *
+                    block_group->sectorsize > offset)
+                        return entry;
+        } else if (entry->offset + entry->bytes > offset)
+                return entry;
+        if (!fuzzy)
+                return NULL;
+        while (1) {
+                if (entry->bitmap) {
+                        if (entry->offset + BITS_PER_BITMAP *
+                            block_group->sectorsize > offset)
+                                break;
+                } else {
+                        if (entry->offset + entry->bytes > offset)
+                                break;
+                }
+                n = rb_next(&entry->offset_index);
+                if (!n)
+                        return NULL;
+                entry = rb_entry(n, struct btrfs_free_space, offset_index);
+        }
+        return entry;
 }
 static void unlink_free_space(struct btrfs_block_group_cache *block_group,
                              struct btrfs_free_space *info)
 {
        rb_erase(&info->offset_index, &block_group->free_space_offset);
-        rb_erase(&info->bytes_index, &block_group->free_space_bytes);
+        block_group->free_extents--;
+        block_group->free_space -= info->bytes;
 }
 static int link_free_space(struct btrfs_block_group_cache *block_group,
@@ -190,17 +246,353 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 {
        int ret = 0;
+        BUG_ON(!info->bitmap && !info->bytes);
-        BUG_ON(!info->bytes);
        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
-                                 &info->offset_index);
+                                 &info->offset_index, (info->bitmap != NULL));
        if (ret)
                return ret;
-        ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes,
+        block_group->free_space += info->bytes;
-                                &info->bytes_index);
+        block_group->free_extents++;
-        if (ret)
+        return ret;
-                return ret;
+}
+static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+{
+        u64 max_bytes, possible_bytes;
+        /*
+         * The goal is to keep the total amount of memory used per 1gb of space
+         * at or below 32k, so we need to adjust how much memory we allow to be
+         * used by extent based free space tracking
+         */
+        max_bytes = MAX_CACHE_BYTES_PER_GIG *
+                (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+        possible_bytes = (block_group->total_bitmaps * PAGE_CACHE_SIZE) +
+                (sizeof(struct btrfs_free_space) *
+                 block_group->extents_thresh);
+        if (possible_bytes > max_bytes) {
+                int extent_bytes = max_bytes -
+                        (block_group->total_bitmaps * PAGE_CACHE_SIZE);
+                if (extent_bytes <= 0) {
+                        block_group->extents_thresh = 0;
+                        return;
+                }
+                block_group->extents_thresh = extent_bytes /
+                        (sizeof(struct btrfs_free_space));
+        }
+}
+static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info, u64 offset,
+                              u64 bytes)
+{
+        unsigned long start, end;
+        unsigned long i;
+        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        BUG_ON(end > BITS_PER_BITMAP);
+        for (i = start; i < end; i++)
+                clear_bit(i, info->bitmap);
+        info->bytes -= bytes;
+        block_group->free_space -= bytes;
+}
+static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+                            struct btrfs_free_space *info, u64 offset,
+                            u64 bytes)
+{
+        unsigned long start, end;
+        unsigned long i;
+        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        BUG_ON(end > BITS_PER_BITMAP);
+        for (i = start; i < end; i++)
+                set_bit(i, info->bitmap);
+        info->bytes += bytes;
+        block_group->free_space += bytes;
+}
+static int search_bitmap(struct btrfs_block_group_cache *block_group,
+                         struct btrfs_free_space *bitmap_info, u64 *offset,
+                         u64 *bytes)
+{
+        unsigned long found_bits = 0;
+        unsigned long bits, i;
+        unsigned long next_zero;
+        i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+                          max_t(u64, *offset, bitmap_info->offset));
+        bits = bytes_to_bits(*bytes, block_group->sectorsize);
+        for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
+             i < BITS_PER_BITMAP;
+             i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i + 1)) {
+                next_zero = find_next_zero_bit(bitmap_info->bitmap,
+                                               BITS_PER_BITMAP, i);
+                if ((next_zero - i) >= bits) {
+                        found_bits = next_zero - i;
+                        break;
+                }
+                i = next_zero;
+        }
+        if (found_bits) {
+                *offset = (u64)(i * block_group->sectorsize) +
+                        bitmap_info->offset;
+                *bytes = (u64)(found_bits) * block_group->sectorsize;
+                return 0;
+        }
+        return -1;
+}
+static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+                                                *block_group, u64 *offset,
+                                                u64 *bytes, int debug)
+{
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        int ret;
+        if (!block_group->free_space_offset.rb_node)
+                return NULL;
+        entry = tree_search_offset(block_group,
+                                   offset_to_bitmap(block_group, *offset),
+                                   0, 1);
+        if (!entry)
+                return NULL;
+        for (node = &entry->offset_index; node; node = rb_next(node)) {
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                if (entry->bytes < *bytes)
+                        continue;
+                if (entry->bitmap) {
+                        ret = search_bitmap(block_group, entry, offset, bytes);
+                        if (!ret)
+                                return entry;
+                        continue;
+                }
+                *offset = entry->offset;
+                *bytes = entry->bytes;
+                return entry;
+        }
+        return NULL;
+}
+static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+                           struct btrfs_free_space *info, u64 offset)
+{
+        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        int max_bitmaps = (int)div64_u64(block_group->key.offset +
+                                         bytes_per_bg - 1, bytes_per_bg);
+        BUG_ON(block_group->total_bitmaps >= max_bitmaps);
+        info->offset = offset_to_bitmap(block_group, offset);
+        link_free_space(block_group, info);
+        block_group->total_bitmaps++;
+        recalculate_thresholds(block_group);
+}
+static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *bitmap_info,
+                              u64 *offset, u64 *bytes)
+{
+        u64 end;
+        u64 search_start, search_bytes;
+        int ret;
+again:
+        end = bitmap_info->offset +
+                (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
+        /*
+         * XXX - this can go away after a few releases.
+         *
+         * since the only user of btrfs_remove_free_space is the tree logging
+         * stuff, and the only way to test that is under crash conditions, we
+         * want to have this debug stuff here just in case somethings not
+         * working.  Search the bitmap for the space we are trying to use to
+         * make sure its actually there.  If its not there then we need to stop
+         * because something has gone wrong.
+         */
+        search_start = *offset;
+        search_bytes = *bytes;
+        ret = search_bitmap(block_group, bitmap_info, &search_start,
+                            &search_bytes);
+        BUG_ON(ret < 0 || search_start != *offset);
+        if (*offset > bitmap_info->offset && *offset + *bytes > end) {
+                bitmap_clear_bits(block_group, bitmap_info, *offset,
+                                  end - *offset + 1);
+                *bytes -= end - *offset + 1;
+                *offset = end + 1;
+        } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
+                bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+                *bytes = 0;
+        }
+        if (*bytes) {
+                struct rb_node *next = rb_next(&bitmap_info->offset_index);
+                if (!bitmap_info->bytes) {
+                        unlink_free_space(block_group, bitmap_info);
+                        kfree(bitmap_info->bitmap);
+                        kfree(bitmap_info);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+                /*
+                 * no entry after this bitmap, but we still have bytes to
+                 * remove, so something has gone wrong.
+                 */
+                if (!next)
+                        return -EINVAL;
+                bitmap_info = rb_entry(next, struct btrfs_free_space,
+                                       offset_index);
+                /*
+                 * if the next entry isn't a bitmap we need to return to let the
+                 * extent stuff do its work.
+                 */
+                if (!bitmap_info->bitmap)
+                        return -EAGAIN;
+                /*
+                 * Ok the next item is a bitmap, but it may not actually hold
+                 * the information for the rest of this free space stuff, so
+                 * look for it, and if we don't find it return so we can try
+                 * everything over again.
+                 */
+                search_start = *offset;
+                search_bytes = *bytes;
+                ret = search_bitmap(block_group, bitmap_info, &search_start,
+                                    &search_bytes);
+                if (ret < 0 || search_start != *offset)
+                        return -EAGAIN;
+                goto again;
+        } else if (!bitmap_info->bytes) {
+                unlink_free_space(block_group, bitmap_info);
+                kfree(bitmap_info->bitmap);
+                kfree(bitmap_info);
+                block_group->total_bitmaps--;
+                recalculate_thresholds(block_group);
+        }
+        return 0;
+}
+static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+                              struct btrfs_free_space *info)
+{
+        struct btrfs_free_space *bitmap_info;
+        int added = 0;
+        u64 bytes, offset, end;
+        int ret;
+        /*
+         * If we are below the extents threshold then we can add this as an
+         * extent, and don't have to deal with the bitmap
+         */
+        if (block_group->free_extents < block_group->extents_thresh &&
+            info->bytes > block_group->sectorsize * 4)
+                return 0;
+        /*
+         * some block groups are so tiny they can't be enveloped by a bitmap, so
+         * don't even bother to create a bitmap for this
+         */
+        if (BITS_PER_BITMAP * block_group->sectorsize >
+            block_group->key.offset)
+                return 0;
+        bytes = info->bytes;
+        offset = info->offset;
+again:
+        bitmap_info = tree_search_offset(block_group,
+                                         offset_to_bitmap(block_group, offset),
+                                         1, 0);
+        if (!bitmap_info) {
+                BUG_ON(added);
+                goto new_bitmap;
+        }
+        end = bitmap_info->offset +
+                (u64)(BITS_PER_BITMAP * block_group->sectorsize);
+        if (offset >= bitmap_info->offset && offset + bytes > end) {
+                bitmap_set_bits(block_group, bitmap_info, offset,
+                                end - offset);
+                bytes -= end - offset;
+                offset = end;
+                added = 0;
+        } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
+                bitmap_set_bits(block_group, bitmap_info, offset, bytes);
+                bytes = 0;
+        } else {
+                BUG();
+        }
+        if (!bytes) {
+                ret = 1;
+                goto out;
+        } else
+                goto again;
+new_bitmap:
+        if (info && info->bitmap) {
+                add_new_bitmap(block_group, info, offset);
+                added = 1;
+                info = NULL;
+                goto again;
+        } else {
+                spin_unlock(&block_group->tree_lock);
+                /* no pre-allocated info, allocate a new one */
+                if (!info) {
+                        info = kzalloc(sizeof(struct btrfs_free_space),
+                                       GFP_NOFS);
+                        if (!info) {
+                                spin_lock(&block_group->tree_lock);
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                }
+                /* allocate the bitmap */
+                info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+                spin_lock(&block_group->tree_lock);
+                if (!info->bitmap) {
+                        ret = -ENOMEM;
+                        goto out;
+                }
+                goto again;
+        }
+out:
+        if (info) {
+                if (info->bitmap)
+                        kfree(info->bitmap);
+                kfree(info);
+        }
        return ret;
 }
@@ -208,8 +600,8 @@ static int link_free_space(struct btrfs_block_group_cache *block_group,
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 offset, u64 bytes)
 {
-        struct btrfs_free_space *right_info;
+        struct btrfs_free_space *right_info = NULL;
-        struct btrfs_free_space *left_info;
+        struct btrfs_free_space *left_info = NULL;
        struct btrfs_free_space *info = NULL;
        int ret = 0;
@@ -227,18 +619,38 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
-        right_info = tree_search_offset(&block_group->free_space_offset,
+        right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
-                                        offset+bytes, 0, 0);
+        if (right_info && rb_prev(&right_info->offset_index))
-        left_info = tree_search_offset(&block_group->free_space_offset,
+                left_info = rb_entry(rb_prev(&right_info->offset_index),
-                                       offset-1, 0, 1);
+                                     struct btrfs_free_space, offset_index);
+        else
+                left_info = tree_search_offset(block_group, offset - 1, 0, 0);
+        /*
+         * If there was no extent directly to the left or right of this new
+         * extent then we know we're going to have to allocate a new extent, so
+         * before we do that see if we need to drop this into a bitmap
+         */
+        if ((!left_info || left_info->bitmap) &&
+            (!right_info || right_info->bitmap)) {
+                ret = insert_into_bitmap(block_group, info);
+                if (ret < 0) {
+                        goto out;
+                } else if (ret) {
+                        ret = 0;
+                        goto out;
+                }
+        }
-        if (right_info) {
+        if (right_info && !right_info->bitmap) {
                unlink_free_space(block_group, right_info);
                info->bytes += right_info->bytes;
                kfree(right_info);
        }
-        if (left_info && left_info->offset + left_info->bytes == offset) {
+        if (left_info && !left_info->bitmap &&
+            left_info->offset + left_info->bytes == offset) {
                unlink_free_space(block_group, left_info);
                info->offset = left_info->offset;
                info->bytes += left_info->bytes;
@@ -248,11 +660,11 @@ int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
        ret = link_free_space(block_group, info);
        if (ret)
                kfree(info);
+out:
        spin_unlock(&block_group->tree_lock);
        if (ret) {
-                printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret);
+                printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
                BUG_ON(ret == -EEXIST);
        }
@@ -263,40 +675,74 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                            u64 offset, u64 bytes)
 {
        struct btrfs_free_space *info;
+        struct btrfs_free_space *next_info = NULL;
        int ret = 0;
        spin_lock(&block_group->tree_lock);
-        info = tree_search_offset(&block_group->free_space_offset, offset, 0,
+again:
-                                  1);
+        info = tree_search_offset(block_group, offset, 0, 0);
-        if (info && info->offset == offset) {
+        if (!info) {
-                if (info->bytes < bytes) {
+                /*
-                        printk(KERN_ERR "Found free space at %llu, size %llu,"
+                 * oops didn't find an extent that matched the space we wanted
-                               "trying to use %llu\n",
+                 * to remove, look for a bitmap instead
-                               (unsigned long long)info->offset,
+                 */
-                               (unsigned long long)info->bytes,
+                info = tree_search_offset(block_group,
-                               (unsigned long long)bytes);
+                                          offset_to_bitmap(block_group, offset),
+                                          1, 0);
+                if (!info) {
+                        WARN_ON(1);
+                        goto out_lock;
+                }
+        }
+        if (info->bytes < bytes && rb_next(&info->offset_index)) {
+                u64 end;
+                next_info = rb_entry(rb_next(&info->offset_index),
+                                             struct btrfs_free_space,
+                                             offset_index);
+                if (next_info->bitmap)
+                        end = next_info->offset + BITS_PER_BITMAP *
+                                block_group->sectorsize - 1;
+                else
+                        end = next_info->offset + next_info->bytes;
+                if (next_info->bytes < bytes ||
+                    next_info->offset > offset || offset > end) {
+                        printk(KERN_CRIT "Found free space at %llu, size %llu,"
+                              " trying to use %llu\n",
+                              (unsigned long long)info->offset,
+                              (unsigned long long)info->bytes,
+                              (unsigned long long)bytes);
                        WARN_ON(1);
                        ret = -EINVAL;
-                        spin_unlock(&block_group->tree_lock);
+                        goto out_lock;
-                        goto out;
                }
-                unlink_free_space(block_group, info);
-                if (info->bytes == bytes) {
+                info = next_info;
-                        kfree(info);
+        }
-                        spin_unlock(&block_group->tree_lock);
-                        goto out;
+        if (info->bytes == bytes) {
+                unlink_free_space(block_group, info);
+                if (info->bitmap) {
+                        kfree(info->bitmap);
+                        block_group->total_bitmaps--;
                }
+                kfree(info);
+                goto out_lock;
+        }
+        if (!info->bitmap && info->offset == offset) {
+                unlink_free_space(block_group, info);
                info->offset += bytes;
                info->bytes -= bytes;
+                link_free_space(block_group, info);
+                goto out_lock;
+        }
-                ret = link_free_space(block_group, info);
+        if (!info->bitmap && info->offset <= offset &&
-                spin_unlock(&block_group->tree_lock);
+            info->offset + info->bytes >= offset + bytes) {
-                BUG_ON(ret);
-        } else if (info && info->offset < offset &&
-                   info->offset + info->bytes >= offset + bytes) {
                u64 old_start = info->offset;
                /*
                 * we're freeing space in the middle of the info,
@@ -312,7 +758,9 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        info->offset = offset + bytes;
                        info->bytes = old_end - info->offset;
                        ret = link_free_space(block_group, info);
-                        BUG_ON(ret);
+                        WARN_ON(ret);
+                        if (ret)
+                                goto out_lock;
                } else {
                        /* the hole we're creating ends at the end
                         * of the info struct, just free the info
@@ -320,32 +768,22 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                        kfree(info);
                }
                spin_unlock(&block_group->tree_lock);
-                /* step two, insert a new info struct to cover anything
-                 * before the hole
+                /* step two, insert a new info struct to cover
+                 * anything before the hole
                 */
                ret = btrfs_add_free_space(block_group, old_start,
                                           offset - old_start);
-                BUG_ON(ret);
+                WARN_ON(ret);
-        } else {
+                goto out;
-                spin_unlock(&block_group->tree_lock);
-                if (!info) {
-                        printk(KERN_ERR "couldn't find space %llu to free\n",
-                               (unsigned long long)offset);
-                        printk(KERN_ERR "cached is %d, offset %llu bytes %llu\n",
-                               block_group->cached,
-                               (unsigned long long)block_group->key.objectid,
-                               (unsigned long long)block_group->key.offset);
-                        btrfs_dump_free_space(block_group, bytes);
-                } else if (info) {
-                        printk(KERN_ERR "hmm, found offset=%llu bytes=%llu, "
-                               "but wanted offset=%llu bytes=%llu\n",
-                               (unsigned long long)info->offset,
-                               (unsigned long long)info->bytes,
-                               (unsigned long long)offset,
-                               (unsigned long long)bytes);
-                }
-                WARN_ON(1);
        }
+        ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+        if (ret == -EAGAIN)
+                goto again;
+        BUG_ON(ret);
+out_lock:
+        spin_unlock(&block_group->tree_lock);
 out:
        return ret;
 }
@@ -361,10 +799,13 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
-                printk(KERN_ERR "entry offset %llu, bytes %llu\n",
+                printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
                       (unsigned long long)info->offset,
-                       (unsigned long long)info->bytes);
+                       (unsigned long long)info->bytes,
+                       (info->bitmap) ? "yes" : "no");
        }
+        printk(KERN_INFO "block group has cluster?: %s\n",
+               list_empty(&block_group->cluster_list) ? "no" : "yes");
        printk(KERN_INFO "%d blocks of free space at or bigger than bytes is"
               "\n", count);
 }
@@ -397,26 +838,35 @@ __btrfs_return_cluster_to_free_space(
 {
        struct btrfs_free_space *entry;
        struct rb_node *node;
+        bool bitmap;
        spin_lock(&cluster->lock);
        if (cluster->block_group != block_group)
                goto out;
+        bitmap = cluster->points_to_bitmap;
+        cluster->block_group = NULL;
        cluster->window_start = 0;
+        list_del_init(&cluster->block_group_list);
+        cluster->points_to_bitmap = false;
+        if (bitmap)
+                goto out;
        node = rb_first(&cluster->root);
-        while(node) {
+        while (node) {
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
                rb_erase(&entry->offset_index, &cluster->root);
-                link_free_space(block_group, entry);
+                BUG_ON(entry->bitmap);
+                tree_insert_offset(&block_group->free_space_offset,
+                                   entry->offset, &entry->offset_index, 0);
        }
-        list_del_init(&cluster->block_group_list);
-        btrfs_put_block_group(cluster->block_group);
-        cluster->block_group = NULL;
        cluster->root.rb_node = NULL;
 out:
        spin_unlock(&cluster->lock);
+        btrfs_put_block_group(block_group);
        return 0;
 }
@@ -425,20 +875,28 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
        struct btrfs_free_space *info;
        struct rb_node *node;
        struct btrfs_free_cluster *cluster;
-        struct btrfs_free_cluster *safe;
+        struct list_head *head;
        spin_lock(&block_group->tree_lock);
+        while ((head = block_group->cluster_list.next) !=
-        list_for_each_entry_safe(cluster, safe, &block_group->cluster_list,
+               &block_group->cluster_list) {
-                                 block_group_list) {
+                cluster = list_entry(head, struct btrfs_free_cluster,
+                                     block_group_list);
                WARN_ON(cluster->block_group != block_group);
                __btrfs_return_cluster_to_free_space(block_group, cluster);
+                if (need_resched()) {
+                        spin_unlock(&block_group->tree_lock);
+                        cond_resched();
+                        spin_lock(&block_group->tree_lock);
+                }
        }
-        while ((node = rb_last(&block_group->free_space_bytes)) != NULL) {
+        while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
-                info = rb_entry(node, struct btrfs_free_space, bytes_index);
+                info = rb_entry(node, struct btrfs_free_space, offset_index);
                unlink_free_space(block_group, info);
+                if (info->bitmap)
+                        kfree(info->bitmap);
                kfree(info);
                if (need_resched()) {
                        spin_unlock(&block_group->tree_lock);
@@ -446,6 +904,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
                        spin_lock(&block_group->tree_lock);
                }
        }
        spin_unlock(&block_group->tree_lock);
 }
@@ -453,25 +912,35 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
                               u64 offset, u64 bytes, u64 empty_size)
 {
        struct btrfs_free_space *entry = NULL;
+        u64 bytes_search = bytes + empty_size;
        u64 ret = 0;
        spin_lock(&block_group->tree_lock);
-        entry = tree_search_offset(&block_group->free_space_offset, offset,
+        entry = find_free_space(block_group, &offset, &bytes_search, 0);
-                                   bytes + empty_size, 1);
        if (!entry)
-                entry = tree_search_bytes(&block_group->free_space_bytes,
+                goto out;
-                                          offset, bytes + empty_size);
-        if (entry) {
+        ret = offset;
+        if (entry->bitmap) {
+                bitmap_clear_bits(block_group, entry, offset, bytes);
+                if (!entry->bytes) {
+                        unlink_free_space(block_group, entry);
+                        kfree(entry->bitmap);
+                        kfree(entry);
+                        block_group->total_bitmaps--;
+                        recalculate_thresholds(block_group);
+                }
+        } else {
                unlink_free_space(block_group, entry);
-                ret = entry->offset;
                entry->offset += bytes;
                entry->bytes -= bytes;
                if (!entry->bytes)
                        kfree(entry);
                else
                        link_free_space(block_group, entry);
        }
+out:
        spin_unlock(&block_group->tree_lock);
        return ret;
@@ -517,6 +986,54 @@ int btrfs_return_cluster_to_free_space(
        return ret;
 }
+static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
+                                   struct btrfs_free_cluster *cluster,
+                                   u64 bytes, u64 min_start)
+{
+        struct btrfs_free_space *entry;
+        int err;
+        u64 search_start = cluster->window_start;
+        u64 search_bytes = bytes;
+        u64 ret = 0;
+        spin_lock(&block_group->tree_lock);
+        spin_lock(&cluster->lock);
+        if (!cluster->points_to_bitmap)
+                goto out;
+        if (cluster->block_group != block_group)
+                goto out;
+        /*
+         * search_start is the beginning of the bitmap, but at some point it may
+         * be a good idea to point to the actual start of the free area in the
+         * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
+         * to 1 to make sure we get the bitmap entry
+         */
+        entry = tree_search_offset(block_group,
+                                   offset_to_bitmap(block_group, search_start),
+                                   1, 0);
+        if (!entry || !entry->bitmap)
+                goto out;
+        search_start = min_start;
+        search_bytes = bytes;
+        err = search_bitmap(block_group, entry, &search_start,
+                            &search_bytes);
+        if (err)
+                goto out;
+        ret = search_start;
+        bitmap_clear_bits(block_group, entry, ret, bytes);
+out:
+        spin_unlock(&cluster->lock);
+        spin_unlock(&block_group->tree_lock);
+        return ret;
+}
 /*
 * given a cluster, try to allocate 'bytes' from it, returns 0
 * if it couldn't find anything suitably large, or a logical disk offset
@@ -530,6 +1047,10 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        struct rb_node *node;
        u64 ret = 0;
+        if (cluster->points_to_bitmap)
+                return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
+                                               min_start);
        spin_lock(&cluster->lock);
        if (bytes > cluster->max_size)
                goto out;
@@ -567,9 +1088,73 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
        }
 out:
        spin_unlock(&cluster->lock);
        return ret;
 }
+static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
+                                struct btrfs_free_space *entry,
+                                struct btrfs_free_cluster *cluster,
+                                u64 offset, u64 bytes, u64 min_bytes)
+{
+        unsigned long next_zero;
+        unsigned long i;
+        unsigned long search_bits;
+        unsigned long total_bits;
+        unsigned long found_bits;
+        unsigned long start = 0;
+        unsigned long total_found = 0;
+        bool found = false;
+        i = offset_to_bit(entry->offset, block_group->sectorsize,
+                          max_t(u64, offset, entry->offset));
+        search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+        total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+again:
+        found_bits = 0;
+        for (i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i);
+             i < BITS_PER_BITMAP;
+             i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, i + 1)) {
+                next_zero = find_next_zero_bit(entry->bitmap,
+                                               BITS_PER_BITMAP, i);
+                if (next_zero - i >= search_bits) {
+                        found_bits = next_zero - i;
+                        break;
+                }
+                i = next_zero;
+        }
+        if (!found_bits)
+                return -1;
+        if (!found) {
+                start = i;
+                found = true;
+        }
+        total_found += found_bits;
+        if (cluster->max_size < found_bits * block_group->sectorsize)
+                cluster->max_size = found_bits * block_group->sectorsize;
+        if (total_found < total_bits) {
+                i = find_next_bit(entry->bitmap, BITS_PER_BITMAP, next_zero);
+                if (i - start > total_bits * 2) {
+                        total_found = 0;
+                        cluster->max_size = 0;
+                        found = false;
+                }
+                goto again;
+        }
+        cluster->window_start = start * block_group->sectorsize +
+                entry->offset;
+        cluster->points_to_bitmap = true;
+        return 0;
+}
 /*
 * here we try to find a cluster of blocks in a block group.  The goal
 * is to find at least bytes free and up to empty_size + bytes free.
@@ -587,12 +1172,12 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
        struct btrfs_free_space *entry = NULL;
        struct rb_node *node;
        struct btrfs_free_space *next;
-        struct btrfs_free_space *last;
+        struct btrfs_free_space *last = NULL;
        u64 min_bytes;
        u64 window_start;
        u64 window_free;
        u64 max_extent = 0;
-        int total_retries = 0;
+        bool found_bitmap = false;
        int ret;
        /* for metadata, allow allocates with more holes */
@@ -620,31 +1205,80 @@ int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                goto out;
        }
 again:
-        min_bytes = min(min_bytes, bytes + empty_size);
+        entry = tree_search_offset(block_group, offset, found_bitmap, 1);
-        entry = tree_search_bytes(&block_group->free_space_bytes,
-                                  offset, min_bytes);
        if (!entry) {
                ret = -ENOSPC;
                goto out;
        }
+        /*
+         * If found_bitmap is true, we exhausted our search for extent entries,
+         * and we just want to search all of the bitmaps that we can find, and
+         * ignore any extent entries we find.
+         */
+        while (entry->bitmap || found_bitmap ||
+               (!entry->bitmap && entry->bytes < min_bytes)) {
+                struct rb_node *node = rb_next(&entry->offset_index);
+                if (entry->bitmap && entry->bytes > bytes + empty_size) {
+                        ret = btrfs_bitmap_cluster(block_group, entry, cluster,
+                                                   offset, bytes + empty_size,
+                                                   min_bytes);
+                        if (!ret)
+                                goto got_it;
+                }
+                if (!node) {
+                        ret = -ENOSPC;
+                        goto out;
+                }
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+        }
+        /*
+         * We already searched all the extent entries from the passed in offset
+         * to the end and didn't find enough space for the cluster, and we also
+         * didn't find any bitmaps that met our criteria, just go ahead and exit
+         */
+        if (found_bitmap) {
+                ret = -ENOSPC;
+                goto out;
+        }
+        cluster->points_to_bitmap = false;
        window_start = entry->offset;
        window_free = entry->bytes;
        last = entry;
        max_extent = entry->bytes;
-        while(1) {
+        while (1) {
                /* out window is just right, lets fill it */
                if (window_free >= bytes + empty_size)
                        break;
                node = rb_next(&last->offset_index);
                if (!node) {
+                        if (found_bitmap)
+                                goto again;
                        ret = -ENOSPC;
                        goto out;
                }
                next = rb_entry(node, struct btrfs_free_space, offset_index);
                /*
+                 * we found a bitmap, so if this search doesn't result in a
+                 * cluster, we know to go and search again for the bitmaps and
+                 * start looking for space there
+                 */
+                if (next->bitmap) {
+                        if (!found_bitmap)
+                                offset = next->offset;
+                        found_bitmap = true;
+                        last = next;
+                        continue;
+                }
+                /*
                 * we haven't filled the empty size and the window is
                 * very large.  reset and try again
                 */
@@ -655,19 +1289,6 @@ again:
                        window_free = entry->bytes;
                        last = entry;
                        max_extent = 0;
-                        total_retries++;
-                        if (total_retries % 64 == 0) {
-                                if (min_bytes >= (bytes + empty_size)) {
-                                        ret = -ENOSPC;
-                                        goto out;
-                                }
-                                /*
-                                 * grow our allocation a bit, we're not having
-                                 * much luck
-                                 */
-                                min_bytes *= 2;
-                                goto again;
-                        }
                } else {
                        last = next;
                        window_free += next->bytes;
@@ -685,11 +1306,19 @@ again:
         * The cluster includes an rbtree, but only uses the offset index
         * of each free space cache entry.
         */
-        while(1) {
+        while (1) {
                node = rb_next(&entry->offset_index);
-                unlink_free_space(block_group, entry);
+                if (entry->bitmap && node) {
+                        entry = rb_entry(node, struct btrfs_free_space,
+                                         offset_index);
+                        continue;
+                } else if (entry->bitmap && !node) {
+                        break;
+                }
+                rb_erase(&entry->offset_index, &block_group->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
-                                         &entry->offset_index);
+                                         &entry->offset_index, 0);
                BUG_ON(ret);
                if (!node || entry == last)
@@ -697,8 +1326,10 @@ again:
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
        }
-        ret = 0;
        cluster->max_size = max_extent;
+got_it:
+        ret = 0;
        atomic_inc(&block_group->count);
        list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
        cluster->block_group = block_group;
@@ -718,6 +1349,7 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
        spin_lock_init(&cluster->refill_lock);
        cluster->root.rb_node = NULL;
        cluster->max_size = 0;
+        cluster->points_to_bitmap = false;
        INIT_LIST_HEAD(&cluster->block_group_list);
        cluster->block_group = NULL;
 }
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 266fb8764054..890a8e79011b 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -19,6 +19,14 @@
 #ifndef __BTRFS_FREE_SPACE_CACHE
 #define __BTRFS_FREE_SPACE_CACHE
+struct btrfs_free_space {
+        struct rb_node offset_index;
+        u64 offset;
+        u64 bytes;
+        unsigned long *bitmap;
+        struct list_head list;
+};
 int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
                         u64 bytenr, u64 size);
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 791eab19e330..59cba180fe83 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2603,8 +2603,8 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        if (root->ref_cows)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
        path = btrfs_alloc_path();
-        path->reada = -1;
        BUG_ON(!path);
+        path->reada = -1;
        /* FIXME, add redo link to tree so we don't leak on crash */
        key.objectid = inode->i_ino;
@@ -3099,8 +3099,12 @@ static void inode_tree_add(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_inode *entry;
-        struct rb_node **p = &root->inode_tree.rb_node;
+        struct rb_node **p;
-        struct rb_node *parent = NULL;
+        struct rb_node *parent;
+again:
+        p = &root->inode_tree.rb_node;
+        parent = NULL;
        spin_lock(&root->inode_lock);
        while (*p) {
@@ -3108,13 +3112,16 @@ static void inode_tree_add(struct inode *inode)
                entry = rb_entry(parent, struct btrfs_inode, rb_node);
                if (inode->i_ino < entry->vfs_inode.i_ino)
-                        p = &(*p)->rb_left;
+                        p = &parent->rb_left;
                else if (inode->i_ino > entry->vfs_inode.i_ino)
-                        p = &(*p)->rb_right;
+                        p = &parent->rb_right;
                else {
                        WARN_ON(!(entry->vfs_inode.i_state &
                                  (I_WILL_FREE | I_FREEING | I_CLEAR)));
-                        break;
+                        rb_erase(parent, &root->inode_tree);
+                        RB_CLEAR_NODE(parent);
+                        spin_unlock(&root->inode_lock);
+                        goto again;
                }
        }
        rb_link_node(&BTRFS_I(inode)->rb_node, parent, p);
@@ -3126,12 +3133,12 @@ static void inode_tree_del(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        spin_lock(&root->inode_lock);
        if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) {
-                spin_lock(&root->inode_lock);
                rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree);
-                spin_unlock(&root->inode_lock);
                RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
        }
+        spin_unlock(&root->inode_lock);
 }
 static noinline void init_btrfs_i(struct inode *inode)
@@ -4785,8 +4792,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * and the replacement file is large.  Start IO on it now so
         * we don't add too much work to the end of the transaction
         */
-        if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+        if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size &&
-            new_inode->i_size &&
            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                filemap_flush(old_inode->i_mapping);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index d6f0806c682f..7b2f401e604e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -740,7 +740,6 @@ int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
                .nr_to_write = mapping->nrpages * 2,
                .range_start = start,
                .range_end = end,
-                .for_writepages = 1,
        };
        return btrfs_writepages(mapping, &wbc);
 }
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 6d6523da0a30..0d126be22b63 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -309,7 +309,7 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
        }
        printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n",
               (unsigned long long)btrfs_header_bytenr(c),
-               btrfs_header_level(c), nr,
+              level, nr,
               (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
        for (i = 0; i < nr; i++) {
                btrfs_node_key_to_cpu(c, &key, i);
@@ -326,10 +326,10 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
                                        btrfs_level_size(root, level - 1),
                                        btrfs_node_ptr_generation(c, i));
                if (btrfs_is_leaf(next) &&
-                    btrfs_header_level(c) != 1)
+                   level != 1)
                        BUG();
                if (btrfs_header_level(next) !=
-                        btrfs_header_level(c) - 1)
+                       level - 1)
                        BUG();
                btrfs_print_tree(root, next);
                free_extent_buffer(next);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 008397934778..c04f7f212602 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -670,6 +670,8 @@ again:
                        err = ret;
                        goto out;
                }
+                if (ret > 0 && path2->slots[level] > 0)
+                        path2->slots[level]--;
                eb = path2->nodes[level];
                WARN_ON(btrfs_node_blockptr(eb, path2->slots[level]) !=
@@ -1609,6 +1611,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                BUG_ON(level == 0);
                path->lowest_level = level;
                ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
+                path->lowest_level = 0;
                if (ret < 0) {
                        btrfs_free_path(path);
                        return ret;
@@ -2550,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len)
        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
        /* make sure the dirty trick played by the caller work */
-        ret = invalidate_inode_pages2_range(inode->i_mapping,
+        while (1) {
-                                            first_index, last_index);
+                ret = invalidate_inode_pages2_range(inode->i_mapping,
+                                                    first_index, last_index);
+                if (ret != -EBUSY)
+                        break;
+                schedule_timeout(HZ/10);
+        }
        if (ret)
                goto out_unlock;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2dbf1c1f56ee..cdbb5022da52 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,6 +40,12 @@ static noinline void put_transaction(struct btrfs_transaction *transaction)
        }
 }
+static noinline void switch_commit_root(struct btrfs_root *root)
+{
+        free_extent_buffer(root->commit_root);
+        root->commit_root = btrfs_root_node(root);
+}
 /*
 * either allocate a new transaction or hop into the existing one
 */
@@ -444,9 +450,6 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        btrfs_write_dirty_block_groups(trans, root);
-        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-        BUG_ON(ret);
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
                if (old_root_bytenr == root->node->start)
@@ -457,13 +460,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                                        &root->root_key,
                                        &root->root_item);
                BUG_ON(ret);
-                btrfs_write_dirty_block_groups(trans, root);
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                ret = btrfs_write_dirty_block_groups(trans, root);
                BUG_ON(ret);
        }
-        free_extent_buffer(root->commit_root);
-        root->commit_root = btrfs_root_node(root);
+        if (root != root->fs_info->extent_root)
+                switch_commit_root(root);
        return 0;
 }
@@ -495,10 +499,12 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
                root = list_entry(next, struct btrfs_root, dirty_list);
                update_cowonly_root(trans, root);
-                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
-                BUG_ON(ret);
        }
+        down_write(&fs_info->extent_commit_sem);
+        switch_commit_root(fs_info->extent_root);
+        up_write(&fs_info->extent_commit_sem);
        return 0;
 }
@@ -544,8 +550,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_update_reloc_root(trans, root);
                        if (root->commit_root != root->node) {
-                                free_extent_buffer(root->commit_root);
+                                switch_commit_root(root);
-                                root->commit_root = btrfs_root_node(root);
                                btrfs_set_root_node(&root->root_item,
                                                    root->node);
                        }
@@ -852,6 +857,16 @@ static void update_super_roots(struct btrfs_root *root)
        super->root_level = root_item->level;
 }
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
+{
+        int ret = 0;
+        spin_lock(&info->new_trans_lock);
+        if (info->running_transaction)
+                ret = info->running_transaction->in_commit;
+        spin_unlock(&info->new_trans_lock);
+        return ret;
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -943,9 +958,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                mutex_unlock(&root->fs_info->trans_mutex);
-                if (flush_on_commit || snap_pending) {
+                if (flush_on_commit) {
-                        if (flush_on_commit)
+                        btrfs_start_delalloc_inodes(root);
-                                btrfs_start_delalloc_inodes(root);
+                        ret = btrfs_wait_ordered_extents(root, 0);
+                        BUG_ON(ret);
+                } else if (snap_pending) {
                        ret = btrfs_wait_ordered_extents(root, 1);
                        BUG_ON(ret);
                }
@@ -1009,15 +1026,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
                            root->fs_info->tree_root->node);
-        free_extent_buffer(root->fs_info->tree_root->commit_root);
+        switch_commit_root(root->fs_info->tree_root);
-        root->fs_info->tree_root->commit_root =
-                                btrfs_root_node(root->fs_info->tree_root);
        btrfs_set_root_node(&root->fs_info->chunk_root->root_item,
                            root->fs_info->chunk_root->node);
-        free_extent_buffer(root->fs_info->chunk_root->commit_root);
+        switch_commit_root(root->fs_info->chunk_root);
-        root->fs_info->chunk_root->commit_root =
-                                btrfs_root_node(root->fs_info->chunk_root);
        update_super_roots(root);
@@ -1057,6 +1070,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
        put_transaction(cur_trans);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 961c3ee5a2e1..663c67404918 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -107,4 +107,5 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
                                        struct extent_io_tree *dirty_pages);
+int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index c13922206d1b..d91b0de7c502 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -797,7 +797,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                return -ENOENT;
        inode = read_one_inode(root, key->objectid);
-        BUG_ON(!dir);
+        BUG_ON(!inode);
        ref_ptr = btrfs_item_ptr_offset(eb, slot);
        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3ab80e9cd767..5cf405b0828d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -260,7 +260,7 @@ loop_lock:
                num_run++;
                batch_run++;
-                if (bio_sync(cur))
+                if (bio_rw_flagged(cur, BIO_RW_SYNCIO))
                        num_sync_run++;
                if (need_resched()) {
@@ -721,7 +721,8 @@ error:
 */
 static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
                                         struct btrfs_device *device,
-                                         u64 num_bytes, u64 *start)
+                                         u64 num_bytes, u64 *start,
+                                         u64 *max_avail)
 {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
@@ -758,9 +759,13 @@ static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans,
        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
        if (ret < 0)
                goto error;
-        ret = btrfs_previous_item(root, path, 0, key.type);
+        if (ret > 0) {
-        if (ret < 0)
+                ret = btrfs_previous_item(root, path, key.objectid, key.type);
-                goto error;
+                if (ret < 0)
+                        goto error;
+                if (ret > 0)
+                        start_found = 1;
+        }
        l = path->nodes[0];
        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
        while (1) {
@@ -803,6 +808,10 @@ no_more_items:
                        if (last_byte < search_start)
                                last_byte = search_start;
                        hole_size = key.offset - last_byte;
+                        if (hole_size > *max_avail)
+                                *max_avail = hole_size;
                        if (key.offset > last_byte &&
                            hole_size >= num_bytes) {
                                *start = last_byte;
@@ -1621,6 +1630,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
        device->fs_devices->total_rw_bytes += diff;
        device->total_bytes = new_size;
+        device->disk_total_bytes = new_size;
        btrfs_clear_space_info_full(device->dev_root->fs_info);
        return btrfs_update_device(trans, device);
@@ -2007,7 +2017,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                        goto done;
                if (ret) {
                        ret = 0;
-                        goto done;
+                        break;
                }
                l = path->nodes[0];
@@ -2015,7 +2025,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
                if (key.objectid != device->devid)
-                        goto done;
+                        break;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
                length = btrfs_dev_extent_length(l, dev_extent);
@@ -2171,6 +2181,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                             max_chunk_size);
 again:
+        max_avail = 0;
        if (!map || map->num_stripes != num_stripes) {
                kfree(map);
                map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
@@ -2219,7 +2230,8 @@ again:
                if (device->in_fs_metadata && avail >= min_free) {
                        ret = find_free_dev_extent(trans, device,
-                                                   min_free, &dev_offset);
+                                                   min_free, &dev_offset,
+                                                   &max_avail);
                        if (ret == 0) {
                                list_move_tail(&device->dev_alloc_list,
                                               &private_devs);
@@ -2795,26 +2807,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                }
        }
-        for (i = 0; i > nr; i++) {
-                struct btrfs_multi_bio *multi;
-                struct btrfs_bio_stripe *stripe;
-                int ret;
-                length = 1;
-                ret = btrfs_map_block(map_tree, WRITE, buf[i],
-                                      &length, &multi, 0);
-                BUG_ON(ret);
-                stripe = multi->stripes;
-                for (j = 0; j < multi->num_stripes; j++) {
-                        if (stripe->physical >= physical &&
-                            physical < stripe->physical + length)
-                                break;
-                }
-                BUG_ON(j >= multi->num_stripes);
-                kfree(multi);
-        }
        *logical = buf;
        *naddrs = nr;
        *stripe_len = map->stripe_len;
@@ -2911,7 +2903,7 @@ static noinline int schedule_bio(struct btrfs_root *root,
        bio->bi_rw |= rw;
        spin_lock(&device->io_lock);
-        if (bio_sync(bio))
+        if (bio_rw_flagged(bio, BIO_RW_SYNCIO))
                pending_bios = &device->pending_sync_bios;
        else
                pending_bios = &device->pending_bios;
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index ecfbce836d32..3e2b90eaa239 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        *total_in = 0;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -1;
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
@@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        char *kaddr;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -ENOMEM;
        data_in = kmap(pages_in[page_in_index]);
@@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
                return -ENOMEM;
        workspace = find_zlib_workspace();
-        if (!workspace)
+        if (IS_ERR(workspace))
                return -ENOMEM;
        workspace->inf_strm.next_in = data_in;
diff --git a/fs/buffer.c b/fs/buffer.c
index a3ef091a45bd..90a98865b0cc 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -281,7 +281,7 @@ static void free_more_memory(void)
        struct zone *zone;
        int nid;
-        wakeup_pdflush(1024);
+        wakeup_flusher_threads(1024);
        yield();
        for_each_online_node(nid) {
@@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh)
        if (!test_set_buffer_dirty(bh)) {
                struct page *page = bh->b_page;
-                if (!TestSetPageDirty(page))
+                if (!TestSetPageDirty(page)) {
-                        __set_page_dirty(page, page_mapping(page), 0);
+                        struct address_space *mapping = page_mapping(page);
+                        if (mapping)
+                                __set_page_dirty(page, mapping, 0);
+                }
        }
 }
diff --git a/fs/char_dev.c b/fs/char_dev.c
index a173551e19d7..3cbc57f932d2 100644
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -31,6 +31,7 @@
 * - no readahead or I/O queue unplugging required
 */
 struct backing_dev_info directly_mappable_cdev_bdi = {
+        .name = "char",
        .capabilities   = (
 #ifdef CONFIG_MMU
                /* permit private copies of the data to be taken */
@@ -237,8 +238,10 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
 }
 /**
- * register_chrdev() - Register a major number for character devices.
+ * __register_chrdev() - create and register a cdev occupying a range of minors
 * @major: major device number or 0 for dynamic allocation
+ * @baseminor: first of the requested range of minor numbers
+ * @count: the number of minor numbers required
 * @name: name of this range of devices
 * @fops: file operations associated with this devices
 *
@@ -254,19 +257,17 @@ int alloc_chrdev_region(dev_t *dev, unsigned baseminor, unsigned count,
 * /dev. It only helps to keep track of the different owners of devices. If
 * your module name has only one type of devices it's ok to use e.g. the name
 * of the module here.
- *
- * This function registers a range of 256 minor numbers. The first minor number
- * is 0.
 */
-int register_chrdev(unsigned int major, const char *name,
+int __register_chrdev(unsigned int major, unsigned int baseminor,
-                    const struct file_operations *fops)
+                      unsigned int count, const char *name,
+                      const struct file_operations *fops)
 {
        struct char_device_struct *cd;
        struct cdev *cdev;
        char *s;
        int err = -ENOMEM;
-        cd = __register_chrdev_region(major, 0, 256, name);
+        cd = __register_chrdev_region(major, baseminor, count, name);
        if (IS_ERR(cd))
                return PTR_ERR(cd);
        
@@ -280,7 +281,7 @@ int register_chrdev(unsigned int major, const char *name,
        for (s = strchr(kobject_name(&cdev->kobj),'/'); s; s = strchr(s, '/'))
                *s = '!';
                
-        err = cdev_add(cdev, MKDEV(cd->major, 0), 256);
+        err = cdev_add(cdev, MKDEV(cd->major, baseminor), count);
        if (err)
                goto out;
@@ -290,7 +291,7 @@ int register_chrdev(unsigned int major, const char *name,
 out:
        kobject_put(&cdev->kobj);
 out2:
-        kfree(__unregister_chrdev_region(cd->major, 0, 256));
+        kfree(__unregister_chrdev_region(cd->major, baseminor, count));
        return err;
 }
@@ -316,10 +317,23 @@ void unregister_chrdev_region(dev_t from, unsigned count)
        }
 }
-void unregister_chrdev(unsigned int major, const char *name)
+/**
+ * __unregister_chrdev - unregister and destroy a cdev
+ * @major: major device number
+ * @baseminor: first of the range of minor numbers
+ * @count: the number of minor numbers this cdev is occupying
+ * @name: name of this range of devices
+ *
+ * Unregister and destroy the cdev occupying the region described by
+ * @major, @baseminor and @count.  This function undoes what
+ * __register_chrdev() did.
+ */
+void __unregister_chrdev(unsigned int major, unsigned int baseminor,
+                         unsigned int count, const char *name)
 {
        struct char_device_struct *cd;
-        cd = __unregister_chrdev_region(major, 0, 256);
+        cd = __unregister_chrdev_region(major, baseminor, count);
        if (cd && cd->cdev)
                cdev_del(cd->cdev);
        kfree(cd);
@@ -568,6 +582,6 @@ EXPORT_SYMBOL(cdev_alloc);
 EXPORT_SYMBOL(cdev_del);
 EXPORT_SYMBOL(cdev_add);
 EXPORT_SYMBOL(cdev_index);
-EXPORT_SYMBOL(register_chrdev);
+EXPORT_SYMBOL(__register_chrdev);
-EXPORT_SYMBOL(unregister_chrdev);
+EXPORT_SYMBOL(__unregister_chrdev);
 EXPORT_SYMBOL(directly_mappable_cdev_bdi);
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 92888aa90749..145540a316ab 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,13 @@
+Version 1.60
+-------------
+Fix memory leak in reconnect.  Fix oops in DFS mount error path.
+Set s_maxbytes to smaller (the max that vfs can handle) so that
+sendfile will now work over cifs mounts again.  Add noforcegid
+and noforceuid mount parameters. Fix small mem leak when using
+ntlmv2. Fix 2nd mount to same server but with different port to
+be allowed (rather than reusing the 1st port) - only when the
+user explicitly overrides the port on the 2nd mount.
 Version 1.59
 ------------
 Client uses server inode numbers (which are persistent) rather than
diff --git a/fs/cifs/README b/fs/cifs/README
index ad92921dbde4..79c1a93400be 100644
--- a/fs/cifs/README
+++ b/fs/cifs/README
@@ -262,11 +262,11 @@ A partial list of the supported mount options follows:
                mount.  
  domain        Set the SMB/CIFS workgroup name prepended to the
                username during CIFS session establishment
-  forceuid      Set the default uid for inodes based on the uid
+  forceuid      Set the default uid for inodes to the uid
-                passed in. For mounts to servers
+                passed in on mount. For mounts to servers
                which do support the CIFS Unix extensions, such as a
                properly configured Samba server, the server provides
-                the uid, gid and mode so this parameter should  not be
+                the uid, gid and mode so this parameter should not be
                specified unless the server and clients uid and gid
                numbering differ.  If the server and client are in the
                same domain (e.g. running winbind or nss_ldap) and
@@ -278,11 +278,7 @@ A partial list of the supported mount options follows:
                of existing files will be the uid (gid) of the person
                who executed the mount (root, except when mount.cifs
                is configured setuid for user mounts) unless the "uid=" 
-                (gid) mount option is specified.  For the uid (gid) of newly
+                (gid) mount option is specified. Also note that permission
-                created files and directories, ie files created since 
-                the last mount of the server share, the expected uid 
-                (gid) is cached as long as the inode remains in 
-                memory on the client.   Also note that permission
                checks (authorization checks) on accesses to a file occur
                at the server, but there are cases in which an administrator
                may want to restrict at the client as well.  For those
@@ -290,12 +286,15 @@ A partial list of the supported mount options follows:
                (such as Windows), permissions can also be checked at the
                client, and a crude form of client side permission checking 
                can be enabled by specifying file_mode and dir_mode on 
-                the client.  Note that the mount.cifs helper must be
+                the client.  (default)
-                at version 1.10 or higher to support specifying the uid
+  forcegid      (similar to above but for the groupid instead of uid) (default)
-                (or gid) in non-numeric form.
+  noforceuid    Fill in file owner information (uid) by requesting it from
-  forcegid      (similar to above but for the groupid instead of uid)
+                the server if possible. With this option, the value given in
+                the uid= option (on mount) will only be used if the server
+                can not support returning uids on inodes.
+  noforcegid    (similar to above but for the group owner, gid, instead of uid)
  uid           Set the default uid for inodes, and indicate to the
-                cifs kernel driver which local user mounted . If the server
+                cifs kernel driver which local user mounted. If the server
                supports the unix extensions the default uid is
                not used to fill in the owner fields of inodes (files)
                unless the "forceuid" parameter is specified.
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 3bb11be8b6a8..606912d8f2a8 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void)
 * i.e. strips from UNC trailing path that is not part of share
 * name and fixup missing '\' in the begining of DFS node refferal
 * if neccessary.
- * Returns pointer to share name on success or NULL on error.
+ * Returns pointer to share name on success or ERR_PTR on error.
 * Caller is responsible for freeing returned string.
 */
 static char *cifs_get_share_name(const char *node_name)
@@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name)
        UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */,
                         GFP_KERNEL);
        if (!UNC)
-                return NULL;
+                return ERR_PTR(-ENOMEM);
        /* get share name and server name */
        if (node_name[1] != '\\') {
@@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name)
                cERROR(1, ("%s: no server name end in node name: %s",
                        __func__, node_name));
                kfree(UNC);
-                return NULL;
+                return ERR_PTR(-EINVAL);
        }
        /* find sharename end */
@@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
                return ERR_PTR(-EINVAL);
        *devname = cifs_get_share_name(ref->node_name);
+        if (IS_ERR(*devname)) {
+                rc = PTR_ERR(*devname);
+                *devname = NULL;
+                goto compose_mount_options_err;
+        }
        rc = dns_resolve_server_name_to_ip(*devname, &srvIP);
        if (rc != 0) {
                cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d",
diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c
index 051caecf7d67..8ec7736ce954 100644
--- a/fs/cifs/cifs_spnego.c
+++ b/fs/cifs/cifs_spnego.c
@@ -125,7 +125,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
        if (server->addr.sockAddr.sin_family == AF_INET)
                sprintf(dp, "ip4=%pI4", &server->addr.sockAddr.sin_addr);
        else if (server->addr.sockAddr.sin_family == AF_INET6)
-                sprintf(dp, "ip6=%pi6", &server->addr.sockAddr6.sin6_addr);
+                sprintf(dp, "ip6=%pI6", &server->addr.sockAddr6.sin6_addr);
        else
                goto out;
diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c
index 60e3c4253de0..714a542cbafc 100644
--- a/fs/cifs/cifs_unicode.c
+++ b/fs/cifs/cifs_unicode.c
@@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes,
        int maxwords = maxbytes / 2;
        char tmp[NLS_MAX_CHARSET_SIZE];
-        for (i = 0; from[i] && i < maxwords; i++) {
+        for (i = 0; i < maxwords && from[i]; i++) {
                charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp,
                                             NLS_MAX_CHARSET_SIZE);
                if (charlen > 0)
diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c
index 6941c22398a6..7dfe0842a6f6 100644
--- a/fs/cifs/cifsacl.c
+++ b/fs/cifs/cifsacl.c
@@ -607,7 +607,7 @@ static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
                return get_cifs_acl_by_path(cifs_sb, path, pacllen);
        pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
-        atomic_dec(&open_file->wrtPending);
+        cifsFileInfo_put(open_file);
        return pntsd;
 }
@@ -665,7 +665,7 @@ static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
                return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
        rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
-        atomic_dec(&open_file->wrtPending);
+        cifsFileInfo_put(open_file);
        return rc;
 }
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 7c9809523f42..7efe1745494d 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -373,6 +373,7 @@ calc_exit_2:
           compare with the NTLM example */
        hmac_md5_final(ses->server->ntlmv2_hash, pctxt);
+        kfree(pctxt);
        return rc;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 44f30504b82d..3610e9958b4c 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -361,13 +361,10 @@ cifs_show_address(struct seq_file *s, struct TCP_Server_Info *server)
 static int
 cifs_show_options(struct seq_file *s, struct vfsmount *m)
 {
-        struct cifs_sb_info *cifs_sb;
+        struct cifs_sb_info *cifs_sb = CIFS_SB(m->mnt_sb);
-        struct cifsTconInfo *tcon;
+        struct cifsTconInfo *tcon = cifs_sb->tcon;
-        cifs_sb = CIFS_SB(m->mnt_sb);
-        tcon = cifs_sb->tcon;
-        seq_printf(s, ",unc=%s", cifs_sb->tcon->treeName);
+        seq_printf(s, ",unc=%s", tcon->treeName);
        if (tcon->ses->userName)
                seq_printf(s, ",username=%s", tcon->ses->userName);
        if (tcon->ses->domainName)
@@ -376,10 +373,14 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m)
        seq_printf(s, ",uid=%d", cifs_sb->mnt_uid);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID)
                seq_printf(s, ",forceuid");
+        else
+                seq_printf(s, ",noforceuid");
        seq_printf(s, ",gid=%d", cifs_sb->mnt_gid);
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID)
                seq_printf(s, ",forcegid");
+        else
+                seq_printf(s, ",noforcegid");
        cifs_show_address(s, tcon->ses->server);
@@ -985,19 +986,19 @@ static int cifs_oplock_thread(void *dummyarg)
                if (try_to_freeze())
                        continue;
-                spin_lock(&GlobalMid_Lock);
+                spin_lock(&cifs_oplock_lock);
-                if (list_empty(&GlobalOplock_Q)) {
+                if (list_empty(&cifs_oplock_list)) {
-                        spin_unlock(&GlobalMid_Lock);
+                        spin_unlock(&cifs_oplock_lock);
                        set_current_state(TASK_INTERRUPTIBLE);
                        schedule_timeout(39*HZ);
                } else {
-                        oplock_item = list_entry(GlobalOplock_Q.next,
+                        oplock_item = list_entry(cifs_oplock_list.next,
                                                struct oplock_q_entry, qhead);
                        cFYI(1, ("found oplock item to write out"));
                        pTcon = oplock_item->tcon;
                        inode = oplock_item->pinode;
                        netfid = oplock_item->netfid;
-                        spin_unlock(&GlobalMid_Lock);
+                        spin_unlock(&cifs_oplock_lock);
                        DeleteOplockQEntry(oplock_item);
                        /* can not grab inode sem here since it would
                                deadlock when oplock received on delete
@@ -1054,7 +1055,7 @@ init_cifs(void)
        int rc = 0;
        cifs_proc_init();
        INIT_LIST_HEAD(&cifs_tcp_ses_list);
-        INIT_LIST_HEAD(&GlobalOplock_Q);
+        INIT_LIST_HEAD(&cifs_oplock_list);
 #ifdef CONFIG_CIFS_EXPERIMENTAL
        INIT_LIST_HEAD(&GlobalDnotifyReqList);
        INIT_LIST_HEAD(&GlobalDnotifyRsp_Q);
@@ -1083,6 +1084,7 @@ init_cifs(void)
        rwlock_init(&GlobalSMBSeslock);
        rwlock_init(&cifs_tcp_ses_lock);
        spin_lock_init(&GlobalMid_Lock);
+        spin_lock_init(&cifs_oplock_lock);
        if (cifs_max_pending < 2) {
                cifs_max_pending = 2;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 6c170948300d..094325e3f714 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -113,5 +113,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
-#define CIFS_VERSION   "1.60"
+#define CIFS_VERSION   "1.61"
 #endif                          /* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6084d6379c03..6cfc81a32703 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -351,11 +351,24 @@ struct cifsFileInfo {
        bool closePend:1;       /* file is marked to close */
        bool invalidHandle:1;   /* file closed via session abend */
        bool messageMode:1;     /* for pipes: message vs byte mode */
-        atomic_t wrtPending;   /* handle in use - defer close */
+        atomic_t count;         /* reference count */
        struct mutex fh_mutex; /* prevents reopen race after dead ses*/
        struct cifs_search_info srch_inf;
 };
+/* Take a reference on the file private data */
+static inline void cifsFileInfo_get(struct cifsFileInfo *cifs_file)
+{
+        atomic_inc(&cifs_file->count);
+}
+/* Release a reference on the file private data */
+static inline void cifsFileInfo_put(struct cifsFileInfo *cifs_file)
+{
+        if (atomic_dec_and_test(&cifs_file->count))
+                kfree(cifs_file);
+}
 /*
 * One of these for each file inode
 */
@@ -656,7 +669,11 @@ GLOBAL_EXTERN rwlock_t		cifs_tcp_ses_lock;
 */
 GLOBAL_EXTERN rwlock_t GlobalSMBSeslock;
-GLOBAL_EXTERN struct list_head GlobalOplock_Q;
+/* Global list of oplocks */
+GLOBAL_EXTERN struct list_head cifs_oplock_list;
+/* Protects the cifs_oplock_list */
+GLOBAL_EXTERN spinlock_t cifs_oplock_lock;
 /* Outstanding dir notify requests */
 GLOBAL_EXTERN struct list_head GlobalDnotifyReqList;
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 1866bc2927d4..301e307e1279 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -100,110 +100,138 @@ static void mark_open_files_invalid(struct cifsTconInfo *pTcon)
           to this tcon */
 }
-/* Allocate and return pointer to an SMB request buffer, and set basic
+/* reconnect the socket, tcon, and smb session if needed */
-   SMB information in the SMB header.  If the return code is zero, this
-   function must have filled in request_buf pointer */
 static int
-small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+cifs_reconnect_tcon(struct cifsTconInfo *tcon, int smb_command)
-                void **request_buf)
 {
        int rc = 0;
+        struct cifsSesInfo *ses;
+        struct TCP_Server_Info *server;
+        struct nls_table *nls_codepage;
-        /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
+        /*
-           check for tcp and smb session status done differently
+         * SMBs NegProt, SessSetup, uLogoff do not have tcon yet so check for
-           for those three - in the calling routine */
+         * tcp and smb session status done differently for those three - in the
-        if (tcon) {
+         * calling routine
-                if (tcon->tidStatus == CifsExiting) {
+         */
-                        /* only tree disconnect, open, and write,
+        if (!tcon)
-                        (and ulogoff which does not have tcon)
+                return 0;
-                        are allowed as we start force umount */
-                        if ((smb_command != SMB_COM_WRITE_ANDX) &&
+        ses = tcon->ses;
-                           (smb_command != SMB_COM_OPEN_ANDX) &&
+        server = ses->server;
-                           (smb_command != SMB_COM_TREE_DISCONNECT)) {
-                                cFYI(1, ("can not send cmd %d while umounting",
+        /*
-                                        smb_command));
+         * only tree disconnect, open, and write, (and ulogoff which does not
-                                return -ENODEV;
+         * have tcon) are allowed as we start force umount
-                        }
+         */
+        if (tcon->tidStatus == CifsExiting) {
+                if (smb_command != SMB_COM_WRITE_ANDX &&
+                    smb_command != SMB_COM_OPEN_ANDX &&
+                    smb_command != SMB_COM_TREE_DISCONNECT) {
+                        cFYI(1, ("can not send cmd %d while umounting",
+                                smb_command));
+                        return -ENODEV;
                }
-                if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
+        }
-                                  (tcon->ses->server)) {
-                        struct nls_table *nls_codepage;
-                                /* Give Demultiplex thread up to 10 seconds to
-                                   reconnect, should be greater than cifs socket
-                                   timeout which is 7 seconds */
-                        while (tcon->ses->server->tcpStatus ==
-                                                         CifsNeedReconnect) {
-                                wait_event_interruptible_timeout(tcon->ses->server->response_q,
-                                        (tcon->ses->server->tcpStatus ==
-                                                        CifsGood), 10 * HZ);
-                                if (tcon->ses->server->tcpStatus ==
-                                                        CifsNeedReconnect) {
-                                        /* on "soft" mounts we wait once */
-                                        if (!tcon->retry ||
-                                           (tcon->ses->status == CifsExiting)) {
-                                                cFYI(1, ("gave up waiting on "
-                                                      "reconnect in smb_init"));
-                                                return -EHOSTDOWN;
-                                        } /* else "hard" mount - keep retrying
-                                             until process is killed or server
-                                             comes back on-line */
-                                } else /* TCP session is reestablished now */
-                                        break;
-                        }
-                        nls_codepage = load_nls_default();
+        if (ses->status == CifsExiting)
-                /* need to prevent multiple threads trying to
+                return -EIO;
-                simultaneously reconnect the same SMB session */
-                        down(&tcon->ses->sesSem);
-                        if (tcon->ses->need_reconnect)
-                                rc = cifs_setup_session(0, tcon->ses,
-                                                        nls_codepage);
-                        if (!rc && (tcon->need_reconnect)) {
-                                mark_open_files_invalid(tcon);
-                                rc = CIFSTCon(0, tcon->ses, tcon->treeName,
-                                              tcon, nls_codepage);
-                                up(&tcon->ses->sesSem);
-                                /* BB FIXME add code to check if wsize needs
-                                   update due to negotiated smb buffer size
-                                   shrinking */
-                                if (rc == 0) {
-                                        atomic_inc(&tconInfoReconnectCount);
-                                        /* tell server Unix caps we support */
-                                        if (tcon->ses->capabilities & CAP_UNIX)
-                                                reset_cifs_unix_caps(
-                                                0 /* no xid */,
-                                                tcon,
-                                                NULL /* we do not know sb */,
-                                                NULL /* no vol info */);
-                                }
-                                cFYI(1, ("reconnect tcon rc = %d", rc));
+        /*
-                                /* Removed call to reopen open files here.
+         * Give demultiplex thread up to 10 seconds to reconnect, should be
-                                   It is safer (and faster) to reopen files
+         * greater than cifs socket timeout which is 7 seconds
-                                   one at a time as needed in read and write */
+         */
+        while (server->tcpStatus == CifsNeedReconnect) {
-                                /* Check if handle based operation so we
+                wait_event_interruptible_timeout(server->response_q,
-                                   know whether we can continue or not without
+                        (server->tcpStatus == CifsGood), 10 * HZ);
-                                   returning to caller to reset file handle */
-                                switch (smb_command) {
-                                        case SMB_COM_READ_ANDX:
-                                        case SMB_COM_WRITE_ANDX:
-                                        case SMB_COM_CLOSE:
-                                        case SMB_COM_FIND_CLOSE2:
-                                        case SMB_COM_LOCKING_ANDX: {
-                                                unload_nls(nls_codepage);
-                                                return -EAGAIN;
-                                        }
-                                }
-                        } else {
-                                up(&tcon->ses->sesSem);
-                        }
-                        unload_nls(nls_codepage);
-                } else {
+                /* is TCP session is reestablished now ?*/
-                        return -EIO;
+                if (server->tcpStatus != CifsNeedReconnect)
+                        break;
+                /*
+                 * on "soft" mounts we wait once. Hard mounts keep
+                 * retrying until process is killed or server comes
+                 * back on-line
+                 */
+                if (!tcon->retry || ses->status == CifsExiting) {
+                        cFYI(1, ("gave up waiting on reconnect in smb_init"));
+                        return -EHOSTDOWN;
                }
        }
+        if (!ses->need_reconnect && !tcon->need_reconnect)
+                return 0;
+        nls_codepage = load_nls_default();
+        /*
+         * need to prevent multiple threads trying to simultaneously
+         * reconnect the same SMB session
+         */
+        down(&ses->sesSem);
+        if (ses->need_reconnect)
+                rc = cifs_setup_session(0, ses, nls_codepage);
+        /* do we need to reconnect tcon? */
+        if (rc || !tcon->need_reconnect) {
+                up(&ses->sesSem);
+                goto out;
+        }
+        mark_open_files_invalid(tcon);
+        rc = CIFSTCon(0, ses, tcon->treeName, tcon, nls_codepage);
+        up(&ses->sesSem);
+        cFYI(1, ("reconnect tcon rc = %d", rc));
+        if (rc)
+                goto out;
+        /*
+         * FIXME: check if wsize needs updated due to negotiated smb buffer
+         *        size shrinking
+         */
+        atomic_inc(&tconInfoReconnectCount);
+        /* tell server Unix caps we support */
+        if (ses->capabilities & CAP_UNIX)
+                reset_cifs_unix_caps(0, tcon, NULL, NULL);
+        /*
+         * Removed call to reopen open files here. It is safer (and faster) to
+         * reopen files one at a time as needed in read and write.
+         *
+         * FIXME: what about file locks? don't we need to reclaim them ASAP?
+         */
+out:
+        /*
+         * Check if handle based operation so we know whether we can continue
+         * or not without returning to caller to reset file handle
+         */
+        switch (smb_command) {
+        case SMB_COM_READ_ANDX:
+        case SMB_COM_WRITE_ANDX:
+        case SMB_COM_CLOSE:
+        case SMB_COM_FIND_CLOSE2:
+        case SMB_COM_LOCKING_ANDX:
+                rc = -EAGAIN;
+        }
+        unload_nls(nls_codepage);
+        return rc;
+}
+/* Allocate and return pointer to an SMB request buffer, and set basic
+   SMB information in the SMB header.  If the return code is zero, this
+   function must have filled in request_buf pointer */
+static int
+small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
+                void **request_buf)
+{
+        int rc = 0;
+        rc = cifs_reconnect_tcon(tcon, smb_command);
        if (rc)
                return rc;
@@ -256,101 +284,7 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon,
 {
        int rc = 0;
-        /* SMBs NegProt, SessSetup, uLogoff do not have tcon yet so
+        rc = cifs_reconnect_tcon(tcon, smb_command);
-           check for tcp and smb session status done differently
-           for those three - in the calling routine */
-        if (tcon) {
-                if (tcon->tidStatus == CifsExiting) {
-                        /* only tree disconnect, open, and write,
-                          (and ulogoff which does not have tcon)
-                          are allowed as we start force umount */
-                        if ((smb_command != SMB_COM_WRITE_ANDX) &&
-                           (smb_command != SMB_COM_OPEN_ANDX) &&
-                           (smb_command != SMB_COM_TREE_DISCONNECT)) {
-                                cFYI(1, ("can not send cmd %d while umounting",
-                                        smb_command));
-                                return -ENODEV;
-                        }
-                }
-                if ((tcon->ses) && (tcon->ses->status != CifsExiting) &&
-                                  (tcon->ses->server)) {
-                        struct nls_table *nls_codepage;
-                                /* Give Demultiplex thread up to 10 seconds to
-                                   reconnect, should be greater than cifs socket
-                                   timeout which is 7 seconds */
-                        while (tcon->ses->server->tcpStatus ==
-                                                        CifsNeedReconnect) {
-                                wait_event_interruptible_timeout(tcon->ses->server->response_q,
-                                        (tcon->ses->server->tcpStatus ==
-                                                        CifsGood), 10 * HZ);
-                                if (tcon->ses->server->tcpStatus ==
-                                                CifsNeedReconnect) {
-                                        /* on "soft" mounts we wait once */
-                                        if (!tcon->retry ||
-                                           (tcon->ses->status == CifsExiting)) {
-                                                cFYI(1, ("gave up waiting on "
-                                                      "reconnect in smb_init"));
-                                                return -EHOSTDOWN;
-                                        } /* else "hard" mount - keep retrying
-                                             until process is killed or server
-                                             comes on-line */
-                                } else /* TCP session is reestablished now */
-                                        break;
-                        }
-                        nls_codepage = load_nls_default();
-                /* need to prevent multiple threads trying to
-                simultaneously reconnect the same SMB session */
-                        down(&tcon->ses->sesSem);
-                        if (tcon->ses->need_reconnect)
-                                rc = cifs_setup_session(0, tcon->ses,
-                                                        nls_codepage);
-                        if (!rc && (tcon->need_reconnect)) {
-                                mark_open_files_invalid(tcon);
-                                rc = CIFSTCon(0, tcon->ses, tcon->treeName,
-                                              tcon, nls_codepage);
-                                up(&tcon->ses->sesSem);
-                                /* BB FIXME add code to check if wsize needs
-                                update due to negotiated smb buffer size
-                                shrinking */
-                                if (rc == 0) {
-                                        atomic_inc(&tconInfoReconnectCount);
-                                        /* tell server Unix caps we support */
-                                        if (tcon->ses->capabilities & CAP_UNIX)
-                                                reset_cifs_unix_caps(
-                                                0 /* no xid */,
-                                                tcon,
-                                                NULL /* do not know sb */,
-                                                NULL /* no vol info */);
-                                }
-                                cFYI(1, ("reconnect tcon rc = %d", rc));
-                                /* Removed call to reopen open files here.
-                                   It is safer (and faster) to reopen files
-                                   one at a time as needed in read and write */
-                                /* Check if handle based operation so we
-                                   know whether we can continue or not without
-                                   returning to caller to reset file handle */
-                                switch (smb_command) {
-                                        case SMB_COM_READ_ANDX:
-                                        case SMB_COM_WRITE_ANDX:
-                                        case SMB_COM_CLOSE:
-                                        case SMB_COM_FIND_CLOSE2:
-                                        case SMB_COM_LOCKING_ANDX: {
-                                                unload_nls(nls_codepage);
-                                                return -EAGAIN;
-                                        }
-                                }
-                        } else {
-                                up(&tcon->ses->sesSem);
-                        }
-                        unload_nls(nls_codepage);
-                } else {
-                        return -EIO;
-                }
-        }
        if (rc)
                return rc;
@@ -3961,6 +3895,10 @@ parse_DFS_referrals(TRANSACTION2_GET_DFS_REFER_RSP *pSMBr,
                if (is_unicode) {
                        __le16 *tmp = kmalloc(strlen(searchName)*2 + 2,
                                                GFP_KERNEL);
+                        if (tmp == NULL) {
+                                rc = -ENOMEM;
+                                goto parse_DFS_referrals_exit;
+                        }
                        cifsConvertToUCS((__le16 *) tmp, searchName,
                                        PATH_MAX, nls_codepage, remap);
                        node->path_consumed = cifs_ucs2_bytes(tmp,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 9bb5c8750736..d49682433c20 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -803,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname,
        char *data;
        unsigned int  temp_len, i, j;
        char separator[2];
+        short int override_uid = -1;
+        short int override_gid = -1;
+        bool uid_specified = false;
+        bool gid_specified = false;
        separator[0] = ',';
        separator[1] = 0;
@@ -1093,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname,
                                                    "too long.\n");
                                return 1;
                        }
-                } else if (strnicmp(data, "uid", 3) == 0) {
+                } else if (!strnicmp(data, "uid", 3) && value && *value) {
-                        if (value && *value)
+                        vol->linux_uid = simple_strtoul(value, &value, 0);
-                                vol->linux_uid =
+                        uid_specified = true;
-                                        simple_strtoul(value, &value, 0);
+                } else if (!strnicmp(data, "forceuid", 8)) {
-                } else if (strnicmp(data, "forceuid", 8) == 0) {
+                        override_uid = 1;
-                                vol->override_uid = 1;
+                } else if (!strnicmp(data, "noforceuid", 10)) {
-                } else if (strnicmp(data, "gid", 3) == 0) {
+                        override_uid = 0;
-                        if (value && *value)
+                } else if (!strnicmp(data, "gid", 3) && value && *value) {
-                                vol->linux_gid =
+                        vol->linux_gid = simple_strtoul(value, &value, 0);
-                                        simple_strtoul(value, &value, 0);
+                        gid_specified = true;
-                } else if (strnicmp(data, "forcegid", 8) == 0) {
+                } else if (!strnicmp(data, "forcegid", 8)) {
-                                vol->override_gid = 1;
+                        override_gid = 1;
+                } else if (!strnicmp(data, "noforcegid", 10)) {
+                        override_gid = 0;
                } else if (strnicmp(data, "file_mode", 4) == 0) {
                        if (value && *value) {
                                vol->file_mode =
@@ -1355,11 +1361,23 @@ cifs_parse_mount_options(char *options, const char *devname,
        if (vol->UNCip == NULL)
                vol->UNCip = &vol->UNC[2];
+        if (uid_specified)
+                vol->override_uid = override_uid;
+        else if (override_uid == 1)
+                printk(KERN_NOTICE "CIFS: ignoring forceuid mount option "
+                                   "specified with no uid= option.\n");
+        if (gid_specified)
+                vol->override_gid = override_gid;
+        else if (override_gid == 1)
+                printk(KERN_NOTICE "CIFS: ignoring forcegid mount option "
+                                   "specified with no gid= option.\n");
        return 0;
 }
 static struct TCP_Server_Info *
-cifs_find_tcp_session(struct sockaddr_storage *addr)
+cifs_find_tcp_session(struct sockaddr_storage *addr, unsigned short int port)
 {
        struct list_head *tmp;
        struct TCP_Server_Info *server;
@@ -1379,16 +1397,37 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
                if (server->tcpStatus == CifsNew)
                        continue;
-                if (addr->ss_family == AF_INET &&
+                switch (addr->ss_family) {
-                    (addr4->sin_addr.s_addr !=
+                case AF_INET:
-                     server->addr.sockAddr.sin_addr.s_addr))
+                        if (addr4->sin_addr.s_addr ==
-                        continue;
+                            server->addr.sockAddr.sin_addr.s_addr) {
-                else if (addr->ss_family == AF_INET6 &&
+                                addr4->sin_port = htons(port);
-                         (!ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
+                                /* user overrode default port? */
-                                           &addr6->sin6_addr) ||
+                                if (addr4->sin_port) {
-                          server->addr.sockAddr6.sin6_scope_id !=
+                                        if (addr4->sin_port !=
-                                           addr6->sin6_scope_id))
+                                            server->addr.sockAddr.sin_port)
-                        continue;
+                                                continue;
+                                }
+                                break;
+                        } else
+                                continue;
+                case AF_INET6:
+                        if (ipv6_addr_equal(&addr6->sin6_addr,
+                            &server->addr.sockAddr6.sin6_addr) &&
+                            (addr6->sin6_scope_id ==
+                            server->addr.sockAddr6.sin6_scope_id)) {
+                                addr6->sin6_port = htons(port);
+                                /* user overrode default port? */
+                                if (addr6->sin6_port) {
+                                        if (addr6->sin6_port !=
+                                           server->addr.sockAddr6.sin6_port)
+                                                continue;
+                                }
+                                break;
+                        } else
+                                continue;
+                }
                ++server->srv_count;
                write_unlock(&cifs_tcp_ses_lock);
@@ -1457,7 +1496,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
        }
        /* see if we already have a matching tcp_ses */
-        tcp_ses = cifs_find_tcp_session(&addr);
+        tcp_ses = cifs_find_tcp_session(&addr, volume_info->port);
        if (tcp_ses)
                return tcp_ses;
@@ -2452,10 +2491,10 @@ try_mount_again:
                tcon->local_lease = volume_info->local_lease;
        }
        if (pSesInfo) {
-                if (pSesInfo->capabilities & CAP_LARGE_FILES) {
+                if (pSesInfo->capabilities & CAP_LARGE_FILES)
-                        sb->s_maxbytes = (u64) 1 << 63;
+                        sb->s_maxbytes = MAX_LFS_FILESIZE;
-                } else
+                else
-                        sb->s_maxbytes = (u64) 1 << 31; /* 2 GB */
+                        sb->s_maxbytes = MAX_NON_LFS;
        }
        /* BB FIXME fix time_gran to be larger for LANMAN sessions */
@@ -2544,11 +2583,20 @@ remote_path_check:
                        if (mount_data != mount_data_global)
                                kfree(mount_data);
                        mount_data = cifs_compose_mount_options(
                                        cifs_sb->mountdata, full_path + 1,
                                        referrals, &fake_devname);
-                        kfree(fake_devname);
                        free_dfs_info_array(referrals, num_referrals);
+                        kfree(fake_devname);
+                        kfree(full_path);
+                        if (IS_ERR(mount_data)) {
+                                rc = PTR_ERR(mount_data);
+                                mount_data = NULL;
+                                goto mount_fail_check;
+                        }
                        if (tcon)
                                cifs_put_tcon(tcon);
@@ -2556,8 +2604,6 @@ remote_path_check:
                                cifs_put_smb_ses(pSesInfo);
                        cleanup_volume_info(&volume_info);
-                        FreeXid(xid);
-                        kfree(full_path);
                        referral_walks_count++;
                        goto try_mount_again;
                }
@@ -2611,9 +2657,9 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses,
                return -EIO;
        smb_buffer = cifs_buf_get();
-        if (smb_buffer == NULL) {
+        if (smb_buffer == NULL)
                return -ENOMEM;
-        }
        smb_buffer_response = smb_buffer;
        header_assemble(smb_buffer, SMB_COM_TREE_CONNECT_ANDX,
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 4326ffd90fa9..a6424cfc0121 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -153,7 +153,7 @@ cifs_fill_fileinfo(struct inode *newinode, __u16 fileHandle,
        mutex_init(&pCifsFile->fh_mutex);
        mutex_init(&pCifsFile->lock_mutex);
        INIT_LIST_HEAD(&pCifsFile->llist);
-        atomic_set(&pCifsFile->wrtPending, 0);
+        atomic_set(&pCifsFile->count, 1);
        /* set the following in open now
                        pCifsFile->pfile = file; */
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index c34b7f8a217b..fa7beac8b80e 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -53,11 +53,9 @@ static inline struct cifsFileInfo *cifs_init_private(
        private_data->pInode = inode;
        private_data->invalidHandle = false;
        private_data->closePend = false;
-        /* we have to track num writers to the inode, since writepages
+        /* Initialize reference count to one.  The private data is
-        does not tell us which handle the write is for so there can
+        freed on the release of the last reference */
-        be a close (overlapping with write) of the filehandle that
+        atomic_set(&private_data->count, 1);
-        cifs_writepages chose to use */
-        atomic_set(&private_data->wrtPending, 0);
        return private_data;
 }
@@ -643,7 +641,7 @@ int cifs_close(struct inode *inode, struct file *file)
                        if (!pTcon->need_reconnect) {
                                write_unlock(&GlobalSMBSeslock);
                                timeout = 2;
-                                while ((atomic_read(&pSMBFile->wrtPending) != 0)
+                                while ((atomic_read(&pSMBFile->count) != 1)
                                        && (timeout <= 2048)) {
                                        /* Give write a better chance to get to
                                        server ahead of the close.  We do not
@@ -657,8 +655,6 @@ int cifs_close(struct inode *inode, struct file *file)
                                        msleep(timeout);
                                        timeout *= 4;
                                }
-                                if (atomic_read(&pSMBFile->wrtPending))
-                                        cERROR(1, ("close with pending write"));
                                if (!pTcon->need_reconnect &&
                                    !pSMBFile->invalidHandle)
                                        rc = CIFSSMBClose(xid, pTcon,
@@ -681,24 +677,7 @@ int cifs_close(struct inode *inode, struct file *file)
                list_del(&pSMBFile->flist);
                list_del(&pSMBFile->tlist);
                write_unlock(&GlobalSMBSeslock);
-                timeout = 10;
+                cifsFileInfo_put(file->private_data);
-                /* We waited above to give the SMBWrite a chance to issue
-                   on the wire (so we do not get SMBWrite returning EBADF
-                   if writepages is racing with close.  Note that writepages
-                   does not specify a file handle, so it is possible for a file
-                   to be opened twice, and the application close the "wrong"
-                   file handle - in these cases we delay long enough to allow
-                   the SMBWrite to get on the wire before the SMB Close.
-                   We allow total wait here over 45 seconds, more than
-                   oplock break time, and more than enough to allow any write
-                   to complete on the server, or to time out on the client */
-                while ((atomic_read(&pSMBFile->wrtPending) != 0)
-                                && (timeout <= 50000)) {
-                        cERROR(1, ("writes pending, delay free of handle"));
-                        msleep(timeout);
-                        timeout *= 8;
-                }
-                kfree(file->private_data);
                file->private_data = NULL;
        } else
                rc = -EBADF;
@@ -1236,7 +1215,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode)
                        if (!open_file->invalidHandle) {
                                /* found a good file */
                                /* lock it so it will not be closed on us */
-                                atomic_inc(&open_file->wrtPending);
+                                cifsFileInfo_get(open_file);
                                read_unlock(&GlobalSMBSeslock);
                                return open_file;
                        } /* else might as well continue, and look for
@@ -1276,7 +1255,7 @@ refind_writable:
                if (open_file->pfile &&
                    ((open_file->pfile->f_flags & O_RDWR) ||
                     (open_file->pfile->f_flags & O_WRONLY))) {
-                        atomic_inc(&open_file->wrtPending);
+                        cifsFileInfo_get(open_file);
                        if (!open_file->invalidHandle) {
                                /* found a good writable file */
@@ -1293,7 +1272,7 @@ refind_writable:
                                else { /* start over in case this was deleted */
                                       /* since the list could be modified */
                                        read_lock(&GlobalSMBSeslock);
-                                        atomic_dec(&open_file->wrtPending);
+                                        cifsFileInfo_put(open_file);
                                        goto refind_writable;
                                }
                        }
@@ -1309,7 +1288,7 @@ refind_writable:
                        read_lock(&GlobalSMBSeslock);
                        /* can not use this handle, no write
                           pending on this one after all */
-                        atomic_dec(&open_file->wrtPending);
+                        cifsFileInfo_put(open_file);
                        if (open_file->closePend) /* list could have changed */
                                goto refind_writable;
@@ -1373,7 +1352,7 @@ static int cifs_partialpagewrite(struct page *page, unsigned from, unsigned to)
        if (open_file) {
                bytes_written = cifs_write(open_file->pfile, write_data,
                                           to-from, &offset);
-                atomic_dec(&open_file->wrtPending);
+                cifsFileInfo_put(open_file);
                /* Does mm or vfs already set times? */
                inode->i_atime = inode->i_mtime = current_fs_time(inode->i_sb);
                if ((bytes_written > 0) && (offset))
@@ -1562,7 +1541,7 @@ retry:
                                                   bytes_to_write, offset,
                                                   &bytes_written, iov, n_iov,
                                                   long_op);
-                                atomic_dec(&open_file->wrtPending);
+                                cifsFileInfo_put(open_file);
                                cifs_update_eof(cifsi, offset, bytes_written);
                                if (rc || bytes_written < bytes_to_write) {
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 18afe57b2461..1f09c7619319 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -212,7 +212,7 @@ cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, FILE_UNIX_BASIC_INFO *info,
 * junction to the new submount (ie to setup the fake directory
 * which represents a DFS referral).
 */
-void
+static void
 cifs_create_dfs_fattr(struct cifs_fattr *fattr, struct super_block *sb)
 {
        struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
@@ -388,7 +388,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path,
 }
 /* Fill a cifs_fattr struct with info from FILE_ALL_INFO */
-void
+static void
 cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
                       struct cifs_sb_info *cifs_sb, bool adjust_tz)
 {
@@ -513,9 +513,12 @@ int cifs_get_inode_info(struct inode **pinode,
                                        cifs_sb->mnt_cifs_flags &
                                                CIFS_MOUNT_MAP_SPECIAL_CHR);
                        if (rc1) {
-                                /* BB EOPNOSUPP disable SERVER_INUM? */
                                cFYI(1, ("GetSrvInodeNum rc %d", rc1));
                                fattr.cf_uniqueid = iunique(sb, ROOT_I);
+                                /* disable serverino if call not supported */
+                                if (rc1 == -EINVAL)
+                                        cifs_sb->mnt_cifs_flags &=
+                                                        ~CIFS_MOUNT_SERVER_INUM;
                        }
                } else {
                        fattr.cf_uniqueid = iunique(sb, ROOT_I);
@@ -797,7 +800,7 @@ set_via_filehandle:
        if (open_file == NULL)
                CIFSSMBClose(xid, pTcon, netfid);
        else
-                atomic_dec(&open_file->wrtPending);
+                cifsFileInfo_put(open_file);
 out:
        return rc;
 }
@@ -1632,7 +1635,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs,
                __u32 npid = open_file->pid;
                rc = CIFSSMBSetFileSize(xid, pTcon, attrs->ia_size, nfid,
                                        npid, false);
-                atomic_dec(&open_file->wrtPending);
+                cifsFileInfo_put(open_file);
                cFYI(1, ("SetFSize for attrs rc = %d", rc));
                if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) {
                        unsigned int bytes_written;
@@ -1787,7 +1790,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs)
                u16 nfid = open_file->netfid;
                u32 npid = open_file->pid;
                rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid);
-                atomic_dec(&open_file->wrtPending);
+                cifsFileInfo_put(open_file);
        } else {
                rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args,
                                    cifs_sb->local_nls,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 0ad3e2d116a6..1da4ab250eae 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -119,20 +119,19 @@ AllocOplockQEntry(struct inode *pinode, __u16 fid, struct cifsTconInfo *tcon)
                temp->pinode = pinode;
                temp->tcon = tcon;
                temp->netfid = fid;
-                spin_lock(&GlobalMid_Lock);
+                spin_lock(&cifs_oplock_lock);
-                list_add_tail(&temp->qhead, &GlobalOplock_Q);
+                list_add_tail(&temp->qhead, &cifs_oplock_list);
-                spin_unlock(&GlobalMid_Lock);
+                spin_unlock(&cifs_oplock_lock);
        }
        return temp;
 }
 void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry)
 {
-        spin_lock(&GlobalMid_Lock);
+        spin_lock(&cifs_oplock_lock);
    /* should we check if list empty first? */
        list_del(&oplockEntry->qhead);
-        spin_unlock(&GlobalMid_Lock);
+        spin_unlock(&cifs_oplock_lock);
        kmem_cache_free(cifs_oplock_cachep, oplockEntry);
 }
@@ -144,14 +143,14 @@ void DeleteTconOplockQEntries(struct cifsTconInfo *tcon)
        if (tcon == NULL)
                return;
-        spin_lock(&GlobalMid_Lock);
+        spin_lock(&cifs_oplock_lock);
-        list_for_each_entry(temp, &GlobalOplock_Q, qhead) {
+        list_for_each_entry(temp, &cifs_oplock_list, qhead) {
                if ((temp->tcon) && (temp->tcon == tcon)) {
                        list_del(&temp->qhead);
                        kmem_cache_free(cifs_oplock_cachep, temp);
                }
        }
-        spin_unlock(&GlobalMid_Lock);
+        spin_unlock(&cifs_oplock_lock);
 }
 static int
diff --git a/fs/compat.c b/fs/compat.c
index 94502dab972a..6d6f98fe64a0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1485,20 +1485,15 @@ int compat_do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = -ERESTARTNOINTR;
+        retval = prepare_bprm_creds(bprm);
-        if (mutex_lock_interruptible(&current->cred_guard_mutex))
+        if (retval)
                goto out_free;
-        current->in_execve = 1;
-        retval = -ENOMEM;
-        bprm->cred = prepare_exec_creds();
-        if (!bprm->cred)
-                goto out_unlock;
        retval = check_unsafe_exec(bprm);
        if (retval < 0)
-                goto out_unlock;
+                goto out_free;
        clear_in_exec = retval;
+        current->in_execve = 1;
        file = open_exec(filename);
        retval = PTR_ERR(file);
@@ -1547,7 +1542,6 @@ int compat_do_execve(char * filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-        mutex_unlock(&current->cred_guard_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
@@ -1567,10 +1561,7 @@ out_file:
 out_unmark:
        if (clear_in_exec)
                current->fs->in_exec = 0;
-out_unlock:
        current->in_execve = 0;
-        mutex_unlock(&current->cred_guard_mutex);
 out_free:
        free_bprm(bprm);
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f28f070a60fc..f91fd51b32e3 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1905,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX)
 COMPATIBLE_IOCTL(FIOASYNC)
 COMPATIBLE_IOCTL(FIONBIO)
 COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
+COMPATIBLE_IOCTL(FS_IOC_FIEMAP)
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index 4921e7426d95..a2f746066c5d 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -51,6 +51,7 @@ static const struct address_space_operations configfs_aops = {
 };
 static struct backing_dev_info configfs_backing_dev_info = {
+        .name           = "configfs",
        .ra_pages       = 0,    /* No readahead */
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
diff --git a/fs/dcache.c b/fs/dcache.c
index 9e5cd3c3a6ba..a100fa35a48f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
 #include <linux/swap.h>
 #include <linux/bootmem.h>
 #include <linux/fs_struct.h>
+#include <linux/hardirq.h>
 #include "internal.h"
 int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 618a60f03886..240cef14fe58 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -106,6 +106,7 @@ struct connection {
 #define CF_CONNECT_PENDING 3
 #define CF_INIT_PENDING 4
 #define CF_IS_OTHERCON 5
+#define CF_CLOSE 6
        struct list_head writequeue;  /* List of outgoing writequeue_entries */
        spinlock_t writequeue_lock;
        int (*rx_action) (struct connection *); /* What to do when active */
@@ -299,6 +300,8 @@ static void lowcomms_write_space(struct sock *sk)
 static inline void lowcomms_connect_sock(struct connection *con)
 {
+        if (test_bit(CF_CLOSE, &con->flags))
+                return;
        if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
                queue_work(send_workqueue, &con->swork);
 }
@@ -926,10 +929,8 @@ static void tcp_connect_to_sock(struct connection *con)
                goto out_err;
        memset(&saddr, 0, sizeof(saddr));
-        if (dlm_nodeid_to_addr(con->nodeid, &saddr)) {
+        if (dlm_nodeid_to_addr(con->nodeid, &saddr))
-                sock_release(sock);
                goto out_err;
-        }
        sock->sk->sk_user_data = con;
        con->rx_action = receive_from_sock;
@@ -1284,7 +1285,6 @@ out:
 static void send_to_sock(struct connection *con)
 {
        int ret = 0;
-        ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
        const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
        struct writequeue_entry *e;
        int len, offset;
@@ -1293,8 +1293,6 @@ static void send_to_sock(struct connection *con)
        if (con->sock == NULL)
                goto out_connect;
-        sendpage = con->sock->ops->sendpage;
        spin_lock(&con->writequeue_lock);
        for (;;) {
                e = list_entry(con->writequeue.next, struct writequeue_entry,
@@ -1309,8 +1307,8 @@ static void send_to_sock(struct connection *con)
                ret = 0;
                if (len) {
-                        ret = sendpage(con->sock, e->page, offset, len,
+                        ret = kernel_sendpage(con->sock, e->page, offset, len,
-                                       msg_flags);
+                                              msg_flags);
                        if (ret == -EAGAIN || ret == 0) {
                                cond_resched();
                                goto out;
@@ -1370,6 +1368,13 @@ int dlm_lowcomms_close(int nodeid)
        log_print("closing connection to node %d", nodeid);
        con = nodeid2con(nodeid, 0);
        if (con) {
+                clear_bit(CF_CONNECT_PENDING, &con->flags);
+                clear_bit(CF_WRITE_PENDING, &con->flags);
+                set_bit(CF_CLOSE, &con->flags);
+                if (cancel_work_sync(&con->swork))
+                        log_print("canceled swork for node %d", nodeid);
+                if (cancel_work_sync(&con->rwork))
+                        log_print("canceled rwork for node %d", nodeid);
                clean_one_writequeue(con);
                close_connection(con, true);
        }
@@ -1395,9 +1400,10 @@ static void process_send_sockets(struct work_struct *work)
        if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
                con->connect_action(con);
+                set_bit(CF_WRITE_PENDING, &con->flags);
        }
-        clear_bit(CF_WRITE_PENDING, &con->flags);
+        if (test_and_clear_bit(CF_WRITE_PENDING, &con->flags))
-        send_to_sock(con);
+                send_to_sock(con);
 }
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index ccc9d62c462d..55ea369f43a9 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -63,7 +63,7 @@ static int send_data(struct sk_buff *skb)
                return rv;
        }
-        return genlmsg_unicast(skb, listener_nlpid);
+        return genlmsg_unicast(&init_net, skb, listener_nlpid);
 }
 static int user_cmd(struct sk_buff *skb, struct genl_info *info)
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index af737bb56cb7..259525c9abb8 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -1303,6 +1303,13 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat,
        }
        (*new_auth_tok)->session_key.encrypted_key_size =
                (body_size - (ECRYPTFS_SALT_SIZE + 5));
+        if ((*new_auth_tok)->session_key.encrypted_key_size
+            > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) {
+                printk(KERN_WARNING "Tag 3 packet contains key larger "
+                       "than ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES\n");
+                rc = -EINVAL;
+                goto out_free;
+        }
        if (unlikely(data[(*packet_size)++] != 0x04)) {
                printk(KERN_WARNING "Unknown version number [%d]\n",
                       data[(*packet_size) - 1]);
@@ -1449,6 +1456,12 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents,
                rc = -EINVAL;
                goto out;
        }
+        if (unlikely((*tag_11_contents_size) > max_contents_bytes)) {
+                printk(KERN_ERR "Literal data section in tag 11 packet exceeds "
+                       "expected size\n");
+                rc = -EINVAL;
+                goto out;
+        }
        if (data[(*packet_size)++] != 0x62) {
                printk(KERN_WARNING "Unrecognizable packet\n");
                rc = -EINVAL;
diff --git a/fs/exec.c b/fs/exec.c
index 4a8849e45b21..172ceb6edde4 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -678,8 +678,8 @@ exit:
 }
 EXPORT_SYMBOL(open_exec);
-int kernel_read(struct file *file, unsigned long offset,
+int kernel_read(struct file *file, loff_t offset,
-        char *addr, unsigned long count)
+                char *addr, unsigned long count)
 {
        mm_segment_t old_fs;
        loff_t pos = offset;
@@ -1016,6 +1016,35 @@ out:
 EXPORT_SYMBOL(flush_old_exec);
 /*
+ * Prepare credentials and lock ->cred_guard_mutex.
+ * install_exec_creds() commits the new creds and drops the lock.
+ * Or, if exec fails before, free_bprm() should release ->cred and
+ * and unlock.
+ */
+int prepare_bprm_creds(struct linux_binprm *bprm)
+{
+        if (mutex_lock_interruptible(&current->cred_guard_mutex))
+                return -ERESTARTNOINTR;
+        bprm->cred = prepare_exec_creds();
+        if (likely(bprm->cred))
+                return 0;
+        mutex_unlock(&current->cred_guard_mutex);
+        return -ENOMEM;
+}
+void free_bprm(struct linux_binprm *bprm)
+{
+        free_arg_pages(bprm);
+        if (bprm->cred) {
+                mutex_unlock(&current->cred_guard_mutex);
+                abort_creds(bprm->cred);
+        }
+        kfree(bprm);
+}
+/*
 * install the new credentials for this executable
 */
 void install_exec_creds(struct linux_binprm *bprm)
@@ -1024,12 +1053,13 @@ void install_exec_creds(struct linux_binprm *bprm)
        commit_creds(bprm->cred);
        bprm->cred = NULL;
+        /*
-        /* cred_guard_mutex must be held at least to this point to prevent
+         * cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
-         * credentials; any time after this it may be unlocked */
+         * credentials; any time after this it may be unlocked.
+         */
        security_bprm_committed_creds(bprm);
+        mutex_unlock(&current->cred_guard_mutex);
 }
 EXPORT_SYMBOL(install_exec_creds);
@@ -1246,14 +1276,6 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
 EXPORT_SYMBOL(search_binary_handler);
-void free_bprm(struct linux_binprm *bprm)
-{
-        free_arg_pages(bprm);
-        if (bprm->cred)
-                abort_creds(bprm->cred);
-        kfree(bprm);
-}
 /*
 * sys_execve() executes a new program.
 */
@@ -1277,20 +1299,15 @@ int do_execve(char * filename,
        if (!bprm)
                goto out_files;
-        retval = -ERESTARTNOINTR;
+        retval = prepare_bprm_creds(bprm);
-        if (mutex_lock_interruptible(&current->cred_guard_mutex))
+        if (retval)
                goto out_free;
-        current->in_execve = 1;
-        retval = -ENOMEM;
-        bprm->cred = prepare_exec_creds();
-        if (!bprm->cred)
-                goto out_unlock;
        retval = check_unsafe_exec(bprm);
        if (retval < 0)
-                goto out_unlock;
+                goto out_free;
        clear_in_exec = retval;
+        current->in_execve = 1;
        file = open_exec(filename);
        retval = PTR_ERR(file);
@@ -1340,7 +1357,6 @@ int do_execve(char * filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-        mutex_unlock(&current->cred_guard_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
@@ -1360,10 +1376,7 @@ out_file:
 out_unmark:
        if (clear_in_exec)
                current->fs->in_exec = 0;
-out_unlock:
        current->in_execve = 0;
-        mutex_unlock(&current->cred_guard_mutex);
 out_free:
        free_bprm(bprm);
diff --git a/fs/ext2/acl.c b/fs/ext2/acl.c
index d636e1297cad..a63d44256a70 100644
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -230,7 +230,7 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        return error;
 }
-static int
+int
 ext2_check_acl(struct inode *inode, int mask)
 {
        struct posix_acl *acl = ext2_get_acl(inode, ACL_TYPE_ACCESS);
@@ -246,12 +246,6 @@ ext2_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-int
-ext2_permission(struct inode *inode, int mask)
-{
-        return generic_permission(inode, mask, ext2_check_acl);
-}
 /*
 * Initialize the ACLs of a new inode. Called from ext2_new_inode.
 *
diff --git a/fs/ext2/acl.h b/fs/ext2/acl.h
index ecefe478898f..3ff6cbb9ac44 100644
--- a/fs/ext2/acl.h
+++ b/fs/ext2/acl.h
@@ -54,13 +54,13 @@ static inline int ext2_acl_count(size_t size)
 #ifdef CONFIG_EXT2_FS_POSIX_ACL
 /* acl.c */
-extern int ext2_permission (struct inode *, int);
+extern int ext2_check_acl (struct inode *, int);
 extern int ext2_acl_chmod (struct inode *);
 extern int ext2_init_acl (struct inode *, struct inode *);
 #else
 #include <linux/sched.h>
-#define ext2_permission NULL
+#define ext2_check_acl  NULL
 #define ext2_get_acl    NULL
 #define ext2_set_acl    NULL
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 2b9e47dc9222..a2f3afd1a1c1 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -85,6 +85,6 @@ const struct inode_operations ext2_file_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .setattr        = ext2_setattr,
-        .permission     = ext2_permission,
+        .check_acl      = ext2_check_acl,
        .fiemap         = ext2_fiemap,
 };
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index e27130341d4f..1c1638f873a4 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -482,7 +482,7 @@ static int ext2_alloc_branch(struct inode *inode,
                unlock_buffer(bh);
                mark_buffer_dirty_inode(bh, inode);
                /* We used to sync bh here if IS_SYNC(inode).
-                 * But we now rely upon generic_osync_inode()
+                 * But we now rely upon generic_write_sync()
                 * and b_inode_buffers.  But not for directories.
                 */
                if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode))
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index e1dedb0f7873..23701f289e98 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -362,6 +362,10 @@ static int ext2_rename (struct inode * old_dir, struct dentry * old_dentry,
        if (dir_de) {
                if (old_dir != new_dir)
                        ext2_set_link(old_inode, dir_de, dir_page, new_dir, 0);
+                else {
+                        kunmap(dir_page);
+                        page_cache_release(dir_page);
+                }
                inode_dec_link_count(old_dir);
        }
        return 0;
@@ -396,7 +400,7 @@ const struct inode_operations ext2_dir_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .setattr        = ext2_setattr,
-        .permission     = ext2_permission,
+        .check_acl      = ext2_check_acl,
 };
 const struct inode_operations ext2_special_inode_operations = {
@@ -407,5 +411,5 @@ const struct inode_operations ext2_special_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .setattr        = ext2_setattr,
-        .permission     = ext2_permission,
+        .check_acl      = ext2_check_acl,
 };
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index fb3c1a21b135..522b15498f45 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -29,23 +29,25 @@ config EXT3_FS
          module will be called ext3.
 config EXT3_DEFAULTS_TO_ORDERED
-        bool "Default to 'data=ordered' in ext3 (legacy option)"
+        bool "Default to 'data=ordered' in ext3"
        depends on EXT3_FS
        help
-          If a filesystem does not explicitly specify a data ordering
+          The journal mode options for ext3 have different tradeoffs
-          mode, and the journal capability allowed it, ext3 used to
+          between when data is guaranteed to be on disk and
-          historically default to 'data=ordered'.
+          performance.  The use of "data=writeback" can cause
+          unwritten data to appear in files after an system crash or
-          That was a rather unfortunate choice, because it leads to all
+          power failure, which can be a security issue.  However,
-          kinds of latency problems, and the 'data=writeback' mode is more
+          "data=ordered" mode can also result in major performance
-          appropriate these days.
+          problems, including seconds-long delays before an fsync()
+          call returns.  For details, see:
-          You should probably always answer 'n' here, and if you really
-          want to use 'data=ordered' mode, set it in the filesystem itself
+          http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs
-          with 'tune2fs -o journal_data_ordered'.
+          If you have been historically happy with ext3's performance,
-          But if you really want to enable the legacy default, you can do
+          data=ordered mode will be a safe choice and you should
-          so by answering 'y' to this question.
+          answer 'y' here.  If you understand the reliability and data
+          privacy issues of data=writeback and are willing to make
+          that trade off, answer 'n'.
 config EXT3_FS_XATTR
        bool "Ext3 extended attributes"
diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c
index e167bae37ef0..c9b0df376b5f 100644
--- a/fs/ext3/acl.c
+++ b/fs/ext3/acl.c
@@ -238,7 +238,7 @@ ext3_set_acl(handle_t *handle, struct inode *inode, int type,
        return error;
 }
-static int
+int
 ext3_check_acl(struct inode *inode, int mask)
 {
        struct posix_acl *acl = ext3_get_acl(inode, ACL_TYPE_ACCESS);
@@ -254,12 +254,6 @@ ext3_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-int
-ext3_permission(struct inode *inode, int mask)
-{
-        return generic_permission(inode, mask, ext3_check_acl);
-}
 /*
 * Initialize the ACLs of a new inode. Called from ext3_new_inode.
 *
diff --git a/fs/ext3/acl.h b/fs/ext3/acl.h
index 07d15a3a5969..597334626de9 100644
--- a/fs/ext3/acl.h
+++ b/fs/ext3/acl.h
@@ -54,13 +54,13 @@ static inline int ext3_acl_count(size_t size)
 #ifdef CONFIG_EXT3_FS_POSIX_ACL
 /* acl.c */
-extern int ext3_permission (struct inode *, int);
+extern int ext3_check_acl (struct inode *, int);
 extern int ext3_acl_chmod (struct inode *);
 extern int ext3_init_acl (handle_t *, struct inode *, struct inode *);
 #else  /* CONFIG_EXT3_FS_POSIX_ACL */
 #include <linux/sched.h>
-#define ext3_permission NULL
+#define ext3_check_acl NULL
 static inline int
 ext3_acl_chmod(struct inode *inode)
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 3d724a95882f..373fa90c796a 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -130,8 +130,7 @@ static int ext3_readdir(struct file * filp,
                struct buffer_head *bh = NULL;
                map_bh.b_state = 0;
-                err = ext3_get_blocks_handle(NULL, inode, blk, 1,
+                err = ext3_get_blocks_handle(NULL, inode, blk, 1, &map_bh, 0);
-                                                &map_bh, 0, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index 5b49704b231b..388bbdfa0b4e 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -51,71 +51,12 @@ static int ext3_release_file (struct inode * inode, struct file * filp)
        return 0;
 }
-static ssize_t
-ext3_file_write(struct kiocb *iocb, const struct iovec *iov,
-                unsigned long nr_segs, loff_t pos)
-{
-        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_path.dentry->d_inode;
-        ssize_t ret;
-        int err;
-        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
-        /*
-         * Skip flushing if there was an error, or if nothing was written.
-         */
-        if (ret <= 0)
-                return ret;
-        /*
-         * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-         * journalling then we need to make sure that we force the transaction
-         * to disk to keep all metadata uptodate synchronously.
-         */
-        if (file->f_flags & O_SYNC) {
-                /*
-                 * If we are non-data-journaled, then the dirty data has
-                 * already been flushed to backing store by generic_osync_inode,
-                 * and the inode has been flushed too if there have been any
-                 * modifications other than mere timestamp updates.
-                 *
-                 * Open question --- do we care about flushing timestamps too
-                 * if the inode is IS_SYNC?
-                 */
-                if (!ext3_should_journal_data(inode))
-                        return ret;
-                goto force_commit;
-        }
-        /*
-         * So we know that there has been no forced data flush.  If the inode
-         * is marked IS_SYNC, we need to force one ourselves.
-         */
-        if (!IS_SYNC(inode))
-                return ret;
-        /*
-         * Open question #2 --- should we force data to disk here too?  If we
-         * don't, the only impact is that data=writeback filesystems won't
-         * flush data to disk automatically on IS_SYNC, only metadata (but
-         * historically, that is what ext2 has done.)
-         */
-force_commit:
-        err = ext3_force_commit(inode->i_sb);
-        if (err)
-                return err;
-        return ret;
-}
 const struct file_operations ext3_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
-        .aio_write      = ext3_file_write,
+        .aio_write      = generic_file_aio_write,
        .unlocked_ioctl = ext3_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ext3_compat_ioctl,
@@ -137,7 +78,7 @@ const struct inode_operations ext3_file_inode_operations = {
        .listxattr      = ext3_listxattr,
        .removexattr    = generic_removexattr,
 #endif
-        .permission     = ext3_permission,
+        .check_acl      = ext3_check_acl,
        .fiemap         = ext3_fiemap,
 };
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index d33634119e17..451d166bbe93 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -23,6 +23,7 @@
 */
 #include <linux/time.h>
+#include <linux/blkdev.h>
 #include <linux/fs.h>
 #include <linux/sched.h>
 #include <linux/writeback.h>
@@ -73,7 +74,7 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
        }
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
-                goto out;
+                goto flush;
        /*
         * The VFS has written the file data.  If the inode is unaltered
@@ -85,7 +86,16 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
                        .nr_to_write = 0, /* sys_fsync did this */
                };
                ret = sync_inode(inode, &wbc);
+                goto out;
        }
+flush:
+        /*
+         * In case we didn't commit a transaction, we have to flush
+         * disk caches manually so that data really is on persistent
+         * storage
+         */
+        if (test_opt(inode->i_sb, BARRIER))
+                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
 out:
        return ret;
 }
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 5f51fed5c750..cd098a7b77fc 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -172,10 +172,21 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 * so before we call here everything must be consistently dirtied against
 * this transaction.
 */
-static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
+static int truncate_restart_transaction(handle_t *handle, struct inode *inode)
 {
+        int ret;
        jbd_debug(2, "restarting handle %p\n", handle);
-        return ext3_journal_restart(handle, blocks_for_truncate(inode));
+        /*
+         * Drop truncate_mutex to avoid deadlock with ext3_get_blocks_handle
+         * At this moment, get_block can be called only for blocks inside
+         * i_size since page cache has been already dropped and writes are
+         * blocked by i_mutex. So we can safely drop the truncate_mutex.
+         */
+        mutex_unlock(&EXT3_I(inode)->truncate_mutex);
+        ret = ext3_journal_restart(handle, blocks_for_truncate(inode));
+        mutex_lock(&EXT3_I(inode)->truncate_mutex);
+        return ret;
 }
 /*
@@ -788,7 +799,7 @@ err_out:
 int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
                sector_t iblock, unsigned long maxblocks,
                struct buffer_head *bh_result,
-                int create, int extend_disksize)
+                int create)
 {
        int err = -EIO;
        int offsets[4];
@@ -911,13 +922,6 @@ int ext3_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext3_splice_branch(handle, inode, iblock,
                                        partial, indirect_blks, count);
-        /*
-         * i_disksize growing is protected by truncate_mutex.  Don't forget to
-         * protect it if you're about to implement concurrent
-         * ext3_get_block() -bzzz
-        */
-        if (!err && extend_disksize && inode->i_size > ei->i_disksize)
-                ei->i_disksize = inode->i_size;
        mutex_unlock(&ei->truncate_mutex);
        if (err)
                goto cleanup;
@@ -972,7 +976,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock,
        }
        ret = ext3_get_blocks_handle(handle, inode, iblock,
-                                        max_blocks, bh_result, create, 0);
+                                        max_blocks, bh_result, create);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1005,7 +1009,7 @@ struct buffer_head *ext3_getblk(handle_t *handle, struct inode *inode,
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
        err = ext3_get_blocks_handle(handle, inode, block, 1,
-                                        &dummy, create, 1);
+                                        &dummy, create);
        /*
         * ext3_get_blocks_handle() returns number of blocks
         * mapped. 0 in case of a HOLE.
@@ -1193,15 +1197,16 @@ write_begin_failed:
                 * i_size_read because we hold i_mutex.
                 *
                 * Add inode to orphan list in case we crash before truncate
-                 * finishes.
+                 * finishes. Do this only if ext3_can_truncate() agrees so
+                 * that orphan processing code is happy.
                 */
-                if (pos + len > inode->i_size)
+                if (pos + len > inode->i_size && ext3_can_truncate(inode))
                        ext3_orphan_add(handle, inode);
                ext3_journal_stop(handle);
                unlock_page(page);
                page_cache_release(page);
                if (pos + len > inode->i_size)
-                        vmtruncate(inode, inode->i_size);
+                        ext3_truncate(inode);
        }
        if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
@@ -1287,7 +1292,7 @@ static int ext3_ordered_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ret2 = ext3_journal_stop(handle);
        if (!ret)
@@ -1296,7 +1301,7 @@ static int ext3_ordered_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -1315,14 +1320,14 @@ static int ext3_writeback_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        ret = ext3_journal_stop(handle);
        unlock_page(page);
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -1358,7 +1363,7 @@ static int ext3_journalled_write_end(struct file *file,
         * There may be allocated blocks outside of i_size because
         * we failed to copy some data. Prepare for truncate.
         */
-        if (pos + len > inode->i_size)
+        if (pos + len > inode->i_size && ext3_can_truncate(inode))
                ext3_orphan_add(handle, inode);
        EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
        if (inode->i_size > EXT3_I(inode)->i_disksize) {
@@ -1375,7 +1380,7 @@ static int ext3_journalled_write_end(struct file *file,
        page_cache_release(page);
        if (pos + len > inode->i_size)
-                vmtruncate(inode, inode->i_size);
+                ext3_truncate(inode);
        return ret ? ret : copied;
 }
@@ -2078,7 +2083,7 @@ static void ext3_clear_blocks(handle_t *handle, struct inode *inode,
                        ext3_journal_dirty_metadata(handle, bh);
                }
                ext3_mark_inode_dirty(handle, inode);
-                ext3_journal_test_restart(handle, inode);
+                truncate_restart_transaction(handle, inode);
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
                        ext3_journal_get_write_access(handle, bh);
@@ -2288,7 +2293,7 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode,
                                return;
                        if (try_to_extend_transaction(handle, inode)) {
                                ext3_mark_inode_dirty(handle, inode);
-                                ext3_journal_test_restart(handle, inode);
+                                truncate_restart_transaction(handle, inode);
                        }
                        ext3_free_blocks(handle, inode, nr, 1);
@@ -2898,6 +2903,10 @@ static int ext3_do_update_inode(handle_t *handle,
        struct buffer_head *bh = iloc->bh;
        int err = 0, rc, block;
+again:
+        /* we can't allow multiple procs in here at once, its a bit racey */
+        lock_buffer(bh);
        /* For fields not not tracking in the in-memory inode,
         * initialise them to zero for new inodes. */
        if (ei->i_state & EXT3_STATE_NEW)
@@ -2957,16 +2966,20 @@ static int ext3_do_update_inode(handle_t *handle,
                               /* If this is the first large file
                                * created, add a flag to the superblock.
                                */
+                                unlock_buffer(bh);
                                err = ext3_journal_get_write_access(handle,
                                                EXT3_SB(sb)->s_sbh);
                                if (err)
                                        goto out_brelse;
                                ext3_update_dynamic_rev(sb);
                                EXT3_SET_RO_COMPAT_FEATURE(sb,
                                        EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
                                handle->h_sync = 1;
                                err = ext3_journal_dirty_metadata(handle,
                                                EXT3_SB(sb)->s_sbh);
+                                /* get our lock and start over */
+                                goto again;
                        }
                }
        }
@@ -2989,6 +3002,7 @@ static int ext3_do_update_inode(handle_t *handle,
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
        BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
+        unlock_buffer(bh);
        rc = ext3_journal_dirty_metadata(handle, bh);
        if (!err)
                err = rc;
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 6ff7b9730234..aad6400c9b77 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -2445,7 +2445,7 @@ const struct inode_operations ext3_dir_inode_operations = {
        .listxattr      = ext3_listxattr,
        .removexattr    = generic_removexattr,
 #endif
-        .permission     = ext3_permission,
+        .check_acl      = ext3_check_acl,
 };
 const struct inode_operations ext3_special_inode_operations = {
@@ -2456,5 +2456,5 @@ const struct inode_operations ext3_special_inode_operations = {
        .listxattr      = ext3_listxattr,
        .removexattr    = generic_removexattr,
 #endif
-        .permission     = ext3_permission,
+        .check_acl      = ext3_check_acl,
 };
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 524b349c6299..a8d80a7f1105 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -543,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl
 #endif
 }
+static char *data_mode_string(unsigned long mode)
+{
+        switch (mode) {
+        case EXT3_MOUNT_JOURNAL_DATA:
+                return "journal";
+        case EXT3_MOUNT_ORDERED_DATA:
+                return "ordered";
+        case EXT3_MOUNT_WRITEBACK_DATA:
+                return "writeback";
+        }
+        return "unknown";
+}
 /*
 * Show an option if
 *  - it's set to a non-default value OR
@@ -616,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
-        if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA)
+        seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt &
-                seq_puts(seq, ",data=journal");
+                                                     EXT3_MOUNT_DATA_FLAGS));
-        else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA)
-                seq_puts(seq, ",data=ordered");
-        else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)
-                seq_puts(seq, ",data=writeback");
        if (test_opt(sb, DATA_ERR_ABORT))
                seq_puts(seq, ",data_err=abort");
@@ -1024,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb,
                datacheck:
                        if (is_remount) {
                                if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS)
-                                                != data_opt) {
+                                                == data_opt)
-                                        printk(KERN_ERR
+                                        break;
-                                                "EXT3-fs: cannot change data "
+                                printk(KERN_ERR
-                                                "mode on remount\n");
+                                        "EXT3-fs (device %s): Cannot change "
-                                        return 0;
+                                        "data mode on remount. The filesystem "
-                                }
+                                        "is mounted in data=%s mode and you "
+                                        "try to remount it in data=%s mode.\n",
+                                        sb->s_id,
+                                        data_mode_string(sbi->s_mount_opt &
+                                                        EXT3_MOUNT_DATA_FLAGS),
+                                        data_mode_string(data_opt));
+                                return 0;
                        } else {
                                sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS;
                                sbi->s_mount_opt |= data_opt;
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 418b6f3b0ae8..d5c0ea2e8f2d 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -37,7 +37,7 @@ config EXT4DEV_COMPAT
          To enable backwards compatibility so that systems that are
          still expecting to mount ext4 filesystems using ext4dev,
-          chose Y here.   This feature will go away by 2.6.31, so
+          choose Y here.   This feature will go away by 2.6.31, so
          please arrange to get your userspace programs fixed!
 config EXT4_FS_XATTR
@@ -77,3 +77,12 @@ config EXT4_FS_SECURITY
          If you are not using a security module that requires using
          extended attributes for file security labels, say N.
+config EXT4_DEBUG
+        bool "EXT4 debugging support"
+        depends on EXT4_FS
+        help
+          Enables run-time debugging support for the ext4 filesystem.
+          If you select Y here, then you will be able to turn on debugging
+          with a command such as "echo 1 > /sys/kernel/debug/ext4/mballoc-debug"
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index f6d8967149ca..0df88b2a69b0 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -236,7 +236,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
        return error;
 }
-static int
+int
 ext4_check_acl(struct inode *inode, int mask)
 {
        struct posix_acl *acl = ext4_get_acl(inode, ACL_TYPE_ACCESS);
@@ -252,12 +252,6 @@ ext4_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-int
-ext4_permission(struct inode *inode, int mask)
-{
-        return generic_permission(inode, mask, ext4_check_acl);
-}
 /*
 * Initialize the ACLs of a new inode. Called from ext4_new_inode.
 *
diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h
index 949789d2bba6..9d843d5deac4 100644
--- a/fs/ext4/acl.h
+++ b/fs/ext4/acl.h
@@ -54,13 +54,13 @@ static inline int ext4_acl_count(size_t size)
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 /* acl.c */
-extern int ext4_permission(struct inode *, int);
+extern int ext4_check_acl(struct inode *, int);
 extern int ext4_acl_chmod(struct inode *);
 extern int ext4_init_acl(handle_t *, struct inode *, struct inode *);
 #else  /* CONFIG_EXT4_FS_POSIX_ACL */
 #include <linux/sched.h>
-#define ext4_permission NULL
+#define ext4_check_acl NULL
 static inline int
 ext4_acl_chmod(struct inode *inode)
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index e2126d70dff5..1d0418980f8d 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -478,7 +478,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
         * new bitmap information
         */
        set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
-        ext4_mb_update_group_info(grp, blocks_freed);
+        grp->bb_free += blocks_freed;
        up_write(&grp->alloc_sem);
        /* We dirtied the bitmap block */
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 9714db393efe..e227eea23f05 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -67,27 +67,29 @@ typedef unsigned int ext4_group_t;
 /* prefer goal again. length */
-#define EXT4_MB_HINT_MERGE              1
+#define EXT4_MB_HINT_MERGE              0x0001
 /* blocks already reserved */
-#define EXT4_MB_HINT_RESERVED           2
+#define EXT4_MB_HINT_RESERVED           0x0002
 /* metadata is being allocated */
-#define EXT4_MB_HINT_METADATA           4
+#define EXT4_MB_HINT_METADATA           0x0004
 /* first blocks in the file */
-#define EXT4_MB_HINT_FIRST              8
+#define EXT4_MB_HINT_FIRST              0x0008
 /* search for the best chunk */
-#define EXT4_MB_HINT_BEST               16
+#define EXT4_MB_HINT_BEST               0x0010
 /* data is being allocated */
-#define EXT4_MB_HINT_DATA               32
+#define EXT4_MB_HINT_DATA               0x0020
 /* don't preallocate (for tails) */
-#define EXT4_MB_HINT_NOPREALLOC         64
+#define EXT4_MB_HINT_NOPREALLOC         0x0040
 /* allocate for locality group */
-#define EXT4_MB_HINT_GROUP_ALLOC        128
+#define EXT4_MB_HINT_GROUP_ALLOC        0x0080
 /* allocate goal blocks or none */
-#define EXT4_MB_HINT_GOAL_ONLY          256
+#define EXT4_MB_HINT_GOAL_ONLY          0x0100
 /* goal is meaningful */
-#define EXT4_MB_HINT_TRY_GOAL           512
+#define EXT4_MB_HINT_TRY_GOAL           0x0200
 /* blocks already pre-reserved by delayed allocation */
-#define EXT4_MB_DELALLOC_RESERVED      1024
+#define EXT4_MB_DELALLOC_RESERVED       0x0400
+/* We are doing stream allocation */
+#define EXT4_MB_STREAM_ALLOC            0x0800
 struct ext4_allocation_request {
@@ -112,6 +114,21 @@ struct ext4_allocation_request {
 };
 /*
+ * For delayed allocation tracking
+ */
+struct mpage_da_data {
+        struct inode *inode;
+        sector_t b_blocknr;             /* start block number of extent */
+        size_t b_size;                  /* size of extent */
+        unsigned long b_state;          /* state of the extent */
+        unsigned long first_page, next_page;    /* extent of pages */
+        struct writeback_control *wbc;
+        int io_done;
+        int pages_written;
+        int retval;
+};
+/*
 * Special inodes numbers
 */
 #define EXT4_BAD_INO             1      /* Bad blocks inode */
@@ -251,7 +268,6 @@ struct flex_groups {
 #define EXT4_TOPDIR_FL                  0x00020000 /* Top of directory hierarchies*/
 #define EXT4_HUGE_FILE_FL               0x00040000 /* Set to each huge file */
 #define EXT4_EXTENTS_FL                 0x00080000 /* Inode uses extents */
-#define EXT4_EXT_MIGRATE                0x00100000 /* Inode is migrating */
 #define EXT4_RESERVED_FL                0x80000000 /* reserved for ext4 lib */
 #define EXT4_FL_USER_VISIBLE            0x000BDFFF /* User visible flags */
@@ -289,6 +305,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
 #define EXT4_STATE_XATTR                0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND            0x00000008 /* No space for expansion */
 #define EXT4_STATE_DA_ALLOC_CLOSE       0x00000010 /* Alloc DA blks on close */
+#define EXT4_STATE_EXT_MIGRATE          0x00000020 /* Inode is migrating */
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
@@ -386,6 +403,9 @@ struct ext4_mount_options {
 #endif
 };
+/* Max physical block we can addres w/o extents */
+#define EXT4_MAX_BLOCK_FILE_PHYS        0xFFFFFFFF
 /*
 * Structure of an inode on the disk
 */
@@ -456,7 +476,6 @@ struct move_extent {
        __u64 len;              /* block length to be moved */
        __u64 moved_len;        /* moved block length */
 };
-#define MAX_DEFRAG_SIZE         ((1UL<<31) - 1)
 #define EXT4_EPOCH_BITS 2
 #define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1)
@@ -694,7 +713,6 @@ struct ext4_inode_info {
 #define EXT4_MOUNT_QUOTA                0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA             0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA             0x200000 /* "old" group quota */
-#define EXT4_MOUNT_JOURNAL_CHECKSUM     0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC             0x8000000 /* Delalloc support */
@@ -841,6 +859,7 @@ struct ext4_sb_info {
        unsigned long s_gdb_count;      /* Number of group descriptor blocks */
        unsigned long s_desc_per_block; /* Number of group descriptors per block */
        ext4_group_t s_groups_count;    /* Number of groups in the fs */
+        ext4_group_t s_blockfile_groups;/* Groups acceptable for non-extent files */
        unsigned long s_overhead_last;  /* Last calculated overhead */
        unsigned long s_blocks_last;    /* Last seen block count */
        loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
@@ -950,6 +969,7 @@ struct ext4_sb_info {
        atomic_t s_mb_lost_chunks;
        atomic_t s_mb_preallocated;
        atomic_t s_mb_discarded;
+        atomic_t s_lock_busy;
        /* locality groups */
        struct ext4_locality_group *s_locality_groups;
@@ -1340,8 +1360,6 @@ extern void ext4_mb_free_blocks(handle_t *, struct inode *,
                ext4_fsblk_t, unsigned long, int, unsigned long *);
 extern int ext4_mb_add_groupinfo(struct super_block *sb,
                ext4_group_t i, struct ext4_group_desc *desc);
-extern void ext4_mb_update_group_info(struct ext4_group_info *grp,
-                ext4_grpblk_t add);
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
                                                ext4_group_t, int);
@@ -1367,6 +1385,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
+extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
 extern int ext4_alloc_da_blocks(struct inode *inode);
@@ -1575,15 +1594,18 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 struct ext4_group_info {
        unsigned long   bb_state;
        struct rb_root  bb_free_root;
-        unsigned short  bb_first_free;
+        ext4_grpblk_t   bb_first_free;  /* first free block */
-        unsigned short  bb_free;
+        ext4_grpblk_t   bb_free;        /* total free blocks */
-        unsigned short  bb_fragments;
+        ext4_grpblk_t   bb_fragments;   /* nr of freespace fragments */
        struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
        void            *bb_bitmap;
 #endif
        struct rw_semaphore alloc_sem;
-        unsigned short  bb_counters[];
+        ext4_grpblk_t   bb_counters[];  /* Nr of free power-of-two-block
+                                         * regions, index is order.
+                                         * bb_counters[3] = 5 means
+                                         * 5 free 8-block regions. */
 };
 #define EXT4_GROUP_INFO_NEED_INIT_BIT   0
@@ -1591,15 +1613,42 @@ struct ext4_group_info {
 #define EXT4_MB_GRP_NEED_INIT(grp)      \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
+#define EXT4_MAX_CONTENTION             8
+#define EXT4_CONTENTION_THRESHOLD       2
 static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
                                              ext4_group_t group)
 {
        return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
 }
+/*
+ * Returns true if the filesystem is busy enough that attempts to
+ * access the block group locks has run into contention.
+ */
+static inline int ext4_fs_is_busy(struct ext4_sb_info *sbi)
+{
+        return (atomic_read(&sbi->s_lock_busy) > EXT4_CONTENTION_THRESHOLD);
+}
 static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
 {
-        spin_lock(ext4_group_lock_ptr(sb, group));
+        spinlock_t *lock = ext4_group_lock_ptr(sb, group);
+        if (spin_trylock(lock))
+                /*
+                 * We're able to grab the lock right away, so drop the
+                 * lock contention counter.
+                 */
+                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, -1, 0);
+        else {
+                /*
+                 * The lock is busy, so bump the contention counter,
+                 * and then wait on the spin lock.
+                 */
+                atomic_add_unless(&EXT4_SB(sb)->s_lock_busy, 1,
+                                  EXT4_MAX_CONTENTION);
+                spin_lock(lock);
+        }
 }
 static inline void ext4_unlock_group(struct super_block *sb,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 20a84105a10b..61652f1d15e6 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -43,8 +43,7 @@
 #define CHECK_BINSEARCH__
 /*
- * If EXT_DEBUG is defined you can use the 'extdebug' mount option
+ * Turn on EXT_DEBUG to get lots of info about extents operations.
- * to get lots of info about what's going on.
 */
 #define EXT_DEBUG__
 #ifdef EXT_DEBUG
@@ -138,6 +137,7 @@ typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
 #define EXT_BREAK      1
 #define EXT_REPEAT     2
+/* Maximum logical block in a file; ext4_extent's ee_block is __le32 */
 #define EXT_MAX_BLOCK   0xffffffff
 /*
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index eb27fd0f2ee8..6a9409920dee 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -44,7 +44,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle,
                                                  handle, err);
        }
        else
-                brelse(bh);
+                bforget(bh);
        return err;
 }
@@ -60,7 +60,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle,
                                                  handle, err);
        }
        else
-                brelse(bh);
+                bforget(bh);
        return err;
 }
@@ -89,7 +89,10 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle,
                        ext4_journal_abort_handle(where, __func__, bh,
                                                  handle, err);
        } else {
-                mark_buffer_dirty(bh);
+                if (inode && bh)
+                        mark_buffer_dirty_inode(bh, inode);
+                else
+                        mark_buffer_dirty(bh);
                if (inode && inode_needs_sync(inode)) {
                        sync_dirty_buffer(bh);
                        if (buffer_req(bh) && !buffer_uptodate(bh)) {
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 73ebfb44ad75..7a3832577923 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -93,7 +93,9 @@ static void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb)
        ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff);
 }
-static int ext4_ext_journal_restart(handle_t *handle, int needed)
+static int ext4_ext_truncate_extend_restart(handle_t *handle,
+                                            struct inode *inode,
+                                            int needed)
 {
        int err;
@@ -104,7 +106,14 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed)
        err = ext4_journal_extend(handle, needed);
        if (err <= 0)
                return err;
-        return ext4_journal_restart(handle, needed);
+        err = ext4_truncate_restart_trans(handle, inode, needed);
+        /*
+         * We have dropped i_data_sem so someone might have cached again
+         * an extent we are going to truncate.
+         */
+        ext4_ext_invalidate_cache(inode);
+        return err;
 }
 /*
@@ -220,57 +229,65 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode,
        return newblock;
 }
-static int ext4_ext_space_block(struct inode *inode)
+static inline int ext4_ext_space_block(struct inode *inode, int check)
 {
        int size;
        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent);
+        if (!check) {
 #ifdef AGGRESSIVE_TEST
-        if (size > 6)
+                if (size > 6)
-                size = 6;
+                        size = 6;
 #endif
+        }
        return size;
 }
-static int ext4_ext_space_block_idx(struct inode *inode)
+static inline int ext4_ext_space_block_idx(struct inode *inode, int check)
 {
        int size;
        size = (inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header))
                        / sizeof(struct ext4_extent_idx);
+        if (!check) {
 #ifdef AGGRESSIVE_TEST
-        if (size > 5)
+                if (size > 5)
-                size = 5;
+                        size = 5;
 #endif
+        }
        return size;
 }
-static int ext4_ext_space_root(struct inode *inode)
+static inline int ext4_ext_space_root(struct inode *inode, int check)
 {
        int size;
        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent);
+        if (!check) {
 #ifdef AGGRESSIVE_TEST
-        if (size > 3)
+                if (size > 3)
-                size = 3;
+                        size = 3;
 #endif
+        }
        return size;
 }
-static int ext4_ext_space_root_idx(struct inode *inode)
+static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 {
        int size;
        size = sizeof(EXT4_I(inode)->i_data);
        size -= sizeof(struct ext4_extent_header);
        size /= sizeof(struct ext4_extent_idx);
+        if (!check) {
 #ifdef AGGRESSIVE_TEST
-        if (size > 4)
+                if (size > 4)
-                size = 4;
+                        size = 4;
 #endif
+        }
        return size;
 }
@@ -284,9 +301,9 @@ int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks)
        int lcap, icap, rcap, leafs, idxs, num;
        int newextents = blocks;
-        rcap = ext4_ext_space_root_idx(inode);
+        rcap = ext4_ext_space_root_idx(inode, 0);
-        lcap = ext4_ext_space_block(inode);
+        lcap = ext4_ext_space_block(inode, 0);
-        icap = ext4_ext_space_block_idx(inode);
+        icap = ext4_ext_space_block_idx(inode, 0);
        /* number of new leaf blocks needed */
        num = leafs = (newextents + lcap - 1) / lcap;
@@ -311,14 +328,14 @@ ext4_ext_max_entries(struct inode *inode, int depth)
        if (depth == ext_depth(inode)) {
                if (depth == 0)
-                        max = ext4_ext_space_root(inode);
+                        max = ext4_ext_space_root(inode, 1);
                else
-                        max = ext4_ext_space_root_idx(inode);
+                        max = ext4_ext_space_root_idx(inode, 1);
        } else {
                if (depth == 0)
-                        max = ext4_ext_space_block(inode);
+                        max = ext4_ext_space_block(inode, 1);
                else
-                        max = ext4_ext_space_block_idx(inode);
+                        max = ext4_ext_space_block_idx(inode, 1);
        }
        return max;
@@ -437,8 +454,9 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
                  ext_debug("  %d->%llu", le32_to_cpu(path->p_idx->ei_block),
                            idx_pblock(path->p_idx));
                } else if (path->p_ext) {
-                        ext_debug("  %d:%d:%llu ",
+                        ext_debug("  %d:[%d]%d:%llu ",
                                  le32_to_cpu(path->p_ext->ee_block),
+                                  ext4_ext_is_uninitialized(path->p_ext),
                                  ext4_ext_get_actual_len(path->p_ext),
                                  ext_pblock(path->p_ext));
                } else
@@ -460,8 +478,11 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path)
        eh = path[depth].p_hdr;
        ex = EXT_FIRST_EXTENT(eh);
+        ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino);
        for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) {
-                ext_debug("%d:%d:%llu ", le32_to_cpu(ex->ee_block),
+                ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block),
+                          ext4_ext_is_uninitialized(ex),
                          ext4_ext_get_actual_len(ex), ext_pblock(ex));
        }
        ext_debug("\n");
@@ -580,9 +601,10 @@ ext4_ext_binsearch(struct inode *inode,
        }
        path->p_ext = l - 1;
-        ext_debug("  -> %d:%llu:%d ",
+        ext_debug("  -> %d:%llu:[%d]%d ",
                        le32_to_cpu(path->p_ext->ee_block),
                        ext_pblock(path->p_ext),
+                        ext4_ext_is_uninitialized(path->p_ext),
                        ext4_ext_get_actual_len(path->p_ext));
 #ifdef CHECK_BINSEARCH
@@ -612,7 +634,7 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode)
        eh->eh_depth = 0;
        eh->eh_entries = 0;
        eh->eh_magic = EXT4_EXT_MAGIC;
-        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode));
+        eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0));
        ext4_mark_inode_dirty(handle, inode);
        ext4_ext_invalidate_cache(inode);
        return 0;
@@ -837,7 +859,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        neh = ext_block_hdr(bh);
        neh->eh_entries = 0;
-        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+        neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        neh->eh_depth = 0;
        ex = EXT_FIRST_EXTENT(neh);
@@ -850,9 +872,10 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
        path[depth].p_ext++;
        while (path[depth].p_ext <=
                        EXT_MAX_EXTENT(path[depth].p_hdr)) {
-                ext_debug("move %d:%llu:%d in new leaf %llu\n",
+                ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n",
                                le32_to_cpu(path[depth].p_ext->ee_block),
                                ext_pblock(path[depth].p_ext),
+                                ext4_ext_is_uninitialized(path[depth].p_ext),
                                ext4_ext_get_actual_len(path[depth].p_ext),
                                newblock);
                /*memmove(ex++, path[depth].p_ext++,
@@ -912,7 +935,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode,
                neh = ext_block_hdr(bh);
                neh->eh_entries = cpu_to_le16(1);
                neh->eh_magic = EXT4_EXT_MAGIC;
-                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
                neh->eh_depth = cpu_to_le16(depth - i);
                fidx = EXT_FIRST_INDEX(neh);
                fidx->ei_block = border;
@@ -1037,9 +1060,9 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
        /* old root could have indexes or leaves
         * so calculate e_max right way */
        if (ext_depth(inode))
-          neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode));
+                neh->eh_max = cpu_to_le16(ext4_ext_space_block_idx(inode, 0));
        else
-          neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode));
+                neh->eh_max = cpu_to_le16(ext4_ext_space_block(inode, 0));
        neh->eh_magic = EXT4_EXT_MAGIC;
        set_buffer_uptodate(bh);
        unlock_buffer(bh);
@@ -1054,7 +1077,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode,
                goto out;
        curp->p_hdr->eh_magic = EXT4_EXT_MAGIC;
-        curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode));
+        curp->p_hdr->eh_max = cpu_to_le16(ext4_ext_space_root_idx(inode, 0));
        curp->p_hdr->eh_entries = cpu_to_le16(1);
        curp->p_idx = EXT_FIRST_INDEX(curp->p_hdr);
@@ -1580,9 +1603,11 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode,
        /* try to insert block into found extent and return */
        if (ex && ext4_can_extents_be_merged(inode, ex, newext)) {
-                ext_debug("append %d block to %d:%d (from %llu)\n",
+                ext_debug("append [%d]%d block to %d:[%d]%d (from %llu)\n",
+                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext),
                                le32_to_cpu(ex->ee_block),
+                                ext4_ext_is_uninitialized(ex),
                                ext4_ext_get_actual_len(ex), ext_pblock(ex));
                err = ext4_ext_get_access(handle, inode, path + depth);
                if (err)
@@ -1651,9 +1676,10 @@ has_space:
        if (!nearex) {
                /* there is no extent in this leaf, create first one */
-                ext_debug("first extent in the leaf: %d:%llu:%d\n",
+                ext_debug("first extent in the leaf: %d:%llu:[%d]%d\n",
                                le32_to_cpu(newext->ee_block),
                                ext_pblock(newext),
+                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext));
                path[depth].p_ext = EXT_FIRST_EXTENT(eh);
        } else if (le32_to_cpu(newext->ee_block)
@@ -1663,10 +1689,11 @@ has_space:
                        len = EXT_MAX_EXTENT(eh) - nearex;
                        len = (len - 1) * sizeof(struct ext4_extent);
                        len = len < 0 ? 0 : len;
-                        ext_debug("insert %d:%llu:%d after: nearest 0x%p, "
+                        ext_debug("insert %d:%llu:[%d]%d after: nearest 0x%p, "
                                        "move %d from 0x%p to 0x%p\n",
                                        le32_to_cpu(newext->ee_block),
                                        ext_pblock(newext),
+                                        ext4_ext_is_uninitialized(newext),
                                        ext4_ext_get_actual_len(newext),
                                        nearex, len, nearex + 1, nearex + 2);
                        memmove(nearex + 2, nearex + 1, len);
@@ -1676,10 +1703,11 @@ has_space:
                BUG_ON(newext->ee_block == nearex->ee_block);
                len = (EXT_MAX_EXTENT(eh) - nearex) * sizeof(struct ext4_extent);
                len = len < 0 ? 0 : len;
-                ext_debug("insert %d:%llu:%d before: nearest 0x%p, "
+                ext_debug("insert %d:%llu:[%d]%d before: nearest 0x%p, "
                                "move %d from 0x%p to 0x%p\n",
                                le32_to_cpu(newext->ee_block),
                                ext_pblock(newext),
+                                ext4_ext_is_uninitialized(newext),
                                ext4_ext_get_actual_len(newext),
                                nearex, len, nearex + 1, nearex + 2);
                memmove(nearex + 1, nearex, len);
@@ -2094,7 +2122,8 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                else
                        uninitialized = 0;
-                ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
+                ext_debug("remove ext %u:[%d]%d\n", ex_ee_block,
+                         uninitialized, ex_ee_len);
                path[depth].p_ext = ex;
                a = ex_ee_block > start ? ex_ee_block : start;
@@ -2138,7 +2167,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
                }
                credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb);
-                err = ext4_ext_journal_restart(handle, credits);
+                err = ext4_ext_truncate_extend_restart(handle, inode, credits);
                if (err)
                        goto out;
@@ -2327,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
                if (err == 0) {
                        ext_inode_hdr(inode)->eh_depth = 0;
                        ext_inode_hdr(inode)->eh_max =
-                                cpu_to_le16(ext4_ext_space_root(inode));
+                                cpu_to_le16(ext4_ext_space_root(inode, 0));
                        err = ext4_ext_dirty(handle, inode, path);
                }
        }
@@ -2743,6 +2772,7 @@ insert:
        } else if (err)
                goto fix_extent_len;
 out:
+        ext4_ext_show_leaf(inode, path);
        return err ? err : allocated;
 fix_extent_len:
@@ -2786,7 +2816,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        struct ext4_allocation_request ar;
        __clear_bit(BH_New, &bh_result->b_state);
-        ext_debug("blocks %u/%u requested for inode %u\n",
+        ext_debug("blocks %u/%u requested for inode %lu\n",
                        iblock, max_blocks, inode->i_ino);
        /* check in cache */
@@ -2849,7 +2879,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        newblock = iblock - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
                        allocated = ee_len - (iblock - ee_block);
-                        ext_debug("%u fit into %lu:%d -> %llu\n", iblock,
+                        ext_debug("%u fit into %u:%d -> %llu\n", iblock,
                                        ee_block, ee_len, newblock);
                        /* Do not put uninitialized extent in the cache */
@@ -2950,7 +2980,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        newblock = ext4_mb_new_blocks(handle, &ar, &err);
        if (!newblock)
                goto out2;
-        ext_debug("allocate new block: goal %llu, found %llu/%lu\n",
+        ext_debug("allocate new block: goal %llu, found %llu/%u\n",
                  ar.goal, newblock, allocated);
        /* try to insert new extent into found leaf and return */
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 3f1873fef1c6..5ca3eca70a1e 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -58,10 +58,7 @@ static ssize_t
 ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                unsigned long nr_segs, loff_t pos)
 {
-        struct file *file = iocb->ki_filp;
+        struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
-        struct inode *inode = file->f_path.dentry->d_inode;
-        ssize_t ret;
-        int err;
        /*
         * If we have encountered a bitmap-format file, the size limit
@@ -81,53 +78,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
                }
        }
-        ret = generic_file_aio_write(iocb, iov, nr_segs, pos);
+        return generic_file_aio_write(iocb, iov, nr_segs, pos);
-        /*
-         * Skip flushing if there was an error, or if nothing was written.
-         */
-        if (ret <= 0)
-                return ret;
-        /*
-         * If the inode is IS_SYNC, or is O_SYNC and we are doing data
-         * journalling then we need to make sure that we force the transaction
-         * to disk to keep all metadata uptodate synchronously.
-         */
-        if (file->f_flags & O_SYNC) {
-                /*
-                 * If we are non-data-journaled, then the dirty data has
-                 * already been flushed to backing store by generic_osync_inode,
-                 * and the inode has been flushed too if there have been any
-                 * modifications other than mere timestamp updates.
-                 *
-                 * Open question --- do we care about flushing timestamps too
-                 * if the inode is IS_SYNC?
-                 */
-                if (!ext4_should_journal_data(inode))
-                        return ret;
-                goto force_commit;
-        }
-        /*
-         * So we know that there has been no forced data flush.  If the inode
-         * is marked IS_SYNC, we need to force one ourselves.
-         */
-        if (!IS_SYNC(inode))
-                return ret;
-        /*
-         * Open question #2 --- should we force data to disk here too?  If we
-         * don't, the only impact is that data=writeback filesystems won't
-         * flush data to disk automatically on IS_SYNC, only metadata (but
-         * historically, that is what ext2 has done.)
-         */
-force_commit:
-        err = ext4_force_commit(inode->i_sb);
-        if (err)
-                return err;
-        return ret;
 }
 static struct vm_operations_struct ext4_file_vm_ops = {
@@ -207,7 +158,7 @@ const struct inode_operations ext4_file_inode_operations = {
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
 #endif
-        .permission     = ext4_permission,
+        .check_acl      = ext4_check_acl,
        .fallocate      = ext4_fallocate,
        .fiemap         = ext4_fiemap,
 };
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 83cf6415f599..07475740b512 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -50,7 +50,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
 {
        struct inode *inode = dentry->d_inode;
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
-        int ret = 0;
+        int err, ret = 0;
        J_ASSERT(ext4_journal_current_handle() == NULL);
@@ -79,6 +79,9 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
                goto out;
        }
+        if (!journal)
+                ret = sync_mapping_buffers(inode->i_mapping);
        if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
                goto out;
@@ -91,10 +94,12 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
                        .sync_mode = WB_SYNC_ALL,
                        .nr_to_write = 0, /* sys_fsync did this */
                };
-                ret = sync_inode(inode, &wbc);
+                err = sync_inode(inode, &wbc);
-                if (journal && (journal->j_flags & JBD2_BARRIER))
+                if (ret == 0)
-                        blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
+                        ret = err;
        }
 out:
+        if (journal && (journal->j_flags & JBD2_BARRIER))
+                blkdev_issue_flush(inode->i_sb->s_bdev, NULL);
        return ret;
 }
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 29e6dc7299b8..f3624ead4f6c 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1189,7 +1189,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
                x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8);
                printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n",
-                        i, ext4_free_inodes_count(sb, gdp), x);
+                        (unsigned long) i, ext4_free_inodes_count(sb, gdp), x);
                bitmap_count += x;
        }
        brelse(bitmap_bh);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index f9c642b22efa..4abd683b963d 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -192,11 +192,24 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
 * so before we call here everything must be consistently dirtied against
 * this transaction.
 */
-static int ext4_journal_test_restart(handle_t *handle, struct inode *inode)
+ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
+                                 int nblocks)
 {
+        int ret;
+        /*
+         * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+         * moment, get_block can be called only for blocks inside i_size since
+         * page cache has been already dropped and writes are blocked by
+         * i_mutex. So we can safely drop the i_data_sem here.
+         */
        BUG_ON(EXT4_JOURNAL(inode) == NULL);
        jbd_debug(2, "restarting handle %p\n", handle);
-        return ext4_journal_restart(handle, blocks_for_truncate(inode));
+        up_write(&EXT4_I(inode)->i_data_sem);
+        ret = ext4_journal_restart(handle, blocks_for_truncate(inode));
+        down_write(&EXT4_I(inode)->i_data_sem);
+        return ret;
 }
 /*
@@ -341,9 +354,7 @@ static int ext4_block_to_path(struct inode *inode,
        int n = 0;
        int final = 0;
-        if (i_block < 0) {
+        if (i_block < direct_blocks) {
-                ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0");
-        } else if (i_block < direct_blocks) {
                offsets[n++] = i_block;
                final = direct_blocks;
        } else if ((i_block -= direct_blocks) < indirect_blocks) {
@@ -551,15 +562,21 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
 *
 *      Normally this function find the preferred place for block allocation,
 *      returns it.
+ *      Because this is only used for non-extent files, we limit the block nr
+ *      to 32 bits.
 */
 static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block,
                                   Indirect *partial)
 {
+        ext4_fsblk_t goal;
        /*
         * XXX need to get goal block from mballoc's data structures
         */
-        return ext4_find_near(inode, partial);
+        goal = ext4_find_near(inode, partial);
+        goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+        return goal;
 }
 /**
@@ -640,6 +657,8 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                if (*err)
                        goto failed_out;
+                BUG_ON(current_block + count > EXT4_MAX_BLOCK_FILE_PHYS);
                target -= count;
                /* allocate blocks for indirect blocks */
                while (index < indirect_blks && count) {
@@ -674,6 +693,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode,
                ar.flags = EXT4_MB_HINT_DATA;
        current_block = ext4_mb_new_blocks(handle, &ar, err);
+        BUG_ON(current_block + ar.len > EXT4_MAX_BLOCK_FILE_PHYS);
        if (*err && (target == blks)) {
                /*
@@ -762,8 +782,9 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode,
                BUFFER_TRACE(bh, "call get_create_access");
                err = ext4_journal_get_create_access(handle, bh);
                if (err) {
+                        /* Don't brelse(bh) here; it's done in
+                         * ext4_journal_forget() below */
                        unlock_buffer(bh);
-                        brelse(bh);
                        goto failed;
                }
@@ -1109,16 +1130,15 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
                ext4_discard_preallocations(inode);
 }
-static int check_block_validity(struct inode *inode, sector_t logical,
+static int check_block_validity(struct inode *inode, const char *msg,
-                                sector_t phys, int len)
+                                sector_t logical, sector_t phys, int len)
 {
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
-                ext4_error(inode->i_sb, "check_block_validity",
+                ext4_error(inode->i_sb, msg,
                           "inode #%lu logical block %llu mapped to %llu "
                           "(size %d)", inode->i_ino,
                           (unsigned long long) logical,
                           (unsigned long long) phys, len);
-                WARN_ON(1);
                return -EIO;
        }
        return 0;
@@ -1170,8 +1190,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
        up_read((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && buffer_mapped(bh)) {
-                int ret = check_block_validity(inode, block,
+                int ret = check_block_validity(inode, "file system corruption",
-                                               bh->b_blocknr, retval);
+                                               block, bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
@@ -1235,8 +1255,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
                         * i_data's format changing.  Force the migrate
                         * to fail by clearing migrate flags
                         */
-                        EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
+                        EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
-                                                        ~EXT4_EXT_MIGRATE;
                }
        }
@@ -1252,8 +1271,9 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
        up_write((&EXT4_I(inode)->i_data_sem));
        if (retval > 0 && buffer_mapped(bh)) {
-                int ret = check_block_validity(inode, block,
+                int ret = check_block_validity(inode, "file system "
-                                               bh->b_blocknr, retval);
+                                               "corruption after allocation",
+                                               block, bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
@@ -1863,18 +1883,6 @@ static void ext4_da_page_release_reservation(struct page *page,
 * Delayed allocation stuff
 */
-struct mpage_da_data {
-        struct inode *inode;
-        sector_t b_blocknr;             /* start block number of extent */
-        size_t b_size;                  /* size of extent */
-        unsigned long b_state;          /* state of the extent */
-        unsigned long first_page, next_page;    /* extent of pages */
-        struct writeback_control *wbc;
-        int io_done;
-        int pages_written;
-        int retval;
-};
 /*
 * mpage_da_submit_io - walks through extent of pages and try to write
 * them with writepage() call back
@@ -2737,6 +2745,7 @@ static int ext4_da_writepages(struct address_space *mapping,
        long pages_skipped;
        int range_cyclic, cycled = 1, io_done = 0;
        int needed_blocks, ret = 0, nr_to_writebump = 0;
+        loff_t range_start = wbc->range_start;
        struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
        trace_ext4_da_writepages(inode, wbc);
@@ -2850,6 +2859,7 @@ retry:
                        mpd.io_done = 1;
                        ret = MPAGE_DA_EXTENT_TAIL;
                }
+                trace_ext4_da_write_pages(inode, &mpd);
                wbc->nr_to_write -= mpd.pages_written;
                ext4_journal_stop(handle);
@@ -2905,6 +2915,7 @@ out_writepages:
        if (!no_nrwrite_index_update)
                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
+        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
        return ret;
 }
@@ -3117,6 +3128,8 @@ out:
 */
 int ext4_alloc_da_blocks(struct inode *inode)
 {
+        trace_ext4_alloc_da_blocks(inode);
        if (!EXT4_I(inode)->i_reserved_data_blocks &&
            !EXT4_I(inode)->i_reserved_meta_blocks)
                return 0;
@@ -3659,7 +3672,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode,
                        ext4_handle_dirty_metadata(handle, inode, bh);
                }
                ext4_mark_inode_dirty(handle, inode);
-                ext4_journal_test_restart(handle, inode);
+                ext4_truncate_restart_trans(handle, inode,
+                                            blocks_for_truncate(inode));
                if (bh) {
                        BUFFER_TRACE(bh, "retaking write access");
                        ext4_journal_get_write_access(handle, bh);
@@ -3870,7 +3884,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                                return;
                        if (try_to_extend_transaction(handle, inode)) {
                                ext4_mark_inode_dirty(handle, inode);
-                                ext4_journal_test_restart(handle, inode);
+                                ext4_truncate_restart_trans(handle, inode,
+                                            blocks_for_truncate(inode));
                        }
                        ext4_free_blocks(handle, inode, nr, 1, 1);
@@ -3958,8 +3973,7 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
-        if (ei->i_disksize && inode->i_size == 0 &&
+        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
-            !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4533,7 +4547,8 @@ static int ext4_inode_blocks_set(handle_t *handle,
 */
 static int ext4_do_update_inode(handle_t *handle,
                                struct inode *inode,
-                                struct ext4_iloc *iloc)
+                                struct ext4_iloc *iloc,
+                                int do_sync)
 {
        struct ext4_inode *raw_inode = ext4_raw_inode(iloc);
        struct ext4_inode_info *ei = EXT4_I(inode);
@@ -4581,8 +4596,7 @@ static int ext4_do_update_inode(handle_t *handle,
        if (ext4_inode_blocks_set(handle, raw_inode, ei))
                goto out_brelse;
        raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
-        /* clear the migrate flag in the raw_inode */
+        raw_inode->i_flags = cpu_to_le32(ei->i_flags);
-        raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE);
        if (EXT4_SB(inode->i_sb)->s_es->s_creator_os !=
            cpu_to_le32(EXT4_OS_HURD))
                raw_inode->i_file_acl_high =
@@ -4635,10 +4649,22 @@ static int ext4_do_update_inode(handle_t *handle,
                raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
        }
-        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+        /*
-        rc = ext4_handle_dirty_metadata(handle, inode, bh);
+         * If we're not using a journal and we were called from
-        if (!err)
+         * ext4_write_inode() to sync the inode (making do_sync true),
-                err = rc;
+         * we can just use sync_dirty_buffer() directly to do our dirty
+         * work.  Testing s_journal here is a bit redundant but it's
+         * worth it to avoid potential future trouble.
+         */
+        if (EXT4_SB(inode->i_sb)->s_journal == NULL && do_sync) {
+                BUFFER_TRACE(bh, "call sync_dirty_buffer");
+                sync_dirty_buffer(bh);
+        } else {
+                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                rc = ext4_handle_dirty_metadata(handle, inode, bh);
+                if (!err)
+                        err = rc;
+        }
        ei->i_state &= ~EXT4_STATE_NEW;
 out_brelse:
@@ -4684,19 +4710,32 @@ out_brelse:
 */
 int ext4_write_inode(struct inode *inode, int wait)
 {
+        int err;
        if (current->flags & PF_MEMALLOC)
                return 0;
-        if (ext4_journal_current_handle()) {
+        if (EXT4_SB(inode->i_sb)->s_journal) {
-                jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
+                if (ext4_journal_current_handle()) {
-                dump_stack();
+                        jbd_debug(1, "called recursively, non-PF_MEMALLOC!\n");
-                return -EIO;
+                        dump_stack();
-        }
+                        return -EIO;
+                }
-        if (!wait)
+                if (!wait)
-                return 0;
+                        return 0;
+                err = ext4_force_commit(inode->i_sb);
+        } else {
+                struct ext4_iloc iloc;
-        return ext4_force_commit(inode->i_sb);
+                err = ext4_get_inode_loc(inode, &iloc);
+                if (err)
+                        return err;
+                err = ext4_do_update_inode(EXT4_NOJOURNAL_HANDLE,
+                                           inode, &iloc, wait);
+        }
+        return err;
 }
 /*
@@ -4990,7 +5029,7 @@ int ext4_mark_iloc_dirty(handle_t *handle,
        get_bh(iloc->bh);
        /* ext4_do_update_inode() does jbd2_journal_dirty_metadata */
-        err = ext4_do_update_inode(handle, inode, iloc);
+        err = ext4_do_update_inode(handle, inode, iloc, 0);
        put_bh(iloc->bh);
        return err;
 }
@@ -5281,12 +5320,21 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        else
                len = PAGE_CACHE_SIZE;
+        lock_page(page);
+        /*
+         * return if we have all the buffers mapped. This avoid
+         * the need to call write_begin/write_end which does a
+         * journal_start/journal_stop which can block and take
+         * long time
+         */
        if (page_has_buffers(page)) {
-                /* return if we have all the buffers mapped */
                if (!walk_page_buffers(NULL, page_buffers(page), 0, len, NULL,
-                                       ext4_bh_unmapped))
+                                        ext4_bh_unmapped)) {
+                        unlock_page(page);
                        goto out_unlock;
+                }
        }
+        unlock_page(page);
        /*
         * OK, we need to fill the hole... Do write_begin write_end
         * to do block allocation/reservation.We are not holding
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 7050a9cd04a4..c1cdf613e725 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -243,10 +243,9 @@ setversion_out:
                                        me.donor_start, me.len, &me.moved_len);
                fput(donor_filp);
-                if (!err)
+                if (copy_to_user((struct move_extent *)arg, &me, sizeof(me)))
-                        if (copy_to_user((struct move_extent *)arg,
+                        return -EFAULT;
-                                &me, sizeof(me)))
-                                return -EFAULT;
                return err;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cd258463e2a9..e9c61896d605 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -22,6 +22,7 @@
 */
 #include "mballoc.h"
+#include <linux/debugfs.h>
 #include <trace/events/ext4.h>
 /*
@@ -622,13 +623,13 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file,
 /* FIXME!! need more doc */
 static void ext4_mb_mark_free_simple(struct super_block *sb,
-                                void *buddy, unsigned first, int len,
+                                void *buddy, ext4_grpblk_t first, ext4_grpblk_t len,
                                        struct ext4_group_info *grp)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        unsigned short min;
+        ext4_grpblk_t min;
-        unsigned short max;
+        ext4_grpblk_t max;
-        unsigned short chunk;
+        ext4_grpblk_t chunk;
        unsigned short border;
        BUG_ON(len > EXT4_BLOCKS_PER_GROUP(sb));
@@ -662,10 +663,10 @@ void ext4_mb_generate_buddy(struct super_block *sb,
                                void *buddy, void *bitmap, ext4_group_t group)
 {
        struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-        unsigned short max = EXT4_BLOCKS_PER_GROUP(sb);
+        ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
-        unsigned short i = 0;
+        ext4_grpblk_t i = 0;
-        unsigned short first;
+        ext4_grpblk_t first;
-        unsigned short len;
+        ext4_grpblk_t len;
        unsigned free = 0;
        unsigned fragments = 0;
        unsigned long long period = get_cycles();
@@ -743,7 +744,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        char *data;
        char *bitmap;
-        mb_debug("init page %lu\n", page->index);
+        mb_debug(1, "init page %lu\n", page->index);
        inode = page->mapping->host;
        sb = inode->i_sb;
@@ -822,7 +823,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                set_bitmap_uptodate(bh[i]);
                bh[i]->b_end_io = end_buffer_read_sync;
                submit_bh(READ, bh[i]);
-                mb_debug("read bitmap for group %u\n", first_group + i);
+                mb_debug(1, "read bitmap for group %u\n", first_group + i);
        }
        /* wait for I/O completion */
@@ -862,12 +863,13 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                if ((first_block + i) & 1) {
                        /* this is block of buddy */
                        BUG_ON(incore == NULL);
-                        mb_debug("put buddy for group %u in page %lu/%x\n",
+                        mb_debug(1, "put buddy for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
                        grinfo = ext4_get_group_info(sb, group);
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
-                               sizeof(unsigned short)*(sb->s_blocksize_bits+2));
+                               sizeof(*grinfo->bb_counters) *
+                                (sb->s_blocksize_bits+2));
                        /*
                         * incore got set to the group block bitmap below
                         */
@@ -878,7 +880,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                } else {
                        /* this is block of bitmap */
                        BUG_ON(incore != NULL);
-                        mb_debug("put bitmap for group %u in page %lu/%x\n",
+                        mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
                        /* see comments in ext4_mb_put_pa() */
@@ -908,6 +910,100 @@ out:
        return err;
 }
+static noinline_for_stack
+int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
+{
+        int ret = 0;
+        void *bitmap;
+        int blocks_per_page;
+        int block, pnum, poff;
+        int num_grp_locked = 0;
+        struct ext4_group_info *this_grp;
+        struct ext4_sb_info *sbi = EXT4_SB(sb);
+        struct inode *inode = sbi->s_buddy_cache;
+        struct page *page = NULL, *bitmap_page = NULL;
+        mb_debug(1, "init group %u\n", group);
+        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
+        this_grp = ext4_get_group_info(sb, group);
+        /*
+         * This ensures that we don't reinit the buddy cache
+         * page which map to the group from which we are already
+         * allocating. If we are looking at the buddy cache we would
+         * have taken a reference using ext4_mb_load_buddy and that
+         * would have taken the alloc_sem lock.
+         */
+        num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
+        if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
+                /*
+                 * somebody initialized the group
+                 * return without doing anything
+                 */
+                ret = 0;
+                goto err;
+        }
+        /*
+         * the buddy cache inode stores the block bitmap
+         * and buddy information in consecutive blocks.
+         * So for each group we need two blocks.
+         */
+        block = group * 2;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        if (page) {
+                BUG_ON(page->mapping != inode->i_mapping);
+                ret = ext4_mb_init_cache(page, NULL);
+                if (ret) {
+                        unlock_page(page);
+                        goto err;
+                }
+                unlock_page(page);
+        }
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
+                goto err;
+        }
+        mark_page_accessed(page);
+        bitmap_page = page;
+        bitmap = page_address(page) + (poff * sb->s_blocksize);
+        /* init buddy cache */
+        block++;
+        pnum = block / blocks_per_page;
+        poff = block % blocks_per_page;
+        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
+        if (page == bitmap_page) {
+                /*
+                 * If both the bitmap and buddy are in
+                 * the same page we don't need to force
+                 * init the buddy
+                 */
+                unlock_page(page);
+        } else if (page) {
+                BUG_ON(page->mapping != inode->i_mapping);
+                ret = ext4_mb_init_cache(page, bitmap);
+                if (ret) {
+                        unlock_page(page);
+                        goto err;
+                }
+                unlock_page(page);
+        }
+        if (page == NULL || !PageUptodate(page)) {
+                ret = -EIO;
+                goto err;
+        }
+        mark_page_accessed(page);
+err:
+        ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
+        if (bitmap_page)
+                page_cache_release(bitmap_page);
+        if (page)
+                page_cache_release(page);
+        return ret;
+}
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                                        struct ext4_buddy *e4b)
@@ -922,7 +1018,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct inode *inode = sbi->s_buddy_cache;
-        mb_debug("load group %u\n", group);
+        mb_debug(1, "load group %u\n", group);
        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
        grp = ext4_get_group_info(sb, group);
@@ -941,8 +1037,26 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
         * groups mapped by the page is blocked
         * till we are done with allocation
         */
+repeat_load_buddy:
        down_read(e4b->alloc_semp);
+        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+                /* we need to check for group need init flag
+                 * with alloc_semp held so that we can be sure
+                 * that new blocks didn't get added to the group
+                 * when we are loading the buddy cache
+                 */
+                up_read(e4b->alloc_semp);
+                /*
+                 * we need full data about the group
+                 * to make a good selection
+                 */
+                ret = ext4_mb_init_group(sb, group);
+                if (ret)
+                        return ret;
+                goto repeat_load_buddy;
+        }
        /*
         * the buddy cache inode stores the block bitmap
         * and buddy information in consecutive blocks.
@@ -1360,7 +1474,7 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac,
        ac->alloc_semp =  e4b->alloc_semp;
        e4b->alloc_semp = NULL;
        /* store last allocated for subsequent stream allocation */
-        if ((ac->ac_flags & EXT4_MB_HINT_DATA)) {
+        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                spin_lock(&sbi->s_md_lock);
                sbi->s_mb_last_group = ac->ac_f_ex.fe_group;
                sbi->s_mb_last_start = ac->ac_f_ex.fe_start;
@@ -1837,97 +1951,6 @@ void ext4_mb_put_buddy_cache_lock(struct super_block *sb,
 }
-static noinline_for_stack
-int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
-{
-        int ret;
-        void *bitmap;
-        int blocks_per_page;
-        int block, pnum, poff;
-        int num_grp_locked = 0;
-        struct ext4_group_info *this_grp;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct inode *inode = sbi->s_buddy_cache;
-        struct page *page = NULL, *bitmap_page = NULL;
-        mb_debug("init group %lu\n", group);
-        blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize;
-        this_grp = ext4_get_group_info(sb, group);
-        /*
-         * This ensures we don't add group
-         * to this buddy cache via resize
-         */
-        num_grp_locked =  ext4_mb_get_buddy_cache_lock(sb, group);
-        if (!EXT4_MB_GRP_NEED_INIT(this_grp)) {
-                /*
-                 * somebody initialized the group
-                 * return without doing anything
-                 */
-                ret = 0;
-                goto err;
-        }
-        /*
-         * the buddy cache inode stores the block bitmap
-         * and buddy information in consecutive blocks.
-         * So for each group we need two blocks.
-         */
-        block = group * 2;
-        pnum = block / blocks_per_page;
-        poff = block % blocks_per_page;
-        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-        if (page) {
-                BUG_ON(page->mapping != inode->i_mapping);
-                ret = ext4_mb_init_cache(page, NULL);
-                if (ret) {
-                        unlock_page(page);
-                        goto err;
-                }
-                unlock_page(page);
-        }
-        if (page == NULL || !PageUptodate(page)) {
-                ret = -EIO;
-                goto err;
-        }
-        mark_page_accessed(page);
-        bitmap_page = page;
-        bitmap = page_address(page) + (poff * sb->s_blocksize);
-        /* init buddy cache */
-        block++;
-        pnum = block / blocks_per_page;
-        poff = block % blocks_per_page;
-        page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS);
-        if (page == bitmap_page) {
-                /*
-                 * If both the bitmap and buddy are in
-                 * the same page we don't need to force
-                 * init the buddy
-                 */
-                unlock_page(page);
-        } else if (page) {
-                BUG_ON(page->mapping != inode->i_mapping);
-                ret = ext4_mb_init_cache(page, bitmap);
-                if (ret) {
-                        unlock_page(page);
-                        goto err;
-                }
-                unlock_page(page);
-        }
-        if (page == NULL || !PageUptodate(page)) {
-                ret = -EIO;
-                goto err;
-        }
-        mark_page_accessed(page);
-err:
-        ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked);
-        if (bitmap_page)
-                page_cache_release(bitmap_page);
-        if (page)
-                page_cache_release(page);
-        return ret;
-}
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
@@ -1938,11 +1961,14 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        struct ext4_sb_info *sbi;
        struct super_block *sb;
        struct ext4_buddy e4b;
-        loff_t size, isize;
        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
        ngroups = ext4_get_groups_count(sb);
+        /* non-extent files are limited to low blocks/groups */
+        if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+                ngroups = sbi->s_blockfile_groups;
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
        /* first, try the goal */
@@ -1974,20 +2000,16 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        }
        bsbits = ac->ac_sb->s_blocksize_bits;
-        /* if stream allocation is enabled, use global goal */
-        size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
-        isize = i_size_read(ac->ac_inode) >> bsbits;
-        if (size < isize)
-                size = isize;
-        if (size < sbi->s_mb_stream_request &&
+        /* if stream allocation is enabled, use global goal */
-                        (ac->ac_flags & EXT4_MB_HINT_DATA)) {
+        if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) {
                /* TBD: may be hot point */
                spin_lock(&sbi->s_md_lock);
                ac->ac_g_ex.fe_group = sbi->s_mb_last_group;
                ac->ac_g_ex.fe_start = sbi->s_mb_last_start;
                spin_unlock(&sbi->s_md_lock);
        }
        /* Let's just scan groups to find more-less suitable blocks */
        cr = ac->ac_2order ? 0 : 1;
        /*
@@ -2015,27 +2037,6 @@ repeat:
                        if (grp->bb_free == 0)
                                continue;
-                        /*
-                         * if the group is already init we check whether it is
-                         * a good group and if not we don't load the buddy
-                         */
-                        if (EXT4_MB_GRP_NEED_INIT(grp)) {
-                                /*
-                                 * we need full data about the group
-                                 * to make a good selection
-                                 */
-                                err = ext4_mb_init_group(sb, group);
-                                if (err)
-                                        goto out;
-                        }
-                        /*
-                         * If the particular group doesn't satisfy our
-                         * criteria we continue with the next group
-                         */
-                        if (!ext4_mb_good_group(ac, group, cr))
-                                continue;
                        err = ext4_mb_load_buddy(sb, group, &e4b);
                        if (err)
                                goto out;
@@ -2156,7 +2157,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
        if (v == SEQ_START_TOKEN) {
                seq_printf(seq, "%-5s %-8s %-23s %-23s %-23s %-5s "
-                                "%-5s %-2s %-5s %-5s %-5s %-6s\n",
+                                "%-5s %-2s %-6s %-5s %-5s %-6s\n",
                          "pid", "inode", "original", "goal", "result", "found",
                           "grps", "cr", "flags", "merge", "tail", "broken");
                return 0;
@@ -2164,7 +2165,7 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v)
        if (hs->op == EXT4_MB_HISTORY_ALLOC) {
                fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u "
-                        "%-5u %-5s %-5u %-6u\n";
+                        "0x%04x %-5s %-5u %-6u\n";
                sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group,
                        hs->result.fe_start, hs->result.fe_len,
                        hs->result.fe_logical);
@@ -2205,7 +2206,7 @@ static void ext4_mb_seq_history_stop(struct seq_file *seq, void *v)
 {
 }
-static struct seq_operations ext4_mb_seq_history_ops = {
+static const struct seq_operations ext4_mb_seq_history_ops = {
        .start  = ext4_mb_seq_history_start,
        .next   = ext4_mb_seq_history_next,
        .stop   = ext4_mb_seq_history_stop,
@@ -2287,7 +2288,7 @@ static ssize_t ext4_mb_seq_history_write(struct file *file,
        return count;
 }
-static struct file_operations ext4_mb_seq_history_fops = {
+static const struct file_operations ext4_mb_seq_history_fops = {
        .owner          = THIS_MODULE,
        .open           = ext4_mb_seq_history_open,
        .read           = seq_read,
@@ -2328,7 +2329,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        struct ext4_buddy e4b;
        struct sg {
                struct ext4_group_info info;
-                unsigned short counters[16];
+                ext4_grpblk_t counters[16];
        } sg;
        group--;
@@ -2366,7 +2367,7 @@ static void ext4_mb_seq_groups_stop(struct seq_file *seq, void *v)
 {
 }
-static struct seq_operations ext4_mb_seq_groups_ops = {
+static const struct seq_operations ext4_mb_seq_groups_ops = {
        .start  = ext4_mb_seq_groups_start,
        .next   = ext4_mb_seq_groups_next,
        .stop   = ext4_mb_seq_groups_stop,
@@ -2387,7 +2388,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file)
 }
-static struct file_operations ext4_mb_seq_groups_fops = {
+static const struct file_operations ext4_mb_seq_groups_fops = {
        .owner          = THIS_MODULE,
        .open           = ext4_mb_seq_groups_open,
        .read           = seq_read,
@@ -2532,7 +2533,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
-        meta_group_info[i]->bb_free_root.rb_node = NULL;;
+        meta_group_info[i]->bb_free_root.rb_node = NULL;
 #ifdef DOUBLE_CHECK
        {
@@ -2558,26 +2559,15 @@ exit_meta_group_info:
        return -ENOMEM;
 } /* ext4_mb_add_groupinfo */
-/*
- * Update an existing group.
- * This function is used for online resize
- */
-void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
-{
-        grp->bb_free += add;
-}
 static int ext4_mb_init_backend(struct super_block *sb)
 {
        ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
-        int metalen;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_super_block *es = sbi->s_es;
        int num_meta_group_infos;
        int num_meta_group_infos_max;
        int array_size;
-        struct ext4_group_info **meta_group_info;
        struct ext4_group_desc *desc;
        /* This is the number of blocks used by GDT */
@@ -2622,22 +2612,6 @@ static int ext4_mb_init_backend(struct super_block *sb)
                goto err_freesgi;
        }
        EXT4_I(sbi->s_buddy_cache)->i_disksize = 0;
-        metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb);
-        for (i = 0; i < num_meta_group_infos; i++) {
-                if ((i + 1) == num_meta_group_infos)
-                        metalen = sizeof(*meta_group_info) *
-                                (ngroups -
-                                        (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
-                meta_group_info = kmalloc(metalen, GFP_KERNEL);
-                if (meta_group_info == NULL) {
-                        printk(KERN_ERR "EXT4-fs: can't allocate mem for a "
-                               "buddy group\n");
-                        goto err_freemeta;
-                }
-                sbi->s_group_info[i] = meta_group_info;
-        }
        for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
@@ -2655,7 +2629,6 @@ err_freebuddy:
        while (i-- > 0)
                kfree(ext4_get_group_info(sb, i));
        i = num_meta_group_infos;
-err_freemeta:
        while (i-- > 0)
                kfree(sbi->s_group_info[i]);
        iput(sbi->s_buddy_cache);
@@ -2672,14 +2645,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        unsigned max;
        int ret;
-        i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short);
+        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets);
        sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_offsets == NULL) {
                return -ENOMEM;
        }
-        i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
+        i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
                kfree(sbi->s_mb_offsets);
@@ -2758,7 +2731,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
                kmem_cache_free(ext4_pspace_cachep, pa);
        }
        if (count)
-                mb_debug("mballoc: %u PAs left\n", count);
+                mb_debug(1, "mballoc: %u PAs left\n", count);
 }
@@ -2839,7 +2812,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        list_for_each_safe(l, ltmp, &txn->t_private_list) {
                entry = list_entry(l, struct ext4_free_data, list);
-                mb_debug("gonna free %u blocks in group %u (0x%p):",
+                mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
                         entry->count, entry->group, entry);
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2874,9 +2847,43 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                ext4_mb_release_desc(&e4b);
        }
-        mb_debug("freed %u blocks in %u structures\n", count, count2);
+        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
+}
+#ifdef CONFIG_EXT4_DEBUG
+u8 mb_enable_debug __read_mostly;
+static struct dentry *debugfs_dir;
+static struct dentry *debugfs_debug;
+static void __init ext4_create_debugfs_entry(void)
+{
+        debugfs_dir = debugfs_create_dir("ext4", NULL);
+        if (debugfs_dir)
+                debugfs_debug = debugfs_create_u8("mballoc-debug",
+                                                  S_IRUGO | S_IWUSR,
+                                                  debugfs_dir,
+                                                  &mb_enable_debug);
+}
+static void ext4_remove_debugfs_entry(void)
+{
+        debugfs_remove(debugfs_debug);
+        debugfs_remove(debugfs_dir);
 }
+#else
+static void __init ext4_create_debugfs_entry(void)
+{
+}
+static void ext4_remove_debugfs_entry(void)
+{
+}
+#endif
 int __init init_ext4_mballoc(void)
 {
        ext4_pspace_cachep =
@@ -2904,6 +2911,7 @@ int __init init_ext4_mballoc(void)
                kmem_cache_destroy(ext4_ac_cachep);
                return -ENOMEM;
        }
+        ext4_create_debugfs_entry();
        return 0;
 }
@@ -2917,6 +2925,7 @@ void exit_ext4_mballoc(void)
        kmem_cache_destroy(ext4_pspace_cachep);
        kmem_cache_destroy(ext4_ac_cachep);
        kmem_cache_destroy(ext4_free_ext_cachep);
+        ext4_remove_debugfs_entry();
 }
@@ -3061,7 +3070,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac)
                ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe;
        else
                ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc;
-        mb_debug("#%u: goal %u blocks for locality group\n",
+        mb_debug(1, "#%u: goal %u blocks for locality group\n",
                current->pid, ac->ac_g_ex.fe_len);
 }
@@ -3180,23 +3189,18 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                BUG_ON(!(ac->ac_o_ex.fe_logical >= pa_end ||
                        ac->ac_o_ex.fe_logical < pa->pa_lstart));
-                /* skip PA normalized request doesn't overlap with */
+                /* skip PAs this normalized request doesn't overlap with */
-                if (pa->pa_lstart >= end) {
+                if (pa->pa_lstart >= end || pa_end <= start) {
-                        spin_unlock(&pa->pa_lock);
-                        continue;
-                }
-                if (pa_end <= start) {
                        spin_unlock(&pa->pa_lock);
                        continue;
                }
                BUG_ON(pa->pa_lstart <= start && pa_end >= end);
+                /* adjust start or end to be adjacent to this pa */
                if (pa_end <= ac->ac_o_ex.fe_logical) {
                        BUG_ON(pa_end < start);
                        start = pa_end;
-                }
+                } else if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
-                if (pa->pa_lstart > ac->ac_o_ex.fe_logical) {
                        BUG_ON(pa->pa_lstart > end);
                        end = pa->pa_lstart;
                }
@@ -3251,7 +3255,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
                ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL;
        }
-        mb_debug("goal: %u(was %u) blocks at %u\n", (unsigned) size,
+        mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size,
                (unsigned) orig_size, (unsigned) start);
 }
@@ -3300,7 +3304,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac,
        BUG_ON(pa->pa_free < len);
        pa->pa_free -= len;
-        mb_debug("use %llu/%u from inode pa %p\n", start, len, pa);
+        mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa);
 }
 /*
@@ -3324,7 +3328,7 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac,
         * in on-disk bitmap -- see ext4_mb_release_context()
         * Other CPUs are prevented from allocating from this pa by lg_mutex
         */
-        mb_debug("use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
+        mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa);
 }
 /*
@@ -3382,6 +3386,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                        ac->ac_o_ex.fe_logical >= pa->pa_lstart + pa->pa_len)
                        continue;
+                /* non-extent files can't have physical blocks past 2^32 */
+                if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+                        pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
+                        continue;
                /* found preallocated blocks, use them */
                spin_lock(&pa->pa_lock);
                if (pa->pa_deleted == 0 && pa->pa_free) {
@@ -3503,7 +3512,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                preallocated += len;
                count++;
        }
-        mb_debug("prellocated %u for group %u\n", preallocated, group);
+        mb_debug(1, "prellocated %u for group %u\n", preallocated, group);
 }
 static void ext4_mb_pa_callback(struct rcu_head *head)
@@ -3638,7 +3647,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        pa->pa_deleted = 0;
        pa->pa_type = MB_INODE_PA;
-        mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
+        mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa,
                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
        trace_ext4_mb_new_inode_pa(ac, pa);
@@ -3698,7 +3707,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        pa->pa_deleted = 0;
        pa->pa_type = MB_GROUP_PA;
-        mb_debug("new group pa %p: %llu/%u for %u\n", pa,
+        mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa,
                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
        trace_ext4_mb_new_group_pa(ac, pa);
@@ -3777,7 +3786,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
                next = mb_find_next_bit(bitmap_bh->b_data, end, bit);
                start = group * EXT4_BLOCKS_PER_GROUP(sb) + bit +
                                le32_to_cpu(sbi->s_es->s_first_data_block);
-                mb_debug("    free preallocated %u/%u in group %u\n",
+                mb_debug(1, "    free preallocated %u/%u in group %u\n",
                                (unsigned) start, (unsigned) next - bit,
                                (unsigned) group);
                free += next - bit;
@@ -3868,7 +3877,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb,
        int busy = 0;
        int free = 0;
-        mb_debug("discard preallocation for group %u\n", group);
+        mb_debug(1, "discard preallocation for group %u\n", group);
        if (list_empty(&grp->bb_prealloc_list))
                return 0;
@@ -3992,7 +4001,7 @@ void ext4_discard_preallocations(struct inode *inode)
                return;
        }
-        mb_debug("discard preallocation for inode %lu\n", inode->i_ino);
+        mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino);
        trace_ext4_discard_preallocations(inode);
        INIT_LIST_HEAD(&list);
@@ -4097,7 +4106,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
 {
        BUG_ON(!list_empty(&EXT4_I(inode)->i_prealloc_list));
 }
-#ifdef MB_DEBUG
+#ifdef CONFIG_EXT4_DEBUG
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
@@ -4139,14 +4148,14 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
                        ext4_get_group_no_and_offset(sb, pa->pa_pstart,
                                                     NULL, &start);
                        spin_unlock(&pa->pa_lock);
-                        printk(KERN_ERR "PA:%lu:%d:%u \n", i,
+                        printk(KERN_ERR "PA:%u:%d:%u \n", i,
-                                                        start, pa->pa_len);
+                               start, pa->pa_len);
                }
                ext4_unlock_group(sb, i);
                if (grp->bb_free == 0)
                        continue;
-                printk(KERN_ERR "%lu: %d/%d \n",
+                printk(KERN_ERR "%u: %d/%d \n",
                       i, grp->bb_free, grp->bb_fragments);
        }
        printk(KERN_ERR "\n");
@@ -4174,16 +4183,26 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
        if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
                return;
+        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+                return;
        size = ac->ac_o_ex.fe_logical + ac->ac_o_ex.fe_len;
-        isize = i_size_read(ac->ac_inode) >> bsbits;
+        isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
+                >> bsbits;
        size = max(size, isize);
-        /* don't use group allocation for large files */
+        if ((size == isize) &&
-        if (size >= sbi->s_mb_stream_request)
+            !ext4_fs_is_busy(sbi) &&
+            (atomic_read(&ac->ac_inode->i_writecount) == 0)) {
+                ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
                return;
+        }
-        if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
+        /* don't use group allocation for large files */
+        if (size >= sbi->s_mb_stream_request) {
+                ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
                return;
+        }
        BUG_ON(ac->ac_lg != NULL);
        /*
@@ -4246,7 +4265,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
         * locality group. this is a policy, actually */
        ext4_mb_group_or_file(ac);
-        mb_debug("init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
+        mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, "
                        "left: %u/%u, right %u/%u to %swritable\n",
                        (unsigned) ar->len, (unsigned) ar->logical,
                        (unsigned) ar->goal, ac->ac_flags, ac->ac_2order,
@@ -4268,7 +4287,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
        struct ext4_prealloc_space *pa, *tmp;
        struct ext4_allocation_context *ac;
-        mb_debug("discard locality group preallocation\n");
+        mb_debug(1, "discard locality group preallocation\n");
        INIT_LIST_HEAD(&discard_list);
        ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index c96bb19f58f9..188d3d709b24 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -37,11 +37,19 @@
 /*
 */
-#define MB_DEBUG__
+#ifdef CONFIG_EXT4_DEBUG
-#ifdef MB_DEBUG
+extern u8 mb_enable_debug;
-#define mb_debug(fmt, a...)     printk(fmt, ##a)
+#define mb_debug(n, fmt, a...)                                          \
+        do {                                                            \
+                if ((n) <= mb_enable_debug) {                           \
+                        printk(KERN_DEBUG "(%s, %d): %s: ",             \
+                               __FILE__, __LINE__, __func__);           \
+                        printk(fmt, ## a);                              \
+                }                                                       \
+        } while (0)
 #else
-#define mb_debug(fmt, a...)
+#define mb_debug(n, fmt, a...)
 #endif
 /*
@@ -128,8 +136,8 @@ struct ext4_prealloc_space {
        unsigned                pa_deleted;
        ext4_fsblk_t            pa_pstart;      /* phys. block */
        ext4_lblk_t             pa_lstart;      /* log. block */
-        unsigned short          pa_len;         /* len of preallocated chunk */
+        ext4_grpblk_t           pa_len;         /* len of preallocated chunk */
-        unsigned short          pa_free;        /* how many blocks are free */
+        ext4_grpblk_t           pa_free;        /* how many blocks are free */
        unsigned short          pa_type;        /* pa type. inode or group */
        spinlock_t              *pa_obj_lock;
        struct inode            *pa_inode;      /* hack, for history only */
@@ -144,7 +152,7 @@ struct ext4_free_extent {
        ext4_lblk_t fe_logical;
        ext4_grpblk_t fe_start;
        ext4_group_t fe_group;
-        int fe_len;
+        ext4_grpblk_t fe_len;
 };
 /*
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 313a50b39741..bf519f239ae6 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -353,17 +353,16 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode,
        down_write(&EXT4_I(inode)->i_data_sem);
        /*
-         * if EXT4_EXT_MIGRATE is cleared a block allocation
+         * if EXT4_STATE_EXT_MIGRATE is cleared a block allocation
         * happened after we started the migrate. We need to
         * fail the migrate
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) {
+        if (!(EXT4_I(inode)->i_state & EXT4_STATE_EXT_MIGRATE)) {
                retval = -EAGAIN;
                up_write(&EXT4_I(inode)->i_data_sem);
                goto err_out;
        } else
-                EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags &
+                EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE;
-                                                        ~EXT4_EXT_MIGRATE;
        /*
         * We have the extent map build with the tmp inode.
         * Now copy the i_data across
@@ -517,14 +516,15 @@ int ext4_ext_migrate(struct inode *inode)
         * when we add extents we extent the journal
         */
        /*
-         * Even though we take i_mutex we can still cause block allocation
+         * Even though we take i_mutex we can still cause block
-         * via mmap write to holes. If we have allocated new blocks we fail
+         * allocation via mmap write to holes. If we have allocated
-         * migrate.  New block allocation will clear EXT4_EXT_MIGRATE flag.
+         * new blocks we fail migrate.  New block allocation will
-         * The flag is updated with i_data_sem held to prevent racing with
+         * clear EXT4_STATE_EXT_MIGRATE flag.  The flag is updated
-         * block allocation.
+         * with i_data_sem held to prevent racing with block
+         * allocation.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
-        EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE;
+        EXT4_I(inode)->i_state |= EXT4_STATE_EXT_MIGRATE;
        up_read((&EXT4_I(inode)->i_data_sem));
        handle = ext4_journal_start(inode, 1);
@@ -618,7 +618,7 @@ err_out:
        tmp_inode->i_nlink = 0;
        ext4_journal_stop(handle);
+        unlock_new_inode(tmp_inode);
        iput(tmp_inode);
        return retval;
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index bbf2dd9404dc..c07a2915e40b 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -19,14 +19,31 @@
 #include "ext4_extents.h"
 #include "ext4.h"
-#define get_ext_path(path, inode, block, ret)           \
+/**
-        do {                                                            \
+ * get_ext_path - Find an extent path for designated logical block number.
-                path = ext4_ext_find_extent(inode, block, path);        \
+ *
-                if (IS_ERR(path)) {                                     \
+ * @inode:      an inode which is searched
-                        ret = PTR_ERR(path);                            \
+ * @lblock:     logical block number to find an extent path
-                        path = NULL;                                    \
+ * @path:       pointer to an extent path pointer (for output)
-                }                                                       \
+ *
-        } while (0)
+ * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * on failure.
+ */
+static inline int
+get_ext_path(struct inode *inode, ext4_lblk_t lblock,
+                struct ext4_ext_path **path)
+{
+        int ret = 0;
+        *path = ext4_ext_find_extent(inode, lblock, *path);
+        if (IS_ERR(*path)) {
+                ret = PTR_ERR(*path);
+                *path = NULL;
+        } else if ((*path)[ext_depth(inode)].p_ext == NULL)
+                ret = -ENODATA;
+        return ret;
+}
 /**
 * copy_extent_status - Copy the extent's initialization status
@@ -113,6 +130,31 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
 }
 /**
+ * mext_check_null_inode - NULL check for two inodes
+ *
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
+ */
+static int
+mext_check_null_inode(struct inode *inode1, struct inode *inode2,
+                const char *function)
+{
+        int ret = 0;
+        if (inode1 == NULL) {
+                ext4_error(inode2->i_sb, function,
+                        "Both inodes should not be NULL: "
+                        "inode1 NULL inode2 %lu", inode2->i_ino);
+                ret = -EIO;
+        } else if (inode2 == NULL) {
+                ext4_error(inode1->i_sb, function,
+                        "Both inodes should not be NULL: "
+                        "inode1 %lu inode2 NULL", inode1->i_ino);
+                ret = -EIO;
+        }
+        return ret;
+}
+/**
 * mext_double_down_read - Acquire two inodes' read semaphore
 *
 * @orig_inode:         original inode structure
@@ -124,8 +166,6 @@ mext_double_down_read(struct inode *orig_inode, struct inode *donor_inode)
 {
        struct inode *first = orig_inode, *second = donor_inode;
-        BUG_ON(orig_inode == NULL || donor_inode == NULL);
        /*
         * Use the inode number to provide the stable locking order instead
         * of its address, because the C language doesn't guarantee you can
@@ -152,8 +192,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
 {
        struct inode *first = orig_inode, *second = donor_inode;
-        BUG_ON(orig_inode == NULL || donor_inode == NULL);
        /*
         * Use the inode number to provide the stable locking order instead
         * of its address, because the C language doesn't guarantee you can
@@ -178,8 +216,6 @@ mext_double_down_write(struct inode *orig_inode, struct inode *donor_inode)
 static void
 mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
 {
-        BUG_ON(orig_inode == NULL || donor_inode == NULL);
        up_read(&EXT4_I(orig_inode)->i_data_sem);
        up_read(&EXT4_I(donor_inode)->i_data_sem);
 }
@@ -194,8 +230,6 @@ mext_double_up_read(struct inode *orig_inode, struct inode *donor_inode)
 static void
 mext_double_up_write(struct inode *orig_inode, struct inode *donor_inode)
 {
-        BUG_ON(orig_inode == NULL || donor_inode == NULL);
        up_write(&EXT4_I(orig_inode)->i_data_sem);
        up_write(&EXT4_I(donor_inode)->i_data_sem);
 }
@@ -283,8 +317,8 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
        }
        if (new_flag) {
-                get_ext_path(orig_path, orig_inode, eblock, err);
+                err = get_ext_path(orig_inode, eblock, &orig_path);
-                if (orig_path == NULL)
+                if (err)
                        goto out;
                if (ext4_ext_insert_extent(handle, orig_inode,
@@ -293,9 +327,9 @@ mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
        }
        if (end_flag) {
-                get_ext_path(orig_path, orig_inode,
+                err = get_ext_path(orig_inode,
-                                      le32_to_cpu(end_ext->ee_block) - 1, err);
+                                le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
-                if (orig_path == NULL)
+                if (err)
                        goto out;
                if (ext4_ext_insert_extent(handle, orig_inode,
@@ -519,7 +553,15 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
         * oext      |-----------|
         * new_ext       |-------|
         */
-        BUG_ON(le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end);
+        if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
+                ext4_error(orig_inode->i_sb, __func__,
+                        "new_ext_end(%u) should be less than or equal to "
+                        "oext->ee_block(%u) + oext_alen(%d) - 1",
+                        new_ext_end, le32_to_cpu(oext->ee_block),
+                        oext_alen);
+                ret = -EIO;
+                goto out;
+        }
        /*
         * Case: new_ext is smaller than original extent
@@ -543,6 +585,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
                                o_end, &start_ext, &new_ext, &end_ext);
+out:
        return ret;
 }
@@ -554,8 +597,10 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
 * @orig_off:           block offset of original inode
 * @donor_off:          block offset of donor inode
 * @max_count:          the maximun length of extents
+ *
+ * Return 0 on success, or a negative error value on failure.
 */
-static void
+static int
 mext_calc_swap_extents(struct ext4_extent *tmp_dext,
                              struct ext4_extent *tmp_oext,
                              ext4_lblk_t orig_off, ext4_lblk_t donor_off,
@@ -564,6 +609,19 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
        ext4_lblk_t diff, orig_diff;
        struct ext4_extent dext_old, oext_old;
+        BUG_ON(orig_off != donor_off);
+        /* original and donor extents have to cover the same block offset */
+        if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
+            le32_to_cpu(tmp_oext->ee_block) +
+                        ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
+                return -ENODATA;
+        if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
+            le32_to_cpu(tmp_dext->ee_block) +
+                        ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
+                return -ENODATA;
        dext_old = *tmp_dext;
        oext_old = *tmp_oext;
@@ -591,6 +649,8 @@ mext_calc_swap_extents(struct ext4_extent *tmp_dext,
        copy_extent_status(&oext_old, tmp_dext);
        copy_extent_status(&dext_old, tmp_oext);
+        return 0;
 }
 /**
@@ -631,13 +691,13 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        mext_double_down_write(orig_inode, donor_inode);
        /* Get the original extent for the block "orig_off" */
-        get_ext_path(orig_path, orig_inode, orig_off, err);
+        err = get_ext_path(orig_inode, orig_off, &orig_path);
-        if (orig_path == NULL)
+        if (err)
                goto out;
        /* Get the donor extent for the head */
-        get_ext_path(donor_path, donor_inode, donor_off, err);
+        err = get_ext_path(donor_inode, donor_off, &donor_path);
-        if (donor_path == NULL)
+        if (err)
                goto out;
        depth = ext_depth(orig_inode);
        oext = orig_path[depth].p_ext;
@@ -647,13 +707,28 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        dext = donor_path[depth].p_ext;
        tmp_dext = *dext;
-        mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+        err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
                                      donor_off, count);
+        if (err)
+                goto out;
        /* Loop for the donor extents */
        while (1) {
                /* The extent for donor must be found. */
-                BUG_ON(!dext || donor_off != le32_to_cpu(tmp_dext.ee_block));
+                if (!dext) {
+                        ext4_error(donor_inode->i_sb, __func__,
+                                   "The extent for donor must be found");
+                        err = -EIO;
+                        goto out;
+                } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
+                        ext4_error(donor_inode->i_sb, __func__,
+                                "Donor offset(%u) and the first block of donor "
+                                "extent(%u) should be equal",
+                                donor_off,
+                                le32_to_cpu(tmp_dext.ee_block));
+                        err = -EIO;
+                        goto out;
+                }
                /* Set donor extent to orig extent */
                err = mext_leaf_block(handle, orig_inode,
@@ -678,8 +753,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                if (orig_path)
                        ext4_ext_drop_refs(orig_path);
-                get_ext_path(orig_path, orig_inode, orig_off, err);
+                err = get_ext_path(orig_inode, orig_off, &orig_path);
-                if (orig_path == NULL)
+                if (err)
                        goto out;
                depth = ext_depth(orig_inode);
                oext = orig_path[depth].p_ext;
@@ -692,9 +767,8 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                if (donor_path)
                        ext4_ext_drop_refs(donor_path);
-                get_ext_path(donor_path, donor_inode,
+                err = get_ext_path(donor_inode, donor_off, &donor_path);
-                                      donor_off, err);
+                if (err)
-                if (donor_path == NULL)
                        goto out;
                depth = ext_depth(donor_inode);
                dext = donor_path[depth].p_ext;
@@ -705,9 +779,10 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
                }
                tmp_dext = *dext;
-                mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
+                err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-                                              donor_off,
+                                           donor_off, count - replaced_count);
-                                              count - replaced_count);
+                if (err)
+                        goto out;
        }
 out:
@@ -740,7 +815,7 @@ out:
 * on success, or a negative error value on failure.
 */
 static int
-move_extent_par_page(struct file *o_filp, struct inode *donor_inode,
+move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
                  pgoff_t orig_page_offset, int data_offset_in_page,
                  int block_len_in_page, int uninit)
 {
@@ -871,6 +946,7 @@ out:
                if (PageLocked(page))
                        unlock_page(page);
                page_cache_release(page);
+                ext4_journal_stop(handle);
        }
 out2:
        ext4_journal_stop(handle);
@@ -897,6 +973,10 @@ mext_check_arguments(struct inode *orig_inode,
                          struct inode *donor_inode, __u64 orig_start,
                          __u64 donor_start, __u64 *len, __u64 moved_len)
 {
+        ext4_lblk_t orig_blocks, donor_blocks;
+        unsigned int blkbits = orig_inode->i_blkbits;
+        unsigned int blocksize = 1 << blkbits;
        /* Regular file check */
        if (!S_ISREG(orig_inode->i_mode) || !S_ISREG(donor_inode->i_mode)) {
                ext4_debug("ext4 move extent: The argument files should be "
@@ -960,54 +1040,58 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
-        if ((orig_start > MAX_DEFRAG_SIZE) ||
+        if ((orig_start > EXT_MAX_BLOCK) ||
-            (donor_start > MAX_DEFRAG_SIZE) ||
+            (donor_start > EXT_MAX_BLOCK) ||
-            (*len > MAX_DEFRAG_SIZE) ||
+            (*len > EXT_MAX_BLOCK) ||
-            (orig_start + *len > MAX_DEFRAG_SIZE))  {
+            (orig_start + *len > EXT_MAX_BLOCK))  {
-                ext4_debug("ext4 move extent: Can't handle over [%lu] blocks "
+                ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
-                        "[ino:orig %lu, donor %lu]\n", MAX_DEFRAG_SIZE,
+                        "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCK,
                        orig_inode->i_ino, donor_inode->i_ino);
                return -EINVAL;
        }
        if (orig_inode->i_size > donor_inode->i_size) {
-                if (orig_start >= donor_inode->i_size) {
+                donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
+                /* TODO: eliminate this artificial restriction */
+                if (orig_start >= donor_blocks) {
                        ext4_debug("ext4 move extent: orig start offset "
-                        "[%llu] should be less than donor file size "
+                        "[%llu] should be less than donor file blocks "
-                        "[%lld] [ino:orig %lu, donor_inode %lu]\n",
+                        "[%u] [ino:orig %lu, donor %lu]\n",
-                        orig_start, donor_inode->i_size,
+                        orig_start, donor_blocks,
                        orig_inode->i_ino, donor_inode->i_ino);
                        return -EINVAL;
                }
-                if (orig_start + *len > donor_inode->i_size) {
+                /* TODO: eliminate this artificial restriction */
+                if (orig_start + *len > donor_blocks) {
                        ext4_debug("ext4 move extent: End offset [%llu] should "
-                                "be less than donor file size [%lld]."
+                                "be less than donor file blocks [%u]."
-                                "So adjust length from %llu to %lld "
+                                "So adjust length from %llu to %llu "
                                "[ino:orig %lu, donor %lu]\n",
-                                orig_start + *len, donor_inode->i_size,
+                                orig_start + *len, donor_blocks,
-                                *len, donor_inode->i_size - orig_start,
+                                *len, donor_blocks - orig_start,
                                orig_inode->i_ino, donor_inode->i_ino);
-                        *len = donor_inode->i_size - orig_start;
+                        *len = donor_blocks - orig_start;
                }
        } else {
-                if (orig_start >= orig_inode->i_size) {
+                orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
+                if (orig_start >= orig_blocks) {
                        ext4_debug("ext4 move extent: start offset [%llu] "
-                                "should be less than original file size "
+                                "should be less than original file blocks "
-                                "[%lld] [inode:orig %lu, donor %lu]\n",
+                                "[%u] [ino:orig %lu, donor %lu]\n",
-                                 orig_start, orig_inode->i_size,
+                                 orig_start, orig_blocks,
                                orig_inode->i_ino, donor_inode->i_ino);
                        return -EINVAL;
                }
-                if (orig_start + *len > orig_inode->i_size) {
+                if (orig_start + *len > orig_blocks) {
                        ext4_debug("ext4 move extent: Adjust length "
-                                "from %llu to %lld. Because it should be "
+                                "from %llu to %llu. Because it should be "
-                                "less than original file size "
+                                "less than original file blocks "
                                "[ino:orig %lu, donor %lu]\n",
-                                *len, orig_inode->i_size - orig_start,
+                                *len, orig_blocks - orig_start,
                                orig_inode->i_ino, donor_inode->i_ino);
-                        *len = orig_inode->i_size - orig_start;
+                        *len = orig_blocks - orig_start;
                }
        }
@@ -1027,18 +1111,23 @@ mext_check_arguments(struct inode *orig_inode,
 * @inode1:     the inode structure
 * @inode2:     the inode structure
 *
- * Lock two inodes' i_mutex by i_ino order. This function is moved from
+ * Lock two inodes' i_mutex by i_ino order.
- * fs/inode.c.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
 */
-static void
+static int
 mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 {
-        if (inode1 == NULL || inode2 == NULL || inode1 == inode2) {
+        int ret = 0;
-                if (inode1)
-                        mutex_lock(&inode1->i_mutex);
+        BUG_ON(inode1 == NULL && inode2 == NULL);
-                else if (inode2)
-                        mutex_lock(&inode2->i_mutex);
+        ret = mext_check_null_inode(inode1, inode2, __func__);
-                return;
+        if (ret < 0)
+                goto out;
+        if (inode1 == inode2) {
+                mutex_lock(&inode1->i_mutex);
+                goto out;
        }
        if (inode1->i_ino < inode2->i_ino) {
@@ -1048,6 +1137,9 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
                mutex_lock_nested(&inode2->i_mutex, I_MUTEX_PARENT);
                mutex_lock_nested(&inode1->i_mutex, I_MUTEX_CHILD);
        }
+out:
+        return ret;
 }
 /**
@@ -1056,17 +1148,28 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2)
 * @inode1:     the inode that is released first
 * @inode2:     the inode that is released second
 *
- * This function is moved from fs/inode.c.
+ * If inode1 or inode2 is NULL, return -EIO. Otherwise, return 0.
 */
-static void
+static int
 mext_inode_double_unlock(struct inode *inode1, struct inode *inode2)
 {
+        int ret = 0;
+        BUG_ON(inode1 == NULL && inode2 == NULL);
+        ret = mext_check_null_inode(inode1, inode2, __func__);
+        if (ret < 0)
+                goto out;
        if (inode1)
                mutex_unlock(&inode1->i_mutex);
        if (inode2 && inode2 != inode1)
                mutex_unlock(&inode2->i_mutex);
+out:
+        return ret;
 }
 /**
@@ -1123,70 +1226,76 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
        ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
        ext4_lblk_t rest_blocks;
        pgoff_t orig_page_offset = 0, seq_end_page;
-        int ret, depth, last_extent = 0;
+        int ret1, ret2, depth, last_extent = 0;
        int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
        int data_offset_in_page;
        int block_len_in_page;
        int uninit;
        /* protect orig and donor against a truncate */
-        mext_inode_double_lock(orig_inode, donor_inode);
+        ret1 = mext_inode_double_lock(orig_inode, donor_inode);
+        if (ret1 < 0)
+                return ret1;
        mext_double_down_read(orig_inode, donor_inode);
        /* Check the filesystem environment whether move_extent can be done */
-        ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
+        ret1 = mext_check_arguments(orig_inode, donor_inode, orig_start,
                                        donor_start, &len, *moved_len);
        mext_double_up_read(orig_inode, donor_inode);
-        if (ret)
+        if (ret1)
-                goto out2;
+                goto out;
        file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
        block_end = block_start + len - 1;
        if (file_end < block_end)
                len -= block_end - file_end;
-        get_ext_path(orig_path, orig_inode, block_start, ret);
+        ret1 = get_ext_path(orig_inode, block_start, &orig_path);
-        if (orig_path == NULL)
+        if (ret1)
-                goto out2;
+                goto out;
        /* Get path structure to check the hole */
-        get_ext_path(holecheck_path, orig_inode, block_start, ret);
+        ret1 = get_ext_path(orig_inode, block_start, &holecheck_path);
-        if (holecheck_path == NULL)
+        if (ret1)
                goto out;
        depth = ext_depth(orig_inode);
        ext_cur = holecheck_path[depth].p_ext;
-        if (ext_cur == NULL) {
-                ret = -EINVAL;
-                goto out;
-        }
        /*
-         * Get proper extent whose ee_block is beyond block_start
+         * Get proper starting location of block replacement if block_start was
-         * if block_start was within the hole.
+         * within the hole.
         */
        if (le32_to_cpu(ext_cur->ee_block) +
                ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
+                /*
+                 * The hole exists between extents or the tail of
+                 * original file.
+                 */
                last_extent = mext_next_extent(orig_inode,
                                        holecheck_path, &ext_cur);
                if (last_extent < 0) {
-                        ret = last_extent;
+                        ret1 = last_extent;
                        goto out;
                }
                last_extent = mext_next_extent(orig_inode, orig_path,
                                                        &ext_dummy);
                if (last_extent < 0) {
-                        ret = last_extent;
+                        ret1 = last_extent;
                        goto out;
                }
-        }
+                seq_start = le32_to_cpu(ext_cur->ee_block);
-        seq_start = block_start;
+        } else if (le32_to_cpu(ext_cur->ee_block) > block_start)
+                /* The hole exists at the beginning of original file. */
+                seq_start = le32_to_cpu(ext_cur->ee_block);
+        else
+                seq_start = block_start;
        /* No blocks within the specified range. */
        if (le32_to_cpu(ext_cur->ee_block) > block_end) {
                ext4_debug("ext4 move extent: The specified range of file "
                                                        "may be the hole\n");
-                ret = -EINVAL;
+                ret1 = -EINVAL;
                goto out;
        }
@@ -1206,7 +1315,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                last_extent = mext_next_extent(orig_inode, holecheck_path,
                                                &ext_cur);
                if (last_extent < 0) {
-                        ret = last_extent;
+                        ret1 = last_extent;
                        break;
                }
                add_blocks = ext4_ext_get_actual_len(ext_cur);
@@ -1258,16 +1367,23 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                while (orig_page_offset <= seq_end_page) {
                        /* Swap original branches with new branches */
-                        ret = move_extent_par_page(o_filp, donor_inode,
+                        ret1 = move_extent_per_page(o_filp, donor_inode,
                                                orig_page_offset,
                                                data_offset_in_page,
                                                block_len_in_page, uninit);
-                        if (ret < 0)
+                        if (ret1 < 0)
                                goto out;
                        orig_page_offset++;
                        /* Count how many blocks we have exchanged */
                        *moved_len += block_len_in_page;
-                        BUG_ON(*moved_len > len);
+                        if (*moved_len > len) {
+                                ext4_error(orig_inode->i_sb, __func__,
+                                        "We replaced blocks too much! "
+                                        "sum of replaced: %llu requested: %llu",
+                                        *moved_len, len);
+                                ret1 = -EIO;
+                                goto out;
+                        }
                        data_offset_in_page = 0;
                        rest_blocks -= block_len_in_page;
@@ -1280,17 +1396,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                /* Decrease buffer counter */
                if (holecheck_path)
                        ext4_ext_drop_refs(holecheck_path);
-                get_ext_path(holecheck_path, orig_inode,
+                ret1 = get_ext_path(orig_inode, seq_start, &holecheck_path);
-                                      seq_start, ret);
+                if (ret1)
-                if (holecheck_path == NULL)
                        break;
                depth = holecheck_path->p_depth;
                /* Decrease buffer counter */
                if (orig_path)
                        ext4_ext_drop_refs(orig_path);
-                get_ext_path(orig_path, orig_inode, seq_start, ret);
+                ret1 = get_ext_path(orig_inode, seq_start, &orig_path);
-                if (orig_path == NULL)
+                if (ret1)
                        break;
                ext_cur = holecheck_path[depth].p_ext;
@@ -1307,14 +1422,13 @@ out:
                ext4_ext_drop_refs(holecheck_path);
                kfree(holecheck_path);
        }
-out2:
-        mext_inode_double_unlock(orig_inode, donor_inode);
-        if (ret)
+        ret2 = mext_inode_double_unlock(orig_inode, donor_inode);
-                return ret;
-        /* All of the specified blocks must be exchanged in succeed */
+        if (ret1)
-        BUG_ON(*moved_len != len);
+                return ret1;
+        else if (ret2)
+                return ret2;
        return 0;
 }
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index de04013d16ff..42f81d285cd5 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1518,8 +1518,12 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                        return retval;
                if (blocks == 1 && !dx_fallback &&
-                    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX))
+                    EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) {
-                        return make_indexed_dir(handle, dentry, inode, bh);
+                        retval = make_indexed_dir(handle, dentry, inode, bh);
+                        if (retval == -ENOSPC)
+                                brelse(bh);
+                        return retval;
+                }
                brelse(bh);
        }
        bh = ext4_append(handle, dir, &block, &retval);
@@ -1528,7 +1532,10 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
        de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
-        return add_dirent_to_buf(handle, dentry, inode, de, bh);
+        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
+        if (retval == -ENOSPC)
+                brelse(bh);
+        return retval;
 }
 /*
@@ -1590,9 +1597,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
+                memset(&node2->fake, 0, sizeof(struct fake_dirent));
                node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
                                                           sb->s_blocksize);
-                node2->fake.inode = 0;
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, frame->bh);
                if (err)
@@ -1657,7 +1664,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
        if (!de)
                goto cleanup;
        err = add_dirent_to_buf(handle, dentry, inode, de, bh);
-        bh = NULL;
+        if (err != -ENOSPC)
+                bh = NULL;
        goto cleanup;
 journal_error:
@@ -2310,7 +2318,7 @@ static int ext4_link(struct dentry *old_dentry,
        struct inode *inode = old_dentry->d_inode;
        int err, retries = 0;
-        if (EXT4_DIR_LINK_MAX(inode))
+        if (inode->i_nlink >= EXT4_LINK_MAX)
                return -EMLINK;
        /*
@@ -2413,7 +2421,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                        goto end_rename;
                retval = -EMLINK;
                if (!new_inode && new_dir != old_dir &&
-                                new_dir->i_nlink >= EXT4_LINK_MAX)
+                    EXT4_DIR_LINK_MAX(new_dir))
                        goto end_rename;
        }
        if (!new_bh) {
@@ -2536,7 +2544,7 @@ const struct inode_operations ext4_dir_inode_operations = {
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
 #endif
-        .permission     = ext4_permission,
+        .check_acl      = ext4_check_acl,
        .fiemap         = ext4_fiemap,
 };
@@ -2548,5 +2556,5 @@ const struct inode_operations ext4_special_inode_operations = {
        .listxattr      = ext4_listxattr,
        .removexattr    = generic_removexattr,
 #endif
-        .permission     = ext4_permission,
+        .check_acl      = ext4_check_acl,
 };
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 68b0351fc647..3cfc343c41b5 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -746,7 +746,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        struct inode *inode = NULL;
        handle_t *handle;
        int gdb_off, gdb_num;
-        int num_grp_locked = 0;
        int err, err2;
        gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb);
@@ -856,7 +855,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         * using the new disk blocks.
         */
-        num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group);
        /* Update group descriptor block for new group */
        gdp = (struct ext4_group_desc *)((char *)primary->b_data +
                                         gdb_off * EXT4_DESC_SIZE(sb));
@@ -875,10 +873,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         * descriptor
         */
        err = ext4_mb_add_groupinfo(sb, input->group, gdp);
-        if (err) {
+        if (err)
-                ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
                goto exit_journal;
-        }
        /*
         * Make the new blocks and inodes valid next.  We do this before
@@ -920,7 +916,6 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        /* Update the global fs size fields */
        sbi->s_groups_count++;
-        ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked);
        ext4_handle_dirty_metadata(handle, NULL, primary);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 8f4f079e6b9a..a6b1ab734728 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -45,6 +45,7 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
+#include "mballoc.h"
 #define CREATE_TRACE_POINTS
 #include <trace/events/ext4.h>
@@ -344,7 +345,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno,
                errstr = "Out of memory";
                break;
        case -EROFS:
-                if (!sb || EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT)
+                if (!sb || (EXT4_SB(sb)->s_journal &&
+                            EXT4_SB(sb)->s_journal->j_flags & JBD2_ABORT))
                        errstr = "Journal has aborted";
                else
                        errstr = "Readonly filesystem";
@@ -1279,11 +1281,9 @@ static int parse_options(char *options, struct super_block *sb,
                        *journal_devnum = option;
                        break;
                case Opt_journal_checksum:
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
+                        break;  /* Kept for backwards compatibility */
-                        break;
                case Opt_journal_async_commit:
                        set_opt(sbi->s_mount_opt, JOURNAL_ASYNC_COMMIT);
-                        set_opt(sbi->s_mount_opt, JOURNAL_CHECKSUM);
                        break;
                case Opt_noload:
                        set_opt(sbi->s_mount_opt, NOLOAD);
@@ -1695,12 +1695,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, NULL);
                flex_group = ext4_flex_group(sbi, i);
-                atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
+                atomic_add(ext4_free_inodes_count(sb, gdp),
-                           ext4_free_inodes_count(sb, gdp));
+                           &sbi->s_flex_groups[flex_group].free_inodes);
-                atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
+                atomic_add(ext4_free_blks_count(sb, gdp),
-                           ext4_free_blks_count(sb, gdp));
+                           &sbi->s_flex_groups[flex_group].free_blocks);
-                atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
+                atomic_add(ext4_used_dirs_count(sb, gdp),
-                           ext4_used_dirs_count(sb, gdp));
+                           &sbi->s_flex_groups[flex_group].used_dirs);
        }
        return 1;
@@ -2253,6 +2253,49 @@ static struct kobj_type ext4_ktype = {
        .release        = ext4_sb_release,
 };
+/*
+ * Check whether this filesystem can be mounted based on
+ * the features present and the RDONLY/RDWR mount requested.
+ * Returns 1 if this filesystem can be mounted as requested,
+ * 0 if it cannot be.
+ */
+static int ext4_feature_set_ok(struct super_block *sb, int readonly)
+{
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP)) {
+                ext4_msg(sb, KERN_ERR,
+                        "Couldn't mount because of "
+                        "unsupported optional features (%x)",
+                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
+                        ~EXT4_FEATURE_INCOMPAT_SUPP));
+                return 0;
+        }
+        if (readonly)
+                return 1;
+        /* Check that feature set is OK for a read-write mount */
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) {
+                ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of "
+                         "unsupported optional features (%x)",
+                         (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
+                                ~EXT4_FEATURE_RO_COMPAT_SUPP));
+                return 0;
+        }
+        /*
+         * Large file size enabled file system can only be mounted
+         * read-write on 32-bit systems if kernel is built with CONFIG_LBDAF
+         */
+        if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) {
+                if (sizeof(blkcnt_t) < sizeof(u64)) {
+                        ext4_msg(sb, KERN_ERR, "Filesystem with huge files "
+                                 "cannot be mounted RDWR without "
+                                 "CONFIG_LBDAF");
+                        return 0;
+                }
+        }
+        return 1;
+}
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
@@ -2274,7 +2317,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        unsigned int db_count;
        unsigned int i;
        int needs_recovery, has_huge_files;
-        int features;
        __u64 blocks_count;
        int err;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
@@ -2401,39 +2443,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         * previously didn't change the revision level when setting the flags,
         * so there is a chance incompat flags are set on a rev 0 filesystem.
         */
-        features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
+        if (!ext4_feature_set_ok(sb, (sb->s_flags & MS_RDONLY)))
-        if (features) {
-                ext4_msg(sb, KERN_ERR,
-                        "Couldn't mount because of "
-                        "unsupported optional features (%x)",
-                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
-                        ~EXT4_FEATURE_INCOMPAT_SUPP));
-                goto failed_mount;
-        }
-        features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
-        if (!(sb->s_flags & MS_RDONLY) && features) {
-                ext4_msg(sb, KERN_ERR,
-                        "Couldn't mount RDWR because of "
-                        "unsupported optional features (%x)",
-                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
-                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                goto failed_mount;
-        }
-        has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
-                                    EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
-        if (has_huge_files) {
-                /*
-                 * Large file size enabled file system can only be
-                 * mount if kernel is build with CONFIG_LBDAF
-                 */
-                if (sizeof(root->i_blocks) < sizeof(u64) &&
-                                !(sb->s_flags & MS_RDONLY)) {
-                        ext4_msg(sb, KERN_ERR, "Filesystem with huge "
-                                        "files cannot be mounted read-write "
-                                        "without CONFIG_LBDAF");
-                        goto failed_mount;
-                }
-        }
        blocksize = BLOCK_SIZE << le32_to_cpu(es->s_log_block_size);
        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
@@ -2469,6 +2481,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                }
        }
+        has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                                EXT4_FEATURE_RO_COMPAT_HUGE_FILE);
        sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits,
                                                      has_huge_files);
        sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files);
@@ -2549,12 +2563,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
-        if (ext4_blocks_count(es) >
+        /*
-                    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
+         * Test whether we have more sectors than will fit in sector_t,
+         * and whether the max offset is addressable by the page cache.
+         */
+        if ((ext4_blocks_count(es) >
+             (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) ||
+            (ext4_blocks_count(es) >
+             (pgoff_t)(~0ULL) >> (PAGE_CACHE_SHIFT - sb->s_blocksize_bits))) {
                ext4_msg(sb, KERN_ERR, "filesystem"
-                        " too large to mount safely");
+                         " too large to mount safely on this system");
                if (sizeof(sector_t) < 8)
                        ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
+                ret = -EFBIG;
                goto failed_mount;
        }
@@ -2595,6 +2616,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount;
        }
        sbi->s_groups_count = blocks_count;
+        sbi->s_blockfile_groups = min_t(ext4_group_t, sbi->s_groups_count,
+                        (EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
        db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
                   EXT4_DESC_PER_BLOCK(sb);
        sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
@@ -2729,20 +2752,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                goto failed_mount4;
        }
-        if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+        jbd2_journal_set_features(sbi->s_journal,
-                jbd2_journal_set_features(sbi->s_journal,
+                                  JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
-                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
+        if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
+                jbd2_journal_set_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-        } else if (test_opt(sb, JOURNAL_CHECKSUM)) {
+        else
-                jbd2_journal_set_features(sbi->s_journal,
-                                JBD2_FEATURE_COMPAT_CHECKSUM, 0, 0);
                jbd2_journal_clear_features(sbi->s_journal, 0, 0,
                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-        } else {
-                jbd2_journal_clear_features(sbi->s_journal,
-                                JBD2_FEATURE_COMPAT_CHECKSUM, 0,
-                                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT);
-        }
        /* We have now updated the journal if required, so we can
         * validate the data journaling mode. */
@@ -3208,7 +3225,18 @@ static int ext4_commit_super(struct super_block *sb, int sync)
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
-        es->s_wtime = cpu_to_le32(get_seconds());
+        /*
+         * If the file system is mounted read-only, don't update the
+         * superblock write time.  This avoids updating the superblock
+         * write time when we are mounting the root file system
+         * read/only but we need to replay the journal; at that point,
+         * for people who are east of GMT and who make their clock
+         * tick in localtime for Windows bug-for-bug compatibility,
+         * the clock is set in the future, and this will cause e2fsck
+         * to complain and force a full file system check.
+         */
+        if (!(sb->s_flags & MS_RDONLY))
+                es->s_wtime = cpu_to_le32(get_seconds());
        es->s_kbytes_written =
                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
                            ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
@@ -3477,18 +3505,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        if (sbi->s_journal)
                                ext4_mark_recovery_complete(sb, es);
                } else {
-                        int ret;
+                        /* Make sure we can mount this feature set readwrite */
-                        if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
+                        if (!ext4_feature_set_ok(sb, 0)) {
-                                        ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-                                ext4_msg(sb, KERN_WARNING, "couldn't "
-                                       "remount RDWR because of unsupported "
-                                       "optional features (%x)",
-                                (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
-                                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                                err = -EROFS;
                                goto restore_opts;
                        }
                        /*
                         * Make sure the group descriptor checksums
                         * are sane.  If they aren't, refuse to remount r/w.
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 62b31c246994..fed5b01d7a8d 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -810,12 +810,23 @@ inserted:
                        get_bh(new_bh);
                } else {
                        /* We need to allocate a new block */
-                        ext4_fsblk_t goal = ext4_group_first_block_no(sb,
+                        ext4_fsblk_t goal, block;
+                        goal = ext4_group_first_block_no(sb,
                                                EXT4_I(inode)->i_block_group);
-                        ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode,
+                        /* non-extent files can't have physical blocks past 2^32 */
+                        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
+                        block = ext4_new_meta_blocks(handle, inode,
                                                  goal, NULL, &error);
                        if (error)
                                goto cleanup;
+                        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                                BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
                        ea_idebug(inode, "creating block %d", block);
                        new_bh = sb_getblk(sb, block);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index f042b965c95c..e8c159de236b 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -176,8 +176,26 @@ static int fat_cont_expand(struct inode *inode, loff_t size)
        inode->i_ctime = inode->i_mtime = CURRENT_TIME_SEC;
        mark_inode_dirty(inode);
-        if (IS_SYNC(inode))
+        if (IS_SYNC(inode)) {
-                err = sync_page_range_nolock(inode, mapping, start, count);
+                int err2;
+                /*
+                 * Opencode syncing since we don't have a file open to use
+                 * standard fsync path.
+                 */
+                err = filemap_fdatawrite_range(mapping, start,
+                                               start + count - 1);
+                err2 = sync_mapping_buffers(mapping);
+                if (!err)
+                        err = err2;
+                err2 = write_inode_now(inode, 1);
+                if (!err)
+                        err = err2;
+                if (!err) {
+                        err =  filemap_fdatawait_range(mapping, start,
+                                                       start + count - 1);
+                }
+        }
 out:
        return err;
 }
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index a6c20473dfd7..4e35be873e09 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -119,8 +119,8 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster)
                MSDOS_I(inode)->i_start = new_dclus;
                MSDOS_I(inode)->i_logstart = new_dclus;
                /*
-                 * Since generic_osync_inode() synchronize later if
+                 * Since generic_write_sync() synchronizes regular files later,
-                 * this is not directory, we don't here.
+                 * we sync here only directories.
                 */
                if (S_ISDIR(inode->i_mode) && IS_DIRSYNC(inode)) {
                        ret = fat_sync_inode(inode);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index c54226be5294..8e1e5e19d21e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -19,171 +19,245 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include "internal.h"
+#define inode_to_bdi(inode)     ((inode)->i_mapping->backing_dev_info)
-/**
+/*
- * writeback_acquire - attempt to get exclusive writeback access to a device
+ * We don't actually have pdflush, but this one is exported though /proc...
- * @bdi: the device's backing_dev_info structure
+ */
- *
+int nr_pdflush_threads;
- * It is a waste of resources to have more than one pdflush thread blocked on
- * a single request queue.  Exclusion at the request_queue level is obtained
+/*
- * via a flag in the request_queue's backing_dev_info.state.
+ * Passed into wb_writeback(), essentially a subset of writeback_control
- *
+ */
- * Non-request_queue-backed address_spaces will share default_backing_dev_info,
+struct wb_writeback_args {
- * unless they implement their own.  Which is somewhat inefficient, as this
+        long nr_pages;
- * may prevent concurrent writeback against multiple devices.
+        struct super_block *sb;
+        enum writeback_sync_modes sync_mode;
+        int for_kupdate;
+        int range_cyclic;
+};
+/*
+ * Work items for the bdi_writeback threads
 */
-static int writeback_acquire(struct backing_dev_info *bdi)
+struct bdi_work {
+        struct list_head list;          /* pending work list */
+        struct rcu_head rcu_head;       /* for RCU free/clear of work */
+        unsigned long seen;             /* threads that have seen this work */
+        atomic_t pending;               /* number of threads still to do work */
+        struct wb_writeback_args args;  /* writeback arguments */
+        unsigned long state;            /* flag bits, see WS_* */
+};
+enum {
+        WS_USED_B = 0,
+        WS_ONSTACK_B,
+};
+#define WS_USED (1 << WS_USED_B)
+#define WS_ONSTACK (1 << WS_ONSTACK_B)
+static inline bool bdi_work_on_stack(struct bdi_work *work)
+{
+        return test_bit(WS_ONSTACK_B, &work->state);
+}
+static inline void bdi_work_init(struct bdi_work *work,
+                                 struct wb_writeback_args *args)
 {
-        return !test_and_set_bit(BDI_pdflush, &bdi->state);
+        INIT_RCU_HEAD(&work->rcu_head);
+        work->args = *args;
+        work->state = WS_USED;
 }
 /**
 * writeback_in_progress - determine whether there is writeback in progress
 * @bdi: the device's backing_dev_info structure.
 *
- * Determine whether there is writeback in progress against a backing device.
+ * Determine whether there is writeback waiting to be handled against a
+ * backing device.
 */
 int writeback_in_progress(struct backing_dev_info *bdi)
 {
-        return test_bit(BDI_pdflush, &bdi->state);
+        return !list_empty(&bdi->work_list);
 }
-/**
+static void bdi_work_clear(struct bdi_work *work)
- * writeback_release - relinquish exclusive writeback access against a device.
- * @bdi: the device's backing_dev_info structure
- */
-static void writeback_release(struct backing_dev_info *bdi)
 {
-        BUG_ON(!writeback_in_progress(bdi));
+        clear_bit(WS_USED_B, &work->state);
-        clear_bit(BDI_pdflush, &bdi->state);
+        smp_mb__after_clear_bit();
+        /*
+         * work can have disappeared at this point. bit waitq functions
+         * should be able to tolerate this, provided bdi_sched_wait does
+         * not dereference it's pointer argument.
+        */
+        wake_up_bit(&work->state, WS_USED_B);
 }
-static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+static void bdi_work_free(struct rcu_head *head)
 {
-        if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+        struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
-                struct dentry *dentry;
-                const char *name = "?";
-                dentry = d_find_alias(inode);
+        if (!bdi_work_on_stack(work))
-                if (dentry) {
+                kfree(work);
-                        spin_lock(&dentry->d_lock);
+        else
-                        name = (const char *) dentry->d_name.name;
+                bdi_work_clear(work);
-                }
-                printk(KERN_DEBUG
-                       "%s(%d): dirtied inode %lu (%s) on %s\n",
-                       current->comm, task_pid_nr(current), inode->i_ino,
-                       name, inode->i_sb->s_id);
-                if (dentry) {
-                        spin_unlock(&dentry->d_lock);
-                        dput(dentry);
-                }
-        }
 }
-/**
+static void wb_work_complete(struct bdi_work *work)
- *      __mark_inode_dirty -    internal function
- *      @inode: inode to mark
- *      @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
- *      Mark an inode as dirty. Callers should use mark_inode_dirty or
- *      mark_inode_dirty_sync.
- *
- * Put the inode on the super block's dirty list.
- *
- * CAREFUL! We mark it dirty unconditionally, but move it onto the
- * dirty list only if it is hashed or if it refers to a blockdev.
- * If it was not hashed, it will never be added to the dirty list
- * even if it is later hashed, as it will have been marked dirty already.
- *
- * In short, make sure you hash any inodes _before_ you start marking
- * them dirty.
- *
- * This function *must* be atomic for the I_DIRTY_PAGES case -
- * set_page_dirty() is called under spinlock in several places.
- *
- * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
- * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
- * the kernel-internal blockdev inode represents the dirtying time of the
- * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
- * page->mapping->host, so the page-dirtying time is recorded in the internal
- * blockdev inode.
- */
-void __mark_inode_dirty(struct inode *inode, int flags)
 {
-        struct super_block *sb = inode->i_sb;
+        const enum writeback_sync_modes sync_mode = work->args.sync_mode;
+        int onstack = bdi_work_on_stack(work);
        /*
-         * Don't do this for I_DIRTY_PAGES - that doesn't actually
+         * For allocated work, we can clear the done/seen bit right here.
-         * dirty the inode itself
+         * For on-stack work, we need to postpone both the clear and free
+         * to after the RCU grace period, since the stack could be invalidated
+         * as soon as bdi_work_clear() has done the wakeup.
         */
-        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+        if (!onstack)
-                if (sb->s_op->dirty_inode)
+                bdi_work_clear(work);
-                        sb->s_op->dirty_inode(inode);
+        if (sync_mode == WB_SYNC_NONE || onstack)
-        }
+                call_rcu(&work->rcu_head, bdi_work_free);
+}
+static void wb_clear_pending(struct bdi_writeback *wb, struct bdi_work *work)
+{
        /*
-         * make sure that changes are seen by all cpus before we test i_state
+         * The caller has retrieved the work arguments from this work,
-         * -- mikulas
+         * drop our reference. If this is the last ref, delete and free it
         */
-        smp_mb();
+        if (atomic_dec_and_test(&work->pending)) {
+                struct backing_dev_info *bdi = wb->bdi;
-        /* avoid the locking if we can */
+                spin_lock(&bdi->wb_lock);
-        if ((inode->i_state & flags) == flags)
+                list_del_rcu(&work->list);
-                return;
+                spin_unlock(&bdi->wb_lock);
-        if (unlikely(block_dump))
+                wb_work_complete(work);
-                block_dump___mark_inode_dirty(inode);
+        }
+}
-        spin_lock(&inode_lock);
+static void bdi_queue_work(struct backing_dev_info *bdi, struct bdi_work *work)
-        if ((inode->i_state & flags) != flags) {
+{
-                const int was_dirty = inode->i_state & I_DIRTY;
+        work->seen = bdi->wb_mask;
+        BUG_ON(!work->seen);
+        atomic_set(&work->pending, bdi->wb_cnt);
+        BUG_ON(!bdi->wb_cnt);
-                inode->i_state |= flags;
+        /*
+         * list_add_tail_rcu() contains the necessary barriers to
+         * make sure the above stores are seen before the item is
+         * noticed on the list
+         */
+        spin_lock(&bdi->wb_lock);
+        list_add_tail_rcu(&work->list, &bdi->work_list);
+        spin_unlock(&bdi->wb_lock);
-                /*
+        /*
-                 * If the inode is being synced, just update its dirty state.
+         * If the default thread isn't there, make sure we add it. When
-                 * The unlocker will place the inode on the appropriate
+         * it gets created and wakes up, we'll run this work.
-                 * superblock list, based upon its state.
+         */
-                 */
+        if (unlikely(list_empty_careful(&bdi->wb_list)))
-                if (inode->i_state & I_SYNC)
+                wake_up_process(default_backing_dev_info.wb.task);
-                        goto out;
+        else {
+                struct bdi_writeback *wb = &bdi->wb;
-                /*
+                if (wb->task)
-                 * Only add valid (hashed) inodes to the superblock's
+                        wake_up_process(wb->task);
-                 * dirty list.  Add blockdev inodes as well.
+        }
-                 */
+}
-                if (!S_ISBLK(inode->i_mode)) {
-                        if (hlist_unhashed(&inode->i_hash))
-                                goto out;
-                }
-                if (inode->i_state & (I_FREEING|I_CLEAR))
-                        goto out;
-                /*
+/*
-                 * If the inode was already on s_dirty/s_io/s_more_io, don't
+ * Used for on-stack allocated work items. The caller needs to wait until
-                 * reposition it (that would break s_dirty time-ordering).
+ * the wb threads have acked the work before it's safe to continue.
-                 */
+ */
-                if (!was_dirty) {
+static void bdi_wait_on_work_clear(struct bdi_work *work)
-                        inode->dirtied_when = jiffies;
+{
-                        list_move(&inode->i_list, &sb->s_dirty);
+        wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
-                }
+                    TASK_UNINTERRUPTIBLE);
+}
+static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
+                                 struct wb_writeback_args *args)
+{
+        struct bdi_work *work;
+        /*
+         * This is WB_SYNC_NONE writeback, so if allocation fails just
+         * wakeup the thread for old dirty data writeback
+         */
+        work = kmalloc(sizeof(*work), GFP_ATOMIC);
+        if (work) {
+                bdi_work_init(work, args);
+                bdi_queue_work(bdi, work);
+        } else {
+                struct bdi_writeback *wb = &bdi->wb;
+                if (wb->task)
+                        wake_up_process(wb->task);
        }
-out:
-        spin_unlock(&inode_lock);
 }
-EXPORT_SYMBOL(__mark_inode_dirty);
+/**
+ * bdi_sync_writeback - start and wait for writeback
+ * @bdi: the backing device to write from
+ * @sb: write inodes from this super_block
+ *
+ * Description:
+ *   This does WB_SYNC_ALL data integrity writeback and waits for the
+ *   IO to complete. Callers must hold the sb s_umount semaphore for
+ *   reading, to avoid having the super disappear before we are done.
+ */
+static void bdi_sync_writeback(struct backing_dev_info *bdi,
+                               struct super_block *sb)
+{
+        struct wb_writeback_args args = {
+                .sb             = sb,
+                .sync_mode      = WB_SYNC_ALL,
+                .nr_pages       = LONG_MAX,
+                .range_cyclic   = 0,
+        };
+        struct bdi_work work;
-static int write_inode(struct inode *inode, int sync)
+        bdi_work_init(&work, &args);
+        work.state |= WS_ONSTACK;
+        bdi_queue_work(bdi, &work);
+        bdi_wait_on_work_clear(&work);
+}
+/**
+ * bdi_start_writeback - start writeback
+ * @bdi: the backing device to write from
+ * @nr_pages: the number of pages to write
+ *
+ * Description:
+ *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
+ *   started when this function returns, we make no guarentees on
+ *   completion. Caller need not hold sb s_umount semaphore.
+ *
+ */
+void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
 {
-        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
+        struct wb_writeback_args args = {
-                return inode->i_sb->s_op->write_inode(inode, sync);
+                .sync_mode      = WB_SYNC_NONE,
-        return 0;
+                .nr_pages       = nr_pages,
+                .range_cyclic   = 1,
+        };
+        bdi_alloc_queue_work(bdi, &args);
 }
 /*
@@ -191,31 +265,32 @@ static int write_inode(struct inode *inode, int sync)
 * furthest end of its superblock's dirty-inode list.
 *
 * Before stamping the inode's ->dirtied_when, we check to see whether it is
- * already the most-recently-dirtied inode on the s_dirty list.  If that is
+ * already the most-recently-dirtied inode on the b_dirty list.  If that is
 * the case then the inode must have been redirtied while it was being written
 * out and we don't reset its dirtied_when.
 */
 static void redirty_tail(struct inode *inode)
 {
-        struct super_block *sb = inode->i_sb;
+        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-        if (!list_empty(&sb->s_dirty)) {
+        if (!list_empty(&wb->b_dirty)) {
-                struct inode *tail_inode;
+                struct inode *tail;
-                tail_inode = list_entry(sb->s_dirty.next, struct inode, i_list);
+                tail = list_entry(wb->b_dirty.next, struct inode, i_list);
-                if (time_before(inode->dirtied_when,
+                if (time_before(inode->dirtied_when, tail->dirtied_when))
-                                tail_inode->dirtied_when))
                        inode->dirtied_when = jiffies;
        }
-        list_move(&inode->i_list, &sb->s_dirty);
+        list_move(&inode->i_list, &wb->b_dirty);
 }
 /*
- * requeue inode for re-scanning after sb->s_io list is exhausted.
+ * requeue inode for re-scanning after bdi->b_io list is exhausted.
 */
 static void requeue_io(struct inode *inode)
 {
-        list_move(&inode->i_list, &inode->i_sb->s_more_io);
+        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+        list_move(&inode->i_list, &wb->b_more_io);
 }
 static void inode_sync_complete(struct inode *inode)
@@ -262,20 +337,18 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 /*
 * Queue all expired dirty inodes for io, eldest first.
 */
-static void queue_io(struct super_block *sb,
+static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
-                                unsigned long *older_than_this)
 {
-        list_splice_init(&sb->s_more_io, sb->s_io.prev);
+        list_splice_init(&wb->b_more_io, wb->b_io.prev);
-        move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+        move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
-int sb_has_dirty_inodes(struct super_block *sb)
+static int write_inode(struct inode *inode, int sync)
 {
-        return !list_empty(&sb->s_dirty) ||
+        if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
-               !list_empty(&sb->s_io) ||
+                return inode->i_sb->s_op->write_inode(inode, sync);
-               !list_empty(&sb->s_more_io);
+        return 0;
 }
-EXPORT_SYMBOL(sb_has_dirty_inodes);
 /*
 * Wait for writeback on an inode to complete.
@@ -322,11 +395,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
        if (inode->i_state & I_SYNC) {
                /*
                 * If this inode is locked for writeback and we are not doing
-                 * writeback-for-data-integrity, move it to s_more_io so that
+                 * writeback-for-data-integrity, move it to b_more_io so that
                 * writeback can proceed with the other inodes on s_io.
                 *
                 * We'll have another go at writing back this inode when we
-                 * completed a full scan of s_io.
+                 * completed a full scan of b_io.
                 */
                if (!wait) {
                        requeue_io(inode);
@@ -371,11 +444,11 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                        /*
                         * We didn't write back all the pages.  nfs_writepages()
                         * sometimes bales out without doing anything. Redirty
-                         * the inode; Move it from s_io onto s_more_io/s_dirty.
+                         * the inode; Move it from b_io onto b_more_io/b_dirty.
                         */
                        /*
                         * akpm: if the caller was the kupdate function we put
-                         * this inode at the head of s_dirty so it gets first
+                         * this inode at the head of b_dirty so it gets first
                         * consideration.  Otherwise, move it to the tail, for
                         * the reasons described there.  I'm not really sure
                         * how much sense this makes.  Presumably I had a good
@@ -385,7 +458,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
                        if (wbc->for_kupdate) {
                                /*
                                 * For the kupdate function we move the inode
-                                 * to s_more_io so it will get more writeout as
+                                 * to b_more_io so it will get more writeout as
                                 * soon as the queue becomes uncongested.
                                 */
                                inode->i_state |= I_DIRTY_PAGES;
@@ -434,50 +507,84 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 }
 /*
- * Write out a superblock's list of dirty inodes.  A wait will be performed
+ * For WB_SYNC_NONE writeback, the caller does not have the sb pinned
- * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ * before calling writeback. So make sure that we do pin it, so it doesn't
- *
+ * go away while we are writing inodes from it.
- * If older_than_this is non-NULL, then only write out inodes which
- * had their first dirtying at a time earlier than *older_than_this.
- *
- * If we're a pdflush thread, then implement pdflush collision avoidance
- * against the entire list.
 *
- * If `bdi' is non-zero then we're being asked to writeback a specific queue.
+ * Returns 0 if the super was successfully pinned (or pinning wasn't needed),
- * This function assumes that the blockdev superblock's inodes are backed by
+ * 1 if we failed.
- * a variety of queues, so all inodes are searched.  For other superblocks,
- * assume that all inodes are backed by the same queue.
- *
- * FIXME: this linear search could get expensive with many fileystems.  But
- * how to fix?  We need to go from an address_space to all inodes which share
- * a queue with that address_space.  (Easy: have a global "dirty superblocks"
- * list).
- *
- * The inodes to be written are parked on sb->s_io.  They are moved back onto
- * sb->s_dirty as they are selected for writing.  This way, none can be missed
- * on the writer throttling path, and we get decent balancing between many
- * throttled threads: we don't want them all piling up on inode_sync_wait.
 */
-void generic_sync_sb_inodes(struct super_block *sb,
+static int pin_sb_for_writeback(struct writeback_control *wbc,
+                                   struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        /*
+         * Caller must already hold the ref for this
+         */
+        if (wbc->sync_mode == WB_SYNC_ALL) {
+                WARN_ON(!rwsem_is_locked(&sb->s_umount));
+                return 0;
+        }
+        spin_lock(&sb_lock);
+        sb->s_count++;
+        if (down_read_trylock(&sb->s_umount)) {
+                if (sb->s_root) {
+                        spin_unlock(&sb_lock);
+                        return 0;
+                }
+                /*
+                 * umounted, drop rwsem again and fall through to failure
+                 */
+                up_read(&sb->s_umount);
+        }
+        sb->s_count--;
+        spin_unlock(&sb_lock);
+        return 1;
+}
+static void unpin_sb_for_writeback(struct writeback_control *wbc,
+                                   struct inode *inode)
+{
+        struct super_block *sb = inode->i_sb;
+        if (wbc->sync_mode == WB_SYNC_ALL)
+                return;
+        up_read(&sb->s_umount);
+        put_super(sb);
+}
+static void writeback_inodes_wb(struct bdi_writeback *wb,
                                struct writeback_control *wbc)
 {
+        struct super_block *sb = wbc->sb;
+        const int is_blkdev_sb = sb_is_blkdev_sb(sb);
        const unsigned long start = jiffies;    /* livelock avoidance */
-        int sync = wbc->sync_mode == WB_SYNC_ALL;
        spin_lock(&inode_lock);
-        if (!wbc->for_kupdate || list_empty(&sb->s_io))
-                queue_io(sb, wbc->older_than_this);
-        while (!list_empty(&sb->s_io)) {
+        if (!wbc->for_kupdate || list_empty(&wb->b_io))
-                struct inode *inode = list_entry(sb->s_io.prev,
+                queue_io(wb, wbc->older_than_this);
+        while (!list_empty(&wb->b_io)) {
+                struct inode *inode = list_entry(wb->b_io.prev,
                                                struct inode, i_list);
-                struct address_space *mapping = inode->i_mapping;
-                struct backing_dev_info *bdi = mapping->backing_dev_info;
                long pages_skipped;
-                if (!bdi_cap_writeback_dirty(bdi)) {
+                /*
+                 * super block given and doesn't match, skip this inode
+                 */
+                if (sb && sb != inode->i_sb) {
+                        redirty_tail(inode);
+                        continue;
+                }
+                if (!bdi_cap_writeback_dirty(wb->bdi)) {
                        redirty_tail(inode);
-                        if (sb_is_blkdev_sb(sb)) {
+                        if (is_blkdev_sb) {
                                /*
                                 * Dirty memory-backed blockdev: the ramdisk
                                 * driver does this.  Skip just this inode
@@ -497,21 +604,14 @@ void generic_sync_sb_inodes(struct super_block *sb,
                        continue;
                }
-                if (wbc->nonblocking && bdi_write_congested(bdi)) {
+                if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
                        wbc->encountered_congestion = 1;
-                        if (!sb_is_blkdev_sb(sb))
+                        if (!is_blkdev_sb)
                                break;          /* Skip a congested fs */
                        requeue_io(inode);
                        continue;               /* Skip a congested blockdev */
                }
-                if (wbc->bdi && bdi != wbc->bdi) {
-                        if (!sb_is_blkdev_sb(sb))
-                                break;          /* fs has the wrong queue */
-                        requeue_io(inode);
-                        continue;               /* blockdev has wrong queue */
-                }
                /*
                 * Was this inode dirtied after sync_sb_inodes was called?
                 * This keeps sync from extra jobs and livelock.
@@ -519,16 +619,16 @@ void generic_sync_sb_inodes(struct super_block *sb,
                if (inode_dirtied_after(inode, start))
                        break;
-                /* Is another pdflush already flushing this queue? */
+                if (pin_sb_for_writeback(wbc, inode)) {
-                if (current_is_pdflush() && !writeback_acquire(bdi))
+                        requeue_io(inode);
-                        break;
+                        continue;
+                }
                BUG_ON(inode->i_state & (I_FREEING | I_CLEAR));
                __iget(inode);
                pages_skipped = wbc->pages_skipped;
                writeback_single_inode(inode, wbc);
-                if (current_is_pdflush())
+                unpin_sb_for_writeback(wbc, inode);
-                        writeback_release(bdi);
                if (wbc->pages_skipped != pages_skipped) {
                        /*
                         * writeback is not making progress due to locked
@@ -544,144 +644,520 @@ void generic_sync_sb_inodes(struct super_block *sb,
                        wbc->more_io = 1;
                        break;
                }
-                if (!list_empty(&sb->s_more_io))
+                if (!list_empty(&wb->b_more_io))
                        wbc->more_io = 1;
        }
-        if (sync) {
+        spin_unlock(&inode_lock);
-                struct inode *inode, *old_inode = NULL;
+        /* Leave any unwritten inodes on b_io */
+}
+void writeback_inodes_wbc(struct writeback_control *wbc)
+{
+        struct backing_dev_info *bdi = wbc->bdi;
+        writeback_inodes_wb(&bdi->wb, wbc);
+}
+/*
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation.  We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode.  Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES     1024
+static inline bool over_bground_thresh(void)
+{
+        unsigned long background_thresh, dirty_thresh;
+        get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+        return (global_page_state(NR_FILE_DIRTY) +
+                global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
+}
+/*
+ * Explicit flushing or periodic writeback of "old" data.
+ *
+ * Define "old": the first time one of an inode's pages is dirtied, we mark the
+ * dirtying-time in the inode's address_space.  So this periodic writeback code
+ * just walks the superblock inode list, writing back any inodes which are
+ * older than a specific point in time.
+ *
+ * Try to run once per dirty_writeback_interval.  But if a writeback event
+ * takes longer than a dirty_writeback_interval interval, then leave a
+ * one-second gap.
+ *
+ * older_than_this takes precedence over nr_to_write.  So we'll only write back
+ * all dirty pages if they are all attached to "old" mappings.
+ */
+static long wb_writeback(struct bdi_writeback *wb,
+                         struct wb_writeback_args *args)
+{
+        struct writeback_control wbc = {
+                .bdi                    = wb->bdi,
+                .sb                     = args->sb,
+                .sync_mode              = args->sync_mode,
+                .older_than_this        = NULL,
+                .for_kupdate            = args->for_kupdate,
+                .range_cyclic           = args->range_cyclic,
+        };
+        unsigned long oldest_jif;
+        long wrote = 0;
+        if (wbc.for_kupdate) {
+                wbc.older_than_this = &oldest_jif;
+                oldest_jif = jiffies -
+                                msecs_to_jiffies(dirty_expire_interval * 10);
+        }
+        if (!wbc.range_cyclic) {
+                wbc.range_start = 0;
+                wbc.range_end = LLONG_MAX;
+        }
+        for (;;) {
                /*
-                 * Data integrity sync. Must wait for all pages under writeback,
+                 * Don't flush anything for non-integrity writeback where
-                 * because there may have been pages dirtied before our sync
+                 * no nr_pages was given
-                 * call, but which had writeout started before we write it out.
-                 * In which case, the inode may not be on the dirty list, but
-                 * we still have to wait for that writeout.
                 */
-                list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+                if (!args->for_kupdate && args->nr_pages <= 0 &&
-                        struct address_space *mapping;
+                     args->sync_mode == WB_SYNC_NONE)
+                        break;
-                        if (inode->i_state &
+                /*
-                                        (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+                 * If no specific pages were given and this is just a
-                                continue;
+                 * periodic background writeout and we are below the
-                        mapping = inode->i_mapping;
+                 * background dirty threshold, don't do anything
-                        if (mapping->nrpages == 0)
+                 */
+                if (args->for_kupdate && args->nr_pages <= 0 &&
+                    !over_bground_thresh())
+                        break;
+                wbc.more_io = 0;
+                wbc.encountered_congestion = 0;
+                wbc.nr_to_write = MAX_WRITEBACK_PAGES;
+                wbc.pages_skipped = 0;
+                writeback_inodes_wb(wb, &wbc);
+                args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
+                /*
+                 * If we ran out of stuff to write, bail unless more_io got set
+                 */
+                if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
+                        if (wbc.more_io && !wbc.for_kupdate)
                                continue;
-                        __iget(inode);
+                        break;
-                        spin_unlock(&inode_lock);
+                }
-                        /*
+        }
-                         * We hold a reference to 'inode' so it couldn't have
-                         * been removed from s_inodes list while we dropped the
+        return wrote;
-                         * inode_lock.  We cannot iput the inode now as we can
+}
-                         * be holding the last reference and we cannot iput it
-                         * under inode_lock. So we keep the reference and iput
+/*
-                         * it later.
+ * Return the next bdi_work struct that hasn't been processed by this
-                         */
+ * wb thread yet. ->seen is initially set for each thread that exists
-                        iput(old_inode);
+ * for this device, when a thread first notices a piece of work it
-                        old_inode = inode;
+ * clears its bit. Depending on writeback type, the thread will notify
+ * completion on either receiving the work (WB_SYNC_NONE) or after
+ * it is done (WB_SYNC_ALL).
+ */
+static struct bdi_work *get_next_work_item(struct backing_dev_info *bdi,
+                                           struct bdi_writeback *wb)
+{
+        struct bdi_work *work, *ret = NULL;
+        rcu_read_lock();
+        list_for_each_entry_rcu(work, &bdi->work_list, list) {
+                if (!test_bit(wb->nr, &work->seen))
+                        continue;
+                clear_bit(wb->nr, &work->seen);
+                ret = work;
+                break;
+        }
+        rcu_read_unlock();
+        return ret;
+}
+static long wb_check_old_data_flush(struct bdi_writeback *wb)
+{
+        unsigned long expired;
+        long nr_pages;
+        expired = wb->last_old_flush +
+                        msecs_to_jiffies(dirty_writeback_interval * 10);
+        if (time_before(jiffies, expired))
+                return 0;
+        wb->last_old_flush = jiffies;
+        nr_pages = global_page_state(NR_FILE_DIRTY) +
+                        global_page_state(NR_UNSTABLE_NFS) +
+                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
+        if (nr_pages) {
+                struct wb_writeback_args args = {
+                        .nr_pages       = nr_pages,
+                        .sync_mode      = WB_SYNC_NONE,
+                        .for_kupdate    = 1,
+                        .range_cyclic   = 1,
+                };
+                return wb_writeback(wb, &args);
+        }
+        return 0;
+}
+/*
+ * Retrieve work items and do the writeback they describe
+ */
+long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
+{
+        struct backing_dev_info *bdi = wb->bdi;
+        struct bdi_work *work;
+        long wrote = 0;
-                        filemap_fdatawait(mapping);
+        while ((work = get_next_work_item(bdi, wb)) != NULL) {
+                struct wb_writeback_args args = work->args;
-                        cond_resched();
+                /*
+                 * Override sync mode, in case we must wait for completion
+                 */
+                if (force_wait)
+                        work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
-                        spin_lock(&inode_lock);
+                /*
+                 * If this isn't a data integrity operation, just notify
+                 * that we have seen this work and we are now starting it.
+                 */
+                if (args.sync_mode == WB_SYNC_NONE)
+                        wb_clear_pending(wb, work);
+                wrote += wb_writeback(wb, &args);
+                /*
+                 * This is a data integrity writeback, so only do the
+                 * notification when we have completed the work.
+                 */
+                if (args.sync_mode == WB_SYNC_ALL)
+                        wb_clear_pending(wb, work);
+        }
+        /*
+         * Check for periodic writeback, kupdated() style
+         */
+        wrote += wb_check_old_data_flush(wb);
+        return wrote;
+}
+/*
+ * Handle writeback of dirty data for the device backed by this bdi. Also
+ * wakes up periodically and does kupdated style flushing.
+ */
+int bdi_writeback_task(struct bdi_writeback *wb)
+{
+        unsigned long last_active = jiffies;
+        unsigned long wait_jiffies = -1UL;
+        long pages_written;
+        while (!kthread_should_stop()) {
+                pages_written = wb_do_writeback(wb, 0);
+                if (pages_written)
+                        last_active = jiffies;
+                else if (wait_jiffies != -1UL) {
+                        unsigned long max_idle;
+                        /*
+                         * Longest period of inactivity that we tolerate. If we
+                         * see dirty data again later, the task will get
+                         * recreated automatically.
+                         */
+                        max_idle = max(5UL * 60 * HZ, wait_jiffies);
+                        if (time_after(jiffies, max_idle + last_active))
+                                break;
                }
-                spin_unlock(&inode_lock);
-                iput(old_inode);
-        } else
-                spin_unlock(&inode_lock);
-        return;         /* Leave any unwritten inodes on s_io */
+                wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
+                schedule_timeout_interruptible(wait_jiffies);
+                try_to_freeze();
+        }
+        return 0;
 }
-EXPORT_SYMBOL_GPL(generic_sync_sb_inodes);
-static void sync_sb_inodes(struct super_block *sb,
+/*
-                                struct writeback_control *wbc)
+ * Schedule writeback for all backing devices. This does WB_SYNC_NONE
+ * writeback, for integrity writeback see bdi_sync_writeback().
+ */
+static void bdi_writeback_all(struct super_block *sb, long nr_pages)
 {
-        generic_sync_sb_inodes(sb, wbc);
+        struct wb_writeback_args args = {
+                .sb             = sb,
+                .nr_pages       = nr_pages,
+                .sync_mode      = WB_SYNC_NONE,
+        };
+        struct backing_dev_info *bdi;
+        rcu_read_lock();
+        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+                if (!bdi_has_dirty_io(bdi))
+                        continue;
+                bdi_alloc_queue_work(bdi, &args);
+        }
+        rcu_read_unlock();
 }
 /*
- * Start writeback of dirty pagecache data against all unlocked inodes.
+ * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
+ * the whole world.
+ */
+void wakeup_flusher_threads(long nr_pages)
+{
+        if (nr_pages == 0)
+                nr_pages = global_page_state(NR_FILE_DIRTY) +
+                                global_page_state(NR_UNSTABLE_NFS);
+        bdi_writeback_all(NULL, nr_pages);
+}
+static noinline void block_dump___mark_inode_dirty(struct inode *inode)
+{
+        if (inode->i_ino || strcmp(inode->i_sb->s_id, "bdev")) {
+                struct dentry *dentry;
+                const char *name = "?";
+                dentry = d_find_alias(inode);
+                if (dentry) {
+                        spin_lock(&dentry->d_lock);
+                        name = (const char *) dentry->d_name.name;
+                }
+                printk(KERN_DEBUG
+                       "%s(%d): dirtied inode %lu (%s) on %s\n",
+                       current->comm, task_pid_nr(current), inode->i_ino,
+                       name, inode->i_sb->s_id);
+                if (dentry) {
+                        spin_unlock(&dentry->d_lock);
+                        dput(dentry);
+                }
+        }
+}
+/**
+ *      __mark_inode_dirty -    internal function
+ *      @inode: inode to mark
+ *      @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
+ *      Mark an inode as dirty. Callers should use mark_inode_dirty or
+ *      mark_inode_dirty_sync.
+ *
+ * Put the inode on the super block's dirty list.
 *
- * Note:
+ * CAREFUL! We mark it dirty unconditionally, but move it onto the
- * We don't need to grab a reference to superblock here. If it has non-empty
+ * dirty list only if it is hashed or if it refers to a blockdev.
- * ->s_dirty it's hadn't been killed yet and kill_super() won't proceed
+ * If it was not hashed, it will never be added to the dirty list
- * past sync_inodes_sb() until the ->s_dirty/s_io/s_more_io lists are all
+ * even if it is later hashed, as it will have been marked dirty already.
- * empty. Since __sync_single_inode() regains inode_lock before it finally moves
- * inode from superblock lists we are OK.
 *
- * If `older_than_this' is non-zero then only flush inodes which have a
+ * In short, make sure you hash any inodes _before_ you start marking
- * flushtime older than *older_than_this.
+ * them dirty.
 *
- * If `bdi' is non-zero then we will scan the first inode against each
+ * This function *must* be atomic for the I_DIRTY_PAGES case -
- * superblock until we find the matching ones.  One group will be the dirty
+ * set_page_dirty() is called under spinlock in several places.
- * inodes against a filesystem.  Then when we hit the dummy blockdev superblock,
+ *
- * sync_sb_inodes will seekout the blockdev which matches `bdi'.  Maybe not
+ * Note that for blockdevs, inode->dirtied_when represents the dirtying time of
- * super-efficient but we're about to do a ton of I/O...
+ * the block-special inode (/dev/hda1) itself.  And the ->dirtied_when field of
+ * the kernel-internal blockdev inode represents the dirtying time of the
+ * blockdev's pages.  This is why for I_DIRTY_PAGES we always use
+ * page->mapping->host, so the page-dirtying time is recorded in the internal
+ * blockdev inode.
 */
-void
+void __mark_inode_dirty(struct inode *inode, int flags)
-writeback_inodes(struct writeback_control *wbc)
 {
-        struct super_block *sb;
+        struct super_block *sb = inode->i_sb;
-        might_sleep();
+        /*
-        spin_lock(&sb_lock);
+         * Don't do this for I_DIRTY_PAGES - that doesn't actually
-restart:
+         * dirty the inode itself
-        list_for_each_entry_reverse(sb, &super_blocks, s_list) {
+         */
-                if (sb_has_dirty_inodes(sb)) {
+        if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
-                        /* we're making our own get_super here */
+                if (sb->s_op->dirty_inode)
-                        sb->s_count++;
+                        sb->s_op->dirty_inode(inode);
-                        spin_unlock(&sb_lock);
+        }
-                        /*
-                         * If we can't get the readlock, there's no sense in
+        /*
-                         * waiting around, most of the time the FS is going to
+         * make sure that changes are seen by all cpus before we test i_state
-                         * be unmounted by the time it is released.
+         * -- mikulas
-                         */
+         */
-                        if (down_read_trylock(&sb->s_umount)) {
+        smp_mb();
-                                if (sb->s_root)
-                                        sync_sb_inodes(sb, wbc);
+        /* avoid the locking if we can */
-                                up_read(&sb->s_umount);
+        if ((inode->i_state & flags) == flags)
+                return;
+        if (unlikely(block_dump))
+                block_dump___mark_inode_dirty(inode);
+        spin_lock(&inode_lock);
+        if ((inode->i_state & flags) != flags) {
+                const int was_dirty = inode->i_state & I_DIRTY;
+                inode->i_state |= flags;
+                /*
+                 * If the inode is being synced, just update its dirty state.
+                 * The unlocker will place the inode on the appropriate
+                 * superblock list, based upon its state.
+                 */
+                if (inode->i_state & I_SYNC)
+                        goto out;
+                /*
+                 * Only add valid (hashed) inodes to the superblock's
+                 * dirty list.  Add blockdev inodes as well.
+                 */
+                if (!S_ISBLK(inode->i_mode)) {
+                        if (hlist_unhashed(&inode->i_hash))
+                                goto out;
+                }
+                if (inode->i_state & (I_FREEING|I_CLEAR))
+                        goto out;
+                /*
+                 * If the inode was already on b_dirty/b_io/b_more_io, don't
+                 * reposition it (that would break b_dirty time-ordering).
+                 */
+                if (!was_dirty) {
+                        struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
+                        struct backing_dev_info *bdi = wb->bdi;
+                        if (bdi_cap_writeback_dirty(bdi) &&
+                            !test_bit(BDI_registered, &bdi->state)) {
+                                WARN_ON(1);
+                                printk(KERN_ERR "bdi-%s not registered\n",
+                                                                bdi->name);
                        }
-                        spin_lock(&sb_lock);
-                        if (__put_super_and_need_restart(sb))
+                        inode->dirtied_when = jiffies;
-                                goto restart;
+                        list_move(&inode->i_list, &wb->b_dirty);
                }
-                if (wbc->nr_to_write <= 0)
-                        break;
        }
-        spin_unlock(&sb_lock);
+out:
+        spin_unlock(&inode_lock);
 }
+EXPORT_SYMBOL(__mark_inode_dirty);
 /*
- * writeback and wait upon the filesystem's dirty inodes.  The caller will
+ * Write out a superblock's list of dirty inodes.  A wait will be performed
- * do this in two passes - one to write, and one to wait.
+ * upon no inodes, all inodes or the final one, depending upon sync_mode.
+ *
+ * If older_than_this is non-NULL, then only write out inodes which
+ * had their first dirtying at a time earlier than *older_than_this.
+ *
+ * If we're a pdlfush thread, then implement pdflush collision avoidance
+ * against the entire list.
 *
- * A finite limit is set on the number of pages which will be written.
+ * If `bdi' is non-zero then we're being asked to writeback a specific queue.
- * To prevent infinite livelock of sys_sync().
+ * This function assumes that the blockdev superblock's inodes are backed by
+ * a variety of queues, so all inodes are searched.  For other superblocks,
+ * assume that all inodes are backed by the same queue.
 *
- * We add in the number of potentially dirty inodes, because each inode write
+ * The inodes to be written are parked on bdi->b_io.  They are moved back onto
- * can dirty pagecache in the underlying blockdev.
+ * bdi->b_dirty as they are selected for writing.  This way, none can be missed
+ * on the writer throttling path, and we get decent balancing between many
+ * throttled threads: we don't want them all piling up on inode_sync_wait.
 */
-void sync_inodes_sb(struct super_block *sb, int wait)
+static void wait_sb_inodes(struct super_block *sb)
 {
-        struct writeback_control wbc = {
+        struct inode *inode, *old_inode = NULL;
-                .sync_mode      = wait ? WB_SYNC_ALL : WB_SYNC_NONE,
-                .range_start    = 0,
+        /*
-                .range_end      = LLONG_MAX,
+         * We need to be protected against the filesystem going from
-        };
+         * r/o to r/w or vice versa.
+         */
+        WARN_ON(!rwsem_is_locked(&sb->s_umount));
+        spin_lock(&inode_lock);
+        /*
+         * Data integrity sync. Must wait for all pages under writeback,
+         * because there may have been pages dirtied before our sync
+         * call, but which had writeout started before we write it out.
+         * In which case, the inode may not be on the dirty list, but
+         * we still have to wait for that writeout.
+         */
+        list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+                struct address_space *mapping;
+                if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+                        continue;
+                mapping = inode->i_mapping;
+                if (mapping->nrpages == 0)
+                        continue;
+                __iget(inode);
+                spin_unlock(&inode_lock);
+                /*
+                 * We hold a reference to 'inode' so it couldn't have
+                 * been removed from s_inodes list while we dropped the
+                 * inode_lock.  We cannot iput the inode now as we can
+                 * be holding the last reference and we cannot iput it
+                 * under inode_lock. So we keep the reference and iput
+                 * it later.
+                 */
+                iput(old_inode);
+                old_inode = inode;
+                filemap_fdatawait(mapping);
+                cond_resched();
-        if (!wait) {
+                spin_lock(&inode_lock);
-                unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+        }
-                unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+        spin_unlock(&inode_lock);
+        iput(old_inode);
+}
-                wbc.nr_to_write = nr_dirty + nr_unstable +
+/**
+ * writeback_inodes_sb  -       writeback dirty inodes from given super_block
+ * @sb: the superblock
+ *
+ * Start writeback on some inodes on this super_block. No guarantees are made
+ * on how many (if any) will be written, and this function does not wait
+ * for IO completion of submitted IO. The number of pages submitted is
+ * returned.
+ */
+void writeback_inodes_sb(struct super_block *sb)
+{
+        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
+        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
+        long nr_to_write;
+        nr_to_write = nr_dirty + nr_unstable +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-        } else
-                wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */
-        sync_sb_inodes(sb, &wbc);
+        bdi_writeback_all(sb, nr_to_write);
+}
+EXPORT_SYMBOL(writeback_inodes_sb);
+/**
+ * sync_inodes_sb       -       sync sb inode pages
+ * @sb: the superblock
+ *
+ * This function writes and waits on any dirty inode belonging to this
+ * super_block. The number of pages synced is returned.
+ */
+void sync_inodes_sb(struct super_block *sb)
+{
+        bdi_sync_writeback(sb->s_bdi, sb);
+        wait_sb_inodes(sb);
 }
+EXPORT_SYMBOL(sync_inodes_sb);
 /**
 * write_inode_now      -       write an inode to disk
@@ -737,57 +1213,3 @@ int sync_inode(struct inode *inode, struct writeback_control *wbc)
        return ret;
 }
 EXPORT_SYMBOL(sync_inode);
-/**
- * generic_osync_inode - flush all dirty data for a given inode to disk
- * @inode: inode to write
- * @mapping: the address_space that should be flushed
- * @what:  what to write and wait upon
- *
- * This can be called by file_write functions for files which have the
- * O_SYNC flag set, to flush dirty writes to disk.
- *
- * @what is a bitmask, specifying which part of the inode's data should be
- * written and waited upon.
- *
- *    OSYNC_DATA:     i_mapping's dirty data
- *    OSYNC_METADATA: the buffers at i_mapping->private_list
- *    OSYNC_INODE:    the inode itself
- */
-int generic_osync_inode(struct inode *inode, struct address_space *mapping, int what)
-{
-        int err = 0;
-        int need_write_inode_now = 0;
-        int err2;
-        if (what & OSYNC_DATA)
-                err = filemap_fdatawrite(mapping);
-        if (what & (OSYNC_METADATA|OSYNC_DATA)) {
-                err2 = sync_mapping_buffers(mapping);
-                if (!err)
-                        err = err2;
-        }
-        if (what & OSYNC_DATA) {
-                err2 = filemap_fdatawait(mapping);
-                if (!err)
-                        err = err2;
-        }
-        spin_lock(&inode_lock);
-        if ((inode->i_state & I_DIRTY) &&
-            ((what & OSYNC_INODE) || (inode->i_state & I_DIRTY_DATASYNC)))
-                need_write_inode_now = 1;
-        spin_unlock(&inode_lock);
-        if (need_write_inode_now) {
-                err2 = write_inode_now(inode, 1);
-                if (!err)
-                        err = err2;
-        }
-        else
-                inode_sync_wait(inode);
-        return err;
-}
-EXPORT_SYMBOL(generic_osync_inode);
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index 99c99dfb0373..3773fd63d2f9 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -61,6 +61,121 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
        return simple_read_from_buffer(buf, len, ppos, tmp, size);
 }
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+                                    size_t len, loff_t *ppos, unsigned val)
+{
+        char tmp[32];
+        size_t size = sprintf(tmp, "%u\n", val);
+        return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+                                     size_t count, loff_t *ppos, unsigned *val,
+                                     unsigned global_limit)
+{
+        unsigned long t;
+        char tmp[32];
+        unsigned limit = (1 << 16) - 1;
+        int err;
+        if (*ppos || count >= sizeof(tmp) - 1)
+                return -EINVAL;
+        if (copy_from_user(tmp, buf, count))
+                return -EINVAL;
+        tmp[count] = '\0';
+        err = strict_strtoul(tmp, 0, &t);
+        if (err)
+                return err;
+        if (!capable(CAP_SYS_ADMIN))
+                limit = min(limit, global_limit);
+        if (t > limit)
+                return -EINVAL;
+        *val = t;
+        return count;
+}
+static ssize_t fuse_conn_max_background_read(struct file *file,
+                                             char __user *buf, size_t len,
+                                             loff_t *ppos)
+{
+        struct fuse_conn *fc;
+        unsigned val;
+        fc = fuse_ctl_file_conn_get(file);
+        if (!fc)
+                return 0;
+        val = fc->max_background;
+        fuse_conn_put(fc);
+        return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+static ssize_t fuse_conn_max_background_write(struct file *file,
+                                              const char __user *buf,
+                                              size_t count, loff_t *ppos)
+{
+        unsigned val;
+        ssize_t ret;
+        ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+                                    max_user_bgreq);
+        if (ret > 0) {
+                struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+                if (fc) {
+                        fc->max_background = val;
+                        fuse_conn_put(fc);
+                }
+        }
+        return ret;
+}
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+                                                   char __user *buf, size_t len,
+                                                   loff_t *ppos)
+{
+        struct fuse_conn *fc;
+        unsigned val;
+        fc = fuse_ctl_file_conn_get(file);
+        if (!fc)
+                return 0;
+        val = fc->congestion_threshold;
+        fuse_conn_put(fc);
+        return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+                                                    const char __user *buf,
+                                                    size_t count, loff_t *ppos)
+{
+        unsigned val;
+        ssize_t ret;
+        ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+                                    max_user_congthresh);
+        if (ret > 0) {
+                struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+                if (fc) {
+                        fc->congestion_threshold = val;
+                        fuse_conn_put(fc);
+                }
+        }
+        return ret;
+}
 static const struct file_operations fuse_ctl_abort_ops = {
        .open = nonseekable_open,
        .write = fuse_conn_abort_write,
@@ -71,6 +186,18 @@ static const struct file_operations fuse_ctl_waiting_ops = {
        .read = fuse_conn_waiting_read,
 };
+static const struct file_operations fuse_conn_max_background_ops = {
+        .open = nonseekable_open,
+        .read = fuse_conn_max_background_read,
+        .write = fuse_conn_max_background_write,
+};
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+        .open = nonseekable_open,
+        .read = fuse_conn_congestion_threshold_read,
+        .write = fuse_conn_congestion_threshold_write,
+};
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
                                          struct fuse_conn *fc,
                                          const char *name,
@@ -127,9 +254,14 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
                goto err;
        if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
-                                NULL, &fuse_ctl_waiting_ops) ||
+                                 NULL, &fuse_ctl_waiting_ops) ||
            !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
-                                 NULL, &fuse_ctl_abort_ops))
+                                 NULL, &fuse_ctl_abort_ops) ||
+            !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+                                 1, NULL, &fuse_conn_max_background_ops) ||
+            !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+                                 S_IFREG | 0600, 1, NULL,
+                                 &fuse_conn_congestion_threshold_ops))
                goto err;
        return 0;
@@ -156,7 +288,7 @@ void fuse_ctl_remove_conn(struct fuse_conn *fc)
                d_drop(dentry);
                dput(dentry);
        }
-        fuse_control_sb->s_root->d_inode->i_nlink--;
+        drop_nlink(fuse_control_sb->s_root->d_inode);
 }
 static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 6484eb75acd6..51d9e33d634f 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -250,7 +250,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 static void flush_bg_queue(struct fuse_conn *fc)
 {
-        while (fc->active_background < FUSE_MAX_BACKGROUND &&
+        while (fc->active_background < fc->max_background &&
               !list_empty(&fc->bg_queue)) {
                struct fuse_req *req;
@@ -280,11 +280,11 @@ __releases(&fc->lock)
        list_del(&req->intr_entry);
        req->state = FUSE_REQ_FINISHED;
        if (req->background) {
-                if (fc->num_background == FUSE_MAX_BACKGROUND) {
+                if (fc->num_background == fc->max_background) {
                        fc->blocked = 0;
                        wake_up_all(&fc->blocked_waitq);
                }
-                if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+                if (fc->num_background == fc->congestion_threshold &&
                    fc->connected && fc->bdi_initialized) {
                        clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
                        clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
@@ -410,9 +410,9 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
 {
        req->background = 1;
        fc->num_background++;
-        if (fc->num_background == FUSE_MAX_BACKGROUND)
+        if (fc->num_background == fc->max_background)
                fc->blocked = 1;
-        if (fc->num_background == FUSE_CONGESTION_THRESHOLD &&
+        if (fc->num_background == fc->congestion_threshold &&
            fc->bdi_initialized) {
                set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
                set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 52b641fc0faf..fc9c79feb5f7 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -25,12 +25,6 @@
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
-/** Maximum number of outstanding background requests */
-#define FUSE_MAX_BACKGROUND 12
-/** Congestion starts at 75% of maximum */
-#define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100)
 /** Bias for fi->writectr, meaning new writepages must not be sent */
 #define FUSE_NOWRITE INT_MIN
@@ -38,7 +32,7 @@
 #define FUSE_NAME_MAX 1024
 /** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 3
+#define FUSE_CTL_NUM_DENTRIES 5
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
    module will check permissions based on the file mode.  Otherwise no
@@ -55,6 +49,10 @@ extern struct list_head fuse_conn_list;
 /** Global mutex protecting fuse_conn_list and the control filesystem */
 extern struct mutex fuse_mutex;
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
 /** FUSE inode */
 struct fuse_inode {
        /** Inode data */
@@ -349,6 +347,12 @@ struct fuse_conn {
        /** rbtree of fuse_files waiting for poll events indexed by ph */
        struct rb_root polled_files;
+        /** Maximum number of outstanding background requests */
+        unsigned max_background;
+        /** Number of background requests at which congestion starts */
+        unsigned congestion_threshold;
        /** Number of requests currently in the background */
        unsigned num_background;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index f91ccc4a189d..6da947daabda 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -14,6 +14,7 @@
 #include <linux/seq_file.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/moduleparam.h>
 #include <linux/parser.h>
 #include <linux/statfs.h>
 #include <linux/random.h>
@@ -28,10 +29,34 @@ static struct kmem_cache *fuse_inode_cachep;
 struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
+static int set_global_limit(const char *val, struct kernel_param *kp);
+unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+                  &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+                  &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
 #define FUSE_SUPER_MAGIC 0x65735546
 #define FUSE_DEFAULT_BLKSIZE 512
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
 struct fuse_mount_data {
        int fd;
        unsigned rootmode;
@@ -517,6 +542,8 @@ void fuse_conn_init(struct fuse_conn *fc)
        INIT_LIST_HEAD(&fc->bg_queue);
        INIT_LIST_HEAD(&fc->entry);
        atomic_set(&fc->num_waiting, 0);
+        fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+        fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
        fc->khctr = 0;
        fc->polled_files = RB_ROOT;
        fc->reqctr = 0;
@@ -727,6 +754,54 @@ static const struct super_operations fuse_super_operations = {
        .show_options   = fuse_show_options,
 };
+static void sanitize_global_limit(unsigned *limit)
+{
+        if (*limit == 0)
+                *limit = ((num_physpages << PAGE_SHIFT) >> 13) /
+                         sizeof(struct fuse_req);
+        if (*limit >= 1 << 16)
+                *limit = (1 << 16) - 1;
+}
+static int set_global_limit(const char *val, struct kernel_param *kp)
+{
+        int rv;
+        rv = param_set_uint(val, kp);
+        if (rv)
+                return rv;
+        sanitize_global_limit((unsigned *)kp->arg);
+        return 0;
+}
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+        int cap_sys_admin = capable(CAP_SYS_ADMIN);
+        if (arg->minor < 13)
+                return;
+        sanitize_global_limit(&max_user_bgreq);
+        sanitize_global_limit(&max_user_congthresh);
+        if (arg->max_background) {
+                fc->max_background = arg->max_background;
+                if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+                        fc->max_background = max_user_bgreq;
+        }
+        if (arg->congestion_threshold) {
+                fc->congestion_threshold = arg->congestion_threshold;
+                if (!cap_sys_admin &&
+                    fc->congestion_threshold > max_user_congthresh)
+                        fc->congestion_threshold = max_user_congthresh;
+        }
+}
 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
        struct fuse_init_out *arg = &req->misc.init_out;
@@ -736,6 +811,8 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
        else {
                unsigned long ra_pages;
+                process_init_limits(fc, arg);
                if (arg->minor >= 6) {
                        ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
                        if (arg->flags & FUSE_ASYNC_READ)
@@ -801,6 +878,7 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 {
        int err;
+        fc->bdi.name = "fuse";
        fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
        fc->bdi.unplug_io_fn = default_unplug_io_fn;
        /* fuse does it's own writeback accounting */
@@ -893,6 +971,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
        if (err)
                goto err_put_conn;
+        sb->s_bdi = &fc->bdi;
        /* Handle umasking inside the fuse code */
        if (sb->s_flags & MS_POSIXACL)
                fc->dont_mask = 1;
@@ -1147,6 +1227,9 @@ static int __init fuse_init(void)
        if (res)
                goto err_sysfs_cleanup;
+        sanitize_global_limit(&max_user_bgreq);
+        sanitize_global_limit(&max_user_congthresh);
        return 0;
 err_sysfs_cleanup:
diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile
index 3da2f1f4f738..21f7e46da4c0 100644
--- a/fs/gfs2/Makefile
+++ b/fs/gfs2/Makefile
@@ -1,6 +1,6 @@
 EXTRA_CFLAGS := -I$(src)
 obj-$(CONFIG_GFS2_FS) += gfs2.o
-gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \
+gfs2-y := acl.o bmap.o dir.o xattr.o glock.o \
        glops.o inode.o log.o lops.o main.o meta_io.o \
        aops.o dentry.o export.o file.o \
        ops_fstype.o ops_inode.o quota.o \
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index fa881bdc3d85..3fc4e3ac7d84 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -19,8 +19,7 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "acl.h"
-#include "eaops.h"
+#include "xattr.h"
-#include "eattr.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
@@ -31,8 +30,7 @@
 #define ACL_DEFAULT 0
 int gfs2_acl_validate_set(struct gfs2_inode *ip, int access,
-                      struct gfs2_ea_request *er,
+                          struct gfs2_ea_request *er, int *remove, mode_t *mode)
-                      int *remove, mode_t *mode)
 {
        struct posix_acl *acl;
        int error;
@@ -83,30 +81,20 @@ int gfs2_acl_validate_remove(struct gfs2_inode *ip, int access)
        return 0;
 }
-static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
+static int acl_get(struct gfs2_inode *ip, const char *name,
-                   struct gfs2_ea_location *el, char **data, unsigned int *len)
+                   struct posix_acl **acl, struct gfs2_ea_location *el,
+                   char **datap, unsigned int *lenp)
 {
-        struct gfs2_ea_request er;
+        char *data;
-        struct gfs2_ea_location el_this;
+        unsigned int len;
        int error;
+        el->el_bh = NULL;
        if (!ip->i_eattr)
                return 0;
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, name, el);
-        if (access) {
-                er.er_name = GFS2_POSIX_ACL_ACCESS;
-                er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
-        } else {
-                er.er_name = GFS2_POSIX_ACL_DEFAULT;
-                er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
-        }
-        er.er_type = GFS2_EATYPE_SYS;
-        if (!el)
-                el = &el_this;
-        error = gfs2_ea_find(ip, &er, el);
        if (error)
                return error;
        if (!el->el_ea)
@@ -114,32 +102,31 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl,
        if (!GFS2_EA_DATA_LEN(el->el_ea))
                goto out;
-        er.er_data_len = GFS2_EA_DATA_LEN(el->el_ea);
+        len = GFS2_EA_DATA_LEN(el->el_ea);
-        er.er_data = kmalloc(er.er_data_len, GFP_NOFS);
+        data = kmalloc(len, GFP_NOFS);
        error = -ENOMEM;
-        if (!er.er_data)
+        if (!data)
                goto out;
-        error = gfs2_ea_get_copy(ip, el, er.er_data);
+        error = gfs2_ea_get_copy(ip, el, data, len);
-        if (error)
+        if (error < 0)
                goto out_kfree;
+        error = 0;
        if (acl) {
-                *acl = posix_acl_from_xattr(er.er_data, er.er_data_len);
+                *acl = posix_acl_from_xattr(data, len);
                if (IS_ERR(*acl))
                        error = PTR_ERR(*acl);
        }
 out_kfree:
-        if (error || !data)
+        if (error || !datap) {
-                kfree(er.er_data);
+                kfree(data);
-        else {
+        } else {
-                *data = er.er_data;
+                *datap = data;
-                *len = er.er_data_len;
+                *lenp = len;
        }
 out:
-        if (error || el == &el_this)
-                brelse(el->el_bh);
        return error;
 }
@@ -153,10 +140,12 @@ out:
 int gfs2_check_acl(struct inode *inode, int mask)
 {
+        struct gfs2_ea_location el;
        struct posix_acl *acl = NULL;
        int error;
-        error = acl_get(GFS2_I(inode), ACL_ACCESS, &acl, NULL, NULL, NULL);
+        error = acl_get(GFS2_I(inode), GFS2_POSIX_ACL_ACCESS, &acl, &el, NULL, NULL);
+        brelse(el.el_bh);
        if (error)
                return error;
@@ -196,10 +185,12 @@ static int munge_mode(struct gfs2_inode *ip, mode_t mode)
 int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
 {
+        struct gfs2_ea_location el;
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct posix_acl *acl = NULL, *clone;
-        struct gfs2_ea_request er;
        mode_t mode = ip->i_inode.i_mode;
+        char *data = NULL;
+        unsigned int len;
        int error;
        if (!sdp->sd_args.ar_posix_acl)
@@ -207,11 +198,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        if (S_ISLNK(ip->i_inode.i_mode))
                return 0;
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        error = acl_get(dip, GFS2_POSIX_ACL_DEFAULT, &acl, &el, &data, &len);
-        er.er_type = GFS2_EATYPE_SYS;
+        brelse(el.el_bh);
-        error = acl_get(dip, ACL_DEFAULT, &acl, NULL,
-                        &er.er_data, &er.er_data_len);
        if (error)
                return error;
        if (!acl) {
@@ -229,9 +217,8 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        acl = clone;
        if (S_ISDIR(ip->i_inode.i_mode)) {
-                er.er_name = GFS2_POSIX_ACL_DEFAULT;
+                error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
-                er.er_name_len = GFS2_POSIX_ACL_DEFAULT_LEN;
+                                       GFS2_POSIX_ACL_DEFAULT, data, len, 0);
-                error = gfs2_system_eaops.eo_set(ip, &er);
                if (error)
                        goto out;
        }
@@ -239,21 +226,19 @@ int gfs2_acl_create(struct gfs2_inode *dip, struct gfs2_inode *ip)
        error = posix_acl_create_masq(acl, &mode);
        if (error < 0)
                goto out;
-        if (error > 0) {
+        if (error == 0)
-                er.er_name = GFS2_POSIX_ACL_ACCESS;
+                goto munge;
-                er.er_name_len = GFS2_POSIX_ACL_ACCESS_LEN;
-                posix_acl_to_xattr(acl, er.er_data, er.er_data_len);
-                er.er_mode = mode;
-                er.er_flags = GFS2_ERF_MODE;
-                error = gfs2_system_eaops.eo_set(ip, &er);
-                if (error)
-                        goto out;
-        } else
-                munge_mode(ip, mode);
+        posix_acl_to_xattr(acl, data, len);
+        error = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SYS,
+                               GFS2_POSIX_ACL_ACCESS, data, len, 0);
+        if (error)
+                goto out;
+munge:
+        error = munge_mode(ip, mode);
 out:
        posix_acl_release(acl);
-        kfree(er.er_data);
+        kfree(data);
        return error;
 }
@@ -265,9 +250,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
        unsigned int len;
        int error;
-        error = acl_get(ip, ACL_ACCESS, &acl, &el, &data, &len);
+        error = acl_get(ip, GFS2_POSIX_ACL_ACCESS, &acl, &el, &data, &len);
        if (error)
-                return error;
+                goto out_brelse;
        if (!acl)
                return gfs2_setattr_simple(ip, attr);
@@ -286,8 +271,9 @@ int gfs2_acl_chmod(struct gfs2_inode *ip, struct iattr *attr)
 out:
        posix_acl_release(acl);
-        brelse(el.el_bh);
        kfree(data);
+out_brelse:
+        brelse(el.el_bh);
        return error;
 }
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 03ebb439ace0..7ebae9a4ecc0 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -624,6 +624,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
 {
        struct gfs2_inode *ip = GFS2_I(mapping->host);
        struct gfs2_sbd *sdp = GFS2_SB(mapping->host);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        unsigned int data_blocks = 0, ind_blocks = 0, rblocks;
        int alloc_required;
        int error = 0;
@@ -637,6 +638,14 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
        error = gfs2_glock_nq(&ip->i_gh);
        if (unlikely(error))
                goto out_uninit;
+        if (&ip->i_inode == sdp->sd_rindex) {
+                error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE,
+                                           GL_NOCACHE, &m_ip->i_gh);
+                if (unlikely(error)) {
+                        gfs2_glock_dq(&ip->i_gh);
+                        goto out_uninit;
+                }
+        }
        error = gfs2_write_alloc_required(ip, pos, len, &alloc_required);
        if (error)
@@ -667,6 +676,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping,
                rblocks += data_blocks ? data_blocks : 1;
        if (ind_blocks || data_blocks)
                rblocks += RES_STATFS + RES_QUOTA;
+        if (&ip->i_inode == sdp->sd_rindex)
+                rblocks += 2 * RES_STATFS;
        error = gfs2_trans_begin(sdp, rblocks,
                                 PAGE_CACHE_SIZE/sdp->sd_sb.sb_bsize);
@@ -712,6 +723,10 @@ out_alloc_put:
                gfs2_alloc_put(ip);
        }
 out_unlock:
+        if (&ip->i_inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
 out_uninit:
        gfs2_holder_uninit(&ip->i_gh);
@@ -725,14 +740,21 @@ out_uninit:
 static void adjust_fs_space(struct inode *inode)
 {
        struct gfs2_sbd *sdp = inode->i_sb->s_fs_info;
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        struct buffer_head *m_bh, *l_bh;
        u64 fs_total, new_free;
        /* Total up the file system space, according to the latest rindex. */
        fs_total = gfs2_ri_total(sdp);
+        if (gfs2_meta_inode_buffer(m_ip, &m_bh) != 0)
+                return;
        spin_lock(&sdp->sd_statfs_spin);
+        gfs2_statfs_change_in(m_sc, m_bh->b_data +
+                              sizeof(struct gfs2_dinode));
        if (fs_total > (m_sc->sc_total + l_sc->sc_total))
                new_free = fs_total - (m_sc->sc_total + l_sc->sc_total);
        else
@@ -741,6 +763,13 @@ static void adjust_fs_space(struct inode *inode)
        fs_warn(sdp, "File system extended by %llu blocks.\n",
                (unsigned long long)new_free);
        gfs2_statfs_change(sdp, new_free, new_free, 0);
+        if (gfs2_meta_inode_buffer(l_ip, &l_bh) != 0)
+                goto out;
+        update_statfs(sdp, m_bh, l_bh);
+        brelse(l_bh);
+out:
+        brelse(m_bh);
 }
 /**
@@ -763,6 +792,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 {
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        u64 to = pos + copied;
        void *kaddr;
        unsigned char *buf = dibh->b_data + sizeof(struct gfs2_dinode);
@@ -794,6 +824,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
        brelse(dibh);
        gfs2_trans_end(sdp);
+        if (inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
        gfs2_holder_uninit(&ip->i_gh);
        return copied;
@@ -823,6 +857,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
        struct inode *inode = page->mapping->host;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
        struct buffer_head *dibh;
        struct gfs2_alloc *al = ip->i_alloc;
        unsigned int from = pos & (PAGE_CACHE_SIZE - 1);
@@ -865,6 +900,10 @@ failed:
                gfs2_quota_unlock(ip);
                gfs2_alloc_put(ip);
        }
+        if (inode == sdp->sd_rindex) {
+                gfs2_glock_dq(&m_ip->i_gh);
+                gfs2_holder_uninit(&m_ip->i_gh);
+        }
        gfs2_glock_dq(&ip->i_gh);
        gfs2_holder_uninit(&ip->i_gh);
        return ret;
diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c
index 022c66cd5606..91beddadd388 100644
--- a/fs/gfs2/dentry.c
+++ b/fs/gfs2/dentry.c
@@ -107,8 +107,26 @@ static int gfs2_dhash(struct dentry *dentry, struct qstr *str)
        return 0;
 }
+static int gfs2_dentry_delete(struct dentry *dentry)
+{
+        struct gfs2_inode *ginode;
+        if (!dentry->d_inode)
+                return 0;
+        ginode = GFS2_I(dentry->d_inode);
+        if (!ginode->i_iopen_gh.gh_gl)
+                return 0;
+        if (test_bit(GLF_DEMOTE, &ginode->i_iopen_gh.gh_gl->gl_flags))
+                return 1;
+        return 0;
+}
 const struct dentry_operations gfs2_dops = {
        .d_revalidate = gfs2_drevalidate,
        .d_hash = gfs2_dhash,
+        .d_delete = gfs2_dentry_delete,
 };
diff --git a/fs/gfs2/eaops.c b/fs/gfs2/eaops.c
deleted file mode 100644
index dee9b03e5b37..000000000000
--- a/fs/gfs2/eaops.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#include <linux/slab.h>
-#include <linux/spinlock.h>
-#include <linux/completion.h>
-#include <linux/buffer_head.h>
-#include <linux/capability.h>
-#include <linux/xattr.h>
-#include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
-#include "gfs2.h"
-#include "incore.h"
-#include "acl.h"
-#include "eaops.h"
-#include "eattr.h"
-#include "util.h"
-/**
- * gfs2_ea_name2type - get the type of the ea, and truncate type from the name
- * @namep: ea name, possibly with type appended
- *
- * Returns: GFS2_EATYPE_XXX
- */
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name)
-{
-        unsigned int type;
-        if (strncmp(name, "system.", 7) == 0) {
-                type = GFS2_EATYPE_SYS;
-                if (truncated_name)
-                        *truncated_name = name + sizeof("system.") - 1;
-        } else if (strncmp(name, "user.", 5) == 0) {
-                type = GFS2_EATYPE_USR;
-                if (truncated_name)
-                        *truncated_name = name + sizeof("user.") - 1;
-        } else if (strncmp(name, "security.", 9) == 0) {
-                type = GFS2_EATYPE_SECURITY;
-                if (truncated_name)
-                        *truncated_name = name + sizeof("security.") - 1;
-        } else {
-                type = GFS2_EATYPE_UNUSED;
-                if (truncated_name)
-                        *truncated_name = NULL;
-        }
-        return type;
-}
-static int system_eo_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        if (!GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) &&
-            !GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len) &&
-            !capable(CAP_SYS_ADMIN))
-                return -EPERM;
-        if (GFS2_SB(&ip->i_inode)->sd_args.ar_posix_acl == 0 &&
-            (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len) ||
-             GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)))
-                return -EOPNOTSUPP;
-        return gfs2_ea_get_i(ip, er);
-}
-static int system_eo_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        int remove = 0;
-        int error;
-        if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
-                if (!(er->er_flags & GFS2_ERF_MODE)) {
-                        er->er_mode = ip->i_inode.i_mode;
-                        er->er_flags |= GFS2_ERF_MODE;
-                }
-                error = gfs2_acl_validate_set(ip, 1, er,
-                                              &remove, &er->er_mode);
-                if (error)
-                        return error;
-                error = gfs2_ea_set_i(ip, er);
-                if (error)
-                        return error;
-                if (remove)
-                        gfs2_ea_remove_i(ip, er);
-                return 0;
-        } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
-                error = gfs2_acl_validate_set(ip, 0, er,
-                                              &remove, NULL);
-                if (error)
-                        return error;
-                if (!remove)
-                        error = gfs2_ea_set_i(ip, er);
-                else {
-                        error = gfs2_ea_remove_i(ip, er);
-                        if (error == -ENODATA)
-                                error = 0;
-                }
-                return error;
-        }
-        return -EPERM;
-}
-static int system_eo_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        if (GFS2_ACL_IS_ACCESS(er->er_name, er->er_name_len)) {
-                int error = gfs2_acl_validate_remove(ip, 1);
-                if (error)
-                        return error;
-        } else if (GFS2_ACL_IS_DEFAULT(er->er_name, er->er_name_len)) {
-                int error = gfs2_acl_validate_remove(ip, 0);
-                if (error)
-                        return error;
-        } else
-                return -EPERM;
-        return gfs2_ea_remove_i(ip, er);
-}
-static const struct gfs2_eattr_operations gfs2_user_eaops = {
-        .eo_get = gfs2_ea_get_i,
-        .eo_set = gfs2_ea_set_i,
-        .eo_remove = gfs2_ea_remove_i,
-        .eo_name = "user",
-};
-const struct gfs2_eattr_operations gfs2_system_eaops = {
-        .eo_get = system_eo_get,
-        .eo_set = system_eo_set,
-        .eo_remove = system_eo_remove,
-        .eo_name = "system",
-};
-static const struct gfs2_eattr_operations gfs2_security_eaops = {
-        .eo_get = gfs2_ea_get_i,
-        .eo_set = gfs2_ea_set_i,
-        .eo_remove = gfs2_ea_remove_i,
-        .eo_name = "security",
-};
-const struct gfs2_eattr_operations *gfs2_ea_ops[] = {
-        NULL,
-        &gfs2_user_eaops,
-        &gfs2_system_eaops,
-        &gfs2_security_eaops,
-};
diff --git a/fs/gfs2/eaops.h b/fs/gfs2/eaops.h
deleted file mode 100644
index da2f7fbbb40d..000000000000
--- a/fs/gfs2/eaops.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
- * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
- *
- * This copyrighted material is made available to anyone wishing to use,
- * modify, copy, or redistribute it subject to the terms and conditions
- * of the GNU General Public License version 2.
- */
-#ifndef __EAOPS_DOT_H__
-#define __EAOPS_DOT_H__
-struct gfs2_ea_request;
-struct gfs2_inode;
-struct gfs2_eattr_operations {
-        int (*eo_get) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-        int (*eo_set) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-        int (*eo_remove) (struct gfs2_inode *ip, struct gfs2_ea_request *er);
-        char *eo_name;
-};
-unsigned int gfs2_ea_name2type(const char *name, const char **truncated_name);
-extern const struct gfs2_eattr_operations gfs2_system_eaops;
-extern const struct gfs2_eattr_operations *gfs2_ea_ops[];
-#endif /* __EAOPS_DOT_H__ */
diff --git a/fs/gfs2/export.c b/fs/gfs2/export.c
index 9200ef221716..d15876e9aa26 100644
--- a/fs/gfs2/export.c
+++ b/fs/gfs2/export.c
@@ -143,17 +143,14 @@ static struct dentry *gfs2_get_parent(struct dentry *child)
 }
 static struct dentry *gfs2_get_dentry(struct super_block *sb,
-                struct gfs2_inum_host *inum)
+                                      struct gfs2_inum_host *inum)
 {
        struct gfs2_sbd *sdp = sb->s_fs_info;
-        struct gfs2_holder i_gh, ri_gh, rgd_gh;
+        struct gfs2_holder i_gh;
-        struct gfs2_rgrpd *rgd;
        struct inode *inode;
        struct dentry *dentry;
        int error;
-        /* System files? */
        inode = gfs2_ilookup(sb, inum->no_addr);
        if (inode) {
                if (GFS2_I(inode)->i_no_formal_ino != inum->no_formal_ino) {
@@ -168,29 +165,11 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb,
        if (error)
                return ERR_PTR(error);
-        error = gfs2_rindex_hold(sdp, &ri_gh);
+        error = gfs2_check_blk_type(sdp, inum->no_addr, GFS2_BLKST_DINODE);
        if (error)
                goto fail;
-        error = -EINVAL;
+        inode = gfs2_inode_lookup(sb, DT_UNKNOWN, inum->no_addr, 0, 0);
-        rgd = gfs2_blk2rgrpd(sdp, inum->no_addr);
-        if (!rgd)
-                goto fail_rindex;
-        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
-        if (error)
-                goto fail_rindex;
-        error = -ESTALE;
-        if (gfs2_get_block_type(rgd, inum->no_addr) != GFS2_BLKST_DINODE)
-                goto fail_rgd;
-        gfs2_glock_dq_uninit(&rgd_gh);
-        gfs2_glock_dq_uninit(&ri_gh);
-        inode = gfs2_inode_lookup(sb, DT_UNKNOWN,
-                                        inum->no_addr,
-                                        0, 0);
        if (IS_ERR(inode)) {
                error = PTR_ERR(inode);
                goto fail;
@@ -224,13 +203,6 @@ out_inode:
        if (!IS_ERR(dentry))
                dentry->d_op = &gfs2_dops;
        return dentry;
-fail_rgd:
-        gfs2_glock_dq_uninit(&rgd_gh);
-fail_rindex:
-        gfs2_glock_dq_uninit(&ri_gh);
 fail:
        gfs2_glock_dq_uninit(&i_gh);
        return ERR_PTR(error);
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 73318a3ce6f1..166f38fbd246 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -38,7 +38,6 @@
 #include "rgrp.h"
 #include "trans.h"
 #include "util.h"
-#include "eaops.h"
 /**
 * gfs2_llseek - seek to a location in a file
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 297421c0427a..8b674b1f3a55 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -63,6 +63,7 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int
 static DECLARE_RWSEM(gfs2_umount_flush_sem);
 static struct dentry *gfs2_root;
 static struct workqueue_struct *glock_workqueue;
+struct workqueue_struct *gfs2_delete_workqueue;
 static LIST_HEAD(lru_list);
 static atomic_t lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(lru_lock);
@@ -167,13 +168,33 @@ static void glock_free(struct gfs2_glock *gl)
 *
 */
-static void gfs2_glock_hold(struct gfs2_glock *gl)
+void gfs2_glock_hold(struct gfs2_glock *gl)
 {
        GLOCK_BUG_ON(gl, atomic_read(&gl->gl_ref) == 0);
        atomic_inc(&gl->gl_ref);
 }
 /**
+ * demote_ok - Check to see if it's ok to unlock a glock
+ * @gl: the glock
+ *
+ * Returns: 1 if it's ok
+ */
+static int demote_ok(const struct gfs2_glock *gl)
+{
+        const struct gfs2_glock_operations *glops = gl->gl_ops;
+        if (gl->gl_state == LM_ST_UNLOCKED)
+                return 0;
+        if (!list_empty(&gl->gl_holders))
+                return 0;
+        if (glops->go_demote_ok)
+                return glops->go_demote_ok(gl);
+        return 1;
+}
+/**
 * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list
 * @gl: the glock
 *
@@ -181,8 +202,13 @@ static void gfs2_glock_hold(struct gfs2_glock *gl)
 static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 {
+        int may_reclaim;
+        may_reclaim = (demote_ok(gl) &&
+                       (atomic_read(&gl->gl_ref) == 1 ||
+                        (gl->gl_name.ln_type == LM_TYPE_INODE &&
+                         atomic_read(&gl->gl_ref) <= 2)));
        spin_lock(&lru_lock);
-        if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) {
+        if (list_empty(&gl->gl_lru) && may_reclaim) {
                list_add_tail(&gl->gl_lru, &lru_list);
                atomic_inc(&lru_count);
        }
@@ -190,6 +216,21 @@ static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl)
 }
 /**
+ * gfs2_glock_put_nolock() - Decrement reference count on glock
+ * @gl: The glock to put
+ *
+ * This function should only be used if the caller has its own reference
+ * to the glock, in addition to the one it is dropping.
+ */
+void gfs2_glock_put_nolock(struct gfs2_glock *gl)
+{
+        if (atomic_dec_and_test(&gl->gl_ref))
+                GLOCK_BUG_ON(gl, 1);
+        gfs2_glock_schedule_for_reclaim(gl);
+}
+/**
 * gfs2_glock_put() - Decrement reference count on glock
 * @gl: The glock to put
 *
@@ -214,9 +255,9 @@ int gfs2_glock_put(struct gfs2_glock *gl)
                rv = 1;
                goto out;
        }
-        /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */
+        spin_lock(&gl->gl_spin);
-        if (atomic_read(&gl->gl_ref) == 2)
+        gfs2_glock_schedule_for_reclaim(gl);
-                gfs2_glock_schedule_for_reclaim(gl);
+        spin_unlock(&gl->gl_spin);
        write_unlock(gl_lock_addr(gl->gl_hash));
 out:
        return rv;
@@ -398,7 +439,7 @@ static void state_change(struct gfs2_glock *gl, unsigned int new_state)
                if (held2)
                        gfs2_glock_hold(gl);
                else
-                        gfs2_glock_put(gl);
+                        gfs2_glock_put_nolock(gl);
        }
        gl->gl_state = new_state;
@@ -633,12 +674,35 @@ out:
 out_sched:
        gfs2_glock_hold(gl);
        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-                gfs2_glock_put(gl);
+                gfs2_glock_put_nolock(gl);
 out_unlock:
        clear_bit(GLF_LOCK, &gl->gl_flags);
        goto out;
 }
+static void delete_work_func(struct work_struct *work)
+{
+        struct gfs2_glock *gl = container_of(work, struct gfs2_glock, gl_delete);
+        struct gfs2_sbd *sdp = gl->gl_sbd;
+        struct gfs2_inode *ip = NULL;
+        struct inode *inode;
+        u64 no_addr = 0;
+        spin_lock(&gl->gl_spin);
+        ip = (struct gfs2_inode *)gl->gl_object;
+        if (ip)
+                no_addr = ip->i_no_addr;
+        spin_unlock(&gl->gl_spin);
+        if (ip) {
+                inode = gfs2_ilookup(sdp->sd_vfs, no_addr);
+                if (inode) {
+                        d_prune_aliases(inode);
+                        iput(inode);
+                }
+        }
+        gfs2_glock_put(gl);
+}
 static void glock_work_func(struct work_struct *work)
 {
        unsigned long delay = 0;
@@ -717,6 +781,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
        gl->gl_sbd = sdp;
        gl->gl_aspace = NULL;
        INIT_DELAYED_WORK(&gl->gl_work, glock_work_func);
+        INIT_WORK(&gl->gl_delete, delete_work_func);
        /* If this glock protects actual on-disk data or metadata blocks,
           create a VFS inode to manage the pages/buffers holding them. */
@@ -858,6 +923,8 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state,
                        gl->gl_demote_state != state) {
                gl->gl_demote_state = LM_ST_UNLOCKED;
        }
+        if (gl->gl_ops->go_callback)
+                gl->gl_ops->go_callback(gl);
        trace_gfs2_demote_rq(gl);
 }
@@ -1274,33 +1341,12 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
                gfs2_glock_put(gl);
 }
-/**
- * demote_ok - Check to see if it's ok to unlock a glock
- * @gl: the glock
- *
- * Returns: 1 if it's ok
- */
-static int demote_ok(const struct gfs2_glock *gl)
-{
-        const struct gfs2_glock_operations *glops = gl->gl_ops;
-        if (gl->gl_state == LM_ST_UNLOCKED)
-                return 0;
-        if (!list_empty(&gl->gl_holders))
-                return 0;
-        if (glops->go_demote_ok)
-                return glops->go_demote_ok(gl);
-        return 1;
-}
 static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
 {
        struct gfs2_glock *gl;
        int may_demote;
        int nr_skipped = 0;
-        int got_ref = 0;
        LIST_HEAD(skipped);
        if (nr == 0)
@@ -1315,37 +1361,29 @@ static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask)
                list_del_init(&gl->gl_lru);
                atomic_dec(&lru_count);
+                /* Check if glock is about to be freed */
+                if (atomic_read(&gl->gl_ref) == 0)
+                        continue;
                /* Test for being demotable */
                if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
                        gfs2_glock_hold(gl);
-                        got_ref = 1;
                        spin_unlock(&lru_lock);
                        spin_lock(&gl->gl_spin);
                        may_demote = demote_ok(gl);
-                        spin_unlock(&gl->gl_spin);
-                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        if (may_demote) {
                                handle_callback(gl, LM_ST_UNLOCKED, 0);
                                nr--;
-                                if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
-                                        gfs2_glock_put(gl);
-                                got_ref = 0;
                        }
+                        if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
+                                gfs2_glock_put_nolock(gl);
+                        spin_unlock(&gl->gl_spin);
+                        clear_bit(GLF_LOCK, &gl->gl_flags);
                        spin_lock(&lru_lock);
-                        if (may_demote)
+                        continue;
-                                continue;
-                }
-                if (list_empty(&gl->gl_lru) &&
-                    (atomic_read(&gl->gl_ref) <= (2 + got_ref))) {
-                        nr_skipped++;
-                        list_add(&gl->gl_lru, &skipped);
-                }
-                if (got_ref) {
-                        spin_unlock(&lru_lock);
-                        gfs2_glock_put(gl);
-                        spin_lock(&lru_lock);
-                        got_ref = 0;
                }
+                nr_skipped++;
+                list_add(&gl->gl_lru, &skipped);
        }
        list_splice(&skipped, &lru_list);
        atomic_add(nr_skipped, &lru_count);
@@ -1727,6 +1765,11 @@ int __init gfs2_glock_init(void)
        glock_workqueue = create_workqueue("glock_workqueue");
        if (IS_ERR(glock_workqueue))
                return PTR_ERR(glock_workqueue);
+        gfs2_delete_workqueue = create_workqueue("delete_workqueue");
+        if (IS_ERR(gfs2_delete_workqueue)) {
+                destroy_workqueue(glock_workqueue);
+                return PTR_ERR(gfs2_delete_workqueue);
+        }
        register_shrinker(&glock_shrinker);
@@ -1737,6 +1780,7 @@ void gfs2_glock_exit(void)
 {
        unregister_shrinker(&glock_shrinker);
        destroy_workqueue(glock_workqueue);
+        destroy_workqueue(gfs2_delete_workqueue);
 }
 static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index a602a28f6f08..c609894ec0d0 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -143,6 +143,7 @@ struct lm_lockops {
 #define GLR_TRYFAILED           13
+extern struct workqueue_struct *gfs2_delete_workqueue;
 static inline struct gfs2_holder *gfs2_glock_is_locked_by_me(struct gfs2_glock *gl)
 {
        struct gfs2_holder *gh;
@@ -191,6 +192,8 @@ static inline int gfs2_glock_is_blocking(struct gfs2_glock *gl)
 int gfs2_glock_get(struct gfs2_sbd *sdp,
                   u64 number, const struct gfs2_glock_operations *glops,
                   int create, struct gfs2_glock **glp);
+void gfs2_glock_hold(struct gfs2_glock *gl);
+void gfs2_glock_put_nolock(struct gfs2_glock *gl);
 int gfs2_glock_put(struct gfs2_glock *gl);
 void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
                      struct gfs2_holder *gh);
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index d5e4ab155ca0..6985eef06c39 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -323,6 +323,7 @@ static void trans_go_sync(struct gfs2_glock *gl)
        if (gl->gl_state != LM_ST_UNLOCKED &&
            test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
+                flush_workqueue(gfs2_delete_workqueue);
                gfs2_meta_syncfs(sdp);
                gfs2_log_shutdown(sdp);
        }
@@ -372,6 +373,25 @@ static int trans_go_demote_ok(const struct gfs2_glock *gl)
        return 0;
 }
+/**
+ * iopen_go_callback - schedule the dcache entry for the inode to be deleted
+ * @gl: the glock
+ *
+ * gl_spin lock is held while calling this
+ */
+static void iopen_go_callback(struct gfs2_glock *gl)
+{
+        struct gfs2_inode *ip = (struct gfs2_inode *)gl->gl_object;
+        if (gl->gl_demote_state == LM_ST_UNLOCKED &&
+            gl->gl_state == LM_ST_SHARED &&
+            ip && test_bit(GIF_USER, &ip->i_flags)) {
+                gfs2_glock_hold(gl);
+                if (queue_work(gfs2_delete_workqueue, &gl->gl_delete) == 0)
+                        gfs2_glock_put_nolock(gl);
+        }
+}
 const struct gfs2_glock_operations gfs2_meta_glops = {
        .go_type = LM_TYPE_META,
 };
@@ -406,6 +426,7 @@ const struct gfs2_glock_operations gfs2_trans_glops = {
 const struct gfs2_glock_operations gfs2_iopen_glops = {
        .go_type = LM_TYPE_IOPEN,
+        .go_callback = iopen_go_callback,
 };
 const struct gfs2_glock_operations gfs2_flock_glops = {
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 225347fbff3c..6edb423f90b3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -159,6 +159,7 @@ struct gfs2_glock_operations {
        int (*go_lock) (struct gfs2_holder *gh);
        void (*go_unlock) (struct gfs2_holder *gh);
        int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl);
+        void (*go_callback) (struct gfs2_glock *gl);
        const int go_type;
        const unsigned long go_min_hold_time;
 };
@@ -228,6 +229,7 @@ struct gfs2_glock {
        struct list_head gl_ail_list;
        atomic_t gl_ail_count;
        struct delayed_work gl_work;
+        struct work_struct gl_delete;
 };
 #define GFS2_MIN_LVB_SIZE 32    /* Min size of LVB that gfs2 supports */
@@ -404,6 +406,12 @@ struct gfs2_statfs_change_host {
 #define GFS2_DATA_WRITEBACK     1
 #define GFS2_DATA_ORDERED       2
+#define GFS2_ERRORS_DEFAULT     GFS2_ERRORS_WITHDRAW
+#define GFS2_ERRORS_WITHDRAW    0
+#define GFS2_ERRORS_CONTINUE    1 /* place holder for future feature */
+#define GFS2_ERRORS_RO          2 /* place holder for future feature */
+#define GFS2_ERRORS_PANIC       3
 struct gfs2_args {
        char ar_lockproto[GFS2_LOCKNAME_LEN];   /* Name of the Lock Protocol */
        char ar_locktable[GFS2_LOCKNAME_LEN];   /* Name of the Lock Table */
@@ -420,6 +428,7 @@ struct gfs2_args {
        unsigned int ar_data:2;                 /* ordered/writeback */
        unsigned int ar_meta:1;                 /* mount metafs */
        unsigned int ar_discard:1;              /* discard requests */
+        unsigned int ar_errors:2;               /* errors=withdraw | panic */
        int ar_commit;                          /* Commit interval */
 };
@@ -487,7 +496,6 @@ struct gfs2_sb_host {
 */
 struct lm_lockstruct {
-        u32 ls_id;
        unsigned int ls_jid;
        unsigned int ls_first;
        unsigned int ls_first_done;
@@ -539,18 +547,12 @@ struct gfs2_sbd {
        struct dentry *sd_root_dir;
        struct inode *sd_jindex;
-        struct inode *sd_inum_inode;
        struct inode *sd_statfs_inode;
-        struct inode *sd_ir_inode;
        struct inode *sd_sc_inode;
        struct inode *sd_qc_inode;
        struct inode *sd_rindex;
        struct inode *sd_quota_inode;
-        /* Inum stuff */
-        struct mutex sd_inum_mutex;
        /* StatFS stuff */
        spinlock_t sd_statfs_spin;
@@ -578,7 +580,6 @@ struct gfs2_sbd {
        struct gfs2_holder sd_journal_gh;
        struct gfs2_holder sd_jinode_gh;
-        struct gfs2_holder sd_ir_gh;
        struct gfs2_holder sd_sc_gh;
        struct gfs2_holder sd_qc_gh;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 2f94bd723698..fb15d3b1f409 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -24,7 +24,7 @@
 #include "acl.h"
 #include "bmap.h"
 #include "dir.h"
-#include "eattr.h"
+#include "xattr.h"
 #include "glock.h"
 #include "glops.h"
 #include "inode.h"
@@ -519,139 +519,6 @@ out:
        return inode ? inode : ERR_PTR(error);
 }
-static void gfs2_inum_range_in(struct gfs2_inum_range_host *ir, const void *buf)
-{
-        const struct gfs2_inum_range *str = buf;
-        ir->ir_start = be64_to_cpu(str->ir_start);
-        ir->ir_length = be64_to_cpu(str->ir_length);
-}
-static void gfs2_inum_range_out(const struct gfs2_inum_range_host *ir, void *buf)
-{
-        struct gfs2_inum_range *str = buf;
-        str->ir_start = cpu_to_be64(ir->ir_start);
-        str->ir_length = cpu_to_be64(ir->ir_length);
-}
-static int pick_formal_ino_1(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
-        struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
-        struct buffer_head *bh;
-        struct gfs2_inum_range_host ir;
-        int error;
-        error = gfs2_trans_begin(sdp, RES_DINODE, 0);
-        if (error)
-                return error;
-        mutex_lock(&sdp->sd_inum_mutex);
-        error = gfs2_meta_inode_buffer(ip, &bh);
-        if (error) {
-                mutex_unlock(&sdp->sd_inum_mutex);
-                gfs2_trans_end(sdp);
-                return error;
-        }
-        gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-        if (ir.ir_length) {
-                *formal_ino = ir.ir_start++;
-                ir.ir_length--;
-                gfs2_trans_add_bh(ip->i_gl, bh, 1);
-                gfs2_inum_range_out(&ir,
-                                    bh->b_data + sizeof(struct gfs2_dinode));
-                brelse(bh);
-                mutex_unlock(&sdp->sd_inum_mutex);
-                gfs2_trans_end(sdp);
-                return 0;
-        }
-        brelse(bh);
-        mutex_unlock(&sdp->sd_inum_mutex);
-        gfs2_trans_end(sdp);
-        return 1;
-}
-static int pick_formal_ino_2(struct gfs2_sbd *sdp, u64 *formal_ino)
-{
-        struct gfs2_inode *ip = GFS2_I(sdp->sd_ir_inode);
-        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_inum_inode);
-        struct gfs2_holder gh;
-        struct buffer_head *bh;
-        struct gfs2_inum_range_host ir;
-        int error;
-        error = gfs2_glock_nq_init(m_ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
-        if (error)
-                return error;
-        error = gfs2_trans_begin(sdp, 2 * RES_DINODE, 0);
-        if (error)
-                goto out;
-        mutex_lock(&sdp->sd_inum_mutex);
-        error = gfs2_meta_inode_buffer(ip, &bh);
-        if (error)
-                goto out_end_trans;
-        gfs2_inum_range_in(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-        if (!ir.ir_length) {
-                struct buffer_head *m_bh;
-                u64 x, y;
-                __be64 z;
-                error = gfs2_meta_inode_buffer(m_ip, &m_bh);
-                if (error)
-                        goto out_brelse;
-                z = *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode));
-                x = y = be64_to_cpu(z);
-                ir.ir_start = x;
-                ir.ir_length = GFS2_INUM_QUANTUM;
-                x += GFS2_INUM_QUANTUM;
-                if (x < y)
-                        gfs2_consist_inode(m_ip);
-                z = cpu_to_be64(x);
-                gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
-                *(__be64 *)(m_bh->b_data + sizeof(struct gfs2_dinode)) = z;
-                brelse(m_bh);
-        }
-        *formal_ino = ir.ir_start++;
-        ir.ir_length--;
-        gfs2_trans_add_bh(ip->i_gl, bh, 1);
-        gfs2_inum_range_out(&ir, bh->b_data + sizeof(struct gfs2_dinode));
-out_brelse:
-        brelse(bh);
-out_end_trans:
-        mutex_unlock(&sdp->sd_inum_mutex);
-        gfs2_trans_end(sdp);
-out:
-        gfs2_glock_dq_uninit(&gh);
-        return error;
-}
-static int pick_formal_ino(struct gfs2_sbd *sdp, u64 *inum)
-{
-        int error;
-        error = pick_formal_ino_1(sdp, inum);
-        if (error <= 0)
-                return error;
-        error = pick_formal_ino_2(sdp, inum);
-        return error;
-}
 /**
 * create_ok - OK to create a new on-disk inode here?
 * @dip:  Directory in which dinode is to be created
@@ -731,7 +598,7 @@ static int alloc_dinode(struct gfs2_inode *dip, u64 *no_addr, u64 *generation)
        if (error)
                goto out_ipreserv;
-        *no_addr = gfs2_alloc_di(dip, generation);
+        error = gfs2_alloc_di(dip, no_addr, generation);
        gfs2_trans_end(sdp);
@@ -924,7 +791,6 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
        size_t len;
        void *value;
        char *name;
-        struct gfs2_ea_request er;
        err = security_inode_init_security(&ip->i_inode, &dip->i_inode,
                                           &name, &value, &len);
@@ -935,16 +801,7 @@ static int gfs2_security_init(struct gfs2_inode *dip, struct gfs2_inode *ip)
                return err;
        }
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        err = gfs2_xattr_set(&ip->i_inode, GFS2_EATYPE_SECURITY, name, value, len, 0);
-        er.er_type = GFS2_EATYPE_SECURITY;
-        er.er_name = name;
-        er.er_data = value;
-        er.er_name_len = strlen(name);
-        er.er_data_len = len;
-        err = gfs2_ea_set_i(ip, &er);
        kfree(value);
        kfree(name);
@@ -991,13 +848,10 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock;
-        error = pick_formal_ino(sdp, &inum.no_formal_ino);
-        if (error)
-                goto fail_gunlock;
        error = alloc_dinode(dip, &inum.no_addr, &generation);
        if (error)
                goto fail_gunlock;
+        inum.no_formal_ino = generation;
        error = gfs2_glock_nq_num(sdp, inum.no_addr, &gfs2_inode_glops,
                                  LM_ST_EXCLUSIVE, GL_SKIP, ghs + 1);
@@ -1008,9 +862,8 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name,
        if (error)
                goto fail_gunlock2;
-        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode),
+        inode = gfs2_inode_lookup(dir->i_sb, IF2DT(mode), inum.no_addr,
-                                        inum.no_addr,
+                                  inum.no_formal_ino, 0);
-                                        inum.no_formal_ino, 0);
        if (IS_ERR(inode))
                goto fail_gunlock2;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 7bc3c45cd676..52fb6c048981 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -84,7 +84,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
        gfs2_tune_init(&sdp->sd_tune);
-        mutex_init(&sdp->sd_inum_mutex);
        spin_lock_init(&sdp->sd_statfs_spin);
        spin_lock_init(&sdp->sd_rindex_spin);
@@ -833,21 +832,12 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
        if (error)
                goto fail;
-        /* Read in the master inode number inode */
-        sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum");
-        if (IS_ERR(sdp->sd_inum_inode)) {
-                error = PTR_ERR(sdp->sd_inum_inode);
-                fs_err(sdp, "can't read in inum inode: %d\n", error);
-                goto fail_journal;
-        }
        /* Read in the master statfs inode */
        sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs");
        if (IS_ERR(sdp->sd_statfs_inode)) {
                error = PTR_ERR(sdp->sd_statfs_inode);
                fs_err(sdp, "can't read in statfs inode: %d\n", error);
-                goto fail_inum;
+                goto fail_journal;
        }
        /* Read in the resource index inode */
@@ -876,8 +866,6 @@ fail_rindex:
        iput(sdp->sd_rindex);
 fail_statfs:
        iput(sdp->sd_statfs_inode);
-fail_inum:
-        iput(sdp->sd_inum_inode);
 fail_journal:
        init_journal(sdp, UNDO);
 fail:
@@ -905,20 +893,12 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
                return error;
        }
-        sprintf(buf, "inum_range%u", sdp->sd_jdesc->jd_jid);
-        sdp->sd_ir_inode = gfs2_lookup_simple(pn, buf);
-        if (IS_ERR(sdp->sd_ir_inode)) {
-                error = PTR_ERR(sdp->sd_ir_inode);
-                fs_err(sdp, "can't find local \"ir\" file: %d\n", error);
-                goto fail;
-        }
        sprintf(buf, "statfs_change%u", sdp->sd_jdesc->jd_jid);
        sdp->sd_sc_inode = gfs2_lookup_simple(pn, buf);
        if (IS_ERR(sdp->sd_sc_inode)) {
                error = PTR_ERR(sdp->sd_sc_inode);
                fs_err(sdp, "can't find local \"sc\" file: %d\n", error);
-                goto fail_ir_i;
+                goto fail;
        }
        sprintf(buf, "quota_change%u", sdp->sd_jdesc->jd_jid);
@@ -932,27 +912,16 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo)
        iput(pn);
        pn = NULL;
-        ip = GFS2_I(sdp->sd_ir_inode);
-        error = gfs2_glock_nq_init(ip->i_gl,
-                                   LM_ST_EXCLUSIVE, 0,
-                                   &sdp->sd_ir_gh);
-        if (error) {
-                fs_err(sdp, "can't lock local \"ir\" file: %d\n", error);
-                goto fail_qc_i;
-        }
        ip = GFS2_I(sdp->sd_sc_inode);
-        error = gfs2_glock_nq_init(ip->i_gl,
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
-                                   LM_ST_EXCLUSIVE, 0,
                                   &sdp->sd_sc_gh);
        if (error) {
                fs_err(sdp, "can't lock local \"sc\" file: %d\n", error);
-                goto fail_ir_gh;
+                goto fail_qc_i;
        }
        ip = GFS2_I(sdp->sd_qc_inode);
-        error = gfs2_glock_nq_init(ip->i_gl,
+        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0,
-                                   LM_ST_EXCLUSIVE, 0,
                                   &sdp->sd_qc_gh);
        if (error) {
                fs_err(sdp, "can't lock local \"qc\" file: %d\n", error);
@@ -965,14 +934,10 @@ fail_qc_gh:
        gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
 fail_ut_gh:
        gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
-fail_ir_gh:
-        gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
 fail_qc_i:
        iput(sdp->sd_qc_inode);
 fail_ut_i:
        iput(sdp->sd_sc_inode);
-fail_ir_i:
-        iput(sdp->sd_ir_inode);
 fail:
        if (pn)
                iput(pn);
@@ -1063,7 +1028,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
        ls->ls_ops = lm;
        ls->ls_first = 1;
-        ls->ls_id = 0;
        for (options = args->ar_hostdata; (o = strsep(&options, ":")); ) {
                substring_t tmp[MAX_OPT_ARGS];
@@ -1081,10 +1045,7 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
                        ls->ls_jid = option;
                        break;
                case Opt_id:
-                        ret = match_int(&tmp[0], &option);
+                        /* Obsolete, but left for backward compat purposes */
-                        if (ret)
-                                goto hostdata_error;
-                        ls->ls_id = option;
                        break;
                case Opt_first:
                        ret = match_int(&tmp[0], &option);
@@ -1133,6 +1094,17 @@ void gfs2_lm_unmount(struct gfs2_sbd *sdp)
                lm->lm_unmount(sdp);
 }
+void gfs2_online_uevent(struct gfs2_sbd *sdp)
+{
+        struct super_block *sb = sdp->sd_vfs;
+        char ro[20];
+        char spectator[20];
+        char *envp[] = { ro, spectator, NULL };
+        sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+        sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
+        kobject_uevent_env(&sdp->sd_kobj, KOBJ_ONLINE, envp);
+}
 /**
 * fill_super - Read in superblock
 * @sb: The VFS superblock
@@ -1157,6 +1129,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT;
        sdp->sd_args.ar_data = GFS2_DATA_DEFAULT;
        sdp->sd_args.ar_commit = 60;
+        sdp->sd_args.ar_errors = GFS2_ERRORS_DEFAULT;
        error = gfs2_mount_args(sdp, &sdp->sd_args, data);
        if (error) {
@@ -1174,6 +1147,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = GFS2_MAGIC;
        sb->s_op = &gfs2_super_ops;
        sb->s_export_op = &gfs2_export_ops;
+        sb->s_xattr = gfs2_xattr_handlers;
        sb->s_time_gran = 1;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -1236,7 +1210,7 @@ static int fill_super(struct super_block *sb, void *data, int silent)
        }
        gfs2_glock_dq_uninit(&mount_gh);
+        gfs2_online_uevent(sdp);
        return 0;
 fail_threads:
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index f8bd20baf99c..c3ac18054057 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -26,8 +26,7 @@
 #include "acl.h"
 #include "bmap.h"
 #include "dir.h"
-#include "eaops.h"
+#include "xattr.h"
-#include "eattr.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
@@ -349,7 +348,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry)
        error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0);
        if (error)
-                goto out_rgrp;
+                goto out_gunlock;
        error = gfs2_dir_del(dip, &dentry->d_name);
        if (error)
@@ -1302,60 +1301,53 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
                         const void *data, size_t size, int flags)
 {
        struct inode *inode = dentry->d_inode;
-        struct gfs2_ea_request er;
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_holder gh;
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        int ret;
-        er.er_type = gfs2_ea_name2type(name, &er.er_name);
-        if (er.er_type == GFS2_EATYPE_UNUSED)
-                return -EOPNOTSUPP;
-        er.er_data = (char *)data;
-        er.er_name_len = strlen(er.er_name);
-        er.er_data_len = size;
-        er.er_flags = flags;
-        gfs2_assert_warn(GFS2_SB(inode), !(er.er_flags & GFS2_ERF_MODE));
-        return gfs2_ea_set(GFS2_I(inode), &er);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        ret = gfs2_glock_nq(&gh);
+        if (ret == 0) {
+                ret = generic_setxattr(dentry, name, data, size, flags);
+                gfs2_glock_dq(&gh);
+        }
+        gfs2_holder_uninit(&gh);
+        return ret;
 }
 static ssize_t gfs2_getxattr(struct dentry *dentry, const char *name,
                             void *data, size_t size)
 {
-        struct gfs2_ea_request er;
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        struct gfs2_holder gh;
-        er.er_type = gfs2_ea_name2type(name, &er.er_name);
+        int ret;
-        if (er.er_type == GFS2_EATYPE_UNUSED)
-                return -EOPNOTSUPP;
-        er.er_data = data;
-        er.er_name_len = strlen(er.er_name);
-        er.er_data_len = size;
-        return gfs2_ea_get(GFS2_I(dentry->d_inode), &er);
-}
-static ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
-{
-        struct gfs2_ea_request er;
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
-        er.er_data = (size) ? buffer : NULL;
-        er.er_data_len = size;
-        return gfs2_ea_list(GFS2_I(dentry->d_inode), &er);
+        gfs2_holder_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+        ret = gfs2_glock_nq(&gh);
+        if (ret == 0) {
+                ret = generic_getxattr(dentry, name, data, size);
+                gfs2_glock_dq(&gh);
+        }
+        gfs2_holder_uninit(&gh);
+        return ret;
 }
 static int gfs2_removexattr(struct dentry *dentry, const char *name)
 {
-        struct gfs2_ea_request er;
+        struct inode *inode = dentry->d_inode;
+        struct gfs2_inode *ip = GFS2_I(inode);
-        memset(&er, 0, sizeof(struct gfs2_ea_request));
+        struct gfs2_holder gh;
-        er.er_type = gfs2_ea_name2type(name, &er.er_name);
+        int ret;
-        if (er.er_type == GFS2_EATYPE_UNUSED)
-                return -EOPNOTSUPP;
-        er.er_name_len = strlen(er.er_name);
-        return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er);
+        gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
+        ret = gfs2_glock_nq(&gh);
+        if (ret == 0) {
+                ret = generic_removexattr(dentry, name);
+                gfs2_glock_dq(&gh);
+        }
+        gfs2_holder_uninit(&gh);
+        return ret;
 }
 static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index daa4ae341a29..28c590b7c9da 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -285,27 +285,19 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd)
        }
        tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes;
-        if (count[1] + count[2] != tmp) {
+        if (count[1] != tmp) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used data mismatch:  %u != %u\n",
                               count[1], tmp);
                return;
        }
-        if (count[3] != rgd->rd_dinodes) {
+        if (count[2] + count[3] != rgd->rd_dinodes) {
                if (gfs2_consist_rgrpd(rgd))
                        fs_err(sdp, "used metadata mismatch:  %u != %u\n",
-                               count[3], rgd->rd_dinodes);
+                               count[2] + count[3], rgd->rd_dinodes);
                return;
        }
-        if (count[2] > count[3]) {
-                if (gfs2_consist_rgrpd(rgd))
-                        fs_err(sdp, "unlinked inodes > inodes:  %u\n",
-                               count[2]);
-                return;
-        }
 }
 static inline int rgrp_contains_block(struct gfs2_rgrpd *rgd, u64 block)
@@ -865,7 +857,8 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
                                        goto start_new_extent;
                                if ((start + nr_sects) != blk) {
                                        rv = blkdev_issue_discard(bdev, start,
-                                                            nr_sects, GFP_NOFS);
+                                                            nr_sects, GFP_NOFS,
+                                                            DISCARD_FL_BARRIER);
                                        if (rv)
                                                goto fail;
                                        nr_sects = 0;
@@ -879,7 +872,8 @@ start_new_extent:
                }
        }
        if (nr_sects) {
-                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS);
+                rv = blkdev_issue_discard(bdev, start, nr_sects, GFP_NOFS,
+                                         DISCARD_FL_BARRIER);
                if (rv)
                        goto fail;
        }
@@ -961,7 +955,8 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al)
 * Returns: The inode, if one has been found
 */
-static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
+static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked,
+                                     u64 skip)
 {
        struct inode *inode;
        u32 goal = 0, block;
@@ -985,6 +980,8 @@ static struct inode *try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked)
                goal++;
                if (*last_unlinked != NO_BLOCK && no_addr <= *last_unlinked)
                        continue;
+                if (no_addr == skip)
+                        continue;
                *last_unlinked = no_addr;
                inode = gfs2_inode_lookup(rgd->rd_sbd->sd_vfs, DT_UNKNOWN,
                                          no_addr, -1, 1);
@@ -1104,7 +1101,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        if (try_rgrp_fit(rgd, al))
                                goto out;
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked);
+                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
@@ -1138,7 +1135,7 @@ static struct inode *get_local_rgrp(struct gfs2_inode *ip, u64 *last_unlinked)
                        if (try_rgrp_fit(rgd, al))
                                goto out;
                        if (rgd->rd_flags & GFS2_RDF_CHECK)
-                                inode = try_rgrp_unlink(rgd, last_unlinked);
+                                inode = try_rgrp_unlink(rgd, last_unlinked, ip->i_no_addr);
                        if (!rg_locked)
                                gfs2_glock_dq_uninit(&al->al_rgd_gh);
                        if (inode)
@@ -1261,7 +1258,7 @@ void gfs2_inplace_release(struct gfs2_inode *ip)
 * Returns: The block type (GFS2_BLKST_*)
 */
-unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
+static unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block)
 {
        struct gfs2_bitmap *bi = NULL;
        u32 length, rgrp_block, buf_block;
@@ -1464,6 +1461,16 @@ int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl)
        return 0;
 }
+static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
+{
+        struct gfs2_sbd *sdp = rgd->rd_sbd;
+        fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+                (unsigned long long)rgd->rd_addr);
+        fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
+        gfs2_rgrp_dump(NULL, rgd->rd_gl);
+        rgd->rd_flags |= GFS2_RDF_ERROR;
+}
 /**
 * gfs2_alloc_block - Allocate one or more blocks
 * @ip: the inode to allocate the block for
@@ -1525,22 +1532,20 @@ int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n)
        return 0;
 rgrp_error:
-        fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n",
+        gfs2_rgrp_error(rgd);
-                (unsigned long long)rgd->rd_addr);
-        fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n");
-        gfs2_rgrp_dump(NULL, rgd->rd_gl);
-        rgd->rd_flags |= GFS2_RDF_ERROR;
        return -EIO;
 }
 /**
 * gfs2_alloc_di - Allocate a dinode
 * @dip: the directory that the inode is going in
+ * @bn: the block number which is allocated
+ * @generation: the generation number of the inode
 *
- * Returns: the block allocated
+ * Returns: 0 on success or error
 */
-u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
+int gfs2_alloc_di(struct gfs2_inode *dip, u64 *bn, u64 *generation)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
        struct gfs2_alloc *al = dip->i_alloc;
@@ -1551,16 +1556,21 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        blk = rgblk_search(rgd, rgd->rd_last_alloc,
                           GFS2_BLKST_FREE, GFS2_BLKST_DINODE, &n);
-        BUG_ON(blk == BFITNOENT);
-        rgd->rd_last_alloc = blk;
+        /* Since all blocks are reserved in advance, this shouldn't happen */
+        if (blk == BFITNOENT)
+                goto rgrp_error;
+        rgd->rd_last_alloc = blk;
        block = rgd->rd_data0 + blk;
+        if (rgd->rd_free == 0)
+                goto rgrp_error;
-        gfs2_assert_withdraw(sdp, rgd->rd_free);
        rgd->rd_free--;
        rgd->rd_dinodes++;
        *generation = rgd->rd_igeneration++;
+        if (*generation == 0)
+                *generation = rgd->rd_igeneration++;
        gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1);
        gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data);
@@ -1573,7 +1583,12 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation)
        rgd->rd_free_clone--;
        spin_unlock(&sdp->sd_rindex_spin);
        trace_gfs2_block_alloc(dip, block, 1, GFS2_BLKST_DINODE);
-        return block;
+        *bn = block;
+        return 0;
+rgrp_error:
+        gfs2_rgrp_error(rgd);
+        return -EIO;
 }
 /**
@@ -1681,6 +1696,46 @@ void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip)
 }
 /**
+ * gfs2_check_blk_type - Check the type of a block
+ * @sdp: The superblock
+ * @no_addr: The block number to check
+ * @type: The block type we are looking for
+ *
+ * Returns: 0 if the block type matches the expected type
+ *          -ESTALE if it doesn't match
+ *          or -ve errno if something went wrong while checking
+ */
+int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr, unsigned int type)
+{
+        struct gfs2_rgrpd *rgd;
+        struct gfs2_holder ri_gh, rgd_gh;
+        int error;
+        error = gfs2_rindex_hold(sdp, &ri_gh);
+        if (error)
+                goto fail;
+        error = -EINVAL;
+        rgd = gfs2_blk2rgrpd(sdp, no_addr);
+        if (!rgd)
+                goto fail_rindex;
+        error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_SHARED, 0, &rgd_gh);
+        if (error)
+                goto fail_rindex;
+        if (gfs2_get_block_type(rgd, no_addr) != type)
+                error = -ESTALE;
+        gfs2_glock_dq_uninit(&rgd_gh);
+fail_rindex:
+        gfs2_glock_dq_uninit(&ri_gh);
+fail:
+        return error;
+}
+/**
 * gfs2_rlist_add - add a RG to a list of RGs
 * @sdp: the filesystem
 * @rlist: the list of resource groups
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 1e76ff0f3e00..b4106ddaaa98 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -44,15 +44,15 @@ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__)
 extern void gfs2_inplace_release(struct gfs2_inode *ip);
-extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block);
 extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n);
-extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation);
+extern int gfs2_alloc_di(struct gfs2_inode *ip, u64 *bn, u64 *generation);
 extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
 extern void gfs2_unlink_di(struct inode *inode);
+extern int gfs2_check_blk_type(struct gfs2_sbd *sdp, u64 no_addr,
+                               unsigned int type);
 struct gfs2_rgrp_list {
        unsigned int rl_rgrps;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 0a6801336470..0ec3ec672de1 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -38,7 +38,7 @@
 #include "trans.h"
 #include "util.h"
 #include "sys.h"
-#include "eattr.h"
+#include "xattr.h"
 #define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x)
@@ -68,6 +68,8 @@ enum {
        Opt_discard,
        Opt_nodiscard,
        Opt_commit,
+        Opt_err_withdraw,
+        Opt_err_panic,
        Opt_error,
 };
@@ -97,6 +99,8 @@ static const match_table_t tokens = {
        {Opt_discard, "discard"},
        {Opt_nodiscard, "nodiscard"},
        {Opt_commit, "commit=%d"},
+        {Opt_err_withdraw, "errors=withdraw"},
+        {Opt_err_panic, "errors=panic"},
        {Opt_error, NULL}
 };
@@ -152,6 +156,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                        args->ar_localcaching = 1;
                        break;
                case Opt_debug:
+                        if (args->ar_errors == GFS2_ERRORS_PANIC) {
+                                fs_info(sdp, "-o debug and -o errors=panic "
+                                       "are mutually exclusive.\n");
+                                return -EINVAL;
+                        }
                        args->ar_debug = 1;
                        break;
                case Opt_nodebug:
@@ -205,6 +214,17 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options)
                                return rv ? rv : -EINVAL;
                        }
                        break;
+                case Opt_err_withdraw:
+                        args->ar_errors = GFS2_ERRORS_WITHDRAW;
+                        break;
+                case Opt_err_panic:
+                        if (args->ar_debug) {
+                                fs_info(sdp, "-o debug and -o errors=panic "
+                                        "are mutually exclusive.\n");
+                                return -EINVAL;
+                        }
+                        args->ar_errors = GFS2_ERRORS_PANIC;
+                        break;
                case Opt_error:
                default:
                        fs_info(sdp, "invalid mount option: %s\n", o);
@@ -353,7 +373,7 @@ fail:
        return error;
 }
-static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
+void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf)
 {
        const struct gfs2_statfs_change *str = buf;
@@ -441,6 +461,29 @@ void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
        brelse(l_bh);
 }
+void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+                   struct buffer_head *l_bh)
+{
+        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
+        struct gfs2_inode *l_ip = GFS2_I(sdp->sd_sc_inode);
+        struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master;
+        struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
+        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        spin_lock(&sdp->sd_statfs_spin);
+        m_sc->sc_total += l_sc->sc_total;
+        m_sc->sc_free += l_sc->sc_free;
+        m_sc->sc_dinodes += l_sc->sc_dinodes;
+        memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
+        memset(l_bh->b_data + sizeof(struct gfs2_dinode),
+               0, sizeof(struct gfs2_statfs_change));
+        spin_unlock(&sdp->sd_statfs_spin);
+        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
+        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+}
 int gfs2_statfs_sync(struct gfs2_sbd *sdp)
 {
        struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode);
@@ -477,19 +520,7 @@ int gfs2_statfs_sync(struct gfs2_sbd *sdp)
        if (error)
                goto out_bh2;
-        gfs2_trans_add_bh(l_ip->i_gl, l_bh, 1);
+        update_statfs(sdp, m_bh, l_bh);
-        spin_lock(&sdp->sd_statfs_spin);
-        m_sc->sc_total += l_sc->sc_total;
-        m_sc->sc_free += l_sc->sc_free;
-        m_sc->sc_dinodes += l_sc->sc_dinodes;
-        memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
-        memset(l_bh->b_data + sizeof(struct gfs2_dinode),
-               0, sizeof(struct gfs2_statfs_change));
-        spin_unlock(&sdp->sd_statfs_spin);
-        gfs2_trans_add_bh(m_ip->i_gl, m_bh, 1);
-        gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
        gfs2_trans_end(sdp);
@@ -680,6 +711,7 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
        struct gfs2_holder t_gh;
        int error;
+        flush_workqueue(gfs2_delete_workqueue);
        gfs2_quota_sync(sdp);
        gfs2_statfs_sync(sdp);
@@ -756,7 +788,6 @@ restart:
        /*  Release stuff  */
        iput(sdp->sd_jindex);
-        iput(sdp->sd_inum_inode);
        iput(sdp->sd_statfs_inode);
        iput(sdp->sd_rindex);
        iput(sdp->sd_quota_inode);
@@ -767,10 +798,8 @@ restart:
        if (!sdp->sd_args.ar_spectator) {
                gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
                gfs2_glock_dq_uninit(&sdp->sd_jinode_gh);
-                gfs2_glock_dq_uninit(&sdp->sd_ir_gh);
                gfs2_glock_dq_uninit(&sdp->sd_sc_gh);
                gfs2_glock_dq_uninit(&sdp->sd_qc_gh);
-                iput(sdp->sd_ir_inode);
                iput(sdp->sd_sc_inode);
                iput(sdp->sd_qc_inode);
        }
@@ -1072,6 +1101,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data)
        gt->gt_log_flush_secs = args.ar_commit;
        spin_unlock(&gt->gt_spin);
+        gfs2_online_uevent(sdp);
        return 0;
 }
@@ -1213,6 +1243,22 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt)
        lfsecs = sdp->sd_tune.gt_log_flush_secs;
        if (lfsecs != 60)
                seq_printf(s, ",commit=%d", lfsecs);
+        if (args->ar_errors != GFS2_ERRORS_DEFAULT) {
+                const char *state;
+                switch (args->ar_errors) {
+                case GFS2_ERRORS_WITHDRAW:
+                        state = "withdraw";
+                        break;
+                case GFS2_ERRORS_PANIC:
+                        state = "panic";
+                        break;
+                default:
+                        state = "unknown";
+                        break;
+                }
+                seq_printf(s, ",errors=%s", state);
+        }
        return 0;
 }
@@ -1240,6 +1286,10 @@ static void gfs2_delete_inode(struct inode *inode)
                goto out;
        }
+        error = gfs2_check_blk_type(sdp, ip->i_no_addr, GFS2_BLKST_UNLINKED);
+        if (error)
+                goto out_truncate;
        gfs2_glock_dq_wait(&ip->i_iopen_gh);
        gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh);
        error = gfs2_glock_nq(&ip->i_iopen_gh);
diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h
index b56413e3e40d..235db3682885 100644
--- a/fs/gfs2/super.h
+++ b/fs/gfs2/super.h
@@ -25,7 +25,7 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp)
        return x;
 }
-void gfs2_jindex_free(struct gfs2_sbd *sdp);
+extern void gfs2_jindex_free(struct gfs2_sbd *sdp);
 extern int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *data);
@@ -36,10 +36,14 @@ extern int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename,
                                     struct gfs2_inode **ipp);
 extern int gfs2_make_fs_rw(struct gfs2_sbd *sdp);
+extern void gfs2_online_uevent(struct gfs2_sbd *sdp);
 extern int gfs2_statfs_init(struct gfs2_sbd *sdp);
 extern void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free,
                               s64 dinodes);
+extern void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc,
+                                  const void *buf);
+extern void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
+                          struct buffer_head *l_bh);
 extern int gfs2_statfs_sync(struct gfs2_sbd *sdp);
 extern int gfs2_freeze_fs(struct gfs2_sbd *sdp);
@@ -50,6 +54,7 @@ extern struct file_system_type gfs2meta_fs_type;
 extern const struct export_operations gfs2_export_ops;
 extern const struct super_operations gfs2_super_ops;
 extern const struct dentry_operations gfs2_dops;
+extern struct xattr_handler *gfs2_xattr_handlers[];
 #endif /* __SUPER_DOT_H__ */
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index 23419dc3027b..446329728d52 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
 #include <linux/kobject.h>
 #include <asm/uaccess.h>
 #include <linux/gfs2_ondisk.h>
+#include <linux/genhd.h>
 #include "gfs2.h"
 #include "incore.h"
@@ -319,12 +320,6 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
        return ret;
 }
-static ssize_t lkid_show(struct gfs2_sbd *sdp, char *buf)
-{
-        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
-        return sprintf(buf, "%u\n", ls->ls_id);
-}
 static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf)
 {
        struct lm_lockstruct *ls = &sdp->sd_lockstruct;
@@ -386,22 +381,20 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf)
 #define GDLM_ATTR(_name,_mode,_show,_store) \
 static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store)
-GDLM_ATTR(proto_name,     0444, proto_name_show,        NULL);
+GDLM_ATTR(proto_name,           0444, proto_name_show,          NULL);
-GDLM_ATTR(block,          0644, block_show,             block_store);
+GDLM_ATTR(block,                0644, block_show,               block_store);
-GDLM_ATTR(withdraw,       0644, withdraw_show,          withdraw_store);
+GDLM_ATTR(withdraw,             0644, withdraw_show,            withdraw_store);
-GDLM_ATTR(id,             0444, lkid_show,              NULL);
+GDLM_ATTR(jid,                  0444, jid_show,                 NULL);
-GDLM_ATTR(jid,            0444, jid_show,               NULL);
+GDLM_ATTR(first,                0444, lkfirst_show,             NULL);
-GDLM_ATTR(first,          0444, lkfirst_show,           NULL);
+GDLM_ATTR(first_done,           0444, first_done_show,          NULL);
-GDLM_ATTR(first_done,     0444, first_done_show,        NULL);
+GDLM_ATTR(recover,              0600, NULL,                     recover_store);
-GDLM_ATTR(recover,        0200, NULL,                   recover_store);
+GDLM_ATTR(recover_done,         0444, recover_done_show,        NULL);
-GDLM_ATTR(recover_done,   0444, recover_done_show,      NULL);
+GDLM_ATTR(recover_status,       0444, recover_status_show,      NULL);
-GDLM_ATTR(recover_status, 0444, recover_status_show,    NULL);
 static struct attribute *lock_module_attrs[] = {
        &gdlm_attr_proto_name.attr,
        &gdlm_attr_block.attr,
        &gdlm_attr_withdraw.attr,
-        &gdlm_attr_id.attr,
        &gdlm_attr_jid.attr,
        &gdlm_attr_first.attr,
        &gdlm_attr_first_done.attr,
@@ -519,7 +512,14 @@ static struct attribute_group lock_module_group = {
 int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
 {
+        struct super_block *sb = sdp->sd_vfs;
        int error;
+        char ro[20];
+        char spectator[20];
+        char *envp[] = { ro, spectator, NULL };
+        sprintf(ro, "RDONLY=%d", (sb->s_flags & MS_RDONLY) ? 1 : 0);
+        sprintf(spectator, "SPECTATOR=%d", sdp->sd_args.ar_spectator ? 1 : 0);
        sdp->sd_kobj.kset = gfs2_kset;
        error = kobject_init_and_add(&sdp->sd_kobj, &gfs2_ktype, NULL,
@@ -535,9 +535,17 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp)
        if (error)
                goto fail_tune;
-        kobject_uevent(&sdp->sd_kobj, KOBJ_ADD);
+        error = sysfs_create_link(&sdp->sd_kobj,
+                                  &disk_to_dev(sb->s_bdev->bd_disk)->kobj,
+                                  "device");
+        if (error)
+                goto fail_lock_module;
+        kobject_uevent_env(&sdp->sd_kobj, KOBJ_ADD, envp);
        return 0;
+fail_lock_module:
+        sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
 fail_tune:
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
 fail_reg:
@@ -549,12 +557,12 @@ fail:
 void gfs2_sys_fs_del(struct gfs2_sbd *sdp)
 {
+        sysfs_remove_link(&sdp->sd_kobj, "device");
        sysfs_remove_group(&sdp->sd_kobj, &tune_group);
        sysfs_remove_group(&sdp->sd_kobj, &lock_module_group);
        kobject_put(&sdp->sd_kobj);
 }
 static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
                       struct kobj_uevent_env *env)
 {
@@ -563,6 +571,8 @@ static int gfs2_uevent(struct kset *kset, struct kobject *kobj,
        add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name);
        add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name);
+        if (!sdp->sd_args.ar_spectator)
+                add_uevent_var(env, "JOURNALID=%u", sdp->sd_lockstruct.ls_jid);
        if (gfs2_uuid_valid(uuid)) {
                add_uevent_var(env, "UUID=%02X%02X%02X%02X-%02X%02X-%02X%02X-"
                               "%02X%02X-%02X%02X%02X%02X%02X%02X",
@@ -578,7 +588,6 @@ static struct kset_uevent_ops gfs2_uevent_ops = {
        .uevent = gfs2_uevent,
 };
 int gfs2_sys_init(void)
 {
        gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 9d12b1118ba0..f6a7efa34eb9 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -38,24 +38,30 @@ int gfs2_lm_withdraw(struct gfs2_sbd *sdp, char *fmt, ...)
        const struct lm_lockops *lm = ls->ls_ops;
        va_list args;
-        if (test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+        if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW &&
+            test_and_set_bit(SDF_SHUTDOWN, &sdp->sd_flags))
                return 0;
        va_start(args, fmt);
        vprintk(fmt, args);
        va_end(args);
-        fs_err(sdp, "about to withdraw this file system\n");
+        if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW) {
-        BUG_ON(sdp->sd_args.ar_debug);
+                fs_err(sdp, "about to withdraw this file system\n");
+                BUG_ON(sdp->sd_args.ar_debug);
-        kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
+                kobject_uevent(&sdp->sd_kobj, KOBJ_OFFLINE);
-        if (lm->lm_unmount) {
+                if (lm->lm_unmount) {
-                fs_err(sdp, "telling LM to unmount\n");
+                        fs_err(sdp, "telling LM to unmount\n");
-                lm->lm_unmount(sdp);
+                        lm->lm_unmount(sdp);
+                }
+                fs_err(sdp, "withdrawn\n");
+                dump_stack();
        }
-        fs_err(sdp, "withdrawn\n");
-        dump_stack();
+        if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+                panic("GFS2: fsid=%s: panic requested.\n", sdp->sd_fsname);
        return -1;
 }
@@ -93,17 +99,24 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion,
                        gfs2_tune_get(sdp, gt_complain_secs) * HZ))
                return -2;
-        printk(KERN_WARNING
+        if (sdp->sd_args.ar_errors == GFS2_ERRORS_WITHDRAW)
-               "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+                printk(KERN_WARNING
-               "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                       "GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
-               sdp->sd_fsname, assertion,
+                       "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
-               sdp->sd_fsname, function, file, line);
+                       sdp->sd_fsname, assertion,
+                       sdp->sd_fsname, function, file, line);
        if (sdp->sd_args.ar_debug)
                BUG();
        else
                dump_stack();
+        if (sdp->sd_args.ar_errors == GFS2_ERRORS_PANIC)
+                panic("GFS2: fsid=%s: warning: assertion \"%s\" failed\n"
+                      "GFS2: fsid=%s:   function = %s, file = %s, line = %u\n",
+                      sdp->sd_fsname, assertion,
+                      sdp->sd_fsname, function, file, line);
        sdp->sd_last_warning = jiffies;
        return -1;
diff --git a/fs/gfs2/eattr.c b/fs/gfs2/xattr.c
index 07ea9529adda..8a0f8ef6ee27 100644
--- a/fs/gfs2/eattr.c
+++ b/fs/gfs2/xattr.c
@@ -18,8 +18,7 @@
 #include "gfs2.h"
 #include "incore.h"
 #include "acl.h"
-#include "eaops.h"
+#include "xattr.h"
-#include "eattr.h"
 #include "glock.h"
 #include "inode.h"
 #include "meta_io.h"
@@ -38,26 +37,32 @@
 * Returns: 1 if the EA should be stuffed
 */
-static int ea_calc_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er,
+static int ea_calc_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize,
                        unsigned int *size)
 {
-        *size = GFS2_EAREQ_SIZE_STUFFED(er);
+        unsigned int jbsize = sdp->sd_jbsize;
-        if (*size <= sdp->sd_jbsize)
+        /* Stuffed */
+        *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize + dsize, 8);
+        if (*size <= jbsize)
                return 1;
-        *size = GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er);
+        /* Unstuffed */
+        *size = ALIGN(sizeof(struct gfs2_ea_header) + nsize +
+                      (sizeof(__be64) * DIV_ROUND_UP(dsize, jbsize)), 8);
        return 0;
 }
-static int ea_check_size(struct gfs2_sbd *sdp, struct gfs2_ea_request *er)
+static int ea_check_size(struct gfs2_sbd *sdp, unsigned int nsize, size_t dsize)
 {
        unsigned int size;
-        if (er->er_data_len > GFS2_EA_MAX_DATA_LEN)
+        if (dsize > GFS2_EA_MAX_DATA_LEN)
                return -ERANGE;
-        ea_calc_size(sdp, er, &size);
+        ea_calc_size(sdp, nsize, dsize, &size);
        /* This can only happen with 512 byte blocks */
        if (size > sdp->sd_jbsize)
@@ -151,7 +156,9 @@ out:
 }
 struct ea_find {
-        struct gfs2_ea_request *ef_er;
+        int type;
+        const char *name;
+        size_t namel;
        struct gfs2_ea_location *ef_el;
 };
@@ -160,14 +167,13 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
                     void *private)
 {
        struct ea_find *ef = private;
-        struct gfs2_ea_request *er = ef->ef_er;
        if (ea->ea_type == GFS2_EATYPE_UNUSED)
                return 0;
-        if (ea->ea_type == er->er_type) {
+        if (ea->ea_type == ef->type) {
-                if (ea->ea_name_len == er->er_name_len &&
+                if (ea->ea_name_len == ef->namel &&
-                    !memcmp(GFS2_EA2NAME(ea), er->er_name, ea->ea_name_len)) {
+                    !memcmp(GFS2_EA2NAME(ea), ef->name, ea->ea_name_len)) {
                        struct gfs2_ea_location *el = ef->ef_el;
                        get_bh(bh);
                        el->el_bh = bh;
@@ -180,13 +186,15 @@ static int ea_find_i(struct gfs2_inode *ip, struct buffer_head *bh,
        return 0;
 }
-int gfs2_ea_find(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
                 struct gfs2_ea_location *el)
 {
        struct ea_find ef;
        int error;
-        ef.ef_er = er;
+        ef.type = type;
+        ef.name = name;
+        ef.namel = strlen(name);
        ef.ef_el = el;
        memset(el, 0, sizeof(struct gfs2_ea_location));
@@ -344,6 +352,20 @@ struct ea_list {
        unsigned int ei_size;
 };
+static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
+{
+        switch (ea->ea_type) {
+        case GFS2_EATYPE_USR:
+                return 5 + ea->ea_name_len + 1;
+        case GFS2_EATYPE_SYS:
+                return 7 + ea->ea_name_len + 1;
+        case GFS2_EATYPE_SECURITY:
+                return 9 + ea->ea_name_len + 1;
+        default:
+                return 0;
+        }
+}
 static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
                     struct gfs2_ea_header *ea, struct gfs2_ea_header *prev,
                     void *private)
@@ -392,21 +414,25 @@ static int ea_list_i(struct gfs2_inode *ip, struct buffer_head *bh,
 }
 /**
- * gfs2_ea_list -
+ * gfs2_listxattr - List gfs2 extended attributes
- * @ip:
+ * @dentry: The dentry whose inode we are interested in
- * @er:
+ * @buffer: The buffer to write the results
+ * @size: The size of the buffer
 *
 * Returns: actual size of data on success, -errno on error
 */
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
+        struct gfs2_inode *ip = GFS2_I(dentry->d_inode);
+        struct gfs2_ea_request er;
        struct gfs2_holder i_gh;
        int error;
-        if (!er->er_data || !er->er_data_len) {
+        memset(&er, 0, sizeof(struct gfs2_ea_request));
-                er->er_data = NULL;
+        if (size) {
-                er->er_data_len = 0;
+                er.er_data = buffer;
+                er.er_data_len = size;
        }
        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
@@ -414,7 +440,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er)
                return error;
        if (ip->i_eattr) {
-                struct ea_list ei = { .ei_er = er, .ei_size = 0 };
+                struct ea_list ei = { .ei_er = &er, .ei_size = 0 };
                error = ea_foreach(ip, ea_list_i, &ei);
                if (!error)
@@ -491,84 +517,61 @@ out:
 }
 int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                     char *data)
+                     char *data, size_t size)
 {
+        int ret;
+        size_t len = GFS2_EA_DATA_LEN(el->el_ea);
+        if (len > size)
+                return -ERANGE;
        if (GFS2_EA_IS_STUFFED(el->el_ea)) {
-                memcpy(data, GFS2_EA2DATA(el->el_ea), GFS2_EA_DATA_LEN(el->el_ea));
+                memcpy(data, GFS2_EA2DATA(el->el_ea), len);
-                return 0;
+                return len;
-        } else
+        }
-                return ea_get_unstuffed(ip, el->el_ea, data);
+        ret = ea_get_unstuffed(ip, el->el_ea, data);
+        if (ret < 0)
+                return ret;
+        return len;
 }
 /**
- * gfs2_ea_get_i -
+ * gfs2_xattr_get - Get a GFS2 extended attribute
- * @ip: The GFS2 inode
+ * @inode: The inode
- * @er: The request structure
+ * @type: The type of extended attribute
+ * @name: The name of the extended attribute
+ * @buffer: The buffer to write the result into
+ * @size: The size of the buffer
 *
 * Returns: actual size of data on success, -errno on error
 */
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_get(struct inode *inode, int type, const char *name,
+                   void *buffer, size_t size)
 {
+        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_ea_location el;
        int error;
        if (!ip->i_eattr)
                return -ENODATA;
+        if (strlen(name) > GFS2_EA_MAX_NAME_LEN)
+                return -EINVAL;
-        error = gfs2_ea_find(ip, er, &el);
+        error = gfs2_ea_find(ip, type, name, &el);
        if (error)
                return error;
        if (!el.el_ea)
                return -ENODATA;
+        if (size)
-        if (er->er_data_len) {
+                error = gfs2_ea_get_copy(ip, &el, buffer, size);
-                if (GFS2_EA_DATA_LEN(el.el_ea) > er->er_data_len)
+        else
-                        error =  -ERANGE;
-                else
-                        error = gfs2_ea_get_copy(ip, &el, er->er_data);
-        }
-        if (!error)
                error = GFS2_EA_DATA_LEN(el.el_ea);
        brelse(el.el_bh);
        return error;
 }
 /**
- * gfs2_ea_get -
- * @ip: The GFS2 inode
- * @er: The request structure
- *
- * Returns: actual size of data on success, -errno on error
- */
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct gfs2_holder i_gh;
-        int error;
-        if (!er->er_name_len ||
-            er->er_name_len > GFS2_EA_MAX_NAME_LEN)
-                return -EINVAL;
-        if (!er->er_data || !er->er_data_len) {
-                er->er_data = NULL;
-                er->er_data_len = 0;
-        }
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-        if (error)
-                return error;
-        error = gfs2_ea_ops[er->er_type]->eo_get(ip, er);
-        gfs2_glock_dq_uninit(&i_gh);
-        return error;
-}
-/**
 * ea_alloc_blk - allocates a new block for extended attributes.
 * @ip: A pointer to the inode that's getting extended attributes
 * @bhp: Pointer to pointer to a struct buffer_head
@@ -713,12 +716,6 @@ static int ea_alloc_skeleton(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
-                if (er->er_flags & GFS2_ERF_MODE) {
-                        gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-                                            (ip->i_inode.i_mode & S_IFMT) ==
-                                            (er->er_mode & S_IFMT));
-                        ip->i_inode.i_mode = er->er_mode;
-                }
                ip->i_inode.i_ctime = CURRENT_TIME;
                gfs2_trans_add_bh(ip->i_gl, dibh, 1);
                gfs2_dinode_out(ip, dibh->b_data);
@@ -762,15 +759,23 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 * Returns: errno
 */
-static int ea_init(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+static int ea_init(struct gfs2_inode *ip, int type, const char *name,
+                   const void *data, size_t size)
 {
+        struct gfs2_ea_request er;
        unsigned int jbsize = GFS2_SB(&ip->i_inode)->sd_jbsize;
        unsigned int blks = 1;
-        if (GFS2_EAREQ_SIZE_STUFFED(er) > jbsize)
+        er.er_type = type;
-                blks += DIV_ROUND_UP(er->er_data_len, jbsize);
+        er.er_name = name;
+        er.er_name_len = strlen(name);
+        er.er_data = (void *)data;
+        er.er_data_len = size;
+        if (GFS2_EAREQ_SIZE_STUFFED(&er) > jbsize)
+                blks += DIV_ROUND_UP(er.er_data_len, jbsize);
-        return ea_alloc_skeleton(ip, er, blks, ea_init_i, NULL);
+        return ea_alloc_skeleton(ip, &er, blks, ea_init_i, NULL);
 }
 static struct gfs2_ea_header *ea_split_ea(struct gfs2_ea_header *ea)
@@ -848,12 +853,6 @@ static int ea_set_simple_noalloc(struct gfs2_inode *ip, struct buffer_head *bh,
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (error)
                goto out;
-        if (er->er_flags & GFS2_ERF_MODE) {
-                gfs2_assert_withdraw(GFS2_SB(&ip->i_inode),
-                        (ip->i_inode.i_mode & S_IFMT) == (er->er_mode & S_IFMT));
-                ip->i_inode.i_mode = er->er_mode;
-        }
        ip->i_inode.i_ctime = CURRENT_TIME;
        gfs2_trans_add_bh(ip->i_gl, dibh, 1);
        gfs2_dinode_out(ip, dibh->b_data);
@@ -894,7 +893,8 @@ static int ea_set_simple(struct gfs2_inode *ip, struct buffer_head *bh,
        int stuffed;
        int error;
-        stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er, &size);
+        stuffed = ea_calc_size(GFS2_SB(&ip->i_inode), es->es_er->er_name_len,
+                               es->es_er->er_data_len, &size);
        if (ea->ea_type == GFS2_EATYPE_UNUSED) {
                if (GFS2_EA_REC_LEN(ea) < size)
@@ -1005,15 +1005,22 @@ out:
        return error;
 }
-static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
+static int ea_set_i(struct gfs2_inode *ip, int type, const char *name,
-                    struct gfs2_ea_location *el)
+                    const void *value, size_t size, struct gfs2_ea_location *el)
 {
+        struct gfs2_ea_request er;
        struct ea_set es;
        unsigned int blks = 2;
        int error;
+        er.er_type = type;
+        er.er_name = name;
+        er.er_data = (void *)value;
+        er.er_name_len = strlen(name);
+        er.er_data_len = size;
        memset(&es, 0, sizeof(struct ea_set));
-        es.es_er = er;
+        es.es_er = &er;
        es.es_el = el;
        error = ea_foreach(ip, ea_set_simple, &es);
@@ -1024,10 +1031,10 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er,
        if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT))
                blks++;
-        if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
+        if (GFS2_EAREQ_SIZE_STUFFED(&er) > GFS2_SB(&ip->i_inode)->sd_jbsize)
-                blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
+                blks += DIV_ROUND_UP(er.er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize);
-        return ea_alloc_skeleton(ip, er, blks, ea_set_block, el);
+        return ea_alloc_skeleton(ip, &er, blks, ea_set_block, el);
 }
 static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
@@ -1039,75 +1046,7 @@ static int ea_set_remove_unstuffed(struct gfs2_inode *ip,
                                     GFS2_EA2NEXT(el->el_prev) == el->el_ea);
        }
-        return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev,0);
+        return ea_remove_unstuffed(ip, el->el_bh, el->el_ea, el->el_prev, 0);
-}
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct gfs2_ea_location el;
-        int error;
-        if (!ip->i_eattr) {
-                if (er->er_flags & XATTR_REPLACE)
-                        return -ENODATA;
-                return ea_init(ip, er);
-        }
-        error = gfs2_ea_find(ip, er, &el);
-        if (error)
-                return error;
-        if (el.el_ea) {
-                if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
-                        brelse(el.el_bh);
-                        return -EPERM;
-                }
-                error = -EEXIST;
-                if (!(er->er_flags & XATTR_CREATE)) {
-                        int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
-                        error = ea_set_i(ip, er, &el);
-                        if (!error && unstuffed)
-                                ea_set_remove_unstuffed(ip, &el);
-                }
-                brelse(el.el_bh);
-        } else {
-                error = -ENODATA;
-                if (!(er->er_flags & XATTR_REPLACE))
-                        error = ea_set_i(ip, er, NULL);
-        }
-        return error;
-}
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er)
-{
-        struct gfs2_holder i_gh;
-        int error;
-        if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
-                return -EINVAL;
-        if (!er->er_data || !er->er_data_len) {
-                er->er_data = NULL;
-                er->er_data_len = 0;
-        }
-        error = ea_check_size(GFS2_SB(&ip->i_inode), er);
-        if (error)
-                return error;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
-        if (error)
-                return error;
-        if (IS_IMMUTABLE(&ip->i_inode))
-                error = -EPERM;
-        else
-                error = gfs2_ea_ops[er->er_type]->eo_set(ip, er);
-        gfs2_glock_dq_uninit(&i_gh);
-        return error;
 }
 static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
@@ -1131,8 +1070,9 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
                if (GFS2_EA_IS_LAST(ea))
                        prev->ea_flags |= GFS2_EAFLAG_LAST;
-        } else
+        } else {
                ea->ea_type = GFS2_EATYPE_UNUSED;
+        }
        error = gfs2_meta_inode_buffer(ip, &dibh);
        if (!error) {
@@ -1147,15 +1087,29 @@ static int ea_remove_stuffed(struct gfs2_inode *ip, struct gfs2_ea_location *el)
        return error;
 }
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+/**
+ * gfs2_xattr_remove - Remove a GFS2 extended attribute
+ * @inode: The inode
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ *
+ * This is not called directly by the VFS since we use the (common)
+ * scheme of making a "set with NULL data" mean a remove request. Note
+ * that this is different from a set with zero length data.
+ *
+ * Returns: 0, or errno on failure
+ */
+static int gfs2_xattr_remove(struct inode *inode, int type, const char *name)
 {
+        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_ea_location el;
        int error;
        if (!ip->i_eattr)
                return -ENODATA;
-        error = gfs2_ea_find(ip, er, &el);
+        error = gfs2_ea_find(ip, type, name, &el);
        if (error)
                return error;
        if (!el.el_ea)
@@ -1164,8 +1118,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
        if (GFS2_EA_IS_STUFFED(el.el_ea))
                error = ea_remove_stuffed(ip, &el);
        else
-                error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev,
+                error = ea_remove_unstuffed(ip, el.el_bh, el.el_ea, el.el_prev, 0);
-                                            0);
        brelse(el.el_bh);
@@ -1173,31 +1126,70 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er)
 }
 /**
- * gfs2_ea_remove - sets (or creates or replaces) an extended attribute
+ * gfs2_xattr_set - Set (or remove) a GFS2 extended attribute
- * @ip: pointer to the inode of the target file
+ * @inode: The inode
- * @er: request information
+ * @type: The type of the extended attribute
+ * @name: The name of the extended attribute
+ * @value: The value of the extended attribute (NULL for remove)
+ * @size: The size of the @value argument
+ * @flags: Create or Replace
 *
- * Returns: errno
+ * See gfs2_xattr_remove() for details of the removal of xattrs.
+ *
+ * Returns: 0 or errno on failure
 */
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er)
+int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+                   const void *value, size_t size, int flags)
 {
-        struct gfs2_holder i_gh;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
+        struct gfs2_inode *ip = GFS2_I(inode);
+        struct gfs2_ea_location el;
+        unsigned int namel = strlen(name);
        int error;
-        if (!er->er_name_len || er->er_name_len > GFS2_EA_MAX_NAME_LEN)
+        if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
-                return -EINVAL;
+                return -EPERM;
+        if (namel > GFS2_EA_MAX_NAME_LEN)
+                return -ERANGE;
-        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh);
+        if (value == NULL)
+                return gfs2_xattr_remove(inode, type, name);
+        if (ea_check_size(sdp, namel, size))
+                return -ERANGE;
+        if (!ip->i_eattr) {
+                if (flags & XATTR_REPLACE)
+                        return -ENODATA;
+                return ea_init(ip, type, name, value, size);
+        }
+        error = gfs2_ea_find(ip, type, name, &el);
        if (error)
                return error;
-        if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode))
+        if (el.el_ea) {
-                error = -EPERM;
+                if (ip->i_diskflags & GFS2_DIF_APPENDONLY) {
-        else
+                        brelse(el.el_bh);
-                error = gfs2_ea_ops[er->er_type]->eo_remove(ip, er);
+                        return -EPERM;
+                }
-        gfs2_glock_dq_uninit(&i_gh);
+                error = -EEXIST;
+                if (!(flags & XATTR_CREATE)) {
+                        int unstuffed = !GFS2_EA_IS_STUFFED(el.el_ea);
+                        error = ea_set_i(ip, type, name, value, size, &el);
+                        if (!error && unstuffed)
+                                ea_set_remove_unstuffed(ip, &el);
+                }
+                brelse(el.el_bh);
+                return error;
+        }
+        error = -ENODATA;
+        if (!(flags & XATTR_REPLACE))
+                error = ea_set_i(ip, type, name, value, size, NULL);
        return error;
 }
@@ -1503,3 +1495,64 @@ out_alloc:
        return error;
 }
+static int gfs2_xattr_user_get(struct inode *inode, const char *name,
+                               void *buffer, size_t size)
+{
+        return gfs2_xattr_get(inode, GFS2_EATYPE_USR, name, buffer, size);
+}
+static int gfs2_xattr_user_set(struct inode *inode, const char *name,
+                               const void *value, size_t size, int flags)
+{
+        return gfs2_xattr_set(inode, GFS2_EATYPE_USR, name, value, size, flags);
+}
+static int gfs2_xattr_system_get(struct inode *inode, const char *name,
+                                 void *buffer, size_t size)
+{
+        return gfs2_xattr_get(inode, GFS2_EATYPE_SYS, name, buffer, size);
+}
+static int gfs2_xattr_system_set(struct inode *inode, const char *name,
+                                 const void *value, size_t size, int flags)
+{
+        return gfs2_xattr_set(inode, GFS2_EATYPE_SYS, name, value, size, flags);
+}
+static int gfs2_xattr_security_get(struct inode *inode, const char *name,
+                                   void *buffer, size_t size)
+{
+        return gfs2_xattr_get(inode, GFS2_EATYPE_SECURITY, name, buffer, size);
+}
+static int gfs2_xattr_security_set(struct inode *inode, const char *name,
+                                   const void *value, size_t size, int flags)
+{
+        return gfs2_xattr_set(inode, GFS2_EATYPE_SECURITY, name, value, size, flags);
+}
+static struct xattr_handler gfs2_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .get    = gfs2_xattr_user_get,
+        .set    = gfs2_xattr_user_set,
+};
+static struct xattr_handler gfs2_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .get    = gfs2_xattr_security_get,
+        .set    = gfs2_xattr_security_set,
+};
+static struct xattr_handler gfs2_xattr_system_handler = {
+        .prefix = XATTR_SYSTEM_PREFIX,
+        .get    = gfs2_xattr_system_get,
+        .set    = gfs2_xattr_system_set,
+};
+struct xattr_handler *gfs2_xattr_handlers[] = {
+        &gfs2_xattr_user_handler,
+        &gfs2_xattr_security_handler,
+        &gfs2_xattr_system_handler,
+        NULL,
+};
diff --git a/fs/gfs2/eattr.h b/fs/gfs2/xattr.h
index c82dbe01d713..cbdfd7743733 100644
--- a/fs/gfs2/eattr.h
+++ b/fs/gfs2/xattr.h
@@ -19,7 +19,7 @@ struct iattr;
 #define GFS2_EA_SIZE(ea) \
 ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
      ((GFS2_EA_IS_STUFFED(ea)) ? GFS2_EA_DATA_LEN(ea) : \
-                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
+                                  (sizeof(__be64) * (ea)->ea_num_ptrs)), 8)
 #define GFS2_EA_IS_STUFFED(ea) (!(ea)->ea_num_ptrs)
 #define GFS2_EA_IS_LAST(ea) ((ea)->ea_flags & GFS2_EAFLAG_LAST)
@@ -27,10 +27,6 @@ ALIGN(sizeof(struct gfs2_ea_header) + (ea)->ea_name_len + \
 #define GFS2_EAREQ_SIZE_STUFFED(er) \
 ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + (er)->er_data_len, 8)
-#define GFS2_EAREQ_SIZE_UNSTUFFED(sdp, er) \
-ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
-      sizeof(__be64) * DIV_ROUND_UP((er)->er_data_len, (sdp)->sd_jbsize), 8)
 #define GFS2_EA2NAME(ea) ((char *)((struct gfs2_ea_header *)(ea) + 1))
 #define GFS2_EA2DATA(ea) (GFS2_EA2NAME(ea) + (ea)->ea_name_len)
@@ -43,16 +39,12 @@ ALIGN(sizeof(struct gfs2_ea_header) + (er)->er_name_len + \
 #define GFS2_EA_BH2FIRST(bh) \
 ((struct gfs2_ea_header *)((bh)->b_data + sizeof(struct gfs2_meta_header)))
-#define GFS2_ERF_MODE 0x80000000
 struct gfs2_ea_request {
        const char *er_name;
        char *er_data;
        unsigned int er_name_len;
        unsigned int er_data_len;
        unsigned int er_type; /* GFS2_EATYPE_... */
-        int er_flags;
-        mode_t er_mode;
 };
 struct gfs2_ea_location {
@@ -61,40 +53,20 @@ struct gfs2_ea_location {
        struct gfs2_ea_header *el_prev;
 };
-int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+extern int gfs2_xattr_get(struct inode *inode, int type, const char *name,
-int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+                          void *buffer, size_t size);
-int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+extern int gfs2_xattr_set(struct inode *inode, int type, const char *name,
+                          const void *value, size_t size, int flags);
-int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+extern ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size);
-int gfs2_ea_get(struct gfs2_inode *ip, struct gfs2_ea_request *er);
+extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
-int gfs2_ea_set(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_remove(struct gfs2_inode *ip, struct gfs2_ea_request *er);
-int gfs2_ea_dealloc(struct gfs2_inode *ip);
 /* Exported to acl.c */
-int gfs2_ea_find(struct gfs2_inode *ip,
+extern int gfs2_ea_find(struct gfs2_inode *ip, int type, const char *name,
-                 struct gfs2_ea_request *er,
+                        struct gfs2_ea_location *el);
-                 struct gfs2_ea_location *el);
+extern int gfs2_ea_get_copy(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-int gfs2_ea_get_copy(struct gfs2_inode *ip,
+                            char *data, size_t size);
-                     struct gfs2_ea_location *el,
+extern int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                     char *data);
+                             struct iattr *attr, char *data);
-int gfs2_ea_acl_chmod(struct gfs2_inode *ip, struct gfs2_ea_location *el,
-                      struct iattr *attr, char *data);
-static inline unsigned int gfs2_ea_strlen(struct gfs2_ea_header *ea)
-{
-        switch (ea->ea_type) {
-        case GFS2_EATYPE_USR:
-                return 5 + ea->ea_name_len + 1;
-        case GFS2_EATYPE_SYS:
-                return 7 + ea->ea_name_len + 1;
-        case GFS2_EATYPE_SECURITY:
-                return 9 + ea->ea_name_len + 1;
-        default:
-                return 0;
-        }
-}
 #endif /* __EATTR_DOT_H__ */
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 941c8425c10b..a93b885311d8 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -44,6 +44,7 @@ static const struct inode_operations hugetlbfs_dir_inode_operations;
 static const struct inode_operations hugetlbfs_inode_operations;
 static struct backing_dev_info hugetlbfs_backing_dev_info = {
+        .name           = "hugetlbfs",
        .ra_pages       = 0,    /* No readahead */
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
@@ -935,26 +936,28 @@ static int can_do_hugetlb_shm(void)
        return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
 }
-struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag,
+                                                struct user_struct **user)
 {
        int error = -ENOMEM;
-        int unlock_shm = 0;
        struct file *file;
        struct inode *inode;
        struct dentry *dentry, *root;
        struct qstr quick_string;
-        struct user_struct *user = current_user();
+        *user = NULL;
        if (!hugetlbfs_vfsmount)
                return ERR_PTR(-ENOENT);
        if (!can_do_hugetlb_shm()) {
-                if (user_shm_lock(size, user)) {
+                *user = current_user();
-                        unlock_shm = 1;
+                if (user_shm_lock(size, *user)) {
                        WARN_ONCE(1,
                          "Using mlock ulimits for SHM_HUGETLB deprecated\n");
-                } else
+                } else {
+                        *user = NULL;
                        return ERR_PTR(-EPERM);
+                }
        }
        root = hugetlbfs_vfsmount->mnt_root;
@@ -996,8 +999,10 @@ out_inode:
 out_dentry:
        dput(dentry);
 out_shm_unlock:
-        if (unlock_shm)
+        if (*user) {
-                user_shm_unlock(size, user);
+                user_shm_unlock(size, *user);
+                *user = NULL;
+        }
        return ERR_PTR(error);
 }
diff --git a/fs/inode.c b/fs/inode.c
index 901bad1e5f12..b2ba83d2c4e1 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -120,12 +120,11 @@ static void wake_up_inode(struct inode *inode)
 * These are initializations that need to be done on every inode
 * allocation as the fields are not initialised by slab allocation.
 */
-struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
+int inode_init_always(struct super_block *sb, struct inode *inode)
 {
        static const struct address_space_operations empty_aops;
        static struct inode_operations empty_iops;
        static const struct file_operations empty_fops;
        struct address_space *const mapping = &inode->i_data;
        inode->i_sb = sb;
@@ -152,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->dirtied_when = 0;
        if (security_inode_alloc(inode))
-                goto out_free_inode;
+                goto out;
        /* allocate and initialize an i_integrity */
        if (ima_inode_alloc(inode))
@@ -183,9 +182,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        if (sb->s_bdev) {
                struct backing_dev_info *bdi;
-                bdi = sb->s_bdev->bd_inode_backing_dev_info;
+                bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
-                if (!bdi)
-                        bdi = sb->s_bdev->bd_inode->i_mapping->backing_dev_info;
                mapping->backing_dev_info = bdi;
        }
        inode->i_private = NULL;
@@ -198,16 +195,12 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode)
        inode->i_fsnotify_mask = 0;
 #endif
-        return inode;
+        return 0;
 out_free_security:
        security_inode_free(inode);
-out_free_inode:
+out:
-        if (inode->i_sb->s_op->destroy_inode)
+        return -ENOMEM;
-                inode->i_sb->s_op->destroy_inode(inode);
-        else
-                kmem_cache_free(inode_cachep, (inode));
-        return NULL;
 }
 EXPORT_SYMBOL(inode_init_always);
@@ -220,12 +213,21 @@ static struct inode *alloc_inode(struct super_block *sb)
        else
                inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL);
-        if (inode)
+        if (!inode)
-                return inode_init_always(sb, inode);
+                return NULL;
-        return NULL;
+        if (unlikely(inode_init_always(sb, inode))) {
+                if (inode->i_sb->s_op->destroy_inode)
+                        inode->i_sb->s_op->destroy_inode(inode);
+                else
+                        kmem_cache_free(inode_cachep, inode);
+                return NULL;
+        }
+        return inode;
 }
-void destroy_inode(struct inode *inode)
+void __destroy_inode(struct inode *inode)
 {
        BUG_ON(inode_has_buffers(inode));
        ima_inode_free(inode);
@@ -237,13 +239,17 @@ void destroy_inode(struct inode *inode)
        if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
                posix_acl_release(inode->i_default_acl);
 #endif
+}
+EXPORT_SYMBOL(__destroy_inode);
+void destroy_inode(struct inode *inode)
+{
+        __destroy_inode(inode);
        if (inode->i_sb->s_op->destroy_inode)
                inode->i_sb->s_op->destroy_inode(inode);
        else
                kmem_cache_free(inode_cachep, (inode));
 }
-EXPORT_SYMBOL(destroy_inode);
 /*
 * These are initializations that only need to be done
diff --git a/fs/jbd/checkpoint.c b/fs/jbd/checkpoint.c
index 61f32f3868cd..b0435dd0654d 100644
--- a/fs/jbd/checkpoint.c
+++ b/fs/jbd/checkpoint.c
@@ -456,7 +456,7 @@ int cleanup_journal_tail(journal_t *journal)
 {
        transaction_t * transaction;
        tid_t           first_tid;
-        unsigned long   blocknr, freed;
+        unsigned int    blocknr, freed;
        if (is_journal_aborted(journal))
                return 1;
@@ -502,8 +502,8 @@ int cleanup_journal_tail(journal_t *journal)
                freed = freed + journal->j_last - journal->j_first;
        jbd_debug(1,
-                  "Cleaning journal tail from %d to %d (offset %lu), "
+                  "Cleaning journal tail from %d to %d (offset %u), "
-                  "freeing %lu\n",
+                  "freeing %u\n",
                  journal->j_tail_sequence, first_tid, blocknr, freed);
        journal->j_free += freed;
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index 618e21c0b7a3..4bd882548c45 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -308,7 +308,7 @@ void journal_commit_transaction(journal_t *journal)
        int bufs;
        int flags;
        int err;
-        unsigned long blocknr;
+        unsigned int blocknr;
        ktime_t start_time;
        u64 commit_time;
        char *tagp = NULL;
diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 737f7246a4b5..bd3c073b485d 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -276,7 +276,7 @@ static void journal_kill_thread(journal_t *journal)
 int journal_write_metadata_buffer(transaction_t *transaction,
                                  struct journal_head  *jh_in,
                                  struct journal_head **jh_out,
-                                  unsigned long blocknr)
+                                  unsigned int blocknr)
 {
        int need_copy_out = 0;
        int done_copy_out = 0;
@@ -287,6 +287,7 @@ int journal_write_metadata_buffer(transaction_t *transaction,
        struct page *new_page;
        unsigned int new_offset;
        struct buffer_head *bh_in = jh2bh(jh_in);
+        journal_t *journal = transaction->t_journal;
        /*
         * The buffer really shouldn't be locked: only the current committing
@@ -300,6 +301,11 @@ int journal_write_metadata_buffer(transaction_t *transaction,
        J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in));
        new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL);
+        /* keep subsequent assertions sane */
+        new_bh->b_state = 0;
+        init_buffer(new_bh, NULL, NULL);
+        atomic_set(&new_bh->b_count, 1);
+        new_jh = journal_add_journal_head(new_bh);      /* This sleeps */
        /*
         * If a new transaction has already done a buffer copy-out, then
@@ -361,14 +367,6 @@ repeat:
                kunmap_atomic(mapped_data, KM_USER0);
        }
-        /* keep subsequent assertions sane */
-        new_bh->b_state = 0;
-        init_buffer(new_bh, NULL, NULL);
-        atomic_set(&new_bh->b_count, 1);
-        jbd_unlock_bh_state(bh_in);
-        new_jh = journal_add_journal_head(new_bh);      /* This sleeps */
        set_bh_page(new_bh, new_page, new_offset);
        new_jh->b_transaction = NULL;
        new_bh->b_size = jh2bh(jh_in)->b_size;
@@ -385,7 +383,11 @@ repeat:
         * copying is moved to the transaction's shadow queue.
         */
        JBUFFER_TRACE(jh_in, "file as BJ_Shadow");
-        journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_lock(&journal->j_list_lock);
+        __journal_file_buffer(jh_in, transaction, BJ_Shadow);
+        spin_unlock(&journal->j_list_lock);
+        jbd_unlock_bh_state(bh_in);
        JBUFFER_TRACE(new_jh, "file as BJ_IO");
        journal_file_buffer(new_jh, transaction, BJ_IO);
@@ -565,9 +567,9 @@ int log_wait_commit(journal_t *journal, tid_t tid)
 * Log buffer allocation routines:
 */
-int journal_next_log_block(journal_t *journal, unsigned long *retp)
+int journal_next_log_block(journal_t *journal, unsigned int *retp)
 {
-        unsigned long blocknr;
+        unsigned int blocknr;
        spin_lock(&journal->j_state_lock);
        J_ASSERT(journal->j_free > 1);
@@ -588,11 +590,11 @@ int journal_next_log_block(journal_t *journal, unsigned long *retp)
 * this is a no-op.  If needed, we can use j_blk_offset - everything is
 * ready.
 */
-int journal_bmap(journal_t *journal, unsigned long blocknr,
+int journal_bmap(journal_t *journal, unsigned int blocknr,
-                 unsigned long *retp)
+                 unsigned int *retp)
 {
        int err = 0;
-        unsigned long ret;
+        unsigned int ret;
        if (journal->j_inode) {
                ret = bmap(journal->j_inode, blocknr);
@@ -602,7 +604,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
                        char b[BDEVNAME_SIZE];
                        printk(KERN_ALERT "%s: journal block not found "
-                                        "at offset %lu on %s\n",
+                                        "at offset %u on %s\n",
                                __func__,
                                blocknr,
                                bdevname(journal->j_dev, b));
@@ -628,7 +630,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr,
 struct journal_head *journal_get_descriptor_buffer(journal_t *journal)
 {
        struct buffer_head *bh;
-        unsigned long blocknr;
+        unsigned int blocknr;
        int err;
        err = journal_next_log_block(journal, &blocknr);
@@ -772,7 +774,7 @@ journal_t * journal_init_inode (struct inode *inode)
        journal_t *journal = journal_init_common();
        int err;
        int n;
-        unsigned long blocknr;
+        unsigned int blocknr;
        if (!journal)
                return NULL;
@@ -844,10 +846,16 @@ static void journal_fail_superblock (journal_t *journal)
 static int journal_reset(journal_t *journal)
 {
        journal_superblock_t *sb = journal->j_superblock;
-        unsigned long first, last;
+        unsigned int first, last;
        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
+        if (first + JFS_MIN_JOURNAL_BLOCKS > last + 1) {
+                printk(KERN_ERR "JBD: Journal too short (blocks %u-%u).\n",
+                       first, last);
+                journal_fail_superblock(journal);
+                return -EINVAL;
+        }
        journal->j_first = first;
        journal->j_last = last;
@@ -877,7 +885,7 @@ static int journal_reset(journal_t *journal)
 **/
 int journal_create(journal_t *journal)
 {
-        unsigned long blocknr;
+        unsigned int blocknr;
        struct buffer_head *bh;
        journal_superblock_t *sb;
        int i, err;
@@ -961,14 +969,14 @@ void journal_update_superblock(journal_t *journal, int wait)
        if (sb->s_start == 0 && journal->j_tail_sequence ==
                                journal->j_transaction_sequence) {
                jbd_debug(1,"JBD: Skipping superblock update on recovered sb "
-                        "(start %ld, seq %d, errno %d)\n",
+                        "(start %u, seq %d, errno %d)\n",
                        journal->j_tail, journal->j_tail_sequence,
                        journal->j_errno);
                goto out;
        }
        spin_lock(&journal->j_state_lock);
-        jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n",
+        jbd_debug(1,"JBD: updating superblock (start %u, seq %d, errno %d)\n",
                  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
        sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
@@ -1363,7 +1371,7 @@ int journal_flush(journal_t *journal)
 {
        int err = 0;
        transaction_t *transaction = NULL;
-        unsigned long old_tail;
+        unsigned int old_tail;
        spin_lock(&journal->j_state_lock);
diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c
index db5e982c5ddf..cb1a49ae605e 100644
--- a/fs/jbd/recovery.c
+++ b/fs/jbd/recovery.c
@@ -70,7 +70,7 @@ static int do_readahead(journal_t *journal, unsigned int start)
 {
        int err;
        unsigned int max, nbufs, next;
-        unsigned long blocknr;
+        unsigned int blocknr;
        struct buffer_head *bh;
        struct buffer_head * bufs[MAXBUF];
@@ -132,7 +132,7 @@ static int jread(struct buffer_head **bhp, journal_t *journal,
                 unsigned int offset)
 {
        int err;
-        unsigned long blocknr;
+        unsigned int blocknr;
        struct buffer_head *bh;
        *bhp = NULL;
@@ -314,7 +314,7 @@ static int do_one_pass(journal_t *journal,
                        struct recovery_info *info, enum passtype pass)
 {
        unsigned int            first_commit_ID, next_commit_ID;
-        unsigned long           next_log_block;
+        unsigned int            next_log_block;
        int                     err, success = 0;
        journal_superblock_t *  sb;
        journal_header_t *      tmp;
@@ -367,14 +367,14 @@ static int do_one_pass(journal_t *journal,
                        if (tid_geq(next_commit_ID, info->end_transaction))
                                break;
-                jbd_debug(2, "Scanning for sequence ID %u at %lu/%lu\n",
+                jbd_debug(2, "Scanning for sequence ID %u at %u/%u\n",
                          next_commit_ID, next_log_block, journal->j_last);
                /* Skip over each chunk of the transaction looking
                 * either the next descriptor block or the final commit
                 * record. */
-                jbd_debug(3, "JBD: checking block %ld\n", next_log_block);
+                jbd_debug(3, "JBD: checking block %u\n", next_log_block);
                err = jread(&bh, journal, next_log_block);
                if (err)
                        goto failed;
@@ -429,7 +429,7 @@ static int do_one_pass(journal_t *journal,
                        tagp = &bh->b_data[sizeof(journal_header_t)];
                        while ((tagp - bh->b_data +sizeof(journal_block_tag_t))
                               <= journal->j_blocksize) {
-                                unsigned long io_block;
+                                unsigned int io_block;
                                tag = (journal_block_tag_t *) tagp;
                                flags = be32_to_cpu(tag->t_flags);
@@ -443,10 +443,10 @@ static int do_one_pass(journal_t *journal,
                                        success = err;
                                        printk (KERN_ERR
                                                "JBD: IO error %d recovering "
-                                                "block %ld in log\n",
+                                                "block %u in log\n",
                                                err, io_block);
                                } else {
-                                        unsigned long blocknr;
+                                        unsigned int blocknr;
                                        J_ASSERT(obh != NULL);
                                        blocknr = be32_to_cpu(tag->t_blocknr);
@@ -581,7 +581,7 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh,
        max = be32_to_cpu(header->r_count);
        while (offset < max) {
-                unsigned long blocknr;
+                unsigned int blocknr;
                int err;
                blocknr = be32_to_cpu(* ((__be32 *) (bh->b_data+offset)));
diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c
index da6cd9bdaabc..ad717328343a 100644
--- a/fs/jbd/revoke.c
+++ b/fs/jbd/revoke.c
@@ -101,7 +101,7 @@ struct jbd_revoke_record_s
 {
        struct list_head  hash;
        tid_t             sequence;     /* Used for recovery only */
-        unsigned long     blocknr;
+        unsigned int      blocknr;
 };
@@ -126,7 +126,7 @@ static void flush_descriptor(journal_t *, struct journal_head *, int, int);
 /* Utility functions to maintain the revoke table */
 /* Borrowed from buffer.c: this is a tried and tested block hash function */
-static inline int hash(journal_t *journal, unsigned long block)
+static inline int hash(journal_t *journal, unsigned int block)
 {
        struct jbd_revoke_table_s *table = journal->j_revoke;
        int hash_shift = table->hash_shift;
@@ -136,7 +136,7 @@ static inline int hash(journal_t *journal, unsigned long block)
                (block << (hash_shift - 12))) & (table->hash_size - 1);
 }
-static int insert_revoke_hash(journal_t *journal, unsigned long blocknr,
+static int insert_revoke_hash(journal_t *journal, unsigned int blocknr,
                              tid_t seq)
 {
        struct list_head *hash_list;
@@ -166,7 +166,7 @@ oom:
 /* Find a revoke record in the journal's hash table. */
 static struct jbd_revoke_record_s *find_revoke_record(journal_t *journal,
-                                                      unsigned long blocknr)
+                                                      unsigned int blocknr)
 {
        struct list_head *hash_list;
        struct jbd_revoke_record_s *record;
@@ -332,7 +332,7 @@ void journal_destroy_revoke(journal_t *journal)
 * by one.
 */
-int journal_revoke(handle_t *handle, unsigned long blocknr,
+int journal_revoke(handle_t *handle, unsigned int blocknr,
                   struct buffer_head *bh_in)
 {
        struct buffer_head *bh = NULL;
@@ -401,7 +401,7 @@ int journal_revoke(handle_t *handle, unsigned long blocknr,
                }
        }
-        jbd_debug(2, "insert revoke for block %lu, bh_in=%p\n", blocknr, bh_in);
+        jbd_debug(2, "insert revoke for block %u, bh_in=%p\n", blocknr, bh_in);
        err = insert_revoke_hash(journal, blocknr,
                                handle->h_transaction->t_tid);
        BUFFER_TRACE(bh_in, "exit");
@@ -644,7 +644,7 @@ static void flush_descriptor(journal_t *journal,
 */
 int journal_set_revoke(journal_t *journal,
-                       unsigned long blocknr,
+                       unsigned int blocknr,
                       tid_t sequence)
 {
        struct jbd_revoke_record_s *record;
@@ -668,7 +668,7 @@ int journal_set_revoke(journal_t *journal,
 */
 int journal_test_revoke(journal_t *journal,
-                        unsigned long blocknr,
+                        unsigned int blocknr,
                        tid_t sequence)
 {
        struct jbd_revoke_record_s *record;
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index 73242ba7c7b1..006f9ad838a2 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -56,7 +56,8 @@ get_transaction(journal_t *journal, transaction_t *transaction)
        spin_lock_init(&transaction->t_handle_lock);
        /* Set up the commit timer for the new transaction. */
-        journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+        journal->j_commit_timer.expires =
+                                round_jiffies_up(transaction->t_expires);
        add_timer(&journal->j_commit_timer);
        J_ASSERT(journal->j_running_transaction == NULL);
@@ -228,6 +229,8 @@ repeat_locked:
                  __log_space_left(journal));
        spin_unlock(&transaction->t_handle_lock);
        spin_unlock(&journal->j_state_lock);
+        lock_map_acquire(&handle->h_lockdep_map);
 out:
        if (unlikely(new_transaction))          /* It's usually NULL */
                kfree(new_transaction);
@@ -292,9 +295,6 @@ handle_t *journal_start(journal_t *journal, int nblocks)
                handle = ERR_PTR(err);
                goto out;
        }
-        lock_map_acquire(&handle->h_lockdep_map);
 out:
        return handle;
 }
@@ -416,6 +416,7 @@ int journal_restart(handle_t *handle, int nblocks)
        __log_start_commit(journal, transaction->t_tid);
        spin_unlock(&journal->j_state_lock);
+        lock_map_release(&handle->h_lockdep_map);
        handle->h_buffer_credits = nblocks;
        ret = start_this_handle(journal, handle);
        return ret;
@@ -489,34 +490,15 @@ void journal_unlock_updates (journal_t *journal)
        wake_up(&journal->j_wait_transaction_locked);
 }
-/*
+static void warn_dirty_buffer(struct buffer_head *bh)
- * Report any unexpected dirty buffers which turn up.  Normally those
- * indicate an error, but they can occur if the user is running (say)
- * tune2fs to modify the live filesystem, so we need the option of
- * continuing as gracefully as possible.  #
- *
- * The caller should already hold the journal lock and
- * j_list_lock spinlock: most callers will need those anyway
- * in order to probe the buffer's journaling state safely.
- */
-static void jbd_unexpected_dirty_buffer(struct journal_head *jh)
 {
-        int jlist;
+        char b[BDEVNAME_SIZE];
-        /* If this buffer is one which might reasonably be dirty
-         * --- ie. data, or not part of this journal --- then
-         * we're OK to leave it alone, but otherwise we need to
-         * move the dirty bit to the journal's own internal
-         * JBDDirty bit. */
-        jlist = jh->b_jlist;
-        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
-            jlist == BJ_Shadow || jlist == BJ_Forget) {
-                struct buffer_head *bh = jh2bh(jh);
-                if (test_clear_buffer_dirty(bh))
+        printk(KERN_WARNING
-                        set_buffer_jbddirty(bh);
+               "JBD: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
-        }
+               "There's a risk of filesystem corruption in case of system "
+               "crash.\n",
+               bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
 }
 /*
@@ -583,14 +565,16 @@ repeat:
                        if (jh->b_next_transaction)
                                J_ASSERT_JH(jh, jh->b_next_transaction ==
                                                        transaction);
+                        warn_dirty_buffer(bh);
                }
                /*
                 * In any case we need to clean the dirty flag and we must
                 * do it under the buffer lock to be sure we don't race
                 * with running write-out.
                 */
-                JBUFFER_TRACE(jh, "Unexpected dirty buffer");
+                JBUFFER_TRACE(jh, "Journalling dirty buffer");
-                jbd_unexpected_dirty_buffer(jh);
+                clear_buffer_dirty(bh);
+                set_buffer_jbddirty(bh);
        }
        unlock_buffer(bh);
@@ -826,6 +810,15 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh)
        J_ASSERT_JH(jh, buffer_locked(jh2bh(jh)));
        if (jh->b_transaction == NULL) {
+                /*
+                 * Previous journal_forget() could have left the buffer
+                 * with jbddirty bit set because it was being committed. When
+                 * the commit finished, we've filed the buffer for
+                 * checkpointing and marked it dirty. Now we are reallocating
+                 * the buffer so the transaction freeing it must have
+                 * committed and so it's safe to clear the dirty bit.
+                 */
+                clear_buffer_dirty(jh2bh(jh));
                jh->b_transaction = transaction;
                /* first access by this transaction */
@@ -1782,8 +1775,13 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction)
        if (jh->b_cp_transaction) {
                JBUFFER_TRACE(jh, "on running+cp transaction");
+                /*
+                 * We don't want to write the buffer anymore, clear the
+                 * bit so that we don't confuse checks in
+                 * __journal_file_buffer
+                 */
+                clear_buffer_dirty(bh);
                __journal_file_buffer(jh, transaction, BJ_Forget);
-                clear_buffer_jbddirty(bh);
                may_free = 0;
        } else {
                JBUFFER_TRACE(jh, "on running transaction");
@@ -2041,12 +2039,17 @@ void __journal_file_buffer(struct journal_head *jh,
        if (jh->b_transaction && jh->b_jlist == jlist)
                return;
-        /* The following list of buffer states needs to be consistent
-         * with __jbd_unexpected_dirty_buffer()'s handling of dirty
-         * state. */
        if (jlist == BJ_Metadata || jlist == BJ_Reserved ||
            jlist == BJ_Shadow || jlist == BJ_Forget) {
+                /*
+                 * For metadata buffers, we track dirty bit in buffer_jbddirty
+                 * instead of buffer_dirty. We should not see a dirty bit set
+                 * here because we clear it in do_get_write_access but e.g.
+                 * tune2fs can modify the sb and set the dirty bit at any time
+                 * so we try to gracefully handle that.
+                 */
+                if (buffer_dirty(bh))
+                        warn_dirty_buffer(bh);
                if (test_clear_buffer_dirty(bh) ||
                    test_clear_buffer_jbddirty(bh))
                        was_dirty = 1;
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 7b4088b2364d..26d991ddc1e6 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -25,6 +25,7 @@
 #include <linux/writeback.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 #include <trace/events/jbd2.h>
 /*
@@ -133,8 +134,8 @@ static int journal_submit_commit_record(journal_t *journal,
        bh->b_end_io = journal_end_buffer_io_sync;
        if (journal->j_flags & JBD2_BARRIER &&
-                !JBD2_HAS_INCOMPAT_FEATURE(journal,
+            !JBD2_HAS_INCOMPAT_FEATURE(journal,
-                                         JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
                set_buffer_ordered(bh);
                barrier_done = 1;
        }
@@ -220,7 +221,6 @@ static int journal_submit_inode_data_buffers(struct address_space *mapping)
                .nr_to_write = mapping->nrpages * 2,
                .range_start = 0,
                .range_end = i_size_read(mapping->host),
-                .for_writepages = 1,
        };
        ret = generic_writepages(mapping, &wbc);
@@ -707,11 +707,13 @@ start_journal_io:
        /* Done it all: now write the commit record asynchronously. */
        if (JBD2_HAS_INCOMPAT_FEATURE(journal,
-                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+                                      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
                err = journal_submit_commit_record(journal, commit_transaction,
                                                 &cbh, crc32_sum);
                if (err)
                        __jbd2_journal_abort_hard(journal);
+                if (journal->j_flags & JBD2_BARRIER)
+                        blkdev_issue_flush(journal->j_dev, NULL);
        }
        /*
@@ -834,7 +836,7 @@ wait_for_iobuf:
        jbd_debug(3, "JBD: commit phase 5\n");
        if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
-                JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
+                                       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
                err = journal_submit_commit_record(journal, commit_transaction,
                                                &cbh, crc32_sum);
                if (err)
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index e378cb383979..a8a358bc0f21 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -1187,6 +1187,12 @@ static int journal_reset(journal_t *journal)
        first = be32_to_cpu(sb->s_first);
        last = be32_to_cpu(sb->s_maxlen);
+        if (first + JBD2_MIN_JOURNAL_BLOCKS > last + 1) {
+                printk(KERN_ERR "JBD: Journal too short (blocks %llu-%llu).\n",
+                       first, last);
+                journal_fail_superblock(journal);
+                return -EINVAL;
+        }
        journal->j_first = first;
        journal->j_last = last;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 6213ac728f30..a0512700542f 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -57,7 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction)
        INIT_LIST_HEAD(&transaction->t_private_list);
        /* Set up the commit timer for the new transaction. */
-        journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
+        journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires);
        add_timer(&journal->j_commit_timer);
        J_ASSERT(journal->j_running_transaction == NULL);
@@ -238,6 +238,8 @@ repeat_locked:
                  __jbd2_log_space_left(journal));
        spin_unlock(&transaction->t_handle_lock);
        spin_unlock(&journal->j_state_lock);
+        lock_map_acquire(&handle->h_lockdep_map);
 out:
        if (unlikely(new_transaction))          /* It's usually NULL */
                kfree(new_transaction);
@@ -303,8 +305,6 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks)
                handle = ERR_PTR(err);
                goto out;
        }
-        lock_map_acquire(&handle->h_lockdep_map);
 out:
        return handle;
 }
@@ -426,6 +426,7 @@ int jbd2_journal_restart(handle_t *handle, int nblocks)
        __jbd2_log_start_commit(journal, transaction->t_tid);
        spin_unlock(&journal->j_state_lock);
+        lock_map_release(&handle->h_lockdep_map);
        handle->h_buffer_credits = nblocks;
        ret = start_this_handle(journal, handle);
        return ret;
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 8fcb6239218e..7edb62e97419 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -258,7 +258,7 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
        return rc;
 }
-static int jffs2_check_acl(struct inode *inode, int mask)
+int jffs2_check_acl(struct inode *inode, int mask)
 {
        struct posix_acl *acl;
        int rc;
@@ -274,11 +274,6 @@ static int jffs2_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-int jffs2_permission(struct inode *inode, int mask)
-{
-        return generic_permission(inode, mask, jffs2_check_acl);
-}
 int jffs2_init_acl_pre(struct inode *dir_i, struct inode *inode, int *i_mode)
 {
        struct posix_acl *acl, *clone;
diff --git a/fs/jffs2/acl.h b/fs/jffs2/acl.h
index fc929f2a14f6..f0ba63e3c36b 100644
--- a/fs/jffs2/acl.h
+++ b/fs/jffs2/acl.h
@@ -26,7 +26,7 @@ struct jffs2_acl_header {
 #ifdef CONFIG_JFFS2_FS_POSIX_ACL
-extern int jffs2_permission(struct inode *, int);
+extern int jffs2_check_acl(struct inode *, int);
 extern int jffs2_acl_chmod(struct inode *);
 extern int jffs2_init_acl_pre(struct inode *, struct inode *, int *);
 extern int jffs2_init_acl_post(struct inode *);
@@ -36,7 +36,7 @@ extern struct xattr_handler jffs2_acl_default_xattr_handler;
 #else
-#define jffs2_permission                        (NULL)
+#define jffs2_check_acl                         (NULL)
 #define jffs2_acl_chmod(inode)                  (0)
 #define jffs2_init_acl_pre(dir_i,inode,mode)    (0)
 #define jffs2_init_acl_post(inode)              (0)
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 6f60cc910f4c..7aa4417e085f 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -55,7 +55,7 @@ const struct inode_operations jffs2_dir_inode_operations =
        .rmdir =        jffs2_rmdir,
        .mknod =        jffs2_mknod,
        .rename =       jffs2_rename,
-        .permission =   jffs2_permission,
+        .check_acl =    jffs2_check_acl,
        .setattr =      jffs2_setattr,
        .setxattr =     jffs2_setxattr,
        .getxattr =     jffs2_getxattr,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 5edc2bf20581..b7b74e299142 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -56,7 +56,7 @@ const struct file_operations jffs2_file_operations =
 const struct inode_operations jffs2_file_inode_operations =
 {
-        .permission =   jffs2_permission,
+        .check_acl =    jffs2_check_acl,
        .setattr =      jffs2_setattr,
        .setxattr =     jffs2_setxattr,
        .getxattr =     jffs2_getxattr,
@@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg)
        kunmap(pg);
        D2(printk(KERN_DEBUG "readpage finished\n"));
-        return 0;
+        return ret;
 }
 int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg)
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index b7339c3b6ad9..4ec11e8bda8c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -21,7 +21,7 @@ const struct inode_operations jffs2_symlink_inode_operations =
 {
        .readlink =     generic_readlink,
        .follow_link =  jffs2_follow_link,
-        .permission =   jffs2_permission,
+        .check_acl =    jffs2_check_acl,
        .setattr =      jffs2_setattr,
        .setxattr =     jffs2_setxattr,
        .getxattr =     jffs2_getxattr,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index d9a721e6db70..5ef7bac265e5 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1268,10 +1268,20 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) {
        if (!c->wbuf)
                return -ENOMEM;
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+        c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL);
+        if (!c->wbuf_verify) {
+                kfree(c->wbuf);
+                return -ENOMEM;
+        }
+#endif
        return 0;
 }
 void jffs2_nor_wbuf_flash_cleanup(struct jffs2_sb_info *c) {
+#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY
+        kfree(c->wbuf_verify);
+#endif
        kfree(c->wbuf);
 }
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 91fa3ad6e8c2..d66477c34306 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -67,10 +67,8 @@ static struct posix_acl *jfs_get_acl(struct inode *inode, int type)
                acl = posix_acl_from_xattr(value, size);
        }
        kfree(value);
-        if (!IS_ERR(acl)) {
+        if (!IS_ERR(acl))
                set_cached_acl(inode, type, acl);
-                posix_acl_release(acl);
-        }
        return acl;
 }
@@ -116,7 +114,7 @@ out:
        return rc;
 }
-static int jfs_check_acl(struct inode *inode, int mask)
+int jfs_check_acl(struct inode *inode, int mask)
 {
        struct posix_acl *acl = jfs_get_acl(inode, ACL_TYPE_ACCESS);
@@ -131,11 +129,6 @@ static int jfs_check_acl(struct inode *inode, int mask)
        return -EAGAIN;
 }
-int jfs_permission(struct inode *inode, int mask)
-{
-        return generic_permission(inode, mask, jfs_check_acl);
-}
 int jfs_init_acl(tid_t tid, struct inode *inode, struct inode *dir)
 {
        struct posix_acl *acl = NULL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 7f6063acaa3b..2b70fa78e4a7 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -96,7 +96,7 @@ const struct inode_operations jfs_file_inode_operations = {
        .removexattr    = jfs_removexattr,
 #ifdef CONFIG_JFS_POSIX_ACL
        .setattr        = jfs_setattr,
-        .permission     = jfs_permission,
+        .check_acl      = jfs_check_acl,
 #endif
 };
diff --git a/fs/jfs/jfs_acl.h b/fs/jfs/jfs_acl.h
index 88475f10a389..b07bd417ef85 100644
--- a/fs/jfs/jfs_acl.h
+++ b/fs/jfs/jfs_acl.h
@@ -20,7 +20,7 @@
 #ifdef CONFIG_JFS_POSIX_ACL
-int jfs_permission(struct inode *, int);
+int jfs_check_acl(struct inode *, int);
 int jfs_init_acl(tid_t, struct inode *, struct inode *);
 int jfs_setattr(struct dentry *, struct iattr *);
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 514ee2edb92a..c79a4270f083 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -1543,7 +1543,7 @@ const struct inode_operations jfs_dir_inode_operations = {
        .removexattr    = jfs_removexattr,
 #ifdef CONFIG_JFS_POSIX_ACL
        .setattr        = jfs_setattr,
-        .permission     = jfs_permission,
+        .check_acl      = jfs_check_acl,
 #endif
 };
diff --git a/fs/libfs.c b/fs/libfs.c
index ddfa89948c3f..dcec3d3ea64f 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -217,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
                return PTR_ERR(s);
        s->s_flags = MS_NOUSER;
-        s->s_maxbytes = ~0ULL;
+        s->s_maxbytes = MAX_LFS_FILESIZE;
        s->s_blocksize = PAGE_SIZE;
        s->s_blocksize_bits = PAGE_SHIFT;
        s->s_magic = magic;
diff --git a/fs/lockd/host.c b/fs/lockd/host.c
index 99d737bd4325..7cb076ac6b45 100644
--- a/fs/lockd/host.c
+++ b/fs/lockd/host.c
@@ -87,18 +87,6 @@ static unsigned int nlm_hash_address(const struct sockaddr *sap)
        return hash & (NLM_HOST_NRHASH - 1);
 }
-static void nlm_clear_port(struct sockaddr *sap)
-{
-        switch (sap->sa_family) {
-        case AF_INET:
-                ((struct sockaddr_in *)sap)->sin_port = 0;
-                break;
-        case AF_INET6:
-                ((struct sockaddr_in6 *)sap)->sin6_port = 0;
-                break;
-        }
-}
 /*
 * Common host lookup routine for server & client
 */
@@ -177,7 +165,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni)
        host->h_addrbuf    = nsm->sm_addrbuf;
        memcpy(nlm_addr(host), ni->sap, ni->salen);
        host->h_addrlen = ni->salen;
-        nlm_clear_port(nlm_addr(host));
+        rpc_set_port(nlm_addr(host), 0);
        memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len);
        host->h_version    = ni->version;
        host->h_proto      = ni->protocol;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 7fce1b525849..30c933188dd7 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -61,43 +61,6 @@ static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm)
        return (struct sockaddr *)&nsm->sm_addr;
 }
-static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf,
-                                     const size_t len)
-{
-        const struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-        snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr);
-}
-static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf,
-                                     const size_t len)
-{
-        const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-        if (ipv6_addr_v4mapped(&sin6->sin6_addr))
-                snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]);
-        else if (sin6->sin6_scope_id != 0)
-                snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr,
-                                sin6->sin6_scope_id);
-        else
-                snprintf(buf, len, "%pI6", &sin6->sin6_addr);
-}
-static void nsm_display_address(const struct sockaddr *sap,
-                                char *buf, const size_t len)
-{
-        switch (sap->sa_family) {
-        case AF_INET:
-                nsm_display_ipv4_address(sap, buf, len);
-                break;
-        case AF_INET6:
-                nsm_display_ipv6_address(sap, buf, len);
-                break;
-        default:
-                snprintf(buf, len, "unsupported address family");
-                break;
-        }
-}
 static struct rpc_clnt *nsm_create(void)
 {
        struct sockaddr_in sin = {
@@ -307,8 +270,11 @@ static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
        memcpy(nsm_addr(new), sap, salen);
        new->sm_addrlen = salen;
        nsm_init_private(new);
-        nsm_display_address((const struct sockaddr *)&new->sm_addr,
-                                new->sm_addrbuf, sizeof(new->sm_addrbuf));
+        if (rpc_ntop(nsm_addr(new), new->sm_addrbuf,
+                                        sizeof(new->sm_addrbuf)) == 0)
+                (void)snprintf(new->sm_addrbuf, sizeof(new->sm_addrbuf),
+                                "unsupported address family");
        memcpy(new->sm_name, hostname, hostname_len);
        new->sm_name[hostname_len] = '\0';
diff --git a/fs/locks.c b/fs/locks.c
index b6440f52178f..19ee18a6829b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -768,7 +768,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
         * give it the opportunity to lock the file.
         */
        if (found)
-                cond_resched_bkl();
+                cond_resched();
 find_conflict:
        for_each_lock(inode, before) {
@@ -1591,7 +1591,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd)
        if (can_sleep)
                lock->fl_flags |= FL_SLEEP;
-        error = security_file_lock(filp, cmd);
+        error = security_file_lock(filp, lock->fl_type);
        if (error)
                goto out_free;
diff --git a/fs/namei.c b/fs/namei.c
index f3c5b278895a..d11f404667e9 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -169,19 +169,10 @@ void putname(const char *name)
 EXPORT_SYMBOL(putname);
 #endif
+/*
-/**
+ * This does basic POSIX ACL permission checking
- * generic_permission  -  check for access rights on a Posix-like filesystem
- * @inode:      inode to check access rights for
- * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
- * @check_acl:  optional callback to check for Posix ACLs
- *
- * Used to check for read/write/execute permissions on a file.
- * We use "fsuid" for this, letting us set arbitrary permissions
- * for filesystem access without changing the "normal" uids which
- * are used for other things..
 */
-int generic_permission(struct inode *inode, int mask,
+static int acl_permission_check(struct inode *inode, int mask,
                int (*check_acl)(struct inode *inode, int mask))
 {
        umode_t                 mode = inode->i_mode;
@@ -193,9 +184,7 @@ int generic_permission(struct inode *inode, int mask,
        else {
                if (IS_POSIXACL(inode) && (mode & S_IRWXG) && check_acl) {
                        int error = check_acl(inode, mask);
-                        if (error == -EACCES)
+                        if (error != -EAGAIN)
-                                goto check_capabilities;
-                        else if (error != -EAGAIN)
                                return error;
                }
@@ -208,8 +197,32 @@ int generic_permission(struct inode *inode, int mask,
         */
        if ((mask & ~mode) == 0)
                return 0;
+        return -EACCES;
+}
+/**
+ * generic_permission  -  check for access rights on a Posix-like filesystem
+ * @inode:      inode to check access rights for
+ * @mask:       right to check for (%MAY_READ, %MAY_WRITE, %MAY_EXEC)
+ * @check_acl:  optional callback to check for Posix ACLs
+ *
+ * Used to check for read/write/execute permissions on a file.
+ * We use "fsuid" for this, letting us set arbitrary permissions
+ * for filesystem access without changing the "normal" uids which
+ * are used for other things..
+ */
+int generic_permission(struct inode *inode, int mask,
+                int (*check_acl)(struct inode *inode, int mask))
+{
+        int ret;
+        /*
+         * Do the basic POSIX ACL permission checks.
+         */
+        ret = acl_permission_check(inode, mask, check_acl);
+        if (ret != -EACCES)
+                return ret;
- check_capabilities:
        /*
         * Read/write DACs are always overridable.
         * Executable DACs are overridable if at least one exec bit is set.
@@ -262,7 +275,7 @@ int inode_permission(struct inode *inode, int mask)
        if (inode->i_op->permission)
                retval = inode->i_op->permission(inode, mask);
        else
-                retval = generic_permission(inode, mask, NULL);
+                retval = generic_permission(inode, mask, inode->i_op->check_acl);
        if (retval)
                return retval;
@@ -432,29 +445,22 @@ static struct dentry * cached_lookup(struct dentry * parent, struct qstr * name,
 */
 static int exec_permission_lite(struct inode *inode)
 {
-        umode_t mode = inode->i_mode;
+        int ret;
-        if (inode->i_op->permission)
+        if (inode->i_op->permission) {
-                return -EAGAIN;
+                ret = inode->i_op->permission(inode, MAY_EXEC);
+                if (!ret)
-        if (current_fsuid() == inode->i_uid)
+                        goto ok;
-                mode >>= 6;
+                return ret;
-        else if (in_group_p(inode->i_gid))
+        }
-                mode >>= 3;
+        ret = acl_permission_check(inode, MAY_EXEC, inode->i_op->check_acl);
+        if (!ret)
-        if (mode & MAY_EXEC)
-                goto ok;
-        if ((inode->i_mode & S_IXUGO) && capable(CAP_DAC_OVERRIDE))
-                goto ok;
-        if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_OVERRIDE))
                goto ok;
-        if (S_ISDIR(inode->i_mode) && capable(CAP_DAC_READ_SEARCH))
+        if (capable(CAP_DAC_OVERRIDE) || capable(CAP_DAC_READ_SEARCH))
                goto ok;
-        return -EACCES;
+        return ret;
 ok:
        return security_inode_permission(inode, MAY_EXEC);
 }
@@ -853,12 +859,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                nd->flags |= LOOKUP_CONTINUE;
                err = exec_permission_lite(inode);
-                if (err == -EAGAIN)
-                        err = inode_permission(nd->path.dentry->d_inode,
-                                               MAY_EXEC);
-                if (!err)
-                        err = ima_path_check(&nd->path, MAY_EXEC,
-                                             IMA_COUNT_UPDATE);
                if (err)
                        break;
@@ -1533,37 +1533,42 @@ int may_open(struct path *path, int acc_mode, int flag)
        if (error)
                return error;
-        error = ima_path_check(path,
+        error = ima_path_check(path, acc_mode ?
-                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+                               acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+                               ACC_MODE(flag) & (MAY_READ | MAY_WRITE),
                               IMA_COUNT_UPDATE);
        if (error)
                return error;
        /*
         * An append-only file must be opened in append mode for writing.
         */
        if (IS_APPEND(inode)) {
+                error = -EPERM;
                if  ((flag & FMODE_WRITE) && !(flag & O_APPEND))
-                        return -EPERM;
+                        goto err_out;
                if (flag & O_TRUNC)
-                        return -EPERM;
+                        goto err_out;
        }
        /* O_NOATIME can only be set by the owner or superuser */
        if (flag & O_NOATIME)
-                if (!is_owner_or_cap(inode))
+                if (!is_owner_or_cap(inode)) {
-                        return -EPERM;
+                        error = -EPERM;
+                        goto err_out;
+                }
        /*
         * Ensure there are no outstanding leases on the file.
         */
        error = break_lease(inode, flag);
        if (error)
-                return error;
+                goto err_out;
        if (flag & O_TRUNC) {
                error = get_write_access(inode);
                if (error)
-                        return error;
+                        goto err_out;
                /*
                 * Refuse to truncate files with mandatory locks held on them.
@@ -1581,12 +1586,17 @@ int may_open(struct path *path, int acc_mode, int flag)
                }
                put_write_access(inode);
                if (error)
-                        return error;
+                        goto err_out;
        } else
                if (flag & FMODE_WRITE)
                        vfs_dq_init(inode);
        return 0;
+err_out:
+        ima_counts_put(path, acc_mode ?
+                       acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC) :
+                       ACC_MODE(flag) & (MAY_READ | MAY_WRITE));
+        return error;
 }
 /*
diff --git a/fs/namespace.c b/fs/namespace.c
index 277c28a63ead..7230787d18b0 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -316,7 +316,8 @@ EXPORT_SYMBOL_GPL(mnt_clone_write);
 */
 int mnt_want_write_file(struct file *file)
 {
-        if (!(file->f_mode & FMODE_WRITE))
+        struct inode *inode = file->f_dentry->d_inode;
+        if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode))
                return mnt_want_write(file->f_path.mnt);
        else
                return mnt_clone_write(file->f_path.mnt);
diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile
index 845159814de2..da7fda639eac 100644
--- a/fs/nfs/Makefile
+++ b/fs/nfs/Makefile
@@ -6,7 +6,8 @@ obj-$(CONFIG_NFS_FS) += nfs.o
 nfs-y                   := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \
                           direct.o pagelist.o proc.o read.o symlink.o unlink.o \
-                           write.o namespace.o mount_clnt.o
+                           write.o namespace.o mount_clnt.o \
+                           dns_resolve.o cache_lib.o
 nfs-$(CONFIG_ROOT_NFS)  += nfsroot.o
 nfs-$(CONFIG_NFS_V3)    += nfs3proc.o nfs3xdr.o
 nfs-$(CONFIG_NFS_V3_ACL)        += nfs3acl.o
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
new file mode 100644
index 000000000000..b4ffd0146ea6
--- /dev/null
+++ b/fs/nfs/cache_lib.c
@@ -0,0 +1,140 @@
+/*
+ * linux/fs/nfs/cache_lib.c
+ *
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/rpc_pipe_fs.h>
+#include "cache_lib.h"
+#define NFS_CACHE_UPCALL_PATHLEN 256
+#define NFS_CACHE_UPCALL_TIMEOUT 15
+static char nfs_cache_getent_prog[NFS_CACHE_UPCALL_PATHLEN] =
+                                "/sbin/nfs_cache_getent";
+static unsigned long nfs_cache_getent_timeout = NFS_CACHE_UPCALL_TIMEOUT;
+module_param_string(cache_getent, nfs_cache_getent_prog,
+                sizeof(nfs_cache_getent_prog), 0600);
+MODULE_PARM_DESC(cache_getent, "Path to the client cache upcall program");
+module_param_named(cache_getent_timeout, nfs_cache_getent_timeout, ulong, 0600);
+MODULE_PARM_DESC(cache_getent_timeout, "Timeout (in seconds) after which "
+                "the cache upcall is assumed to have failed");
+int nfs_cache_upcall(struct cache_detail *cd, char *entry_name)
+{
+        static char *envp[] = { "HOME=/",
+                "TERM=linux",
+                "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+                NULL
+        };
+        char *argv[] = {
+                nfs_cache_getent_prog,
+                cd->name,
+                entry_name,
+                NULL
+        };
+        int ret = -EACCES;
+        if (nfs_cache_getent_prog[0] == '\0')
+                goto out;
+        ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+        /*
+         * Disable the upcall mechanism if we're getting an ENOENT or
+         * EACCES error. The admin can re-enable it on the fly by using
+         * sysfs to set the 'cache_getent' parameter once the problem
+         * has been fixed.
+         */
+        if (ret == -ENOENT || ret == -EACCES)
+                nfs_cache_getent_prog[0] = '\0';
+out:
+        return ret > 0 ? 0 : ret;
+}
+/*
+ * Deferred request handling
+ */
+void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq)
+{
+        if (atomic_dec_and_test(&dreq->count))
+                kfree(dreq);
+}
+static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
+{
+        struct nfs_cache_defer_req *dreq;
+        dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
+        complete_all(&dreq->completion);
+        nfs_cache_defer_req_put(dreq);
+}
+static struct cache_deferred_req *nfs_dns_cache_defer(struct cache_req *req)
+{
+        struct nfs_cache_defer_req *dreq;
+        dreq = container_of(req, struct nfs_cache_defer_req, req);
+        dreq->deferred_req.revisit = nfs_dns_cache_revisit;
+        atomic_inc(&dreq->count);
+        return &dreq->deferred_req;
+}
+struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void)
+{
+        struct nfs_cache_defer_req *dreq;
+        dreq = kzalloc(sizeof(*dreq), GFP_KERNEL);
+        if (dreq) {
+                init_completion(&dreq->completion);
+                atomic_set(&dreq->count, 1);
+                dreq->req.defer = nfs_dns_cache_defer;
+        }
+        return dreq;
+}
+int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq)
+{
+        if (wait_for_completion_timeout(&dreq->completion,
+                        nfs_cache_getent_timeout * HZ) == 0)
+                return -ETIMEDOUT;
+        return 0;
+}
+int nfs_cache_register(struct cache_detail *cd)
+{
+        struct nameidata nd;
+        struct vfsmount *mnt;
+        int ret;
+        mnt = rpc_get_mount();
+        if (IS_ERR(mnt))
+                return PTR_ERR(mnt);
+        ret = vfs_path_lookup(mnt->mnt_root, mnt, "/cache", 0, &nd);
+        if (ret)
+                goto err;
+        ret = sunrpc_cache_register_pipefs(nd.path.dentry,
+                        cd->name, 0600, cd);
+        path_put(&nd.path);
+        if (!ret)
+                return ret;
+err:
+        rpc_put_mount();
+        return ret;
+}
+void nfs_cache_unregister(struct cache_detail *cd)
+{
+        sunrpc_cache_unregister_pipefs(cd);
+        rpc_put_mount();
+}
diff --git a/fs/nfs/cache_lib.h b/fs/nfs/cache_lib.h
new file mode 100644
index 000000000000..76f856e284e4
--- /dev/null
+++ b/fs/nfs/cache_lib.h
@@ -0,0 +1,27 @@
+/*
+ * Helper routines for the NFS client caches
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ */
+#include <linux/completion.h>
+#include <linux/sunrpc/cache.h>
+#include <asm/atomic.h>
+/*
+ * Deferred request handling
+ */
+struct nfs_cache_defer_req {
+        struct cache_req req;
+        struct cache_deferred_req deferred_req;
+        struct completion completion;
+        atomic_t count;
+};
+extern int nfs_cache_upcall(struct cache_detail *cd, char *entry_name);
+extern struct nfs_cache_defer_req *nfs_cache_defer_req_alloc(void);
+extern void nfs_cache_defer_req_put(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_wait_for_upcall(struct nfs_cache_defer_req *dreq);
+extern int nfs_cache_register(struct cache_detail *cd);
+extern void nfs_cache_unregister(struct cache_detail *cd);
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 7f604c7941fb..293fa0528a6e 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -43,21 +43,29 @@ static struct svc_program nfs4_callback_program;
 unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
 unsigned short nfs_callback_tcpport6;
-static const int nfs_set_port_min = 0;
+#define NFS_CALLBACK_MAXPORTNR (65535U)
-static const int nfs_set_port_max = 65535;
-static int param_set_port(const char *val, struct kernel_param *kp)
+static int param_set_portnr(const char *val, struct kernel_param *kp)
 {
-        char *endp;
+        unsigned long num;
-        int num = simple_strtol(val, &endp, 0);
+        int ret;
-        if (endp == val || *endp || num < nfs_set_port_min || num > nfs_set_port_max)
+        if (!val)
+                return -EINVAL;
+        ret = strict_strtoul(val, 0, &num);
+        if (ret == -EINVAL || num > NFS_CALLBACK_MAXPORTNR)
                return -EINVAL;
-        *((int *)kp->arg) = num;
+        *((unsigned int *)kp->arg) = num;
        return 0;
 }
-module_param_call(callback_tcpport, param_set_port, param_get_int,
+static int param_get_portnr(char *buffer, struct kernel_param *kp)
-                 &nfs_callback_set_tcpport, 0644);
+{
+        return param_get_uint(buffer, kp);
+}
+#define param_check_portnr(name, p) __param_check(name, p, unsigned int);
+module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
 /*
 * This is the NFSv4 callback kernel thread.
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 8d25ccb2d51d..e350bd6a2334 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -809,6 +809,9 @@ static int nfs_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
        server->options = data->options;
+        server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+                NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP|
+                NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME;
        if (data->rsize)
                server->rsize = nfs_block_size(data->rsize, NULL);
@@ -879,6 +882,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, struct nfs_fsinfo *
                server->rsize = NFS_MAX_FILE_IO_SIZE;
        server->rpages = (server->rsize + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        server->backing_dev_info.name = "nfs";
        server->backing_dev_info.ra_pages = server->rpages * NFS_MAX_READAHEAD;
        if (server->wsize > max_rpc_payload)
@@ -1074,10 +1078,6 @@ struct nfs_server *nfs_create_server(const struct nfs_parsed_mount_data *data,
                (unsigned long long) server->fsid.major,
                (unsigned long long) server->fsid.minor);
-        BUG_ON(!server->nfs_client);
-        BUG_ON(!server->nfs_client->rpc_ops);
-        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
        spin_lock(&nfs_client_lock);
        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
        list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1274,7 +1274,7 @@ static int nfs4_init_server(struct nfs_server *server,
        /* Initialise the client representation from the mount data */
        server->flags = data->flags;
-        server->caps |= NFS_CAP_ATOMIC_OPEN;
+        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
        server->options = data->options;
        /* Get a client record */
@@ -1359,10 +1359,6 @@ struct nfs_server *nfs4_create_server(const struct nfs_parsed_mount_data *data,
        if (server->namelen == 0 || server->namelen > NFS4_MAXNAMLEN)
                server->namelen = NFS4_MAXNAMLEN;
-        BUG_ON(!server->nfs_client);
-        BUG_ON(!server->nfs_client->rpc_ops);
-        BUG_ON(!server->nfs_client->rpc_ops->file_inode_ops);
        spin_lock(&nfs_client_lock);
        list_add_tail(&server->client_link, &server->nfs_client->cl_superblocks);
        list_add_tail(&server->master_link, &nfs_volume_list);
@@ -1400,7 +1396,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
        /* Initialise the client representation from the parent server */
        nfs_server_copy_userdata(server, parent_server);
-        server->caps |= NFS_CAP_ATOMIC_OPEN;
+        server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR;
        /* Get a client representation.
         * Note: NFSv4 always uses TCP, */
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 489fc01a3204..6c3210099d51 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata)
        if (put_dreq(dreq))
                nfs_direct_complete(dreq);
-        nfs_readdata_release(calldata);
+        nfs_readdata_free(data);
 }
 static const struct rpc_call_ops nfs_read_direct_ops = {
@@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                                        data->npages, 1, 0, data->pagevec, NULL);
                up_read(&current->mm->mmap_sem);
                if (result < 0) {
-                        nfs_readdata_release(data);
+                        nfs_readdata_free(data);
                        break;
                }
                if ((unsigned)result < data->npages) {
                        bytes = result * PAGE_SIZE;
                        if (bytes <= pgbase) {
                                nfs_direct_release_pages(data->pagevec, result);
-                                nfs_readdata_release(data);
+                                nfs_readdata_free(data);
                                break;
                        }
                        bytes -= pgbase;
@@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq,
                data->inode = inode;
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
-                data->args.context = get_nfs_open_context(ctx);
+                data->args.context = ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
@@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq)
                struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages);
                list_del(&data->pages);
                nfs_direct_release_pages(data->pagevec, data->npages);
-                nfs_writedata_release(data);
+                nfs_writedata_free(data);
        }
 }
@@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata)
        dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status);
        nfs_direct_write_complete(dreq, data->inode);
-        nfs_commitdata_release(calldata);
+        nfs_commit_free(data);
 }
 static const struct rpc_call_ops nfs_commit_direct_ops = {
@@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
        data->args.fh = NFS_FH(data->inode);
        data->args.offset = 0;
        data->args.count = 0;
-        data->args.context = get_nfs_open_context(dreq->ctx);
+        data->args.context = dreq->ctx;
        data->res.count = 0;
        data->res.fattr = &data->fattr;
        data->res.verf = &data->verf;
@@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                                        data->npages, 0, 0, data->pagevec, NULL);
                up_read(&current->mm->mmap_sem);
                if (result < 0) {
-                        nfs_writedata_release(data);
+                        nfs_writedata_free(data);
                        break;
                }
                if ((unsigned)result < data->npages) {
                        bytes = result * PAGE_SIZE;
                        if (bytes <= pgbase) {
                                nfs_direct_release_pages(data->pagevec, result);
-                                nfs_writedata_release(data);
+                                nfs_writedata_free(data);
                                break;
                        }
                        bytes -= pgbase;
@@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq,
                data->inode = inode;
                data->cred = msg.rpc_cred;
                data->args.fh = NFS_FH(inode);
-                data->args.context = get_nfs_open_context(ctx);
+                data->args.context = ctx;
                data->args.offset = pos;
                data->args.pgbase = pgbase;
                data->args.pages = data->pagevec;
@@ -934,9 +934,6 @@ out:
 * back into its cache.  We let the server do generic write
 * parameter checking and report problems.
 *
- * We also avoid an unnecessary invocation of generic_osync_inode(),
- * as it is fairly meaningless to sync the metadata of an NFS file.
- *
 * We eliminate local atime updates, see direct read above.
 *
 * We avoid unnecessary page cache invalidations for normal cached
diff --git a/fs/nfs/dns_resolve.c b/fs/nfs/dns_resolve.c
new file mode 100644
index 000000000000..f4d54ba97cc6
--- /dev/null
+++ b/fs/nfs/dns_resolve.c
@@ -0,0 +1,335 @@
+/*
+ * linux/fs/nfs/dns_resolve.c
+ *
+ * Copyright (c) 2009 Trond Myklebust <Trond.Myklebust@netapp.com>
+ *
+ * Resolves DNS hostnames into valid ip addresses
+ */
+#include <linux/hash.h>
+#include <linux/string.h>
+#include <linux/kmod.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/seq_file.h>
+#include <linux/inet.h>
+#include <linux/sunrpc/clnt.h>
+#include <linux/sunrpc/cache.h>
+#include <linux/sunrpc/svcauth.h>
+#include "dns_resolve.h"
+#include "cache_lib.h"
+#define NFS_DNS_HASHBITS 4
+#define NFS_DNS_HASHTBL_SIZE (1 << NFS_DNS_HASHBITS)
+static struct cache_head *nfs_dns_table[NFS_DNS_HASHTBL_SIZE];
+struct nfs_dns_ent {
+        struct cache_head h;
+        char *hostname;
+        size_t namelen;
+        struct sockaddr_storage addr;
+        size_t addrlen;
+};
+static void nfs_dns_ent_init(struct cache_head *cnew,
+                struct cache_head *ckey)
+{
+        struct nfs_dns_ent *new;
+        struct nfs_dns_ent *key;
+        new = container_of(cnew, struct nfs_dns_ent, h);
+        key = container_of(ckey, struct nfs_dns_ent, h);
+        kfree(new->hostname);
+        new->hostname = kstrndup(key->hostname, key->namelen, GFP_KERNEL);
+        if (new->hostname) {
+                new->namelen = key->namelen;
+                memcpy(&new->addr, &key->addr, key->addrlen);
+                new->addrlen = key->addrlen;
+        } else {
+                new->namelen = 0;
+                new->addrlen = 0;
+        }
+}
+static void nfs_dns_ent_put(struct kref *ref)
+{
+        struct nfs_dns_ent *item;
+        item = container_of(ref, struct nfs_dns_ent, h.ref);
+        kfree(item->hostname);
+        kfree(item);
+}
+static struct cache_head *nfs_dns_ent_alloc(void)
+{
+        struct nfs_dns_ent *item = kmalloc(sizeof(*item), GFP_KERNEL);
+        if (item != NULL) {
+                item->hostname = NULL;
+                item->namelen = 0;
+                item->addrlen = 0;
+                return &item->h;
+        }
+        return NULL;
+};
+static unsigned int nfs_dns_hash(const struct nfs_dns_ent *key)
+{
+        return hash_str(key->hostname, NFS_DNS_HASHBITS);
+}
+static void nfs_dns_request(struct cache_detail *cd,
+                struct cache_head *ch,
+                char **bpp, int *blen)
+{
+        struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+        qword_add(bpp, blen, key->hostname);
+        (*bpp)[-1] = '\n';
+}
+static int nfs_dns_upcall(struct cache_detail *cd,
+                struct cache_head *ch)
+{
+        struct nfs_dns_ent *key = container_of(ch, struct nfs_dns_ent, h);
+        int ret;
+        ret = nfs_cache_upcall(cd, key->hostname);
+        if (ret)
+                ret = sunrpc_cache_pipe_upcall(cd, ch, nfs_dns_request);
+        return ret;
+}
+static int nfs_dns_match(struct cache_head *ca,
+                struct cache_head *cb)
+{
+        struct nfs_dns_ent *a;
+        struct nfs_dns_ent *b;
+        a = container_of(ca, struct nfs_dns_ent, h);
+        b = container_of(cb, struct nfs_dns_ent, h);
+        if (a->namelen == 0 || a->namelen != b->namelen)
+                return 0;
+        return memcmp(a->hostname, b->hostname, a->namelen) == 0;
+}
+static int nfs_dns_show(struct seq_file *m, struct cache_detail *cd,
+                struct cache_head *h)
+{
+        struct nfs_dns_ent *item;
+        long ttl;
+        if (h == NULL) {
+                seq_puts(m, "# ip address      hostname        ttl\n");
+                return 0;
+        }
+        item = container_of(h, struct nfs_dns_ent, h);
+        ttl = (long)item->h.expiry_time - (long)get_seconds();
+        if (ttl < 0)
+                ttl = 0;
+        if (!test_bit(CACHE_NEGATIVE, &h->flags)) {
+                char buf[INET6_ADDRSTRLEN+IPV6_SCOPE_ID_LEN+1];
+                rpc_ntop((struct sockaddr *)&item->addr, buf, sizeof(buf));
+                seq_printf(m, "%15s ", buf);
+        } else
+                seq_puts(m, "<none>          ");
+        seq_printf(m, "%15s %ld\n", item->hostname, ttl);
+        return 0;
+}
+struct nfs_dns_ent *nfs_dns_lookup(struct cache_detail *cd,
+                struct nfs_dns_ent *key)
+{
+        struct cache_head *ch;
+        ch = sunrpc_cache_lookup(cd,
+                        &key->h,
+                        nfs_dns_hash(key));
+        if (!ch)
+                return NULL;
+        return container_of(ch, struct nfs_dns_ent, h);
+}
+struct nfs_dns_ent *nfs_dns_update(struct cache_detail *cd,
+                struct nfs_dns_ent *new,
+                struct nfs_dns_ent *key)
+{
+        struct cache_head *ch;
+        ch = sunrpc_cache_update(cd,
+                        &new->h, &key->h,
+                        nfs_dns_hash(key));
+        if (!ch)
+                return NULL;
+        return container_of(ch, struct nfs_dns_ent, h);
+}
+static int nfs_dns_parse(struct cache_detail *cd, char *buf, int buflen)
+{
+        char buf1[NFS_DNS_HOSTNAME_MAXLEN+1];
+        struct nfs_dns_ent key, *item;
+        unsigned long ttl;
+        ssize_t len;
+        int ret = -EINVAL;
+        if (buf[buflen-1] != '\n')
+                goto out;
+        buf[buflen-1] = '\0';
+        len = qword_get(&buf, buf1, sizeof(buf1));
+        if (len <= 0)
+                goto out;
+        key.addrlen = rpc_pton(buf1, len,
+                        (struct sockaddr *)&key.addr,
+                        sizeof(key.addr));
+        len = qword_get(&buf, buf1, sizeof(buf1));
+        if (len <= 0)
+                goto out;
+        key.hostname = buf1;
+        key.namelen = len;
+        memset(&key.h, 0, sizeof(key.h));
+        ttl = get_expiry(&buf);
+        if (ttl == 0)
+                goto out;
+        key.h.expiry_time = ttl + get_seconds();
+        ret = -ENOMEM;
+        item = nfs_dns_lookup(cd, &key);
+        if (item == NULL)
+                goto out;
+        if (key.addrlen == 0)
+                set_bit(CACHE_NEGATIVE, &key.h.flags);
+        item = nfs_dns_update(cd, &key, item);
+        if (item == NULL)
+                goto out;
+        ret = 0;
+        cache_put(&item->h, cd);
+out:
+        return ret;
+}
+static struct cache_detail nfs_dns_resolve = {
+        .owner = THIS_MODULE,
+        .hash_size = NFS_DNS_HASHTBL_SIZE,
+        .hash_table = nfs_dns_table,
+        .name = "dns_resolve",
+        .cache_put = nfs_dns_ent_put,
+        .cache_upcall = nfs_dns_upcall,
+        .cache_parse = nfs_dns_parse,
+        .cache_show = nfs_dns_show,
+        .match = nfs_dns_match,
+        .init = nfs_dns_ent_init,
+        .update = nfs_dns_ent_init,
+        .alloc = nfs_dns_ent_alloc,
+};
+static int do_cache_lookup(struct cache_detail *cd,
+                struct nfs_dns_ent *key,
+                struct nfs_dns_ent **item,
+                struct nfs_cache_defer_req *dreq)
+{
+        int ret = -ENOMEM;
+        *item = nfs_dns_lookup(cd, key);
+        if (*item) {
+                ret = cache_check(cd, &(*item)->h, &dreq->req);
+                if (ret)
+                        *item = NULL;
+        }
+        return ret;
+}
+static int do_cache_lookup_nowait(struct cache_detail *cd,
+                struct nfs_dns_ent *key,
+                struct nfs_dns_ent **item)
+{
+        int ret = -ENOMEM;
+        *item = nfs_dns_lookup(cd, key);
+        if (!*item)
+                goto out_err;
+        ret = -ETIMEDOUT;
+        if (!test_bit(CACHE_VALID, &(*item)->h.flags)
+                        || (*item)->h.expiry_time < get_seconds()
+                        || cd->flush_time > (*item)->h.last_refresh)
+                goto out_put;
+        ret = -ENOENT;
+        if (test_bit(CACHE_NEGATIVE, &(*item)->h.flags))
+                goto out_put;
+        return 0;
+out_put:
+        cache_put(&(*item)->h, cd);
+out_err:
+        *item = NULL;
+        return ret;
+}
+static int do_cache_lookup_wait(struct cache_detail *cd,
+                struct nfs_dns_ent *key,
+                struct nfs_dns_ent **item)
+{
+        struct nfs_cache_defer_req *dreq;
+        int ret = -ENOMEM;
+        dreq = nfs_cache_defer_req_alloc();
+        if (!dreq)
+                goto out;
+        ret = do_cache_lookup(cd, key, item, dreq);
+        if (ret == -EAGAIN) {
+                ret = nfs_cache_wait_for_upcall(dreq);
+                if (!ret)
+                        ret = do_cache_lookup_nowait(cd, key, item);
+        }
+        nfs_cache_defer_req_put(dreq);
+out:
+        return ret;
+}
+ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+                struct sockaddr *sa, size_t salen)
+{
+        struct nfs_dns_ent key = {
+                .hostname = name,
+                .namelen = namelen,
+        };
+        struct nfs_dns_ent *item = NULL;
+        ssize_t ret;
+        ret = do_cache_lookup_wait(&nfs_dns_resolve, &key, &item);
+        if (ret == 0) {
+                if (salen >= item->addrlen) {
+                        memcpy(sa, &item->addr, item->addrlen);
+                        ret = item->addrlen;
+                } else
+                        ret = -EOVERFLOW;
+                cache_put(&item->h, &nfs_dns_resolve);
+        } else if (ret == -ENOENT)
+                ret = -ESRCH;
+        return ret;
+}
+int nfs_dns_resolver_init(void)
+{
+        return nfs_cache_register(&nfs_dns_resolve);
+}
+void nfs_dns_resolver_destroy(void)
+{
+        nfs_cache_unregister(&nfs_dns_resolve);
+}
diff --git a/fs/nfs/dns_resolve.h b/fs/nfs/dns_resolve.h
new file mode 100644
index 000000000000..a3f0938babf7
--- /dev/null
+++ b/fs/nfs/dns_resolve.h
@@ -0,0 +1,14 @@
+/*
+ * Resolve DNS hostnames into valid ip addresses
+ */
+#ifndef __LINUX_FS_NFS_DNS_RESOLVE_H
+#define __LINUX_FS_NFS_DNS_RESOLVE_H
+#define NFS_DNS_HOSTNAME_MAXLEN (128)
+extern int nfs_dns_resolver_init(void);
+extern void nfs_dns_resolver_destroy(void);
+extern ssize_t nfs_dns_resolve_name(char *name, size_t namelen,
+                struct sockaddr *sa, size_t salen);
+#endif
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 05062329b678..5021b75d2d1e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -328,6 +328,42 @@ nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
 }
 /*
+ * Decide whether a read/modify/write cycle may be more efficient
+ * then a modify/write/read cycle when writing to a page in the
+ * page cache.
+ *
+ * The modify/write/read cycle may occur if a page is read before
+ * being completely filled by the writer.  In this situation, the
+ * page must be completely written to stable storage on the server
+ * before it can be refilled by reading in the page from the server.
+ * This can lead to expensive, small, FILE_SYNC mode writes being
+ * done.
+ *
+ * It may be more efficient to read the page first if the file is
+ * open for reading in addition to writing, the page is not marked
+ * as Uptodate, it is not dirty or waiting to be committed,
+ * indicating that it was previously allocated and then modified,
+ * that there were valid bytes of data in that range of the file,
+ * and that the new data won't completely replace the old data in
+ * that range of the file.
+ */
+static int nfs_want_read_modify_write(struct file *file, struct page *page,
+                        loff_t pos, unsigned len)
+{
+        unsigned int pglen = nfs_page_length(page);
+        unsigned int offset = pos & (PAGE_CACHE_SIZE - 1);
+        unsigned int end = offset + len;
+        if ((file->f_mode & FMODE_READ) &&      /* open for read? */
+            !PageUptodate(page) &&              /* Uptodate? */
+            !PagePrivate(page) &&               /* i/o request already? */
+            pglen &&                            /* valid bytes of file? */
+            (end < pglen || offset))            /* replace all valid bytes? */
+                return 1;
+        return 0;
+}
+/*
 * This does the "real" work of the write. We must allocate and lock the
 * page to be sent back to the generic routine, which then copies the
 * data from user space.
@@ -340,15 +376,16 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                        struct page **pagep, void **fsdata)
 {
        int ret;
-        pgoff_t index;
+        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        struct page *page;
-        index = pos >> PAGE_CACHE_SHIFT;
+        int once_thru = 0;
        dfprintk(PAGECACHE, "NFS: write_begin(%s/%s(%ld), %u@%lld)\n",
                file->f_path.dentry->d_parent->d_name.name,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
+start:
        /*
         * Prevent starvation issues if someone is doing a consistency
         * sync-to-disk
@@ -367,6 +404,13 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
        if (ret) {
                unlock_page(page);
                page_cache_release(page);
+        } else if (!once_thru &&
+                   nfs_want_read_modify_write(file, page, pos, len)) {
+                once_thru = 1;
+                ret = nfs_readpage(file, page);
+                page_cache_release(page);
+                if (!ret)
+                        goto start;
        }
        return ret;
 }
@@ -479,6 +523,7 @@ const struct address_space_operations nfs_file_aops = {
        .invalidatepage = nfs_invalidate_page,
        .releasepage = nfs_release_page,
        .direct_IO = nfs_direct_IO,
+        .migratepage = nfs_migrate_page,
        .launder_page = nfs_launder_page,
 };
diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c
index 86147b0ab2cf..21a84d45916f 100644
--- a/fs/nfs/idmap.c
+++ b/fs/nfs/idmap.c
@@ -101,7 +101,7 @@ static void idmap_pipe_destroy_msg(struct rpc_pipe_msg *);
 static unsigned int fnvhash32(const void *, size_t);
-static struct rpc_pipe_ops idmap_upcall_ops = {
+static const struct rpc_pipe_ops idmap_upcall_ops = {
        .upcall         = idmap_pipe_upcall,
        .downcall       = idmap_pipe_downcall,
        .destroy_msg    = idmap_pipe_destroy_msg,
@@ -119,8 +119,8 @@ nfs_idmap_new(struct nfs_client *clp)
        if (idmap == NULL)
                return -ENOMEM;
-        idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_dentry, "idmap",
+        idmap->idmap_dentry = rpc_mkpipe(clp->cl_rpcclient->cl_path.dentry,
-                                         idmap, &idmap_upcall_ops, 0);
+                        "idmap", idmap, &idmap_upcall_ops, 0);
        if (IS_ERR(idmap->idmap_dentry)) {
                error = PTR_ERR(idmap->idmap_dentry);
                kfree(idmap);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bd7938eda6a8..060022b4651c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -46,6 +46,7 @@
 #include "iostat.h"
 #include "internal.h"
 #include "fscache.h"
+#include "dns_resolve.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -286,6 +287,11 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                /* We can't support update_atime(), since the server will reset it */
                inode->i_flags |= S_NOATIME|S_NOCMTIME;
                inode->i_mode = fattr->mode;
+                if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0
+                                && nfs_server_capable(inode, NFS_CAP_MODE))
+                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_ACCESS
+                                | NFS_INO_INVALID_ACL;
                /* Why so? Because we want revalidate for devices/FIFOs, and
                 * that's precisely what we have in nfs_file_inode_operations.
                 */
@@ -330,20 +336,46 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                nfsi->attr_gencount = fattr->gencount;
                if (fattr->valid & NFS_ATTR_FATTR_ATIME)
                        inode->i_atime = fattr->atime;
+                else if (nfs_server_capable(inode, NFS_CAP_ATIME))
+                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
                if (fattr->valid & NFS_ATTR_FATTR_MTIME)
                        inode->i_mtime = fattr->mtime;
+                else if (nfs_server_capable(inode, NFS_CAP_MTIME))
+                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_DATA;
                if (fattr->valid & NFS_ATTR_FATTR_CTIME)
                        inode->i_ctime = fattr->ctime;
+                else if (nfs_server_capable(inode, NFS_CAP_CTIME))
+                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_ACCESS
+                                | NFS_INO_INVALID_ACL;
                if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
                        nfsi->change_attr = fattr->change_attr;
+                else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR))
+                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_DATA;
                if (fattr->valid & NFS_ATTR_FATTR_SIZE)
                        inode->i_size = nfs_size_to_loff_t(fattr->size);
+                else
+                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_DATA
+                                | NFS_INO_REVAL_PAGECACHE;
                if (fattr->valid & NFS_ATTR_FATTR_NLINK)
                        inode->i_nlink = fattr->nlink;
+                else if (nfs_server_capable(inode, NFS_CAP_NLINK))
+                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR;
                if (fattr->valid & NFS_ATTR_FATTR_OWNER)
                        inode->i_uid = fattr->uid;
+                else if (nfs_server_capable(inode, NFS_CAP_OWNER))
+                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_ACCESS
+                                | NFS_INO_INVALID_ACL;
                if (fattr->valid & NFS_ATTR_FATTR_GROUP)
                        inode->i_gid = fattr->gid;
+                else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP))
+                        nfsi->cache_validity |= NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_ACCESS
+                                | NFS_INO_INVALID_ACL;
                if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
                        inode->i_blocks = fattr->du.nfs2.blocks;
                if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
@@ -1145,6 +1177,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
        loff_t cur_isize, new_isize;
        unsigned long invalid = 0;
        unsigned long now = jiffies;
+        unsigned long save_cache_validity;
        dfprintk(VFS, "NFS: %s(%s/%ld ct=%d info=0x%x)\n",
                        __func__, inode->i_sb->s_id, inode->i_ino,
@@ -1171,10 +1204,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
         */
        nfsi->read_cache_jiffies = fattr->time_start;
-        if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
+        save_cache_validity = nfsi->cache_validity;
-            nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+        nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
-                    | NFS_INO_INVALID_ATIME
+                        | NFS_INO_INVALID_ATIME
-                    | NFS_INO_REVAL_PAGECACHE);
+                        | NFS_INO_REVAL_FORCED
+                        | NFS_INO_REVAL_PAGECACHE);
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
@@ -1189,7 +1223,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                nfs_force_lookup_revalidate(inode);
                        nfsi->change_attr = fattr->change_attr;
                }
-        }
+        } else if (server->caps & NFS_CAP_CHANGE_ATTR)
+                invalid |= save_cache_validity;
        if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
                /* NFSv2/v3: Check if the mtime agrees */
@@ -1201,7 +1236,12 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                nfs_force_lookup_revalidate(inode);
                        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
                }
-        }
+        } else if (server->caps & NFS_CAP_MTIME)
+                invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_DATA
+                                | NFS_INO_REVAL_PAGECACHE
+                                | NFS_INO_REVAL_FORCED);
        if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
                /* If ctime has changed we should definitely clear access+acl caches */
                if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
@@ -1215,7 +1255,11 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        }
                        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
                }
-        }
+        } else if (server->caps & NFS_CAP_CTIME)
+                invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_ACCESS
+                                | NFS_INO_INVALID_ACL
+                                | NFS_INO_REVAL_FORCED);
        /* Check if our cached file size is stale */
        if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
@@ -1231,30 +1275,50 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        dprintk("NFS: isize change on server for file %s/%ld\n",
                                        inode->i_sb->s_id, inode->i_ino);
                }
-        }
+        } else
+                invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+                                | NFS_INO_REVAL_PAGECACHE
+                                | NFS_INO_REVAL_FORCED);
        if (fattr->valid & NFS_ATTR_FATTR_ATIME)
                memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
+        else if (server->caps & NFS_CAP_ATIME)
+                invalid |= save_cache_validity & (NFS_INO_INVALID_ATIME
+                                | NFS_INO_REVAL_FORCED);
        if (fattr->valid & NFS_ATTR_FATTR_MODE) {
                if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
                        inode->i_mode = fattr->mode;
                }
-        }
+        } else if (server->caps & NFS_CAP_MODE)
+                invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_ACCESS
+                                | NFS_INO_INVALID_ACL
+                                | NFS_INO_REVAL_FORCED);
        if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
                if (inode->i_uid != fattr->uid) {
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
                        inode->i_uid = fattr->uid;
                }
-        }
+        } else if (server->caps & NFS_CAP_OWNER)
+                invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_ACCESS
+                                | NFS_INO_INVALID_ACL
+                                | NFS_INO_REVAL_FORCED);
        if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
                if (inode->i_gid != fattr->gid) {
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
                        inode->i_gid = fattr->gid;
                }
-        }
+        } else if (server->caps & NFS_CAP_OWNER_GROUP)
+                invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+                                | NFS_INO_INVALID_ACCESS
+                                | NFS_INO_INVALID_ACL
+                                | NFS_INO_REVAL_FORCED);
        if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
                if (inode->i_nlink != fattr->nlink) {
@@ -1263,7 +1327,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                invalid |= NFS_INO_INVALID_DATA;
                        inode->i_nlink = fattr->nlink;
                }
-        }
+        } else if (server->caps & NFS_CAP_NLINK)
+                invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR
+                                | NFS_INO_REVAL_FORCED);
        if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
                /*
@@ -1293,9 +1359,8 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                                || S_ISLNK(inode->i_mode)))
                invalid &= ~NFS_INO_INVALID_DATA;
        if (!nfs_have_delegation(inode, FMODE_READ) ||
-                        (nfsi->cache_validity & NFS_INO_REVAL_FORCED))
+                        (save_cache_validity & NFS_INO_REVAL_FORCED))
                nfsi->cache_validity |= invalid;
-        nfsi->cache_validity &= ~NFS_INO_REVAL_FORCED;
        return 0;
 out_changed:
@@ -1442,6 +1507,10 @@ static int __init init_nfs_fs(void)
 {
        int err;
+        err = nfs_dns_resolver_init();
+        if (err < 0)
+                goto out8;
        err = nfs_fscache_register();
        if (err < 0)
                goto out7;
@@ -1500,6 +1569,8 @@ out5:
 out6:
        nfs_fscache_unregister();
 out7:
+        nfs_dns_resolver_destroy();
+out8:
        return err;
 }
@@ -1511,6 +1582,7 @@ static void __exit exit_nfs_fs(void)
        nfs_destroy_inodecache();
        nfs_destroy_nfspagecache();
        nfs_fscache_unregister();
+        nfs_dns_resolver_destroy();
 #ifdef CONFIG_PROC_FS
        rpc_proc_unregister("nfs");
 #endif
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 7dd90a6769d0..e21b1bb9972f 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -49,6 +49,11 @@ struct nfs_clone_mount {
 #define NFS_MAX_SECFLAVORS      (12)
 /*
+ * Value used if the user did not specify a port value.
+ */
+#define NFS_UNSPEC_PORT         (-1)
+/*
 * In-kernel mount arguments
 */
 struct nfs_parsed_mount_data {
@@ -63,6 +68,7 @@ struct nfs_parsed_mount_data {
        unsigned int            auth_flavor_len;
        rpc_authflavor_t        auth_flavors[1];
        char                    *client_address;
+        unsigned int            version;
        unsigned int            minorversion;
        char                    *fscache_uniq;
@@ -71,7 +77,7 @@ struct nfs_parsed_mount_data {
                size_t                  addrlen;
                char                    *hostname;
                u32                     version;
-                unsigned short          port;
+                int                     port;
                unsigned short          protocol;
        } mount_server;
@@ -80,7 +86,7 @@ struct nfs_parsed_mount_data {
                size_t                  addrlen;
                char                    *hostname;
                char                    *export_path;
-                unsigned short          port;
+                int                     port;
                unsigned short          protocol;
        } nfs_server;
@@ -102,6 +108,7 @@ struct nfs_mount_request {
 };
 extern int nfs_mount(struct nfs_mount_request *info);
+extern void nfs_umount(const struct nfs_mount_request *info);
 /* client.c */
 extern struct rpc_program nfs_program;
@@ -213,7 +220,6 @@ void nfs_zap_acl_cache(struct inode *inode);
 extern int nfs_wait_bit_killable(void *word);
 /* super.c */
-void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
 extern struct file_system_type nfs_xdev_fs_type;
 #ifdef CONFIG_NFS_V4
 extern struct file_system_type nfs4_xdev_fs_type;
@@ -248,6 +254,12 @@ extern void nfs_read_prepare(struct rpc_task *task, void *calldata);
 /* write.c */
 extern void nfs_write_prepare(struct rpc_task *task, void *calldata);
+#ifdef CONFIG_MIGRATION
+extern int nfs_migrate_page(struct address_space *,
+                struct page *, struct page *);
+#else
+#define nfs_migrate_page NULL
+#endif
 /* nfs4proc.c */
 extern int _nfs4_call_sync(struct nfs_server *server,
@@ -368,24 +380,3 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len)
        return ((unsigned long)len + (unsigned long)base +
                PAGE_SIZE - 1) >> PAGE_SHIFT;
 }
-#define IPV6_SCOPE_DELIMITER    '%'
-/*
- * Set the port number in an address.  Be agnostic about the address
- * family.
- */
-static inline void nfs_set_port(struct sockaddr *sap, unsigned short port)
-{
-        struct sockaddr_in *ap = (struct sockaddr_in *)sap;
-        struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap;
-        switch (sap->sa_family) {
-        case AF_INET:
-                ap->sin_port = htons(port);
-                break;
-        case AF_INET6:
-                ap6->sin6_port = htons(port);
-                break;
-        }
-}
diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c
index 38ef9eaec407..0adefc40cc89 100644
--- a/fs/nfs/mount_clnt.c
+++ b/fs/nfs/mount_clnt.c
@@ -209,6 +209,71 @@ out_mnt_err:
        goto out;
 }
+/**
+ * nfs_umount - Notify a server that we have unmounted this export
+ * @info: pointer to umount request arguments
+ *
+ * MOUNTPROC_UMNT is advisory, so we set a short timeout, and always
+ * use UDP.
+ */
+void nfs_umount(const struct nfs_mount_request *info)
+{
+        static const struct rpc_timeout nfs_umnt_timeout = {
+                .to_initval = 1 * HZ,
+                .to_maxval = 3 * HZ,
+                .to_retries = 2,
+        };
+        struct rpc_create_args args = {
+                .protocol       = IPPROTO_UDP,
+                .address        = info->sap,
+                .addrsize       = info->salen,
+                .timeout        = &nfs_umnt_timeout,
+                .servername     = info->hostname,
+                .program        = &mnt_program,
+                .version        = info->version,
+                .authflavor     = RPC_AUTH_UNIX,
+                .flags          = RPC_CLNT_CREATE_NOPING,
+        };
+        struct mountres result;
+        struct rpc_message msg  = {
+                .rpc_argp       = info->dirpath,
+                .rpc_resp       = &result,
+        };
+        struct rpc_clnt *clnt;
+        int status;
+        if (info->noresvport)
+                args.flags |= RPC_CLNT_CREATE_NONPRIVPORT;
+        clnt = rpc_create(&args);
+        if (unlikely(IS_ERR(clnt)))
+                goto out_clnt_err;
+        dprintk("NFS: sending UMNT request for %s:%s\n",
+                (info->hostname ? info->hostname : "server"), info->dirpath);
+        if (info->version == NFS_MNT3_VERSION)
+                msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC3_UMNT];
+        else
+                msg.rpc_proc = &clnt->cl_procinfo[MOUNTPROC_UMNT];
+        status = rpc_call_sync(clnt, &msg, 0);
+        rpc_shutdown_client(clnt);
+        if (unlikely(status < 0))
+                goto out_call_err;
+        return;
+out_clnt_err:
+        dprintk("NFS: failed to create UMNT RPC client, status=%ld\n",
+                        PTR_ERR(clnt));
+        return;
+out_call_err:
+        dprintk("NFS: UMNT request failed, status=%d\n", status);
+}
 /*
 * XDR encode/decode functions for MOUNT
 */
@@ -258,7 +323,7 @@ static int decode_status(struct xdr_stream *xdr, struct mountres *res)
                return -EIO;
        status = ntohl(*p);
-        for (i = 0; i <= ARRAY_SIZE(mnt_errtbl); i++) {
+        for (i = 0; i < ARRAY_SIZE(mnt_errtbl); i++) {
                if (mnt_errtbl[i].status == status) {
                        res->errno = mnt_errtbl[i].errno;
                        return 0;
@@ -309,7 +374,7 @@ static int decode_fhs_status(struct xdr_stream *xdr, struct mountres *res)
                return -EIO;
        status = ntohl(*p);
-        for (i = 0; i <= ARRAY_SIZE(mnt3_errtbl); i++) {
+        for (i = 0; i < ARRAY_SIZE(mnt3_errtbl); i++) {
                if (mnt3_errtbl[i].status == status) {
                        res->errno = mnt3_errtbl[i].errno;
                        return 0;
@@ -407,6 +472,13 @@ static struct rpc_procinfo mnt_procedures[] = {
                .p_statidx      = MOUNTPROC_MNT,
                .p_name         = "MOUNT",
        },
+        [MOUNTPROC_UMNT] = {
+                .p_proc         = MOUNTPROC_UMNT,
+                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_arglen       = MNT_enc_dirpath_sz,
+                .p_statidx      = MOUNTPROC_UMNT,
+                .p_name         = "UMOUNT",
+        },
 };
 static struct rpc_procinfo mnt3_procedures[] = {
@@ -419,6 +491,13 @@ static struct rpc_procinfo mnt3_procedures[] = {
                .p_statidx      = MOUNTPROC3_MNT,
                .p_name         = "MOUNT",
        },
+        [MOUNTPROC3_UMNT] = {
+                .p_proc         = MOUNTPROC3_UMNT,
+                .p_encode       = (kxdrproc_t)mnt_enc_dirpath,
+                .p_arglen       = MNT_enc_dirpath_sz,
+                .p_statidx      = MOUNTPROC3_UMNT,
+                .p_name         = "UMOUNT",
+        },
 };
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index d0cc5ce0edfe..ee6a13f05443 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -299,7 +299,6 @@ static void nfs3_free_createdata(struct nfs3_createdata *data)
 /*
 * Create a regular file.
- * For now, we don't implement O_EXCL.
 */
 static int
 nfs3_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 2a2a0a7143ad..2636c26d56fa 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -17,6 +17,7 @@
 #include <linux/inet.h>
 #include "internal.h"
 #include "nfs4_fs.h"
+#include "dns_resolve.h"
 #define NFSDBG_FACILITY         NFSDBG_VFS
@@ -95,6 +96,20 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent,
        return 0;
 }
+static size_t nfs_parse_server_name(char *string, size_t len,
+                struct sockaddr *sa, size_t salen)
+{
+        ssize_t ret;
+        ret = rpc_pton(string, len, sa, salen);
+        if (ret == 0) {
+                ret = nfs_dns_resolve_name(string, len, sa, salen);
+                if (ret < 0)
+                        ret = 0;
+        }
+        return ret;
+}
 static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                                     char *page, char *page2,
                                     const struct nfs4_fs_location *location)
@@ -121,11 +136,12 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata,
                if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len))
                        continue;
-                nfs_parse_ip_address(buf->data, buf->len,
+                mountdata->addrlen = nfs_parse_server_name(buf->data,
-                                mountdata->addr, &mountdata->addrlen);
+                                buf->len,
-                if (mountdata->addr->sa_family == AF_UNSPEC)
+                                mountdata->addr, mountdata->addrlen);
+                if (mountdata->addrlen == 0)
                        continue;
-                nfs_set_port(mountdata->addr, NFS_PORT);
+                rpc_set_port(mountdata->addr, NFS_PORT);
                memcpy(page2, buf->data, buf->len);
                page2[buf->len] = '\0';
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 6917311f201c..be6544aef41f 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -61,6 +61,8 @@
 #define NFS4_POLL_RETRY_MIN     (HZ/10)
 #define NFS4_POLL_RETRY_MAX     (15*HZ)
+#define NFS4_MAX_LOOP_ON_RECOVER (10)
 struct nfs4_opendata;
 static int _nfs4_proc_open(struct nfs4_opendata *data);
 static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *);
@@ -426,17 +428,19 @@ out:
 static int nfs4_recover_session(struct nfs4_session *session)
 {
        struct nfs_client *clp = session->clp;
+        unsigned int loop;
        int ret;
-        for (;;) {
+        for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
                ret = nfs4_wait_clnt_recover(clp);
                if (ret != 0)
-                                return ret;
+                        break;
                if (!test_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state))
                        break;
                nfs4_schedule_state_manager(clp);
+                ret = -EIO;
        }
-        return 0;
+        return ret;
 }
 static int nfs41_setup_sequence(struct nfs4_session *session,
@@ -1444,18 +1448,20 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 static int nfs4_recover_expired_lease(struct nfs_server *server)
 {
        struct nfs_client *clp = server->nfs_client;
+        unsigned int loop;
        int ret;
-        for (;;) {
+        for (loop = NFS4_MAX_LOOP_ON_RECOVER; loop != 0; loop--) {
                ret = nfs4_wait_clnt_recover(clp);
                if (ret != 0)
-                        return ret;
+                        break;
                if (!test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) &&
                    !test_bit(NFS4CLNT_CHECK_LEASE,&clp->cl_state))
                        break;
                nfs4_schedule_state_recovery(clp);
+                ret = -EIO;
        }
-        return 0;
+        return ret;
 }
 /*
@@ -1997,12 +2003,34 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
        status = nfs4_call_sync(server, &msg, &args, &res, 0);
        if (status == 0) {
                memcpy(server->attr_bitmask, res.attr_bitmask, sizeof(server->attr_bitmask));
+                server->caps &= ~(NFS_CAP_ACLS|NFS_CAP_HARDLINKS|
+                                NFS_CAP_SYMLINKS|NFS_CAP_FILEID|
+                                NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|
+                                NFS_CAP_OWNER_GROUP|NFS_CAP_ATIME|
+                                NFS_CAP_CTIME|NFS_CAP_MTIME);
                if (res.attr_bitmask[0] & FATTR4_WORD0_ACL)
                        server->caps |= NFS_CAP_ACLS;
                if (res.has_links != 0)
                        server->caps |= NFS_CAP_HARDLINKS;
                if (res.has_symlinks != 0)
                        server->caps |= NFS_CAP_SYMLINKS;
+                if (res.attr_bitmask[0] & FATTR4_WORD0_FILEID)
+                        server->caps |= NFS_CAP_FILEID;
+                if (res.attr_bitmask[1] & FATTR4_WORD1_MODE)
+                        server->caps |= NFS_CAP_MODE;
+                if (res.attr_bitmask[1] & FATTR4_WORD1_NUMLINKS)
+                        server->caps |= NFS_CAP_NLINK;
+                if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER)
+                        server->caps |= NFS_CAP_OWNER;
+                if (res.attr_bitmask[1] & FATTR4_WORD1_OWNER_GROUP)
+                        server->caps |= NFS_CAP_OWNER_GROUP;
+                if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_ACCESS)
+                        server->caps |= NFS_CAP_ATIME;
+                if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_METADATA)
+                        server->caps |= NFS_CAP_CTIME;
+                if (res.attr_bitmask[1] & FATTR4_WORD1_TIME_MODIFY)
+                        server->caps |= NFS_CAP_MTIME;
                memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
                server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
                server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 65ca8c18476f..1434080aefeb 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1250,8 +1250,8 @@ static void nfs4_state_manager(struct nfs_client *clp)
                                continue;
                }
                /* Initialize or reset the session */
-                if (nfs4_has_session(clp) &&
+                if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)
-                   test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) {
+                   && nfs4_has_session(clp)) {
                        if (clp->cl_cons_state == NFS_CS_SESSION_INITING)
                                status = nfs4_initialize_session(clp);
                        else
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 617273e7d47f..cfc30d362f94 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -702,29 +702,12 @@ struct compound_hdr {
        u32             minorversion;
 };
-/*
+static __be32 *reserve_space(struct xdr_stream *xdr, size_t nbytes)
- * START OF "GENERIC" ENCODE ROUTINES.
+{
- *   These may look a little ugly since they are imported from a "generic"
+        __be32 *p = xdr_reserve_space(xdr, nbytes);
- * set of XDR encode/decode routines which are intended to be shared by
+        BUG_ON(!p);
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
+        return p;
- *
+}
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define WRITE32(n)               *p++ = htonl(n)
-#define WRITE64(n)               do {                           \
-        *p++ = htonl((uint32_t)((n) >> 32));                            \
-        *p++ = htonl((uint32_t)(n));                                    \
-} while (0)
-#define WRITEMEM(ptr,nbytes)     do {                           \
-        p = xdr_encode_opaque_fixed(p, ptr, nbytes);            \
-} while (0)
-#define RESERVE_SPACE(nbytes)   do {                            \
-        p = xdr_reserve_space(xdr, nbytes);                     \
-        BUG_ON(!p);                                             \
-} while (0)
 static void encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
 {
@@ -749,12 +732,11 @@ static void encode_compound_hdr(struct xdr_stream *xdr,
        dprintk("encode_compound: tag=%.*s\n", (int)hdr->taglen, hdr->tag);
        BUG_ON(hdr->taglen > NFS4_MAXTAGLEN);
-        RESERVE_SPACE(12+(XDR_QUADLEN(hdr->taglen)<<2));
+        p = reserve_space(xdr, 4 + hdr->taglen + 8);
-        WRITE32(hdr->taglen);
+        p = xdr_encode_opaque(p, hdr->tag, hdr->taglen);
-        WRITEMEM(hdr->tag, hdr->taglen);
+        *p++ = cpu_to_be32(hdr->minorversion);
-        WRITE32(hdr->minorversion);
        hdr->nops_p = p;
-        WRITE32(hdr->nops);
+        *p = cpu_to_be32(hdr->nops);
 }
 static void encode_nops(struct compound_hdr *hdr)
@@ -829,55 +811,53 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
                len += 16;
        else if (iap->ia_valid & ATTR_MTIME)
                len += 4;
-        RESERVE_SPACE(len);
+        p = reserve_space(xdr, len);
        /*
         * We write the bitmap length now, but leave the bitmap and the attribute
         * buffer length to be backfilled at the end of this routine.
         */
-        WRITE32(2);
+        *p++ = cpu_to_be32(2);
        q = p;
        p += 3;
        if (iap->ia_valid & ATTR_SIZE) {
                bmval0 |= FATTR4_WORD0_SIZE;
-                WRITE64(iap->ia_size);
+                p = xdr_encode_hyper(p, iap->ia_size);
        }
        if (iap->ia_valid & ATTR_MODE) {
                bmval1 |= FATTR4_WORD1_MODE;
-                WRITE32(iap->ia_mode & S_IALLUGO);
+                *p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
        }
        if (iap->ia_valid & ATTR_UID) {
                bmval1 |= FATTR4_WORD1_OWNER;
-                WRITE32(owner_namelen);
+                p = xdr_encode_opaque(p, owner_name, owner_namelen);
-                WRITEMEM(owner_name, owner_namelen);
        }
        if (iap->ia_valid & ATTR_GID) {
                bmval1 |= FATTR4_WORD1_OWNER_GROUP;
-                WRITE32(owner_grouplen);
+                p = xdr_encode_opaque(p, owner_group, owner_grouplen);
-                WRITEMEM(owner_group, owner_grouplen);
        }
        if (iap->ia_valid & ATTR_ATIME_SET) {
                bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
-                WRITE32(NFS4_SET_TO_CLIENT_TIME);
+                *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
-                WRITE32(0);
+                *p++ = cpu_to_be32(0);
-                WRITE32(iap->ia_mtime.tv_sec);
+                *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
-                WRITE32(iap->ia_mtime.tv_nsec);
+                *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
        }
        else if (iap->ia_valid & ATTR_ATIME) {
                bmval1 |= FATTR4_WORD1_TIME_ACCESS_SET;
-                WRITE32(NFS4_SET_TO_SERVER_TIME);
+                *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
        }
        if (iap->ia_valid & ATTR_MTIME_SET) {
                bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
-                WRITE32(NFS4_SET_TO_CLIENT_TIME);
+                *p++ = cpu_to_be32(NFS4_SET_TO_CLIENT_TIME);
-                WRITE32(0);
+                *p++ = cpu_to_be32(0);
-                WRITE32(iap->ia_mtime.tv_sec);
+                *p++ = cpu_to_be32(iap->ia_mtime.tv_sec);
-                WRITE32(iap->ia_mtime.tv_nsec);
+                *p++ = cpu_to_be32(iap->ia_mtime.tv_nsec);
        }
        else if (iap->ia_valid & ATTR_MTIME) {
                bmval1 |= FATTR4_WORD1_TIME_MODIFY_SET;
-                WRITE32(NFS4_SET_TO_SERVER_TIME);
+                *p++ = cpu_to_be32(NFS4_SET_TO_SERVER_TIME);
        }
        /*
@@ -891,7 +871,7 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap, const
        len = (char *)p - (char *)q - 12;
        *q++ = htonl(bmval0);
        *q++ = htonl(bmval1);
-        *q++ = htonl(len);
+        *q = htonl(len);
 /* out: */
 }
@@ -900,9 +880,9 @@ static void encode_access(struct xdr_stream *xdr, u32 access, struct compound_hd
 {
        __be32 *p;
-        RESERVE_SPACE(8);
+        p = reserve_space(xdr, 8);
-        WRITE32(OP_ACCESS);
+        *p++ = cpu_to_be32(OP_ACCESS);
-        WRITE32(access);
+        *p = cpu_to_be32(access);
        hdr->nops++;
        hdr->replen += decode_access_maxsz;
 }
@@ -911,10 +891,10 @@ static void encode_close(struct xdr_stream *xdr, const struct nfs_closeargs *arg
 {
        __be32 *p;
-        RESERVE_SPACE(8+NFS4_STATEID_SIZE);
+        p = reserve_space(xdr, 8+NFS4_STATEID_SIZE);
-        WRITE32(OP_CLOSE);
+        *p++ = cpu_to_be32(OP_CLOSE);
-        WRITE32(arg->seqid->sequence->counter);
+        *p++ = cpu_to_be32(arg->seqid->sequence->counter);
-        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+        xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
        hdr->nops++;
        hdr->replen += decode_close_maxsz;
 }
@@ -923,10 +903,10 @@ static void encode_commit(struct xdr_stream *xdr, const struct nfs_writeargs *ar
 {
        __be32 *p;
-        RESERVE_SPACE(16);
+        p = reserve_space(xdr, 16);
-        WRITE32(OP_COMMIT);
+        *p++ = cpu_to_be32(OP_COMMIT);
-        WRITE64(args->offset);
+        p = xdr_encode_hyper(p, args->offset);
-        WRITE32(args->count);
+        *p = cpu_to_be32(args->count);
        hdr->nops++;
        hdr->replen += decode_commit_maxsz;
 }
@@ -935,30 +915,28 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 {
        __be32 *p;
-        RESERVE_SPACE(8);
+        p = reserve_space(xdr, 8);
-        WRITE32(OP_CREATE);
+        *p++ = cpu_to_be32(OP_CREATE);
-        WRITE32(create->ftype);
+        *p = cpu_to_be32(create->ftype);
        switch (create->ftype) {
        case NF4LNK:
-                RESERVE_SPACE(4);
+                p = reserve_space(xdr, 4);
-                WRITE32(create->u.symlink.len);
+                *p = cpu_to_be32(create->u.symlink.len);
                xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
                break;
        case NF4BLK: case NF4CHR:
-                RESERVE_SPACE(8);
+                p = reserve_space(xdr, 8);
-                WRITE32(create->u.device.specdata1);
+                *p++ = cpu_to_be32(create->u.device.specdata1);
-                WRITE32(create->u.device.specdata2);
+                *p = cpu_to_be32(create->u.device.specdata2);
                break;
        default:
                break;
        }
-        RESERVE_SPACE(4 + create->name->len);
+        encode_string(xdr, create->name->len, create->name->name);
-        WRITE32(create->name->len);
-        WRITEMEM(create->name->name, create->name->len);
        hdr->nops++;
        hdr->replen += decode_create_maxsz;
@@ -969,10 +947,10 @@ static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct c
 {
        __be32 *p;
-        RESERVE_SPACE(12);
+        p = reserve_space(xdr, 12);
-        WRITE32(OP_GETATTR);
+        *p++ = cpu_to_be32(OP_GETATTR);
-        WRITE32(1);
+        *p++ = cpu_to_be32(1);
-        WRITE32(bitmap);
+        *p = cpu_to_be32(bitmap);
        hdr->nops++;
        hdr->replen += decode_getattr_maxsz;
 }
@@ -981,11 +959,11 @@ static void encode_getattr_two(struct xdr_stream *xdr, uint32_t bm0, uint32_t bm
 {
        __be32 *p;
-        RESERVE_SPACE(16);
+        p = reserve_space(xdr, 16);
-        WRITE32(OP_GETATTR);
+        *p++ = cpu_to_be32(OP_GETATTR);
-        WRITE32(2);
+        *p++ = cpu_to_be32(2);
-        WRITE32(bm0);
+        *p++ = cpu_to_be32(bm0);
-        WRITE32(bm1);
+        *p = cpu_to_be32(bm1);
        hdr->nops++;
        hdr->replen += decode_getattr_maxsz;
 }
@@ -1012,8 +990,8 @@ static void encode_getfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(OP_GETFH);
+        *p = cpu_to_be32(OP_GETFH);
        hdr->nops++;
        hdr->replen += decode_getfh_maxsz;
 }
@@ -1022,10 +1000,9 @@ static void encode_link(struct xdr_stream *xdr, const struct qstr *name, struct
 {
        __be32 *p;
-        RESERVE_SPACE(8 + name->len);
+        p = reserve_space(xdr, 8 + name->len);
-        WRITE32(OP_LINK);
+        *p++ = cpu_to_be32(OP_LINK);
-        WRITE32(name->len);
+        xdr_encode_opaque(p, name->name, name->len);
-        WRITEMEM(name->name, name->len);
        hdr->nops++;
        hdr->replen += decode_link_maxsz;
 }
@@ -1052,27 +1029,27 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args
 {
        __be32 *p;
-        RESERVE_SPACE(32);
+        p = reserve_space(xdr, 32);
-        WRITE32(OP_LOCK);
+        *p++ = cpu_to_be32(OP_LOCK);
-        WRITE32(nfs4_lock_type(args->fl, args->block));
+        *p++ = cpu_to_be32(nfs4_lock_type(args->fl, args->block));
-        WRITE32(args->reclaim);
+        *p++ = cpu_to_be32(args->reclaim);
-        WRITE64(args->fl->fl_start);
+        p = xdr_encode_hyper(p, args->fl->fl_start);
-        WRITE64(nfs4_lock_length(args->fl));
+        p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
-        WRITE32(args->new_lock_owner);
+        *p = cpu_to_be32(args->new_lock_owner);
        if (args->new_lock_owner){
-                RESERVE_SPACE(4+NFS4_STATEID_SIZE+32);
+                p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32);
-                WRITE32(args->open_seqid->sequence->counter);
+                *p++ = cpu_to_be32(args->open_seqid->sequence->counter);
-                WRITEMEM(args->open_stateid->data, NFS4_STATEID_SIZE);
+                p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE);
-                WRITE32(args->lock_seqid->sequence->counter);
+                *p++ = cpu_to_be32(args->lock_seqid->sequence->counter);
-                WRITE64(args->lock_owner.clientid);
+                p = xdr_encode_hyper(p, args->lock_owner.clientid);
-                WRITE32(16);
+                *p++ = cpu_to_be32(16);
-                WRITEMEM("lock id:", 8);
+                p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-                WRITE64(args->lock_owner.id);
+                xdr_encode_hyper(p, args->lock_owner.id);
        }
        else {
-                RESERVE_SPACE(NFS4_STATEID_SIZE+4);
+                p = reserve_space(xdr, NFS4_STATEID_SIZE+4);
-                WRITEMEM(args->lock_stateid->data, NFS4_STATEID_SIZE);
+                p = xdr_encode_opaque_fixed(p, args->lock_stateid->data, NFS4_STATEID_SIZE);
-                WRITE32(args->lock_seqid->sequence->counter);
+                *p = cpu_to_be32(args->lock_seqid->sequence->counter);
        }
        hdr->nops++;
        hdr->replen += decode_lock_maxsz;
@@ -1082,15 +1059,15 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar
 {
        __be32 *p;
-        RESERVE_SPACE(52);
+        p = reserve_space(xdr, 52);
-        WRITE32(OP_LOCKT);
+        *p++ = cpu_to_be32(OP_LOCKT);
-        WRITE32(nfs4_lock_type(args->fl, 0));
+        *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
-        WRITE64(args->fl->fl_start);
+        p = xdr_encode_hyper(p, args->fl->fl_start);
-        WRITE64(nfs4_lock_length(args->fl));
+        p = xdr_encode_hyper(p, nfs4_lock_length(args->fl));
-        WRITE64(args->lock_owner.clientid);
+        p = xdr_encode_hyper(p, args->lock_owner.clientid);
-        WRITE32(16);
+        *p++ = cpu_to_be32(16);
-        WRITEMEM("lock id:", 8);
+        p = xdr_encode_opaque_fixed(p, "lock id:", 8);
-        WRITE64(args->lock_owner.id);
+        xdr_encode_hyper(p, args->lock_owner.id);
        hdr->nops++;
        hdr->replen += decode_lockt_maxsz;
 }
@@ -1099,13 +1076,13 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar
 {
        __be32 *p;
-        RESERVE_SPACE(12+NFS4_STATEID_SIZE+16);
+        p = reserve_space(xdr, 12+NFS4_STATEID_SIZE+16);
-        WRITE32(OP_LOCKU);
+        *p++ = cpu_to_be32(OP_LOCKU);
-        WRITE32(nfs4_lock_type(args->fl, 0));
+        *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0));
-        WRITE32(args->seqid->sequence->counter);
+        *p++ = cpu_to_be32(args->seqid->sequence->counter);
-        WRITEMEM(args->stateid->data, NFS4_STATEID_SIZE);
+        p = xdr_encode_opaque_fixed(p, args->stateid->data, NFS4_STATEID_SIZE);
-        WRITE64(args->fl->fl_start);
+        p = xdr_encode_hyper(p, args->fl->fl_start);
-        WRITE64(nfs4_lock_length(args->fl));
+        xdr_encode_hyper(p, nfs4_lock_length(args->fl));
        hdr->nops++;
        hdr->replen += decode_locku_maxsz;
 }
@@ -1115,10 +1092,9 @@ static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struc
        int len = name->len;
        __be32 *p;
-        RESERVE_SPACE(8 + len);
+        p = reserve_space(xdr, 8 + len);
-        WRITE32(OP_LOOKUP);
+        *p++ = cpu_to_be32(OP_LOOKUP);
-        WRITE32(len);
+        xdr_encode_opaque(p, name->name, len);
-        WRITEMEM(name->name, len);
        hdr->nops++;
        hdr->replen += decode_lookup_maxsz;
 }
@@ -1127,21 +1103,21 @@ static void encode_share_access(struct xdr_stream *xdr, fmode_t fmode)
 {
        __be32 *p;
-        RESERVE_SPACE(8);
+        p = reserve_space(xdr, 8);
        switch (fmode & (FMODE_READ|FMODE_WRITE)) {
        case FMODE_READ:
-                WRITE32(NFS4_SHARE_ACCESS_READ);
+                *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_READ);
                break;
        case FMODE_WRITE:
-                WRITE32(NFS4_SHARE_ACCESS_WRITE);
+                *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_WRITE);
                break;
        case FMODE_READ|FMODE_WRITE:
-                WRITE32(NFS4_SHARE_ACCESS_BOTH);
+                *p++ = cpu_to_be32(NFS4_SHARE_ACCESS_BOTH);
                break;
        default:
-                WRITE32(0);
+                *p++ = cpu_to_be32(0);
        }
-        WRITE32(0);             /* for linux, share_deny = 0 always */
+        *p = cpu_to_be32(0);            /* for linux, share_deny = 0 always */
 }
 static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_openargs *arg)
@@ -1151,29 +1127,29 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena
 * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4,
 * owner 4 = 32
 */
-        RESERVE_SPACE(8);
+        p = reserve_space(xdr, 8);
-        WRITE32(OP_OPEN);
+        *p++ = cpu_to_be32(OP_OPEN);
-        WRITE32(arg->seqid->sequence->counter);
+        *p = cpu_to_be32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
-        RESERVE_SPACE(28);
+        p = reserve_space(xdr, 28);
-        WRITE64(arg->clientid);
+        p = xdr_encode_hyper(p, arg->clientid);
-        WRITE32(16);
+        *p++ = cpu_to_be32(16);
-        WRITEMEM("open id:", 8);
+        p = xdr_encode_opaque_fixed(p, "open id:", 8);
-        WRITE64(arg->id);
+        xdr_encode_hyper(p, arg->id);
 }
 static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_openargs *arg)
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
        switch(arg->open_flags & O_EXCL) {
        case 0:
-                WRITE32(NFS4_CREATE_UNCHECKED);
+                *p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
                encode_attrs(xdr, arg->u.attrs, arg->server);
                break;
        default:
-                WRITE32(NFS4_CREATE_EXCLUSIVE);
+                *p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
                encode_nfs4_verifier(xdr, &arg->u.verifier);
        }
 }
@@ -1182,14 +1158,14 @@ static void encode_opentype(struct xdr_stream *xdr, const struct nfs_openargs *a
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
        switch (arg->open_flags & O_CREAT) {
        case 0:
-                WRITE32(NFS4_OPEN_NOCREATE);
+                *p = cpu_to_be32(NFS4_OPEN_NOCREATE);
                break;
        default:
                BUG_ON(arg->claim != NFS4_OPEN_CLAIM_NULL);
-                WRITE32(NFS4_OPEN_CREATE);
+                *p = cpu_to_be32(NFS4_OPEN_CREATE);
                encode_createmode(xdr, arg);
        }
 }
@@ -1198,16 +1174,16 @@ static inline void encode_delegation_type(struct xdr_stream *xdr, fmode_t delega
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
        switch (delegation_type) {
        case 0:
-                WRITE32(NFS4_OPEN_DELEGATE_NONE);
+                *p = cpu_to_be32(NFS4_OPEN_DELEGATE_NONE);
                break;
        case FMODE_READ:
-                WRITE32(NFS4_OPEN_DELEGATE_READ);
+                *p = cpu_to_be32(NFS4_OPEN_DELEGATE_READ);
                break;
        case FMODE_WRITE|FMODE_READ:
-                WRITE32(NFS4_OPEN_DELEGATE_WRITE);
+                *p = cpu_to_be32(NFS4_OPEN_DELEGATE_WRITE);
                break;
        default:
                BUG();
@@ -1218,8 +1194,8 @@ static inline void encode_claim_null(struct xdr_stream *xdr, const struct qstr *
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(NFS4_OPEN_CLAIM_NULL);
+        *p = cpu_to_be32(NFS4_OPEN_CLAIM_NULL);
        encode_string(xdr, name->len, name->name);
 }
@@ -1227,8 +1203,8 @@ static inline void encode_claim_previous(struct xdr_stream *xdr, fmode_t type)
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(NFS4_OPEN_CLAIM_PREVIOUS);
+        *p = cpu_to_be32(NFS4_OPEN_CLAIM_PREVIOUS);
        encode_delegation_type(xdr, type);
 }
@@ -1236,9 +1212,9 @@ static inline void encode_claim_delegate_cur(struct xdr_stream *xdr, const struc
 {
        __be32 *p;
-        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
-        WRITE32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
+        *p++ = cpu_to_be32(NFS4_OPEN_CLAIM_DELEGATE_CUR);
-        WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+        xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
        encode_string(xdr, name->len, name->name);
 }
@@ -1267,10 +1243,10 @@ static void encode_open_confirm(struct xdr_stream *xdr, const struct nfs_open_co
 {
        __be32 *p;
-        RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
+        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
-        WRITE32(OP_OPEN_CONFIRM);
+        *p++ = cpu_to_be32(OP_OPEN_CONFIRM);
-        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+        p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-        WRITE32(arg->seqid->sequence->counter);
+        *p = cpu_to_be32(arg->seqid->sequence->counter);
        hdr->nops++;
        hdr->replen += decode_open_confirm_maxsz;
 }
@@ -1279,10 +1255,10 @@ static void encode_open_downgrade(struct xdr_stream *xdr, const struct nfs_close
 {
        __be32 *p;
-        RESERVE_SPACE(4+NFS4_STATEID_SIZE+4);
+        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4);
-        WRITE32(OP_OPEN_DOWNGRADE);
+        *p++ = cpu_to_be32(OP_OPEN_DOWNGRADE);
-        WRITEMEM(arg->stateid->data, NFS4_STATEID_SIZE);
+        p = xdr_encode_opaque_fixed(p, arg->stateid->data, NFS4_STATEID_SIZE);
-        WRITE32(arg->seqid->sequence->counter);
+        *p = cpu_to_be32(arg->seqid->sequence->counter);
        encode_share_access(xdr, arg->fmode);
        hdr->nops++;
        hdr->replen += decode_open_downgrade_maxsz;
@@ -1294,10 +1270,9 @@ encode_putfh(struct xdr_stream *xdr, const struct nfs_fh *fh, struct compound_hd
        int len = fh->size;
        __be32 *p;
-        RESERVE_SPACE(8 + len);
+        p = reserve_space(xdr, 8 + len);
-        WRITE32(OP_PUTFH);
+        *p++ = cpu_to_be32(OP_PUTFH);
-        WRITE32(len);
+        xdr_encode_opaque(p, fh->data, len);
-        WRITEMEM(fh->data, len);
        hdr->nops++;
        hdr->replen += decode_putfh_maxsz;
 }
@@ -1306,8 +1281,8 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(OP_PUTROOTFH);
+        *p = cpu_to_be32(OP_PUTROOTFH);
        hdr->nops++;
        hdr->replen += decode_putrootfh_maxsz;
 }
@@ -1317,26 +1292,26 @@ static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context
        nfs4_stateid stateid;
        __be32 *p;
-        RESERVE_SPACE(NFS4_STATEID_SIZE);
+        p = reserve_space(xdr, NFS4_STATEID_SIZE);
        if (ctx->state != NULL) {
                nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner);
-                WRITEMEM(stateid.data, NFS4_STATEID_SIZE);
+                xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE);
        } else
-                WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
+                xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
 }
 static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, struct compound_hdr *hdr)
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(OP_READ);
+        *p = cpu_to_be32(OP_READ);
        encode_stateid(xdr, args->context);
-        RESERVE_SPACE(12);
+        p = reserve_space(xdr, 12);
-        WRITE64(args->offset);
+        p = xdr_encode_hyper(p, args->offset);
-        WRITE32(args->count);
+        *p = cpu_to_be32(args->count);
        hdr->nops++;
        hdr->replen += decode_read_maxsz;
 }
@@ -1349,20 +1324,20 @@ static void encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg
        };
        __be32 *p;
-        RESERVE_SPACE(12+NFS4_VERIFIER_SIZE+20);
+        p = reserve_space(xdr, 12+NFS4_VERIFIER_SIZE+20);
-        WRITE32(OP_READDIR);
+        *p++ = cpu_to_be32(OP_READDIR);
-        WRITE64(readdir->cookie);
+        p = xdr_encode_hyper(p, readdir->cookie);
-        WRITEMEM(readdir->verifier.data, NFS4_VERIFIER_SIZE);
+        p = xdr_encode_opaque_fixed(p, readdir->verifier.data, NFS4_VERIFIER_SIZE);
-        WRITE32(readdir->count >> 1);  /* We're not doing readdirplus */
+        *p++ = cpu_to_be32(readdir->count >> 1);  /* We're not doing readdirplus */
-        WRITE32(readdir->count);
+        *p++ = cpu_to_be32(readdir->count);
-        WRITE32(2);
+        *p++ = cpu_to_be32(2);
        /* Switch to mounted_on_fileid if the server supports it */
        if (readdir->bitmask[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)
                attrs[0] &= ~FATTR4_WORD0_FILEID;
        else
                attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
-        WRITE32(attrs[0] & readdir->bitmask[0]);
+        *p++ = cpu_to_be32(attrs[0] & readdir->bitmask[0]);
-        WRITE32(attrs[1] & readdir->bitmask[1]);
+        *p = cpu_to_be32(attrs[1] & readdir->bitmask[1]);
        hdr->nops++;
        hdr->replen += decode_readdir_maxsz;
        dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n",
@@ -1378,8 +1353,8 @@ static void encode_readlink(struct xdr_stream *xdr, const struct nfs4_readlink *
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(OP_READLINK);
+        *p = cpu_to_be32(OP_READLINK);
        hdr->nops++;
        hdr->replen += decode_readlink_maxsz;
 }
@@ -1388,10 +1363,9 @@ static void encode_remove(struct xdr_stream *xdr, const struct qstr *name, struc
 {
        __be32 *p;
-        RESERVE_SPACE(8 + name->len);
+        p = reserve_space(xdr, 8 + name->len);
-        WRITE32(OP_REMOVE);
+        *p++ = cpu_to_be32(OP_REMOVE);
-        WRITE32(name->len);
+        xdr_encode_opaque(p, name->name, name->len);
-        WRITEMEM(name->name, name->len);
        hdr->nops++;
        hdr->replen += decode_remove_maxsz;
 }
@@ -1400,14 +1374,10 @@ static void encode_rename(struct xdr_stream *xdr, const struct qstr *oldname, co
 {
        __be32 *p;
-        RESERVE_SPACE(8 + oldname->len);
+        p = reserve_space(xdr, 4);
-        WRITE32(OP_RENAME);
+        *p = cpu_to_be32(OP_RENAME);
-        WRITE32(oldname->len);
+        encode_string(xdr, oldname->len, oldname->name);
-        WRITEMEM(oldname->name, oldname->len);
+        encode_string(xdr, newname->len, newname->name);
-        RESERVE_SPACE(4 + newname->len);
-        WRITE32(newname->len);
-        WRITEMEM(newname->name, newname->len);
        hdr->nops++;
        hdr->replen += decode_rename_maxsz;
 }
@@ -1416,9 +1386,9 @@ static void encode_renew(struct xdr_stream *xdr, const struct nfs_client *client
 {
        __be32 *p;
-        RESERVE_SPACE(12);
+        p = reserve_space(xdr, 12);
-        WRITE32(OP_RENEW);
+        *p++ = cpu_to_be32(OP_RENEW);
-        WRITE64(client_stateid->cl_clientid);
+        xdr_encode_hyper(p, client_stateid->cl_clientid);
        hdr->nops++;
        hdr->replen += decode_renew_maxsz;
 }
@@ -1428,8 +1398,8 @@ encode_restorefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(OP_RESTOREFH);
+        *p = cpu_to_be32(OP_RESTOREFH);
        hdr->nops++;
        hdr->replen += decode_restorefh_maxsz;
 }
@@ -1439,16 +1409,16 @@ encode_setacl(struct xdr_stream *xdr, struct nfs_setaclargs *arg, struct compoun
 {
        __be32 *p;
-        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
-        WRITE32(OP_SETATTR);
+        *p++ = cpu_to_be32(OP_SETATTR);
-        WRITEMEM(zero_stateid.data, NFS4_STATEID_SIZE);
+        xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE);
-        RESERVE_SPACE(2*4);
+        p = reserve_space(xdr, 2*4);
-        WRITE32(1);
+        *p++ = cpu_to_be32(1);
-        WRITE32(FATTR4_WORD0_ACL);
+        *p = cpu_to_be32(FATTR4_WORD0_ACL);
        if (arg->acl_len % 4)
                return -EINVAL;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(arg->acl_len);
+        *p = cpu_to_be32(arg->acl_len);
        xdr_write_pages(xdr, arg->acl_pages, arg->acl_pgbase, arg->acl_len);
        hdr->nops++;
        hdr->replen += decode_setacl_maxsz;
@@ -1460,8 +1430,8 @@ encode_savefh(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(OP_SAVEFH);
+        *p = cpu_to_be32(OP_SAVEFH);
        hdr->nops++;
        hdr->replen += decode_savefh_maxsz;
 }
@@ -1470,9 +1440,9 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
        __be32 *p;
-        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
-        WRITE32(OP_SETATTR);
+        *p++ = cpu_to_be32(OP_SETATTR);
-        WRITEMEM(arg->stateid.data, NFS4_STATEID_SIZE);
+        xdr_encode_opaque_fixed(p, arg->stateid.data, NFS4_STATEID_SIZE);
        hdr->nops++;
        hdr->replen += decode_setattr_maxsz;
        encode_attrs(xdr, arg->iap, server);
@@ -1482,17 +1452,17 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie
 {
        __be32 *p;
-        RESERVE_SPACE(4 + NFS4_VERIFIER_SIZE);
+        p = reserve_space(xdr, 4 + NFS4_VERIFIER_SIZE);
-        WRITE32(OP_SETCLIENTID);
+        *p++ = cpu_to_be32(OP_SETCLIENTID);
-        WRITEMEM(setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
+        xdr_encode_opaque_fixed(p, setclientid->sc_verifier->data, NFS4_VERIFIER_SIZE);
        encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name);
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(setclientid->sc_prog);
+        *p = cpu_to_be32(setclientid->sc_prog);
        encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid);
        encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr);
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(setclientid->sc_cb_ident);
+        *p = cpu_to_be32(setclientid->sc_cb_ident);
        hdr->nops++;
        hdr->replen += decode_setclientid_maxsz;
 }
@@ -1501,10 +1471,10 @@ static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs_
 {
        __be32 *p;
-        RESERVE_SPACE(12 + NFS4_VERIFIER_SIZE);
+        p = reserve_space(xdr, 12 + NFS4_VERIFIER_SIZE);
-        WRITE32(OP_SETCLIENTID_CONFIRM);
+        *p++ = cpu_to_be32(OP_SETCLIENTID_CONFIRM);
-        WRITE64(client_state->cl_clientid);
+        p = xdr_encode_hyper(p, client_state->cl_clientid);
-        WRITEMEM(client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
+        xdr_encode_opaque_fixed(p, client_state->cl_confirm.data, NFS4_VERIFIER_SIZE);
        hdr->nops++;
        hdr->replen += decode_setclientid_confirm_maxsz;
 }
@@ -1513,15 +1483,15 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg
 {
        __be32 *p;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4);
-        WRITE32(OP_WRITE);
+        *p = cpu_to_be32(OP_WRITE);
        encode_stateid(xdr, args->context);
-        RESERVE_SPACE(16);
+        p = reserve_space(xdr, 16);
-        WRITE64(args->offset);
+        p = xdr_encode_hyper(p, args->offset);
-        WRITE32(args->stable);
+        *p++ = cpu_to_be32(args->stable);
-        WRITE32(args->count);
+        *p = cpu_to_be32(args->count);
        xdr_write_pages(xdr, args->pages, args->pgbase, args->count);
        hdr->nops++;
@@ -1532,10 +1502,10 @@ static void encode_delegreturn(struct xdr_stream *xdr, const nfs4_stateid *state
 {
        __be32 *p;
-        RESERVE_SPACE(4+NFS4_STATEID_SIZE);
+        p = reserve_space(xdr, 4+NFS4_STATEID_SIZE);
-        WRITE32(OP_DELEGRETURN);
+        *p++ = cpu_to_be32(OP_DELEGRETURN);
-        WRITEMEM(stateid->data, NFS4_STATEID_SIZE);
+        xdr_encode_opaque_fixed(p, stateid->data, NFS4_STATEID_SIZE);
        hdr->nops++;
        hdr->replen += decode_delegreturn_maxsz;
 }
@@ -1548,16 +1518,16 @@ static void encode_exchange_id(struct xdr_stream *xdr,
 {
        __be32 *p;
-        RESERVE_SPACE(4 + sizeof(args->verifier->data));
+        p = reserve_space(xdr, 4 + sizeof(args->verifier->data));
-        WRITE32(OP_EXCHANGE_ID);
+        *p++ = cpu_to_be32(OP_EXCHANGE_ID);
-        WRITEMEM(args->verifier->data, sizeof(args->verifier->data));
+        xdr_encode_opaque_fixed(p, args->verifier->data, sizeof(args->verifier->data));
        encode_string(xdr, args->id_len, args->id);
-        RESERVE_SPACE(12);
+        p = reserve_space(xdr, 12);
-        WRITE32(args->flags);
+        *p++ = cpu_to_be32(args->flags);
-        WRITE32(0);     /* zero length state_protect4_a */
+        *p++ = cpu_to_be32(0);  /* zero length state_protect4_a */
-        WRITE32(0);     /* zero length implementation id array */
+        *p = cpu_to_be32(0);    /* zero length implementation id array */
        hdr->nops++;
        hdr->replen += decode_exchange_id_maxsz;
 }
@@ -1571,55 +1541,43 @@ static void encode_create_session(struct xdr_stream *xdr,
        uint32_t len;
        struct nfs_client *clp = args->client;
-        RESERVE_SPACE(4);
+        len = scnprintf(machine_name, sizeof(machine_name), "%s",
-        WRITE32(OP_CREATE_SESSION);
+                        clp->cl_ipaddr);
-        RESERVE_SPACE(8);
-        WRITE64(clp->cl_ex_clid);
-        RESERVE_SPACE(8);
+        p = reserve_space(xdr, 20 + 2*28 + 20 + len + 12);
-        WRITE32(clp->cl_seqid);                 /*Sequence id */
+        *p++ = cpu_to_be32(OP_CREATE_SESSION);
-        WRITE32(args->flags);                   /*flags */
+        p = xdr_encode_hyper(p, clp->cl_ex_clid);
+        *p++ = cpu_to_be32(clp->cl_seqid);                      /*Sequence id */
+        *p++ = cpu_to_be32(args->flags);                        /*flags */
-        RESERVE_SPACE(2*28);                    /* 2 channel_attrs */
        /* Fore Channel */
-        WRITE32(args->fc_attrs.headerpadsz);    /* header padding size */
+        *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
-        WRITE32(args->fc_attrs.max_rqst_sz);    /* max req size */
+        *p++ = cpu_to_be32(args->fc_attrs.max_rqst_sz); /* max req size */
-        WRITE32(args->fc_attrs.max_resp_sz);    /* max resp size */
+        *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz); /* max resp size */
-        WRITE32(args->fc_attrs.max_resp_sz_cached);     /* Max resp sz cached */
+        *p++ = cpu_to_be32(args->fc_attrs.max_resp_sz_cached);  /* Max resp sz cached */
-        WRITE32(args->fc_attrs.max_ops);        /* max operations */
+        *p++ = cpu_to_be32(args->fc_attrs.max_ops);     /* max operations */
-        WRITE32(args->fc_attrs.max_reqs);       /* max requests */
+        *p++ = cpu_to_be32(args->fc_attrs.max_reqs);    /* max requests */
-        WRITE32(0);                             /* rdmachannel_attrs */
+        *p++ = cpu_to_be32(0);                          /* rdmachannel_attrs */
        /* Back Channel */
-        WRITE32(args->fc_attrs.headerpadsz);    /* header padding size */
+        *p++ = cpu_to_be32(args->fc_attrs.headerpadsz); /* header padding size */
-        WRITE32(args->bc_attrs.max_rqst_sz);    /* max req size */
+        *p++ = cpu_to_be32(args->bc_attrs.max_rqst_sz); /* max req size */
-        WRITE32(args->bc_attrs.max_resp_sz);    /* max resp size */
+        *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz); /* max resp size */
-        WRITE32(args->bc_attrs.max_resp_sz_cached);     /* Max resp sz cached */
+        *p++ = cpu_to_be32(args->bc_attrs.max_resp_sz_cached);  /* Max resp sz cached */
-        WRITE32(args->bc_attrs.max_ops);        /* max operations */
+        *p++ = cpu_to_be32(args->bc_attrs.max_ops);     /* max operations */
-        WRITE32(args->bc_attrs.max_reqs);       /* max requests */
+        *p++ = cpu_to_be32(args->bc_attrs.max_reqs);    /* max requests */
-        WRITE32(0);                             /* rdmachannel_attrs */
+        *p++ = cpu_to_be32(0);                          /* rdmachannel_attrs */
-        RESERVE_SPACE(4);
+        *p++ = cpu_to_be32(args->cb_program);           /* cb_program */
-        WRITE32(args->cb_program);              /* cb_program */
+        *p++ = cpu_to_be32(1);
+        *p++ = cpu_to_be32(RPC_AUTH_UNIX);                      /* auth_sys */
-        RESERVE_SPACE(4);                       /* # of security flavors */
-        WRITE32(1);
-        RESERVE_SPACE(4);
-        WRITE32(RPC_AUTH_UNIX);                 /* auth_sys */
        /* authsys_parms rfc1831 */
-        RESERVE_SPACE(4);
+        *p++ = cpu_to_be32((u32)clp->cl_boot_time.tv_nsec);     /* stamp */
-        WRITE32((u32)clp->cl_boot_time.tv_nsec);        /* stamp */
+        p = xdr_encode_opaque(p, machine_name, len);
-        len = scnprintf(machine_name, sizeof(machine_name), "%s",
+        *p++ = cpu_to_be32(0);                          /* UID */
-                        clp->cl_ipaddr);
+        *p++ = cpu_to_be32(0);                          /* GID */
-        RESERVE_SPACE(16 + len);
+        *p = cpu_to_be32(0);                            /* No more gids */
-        WRITE32(len);
-        WRITEMEM(machine_name, len);
-        WRITE32(0);                             /* UID */
-        WRITE32(0);                             /* GID */
-        WRITE32(0);                             /* No more gids */
        hdr->nops++;
        hdr->replen += decode_create_session_maxsz;
 }
@@ -1629,9 +1587,9 @@ static void encode_destroy_session(struct xdr_stream *xdr,
                                   struct compound_hdr *hdr)
 {
        __be32 *p;
-        RESERVE_SPACE(4 + NFS4_MAX_SESSIONID_LEN);
+        p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN);
-        WRITE32(OP_DESTROY_SESSION);
+        *p++ = cpu_to_be32(OP_DESTROY_SESSION);
-        WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+        xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
        hdr->nops++;
        hdr->replen += decode_destroy_session_maxsz;
 }
@@ -1655,8 +1613,8 @@ static void encode_sequence(struct xdr_stream *xdr,
        WARN_ON(args->sa_slotid == NFS4_MAX_SLOT_TABLE);
        slot = tp->slots + args->sa_slotid;
-        RESERVE_SPACE(4);
+        p = reserve_space(xdr, 4 + NFS4_MAX_SESSIONID_LEN + 16);
-        WRITE32(OP_SEQUENCE);
+        *p++ = cpu_to_be32(OP_SEQUENCE);
        /*
         * Sessionid + seqid + slotid + max slotid + cache_this
@@ -1670,12 +1628,11 @@ static void encode_sequence(struct xdr_stream *xdr,
                ((u32 *)session->sess_id.data)[3],
                slot->seq_nr, args->sa_slotid,
                tp->highest_used_slotid, args->sa_cache_this);
-        RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 16);
+        p = xdr_encode_opaque_fixed(p, session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
-        WRITEMEM(session->sess_id.data, NFS4_MAX_SESSIONID_LEN);
+        *p++ = cpu_to_be32(slot->seq_nr);
-        WRITE32(slot->seq_nr);
+        *p++ = cpu_to_be32(args->sa_slotid);
-        WRITE32(args->sa_slotid);
+        *p++ = cpu_to_be32(tp->highest_used_slotid);
-        WRITE32(tp->highest_used_slotid);
+        *p = cpu_to_be32(args->sa_cache_this);
-        WRITE32(args->sa_cache_this);
        hdr->nops++;
        hdr->replen += decode_sequence_maxsz;
 #endif /* CONFIG_NFS_V4_1 */
@@ -2466,68 +2423,53 @@ static int nfs4_xdr_enc_get_lease_time(struct rpc_rqst *req, uint32_t *p,
 }
 #endif /* CONFIG_NFS_V4_1 */
-/*
+static void print_overflow_msg(const char *func, const struct xdr_stream *xdr)
- * START OF "GENERIC" DECODE ROUTINES.
+{
- *   These may look a little ugly since they are imported from a "generic"
+        dprintk("nfs: %s: prematurely hit end of receive buffer. "
- * set of XDR encode/decode routines which are intended to be shared by
+                "Remaining buffer length is %tu words.\n",
- * all of our NFSv4 implementations (OpenBSD, MacOS X...).
+                func, xdr->end - xdr->p);
- *
+}
- * If the pain of reading these is too great, it should be a straightforward
- * task to translate them into Linux-specific versions which are more
- * consistent with the style used in NFSv2/v3...
- */
-#define READ32(x)         (x) = ntohl(*p++)
-#define READ64(x)         do {                  \
-        (x) = (u64)ntohl(*p++) << 32;           \
-        (x) |= ntohl(*p++);                     \
-} while (0)
-#define READTIME(x)       do {                  \
-        p++;                                    \
-        (x.tv_sec) = ntohl(*p++);               \
-        (x.tv_nsec) = ntohl(*p++);              \
-} while (0)
-#define COPYMEM(x,nbytes) do {                  \
-        memcpy((x), p, nbytes);                 \
-        p += XDR_QUADLEN(nbytes);               \
-} while (0)
-#define READ_BUF(nbytes)  do { \
-        p = xdr_inline_decode(xdr, nbytes); \
-        if (unlikely(!p)) { \
-                dprintk("nfs: %s: prematurely hit end of receive" \
-                                " buffer\n", __func__); \
-                dprintk("nfs: %s: xdr->p=%p, bytes=%u, xdr->end=%p\n", \
-                                __func__, xdr->p, nbytes, xdr->end); \
-                return -EIO; \
-        } \
-} while (0)
 static int decode_opaque_inline(struct xdr_stream *xdr, unsigned int *len, char **string)
 {
        __be32 *p;
-        READ_BUF(4);
+        p = xdr_inline_decode(xdr, 4);
-        READ32(*len);
+        if (unlikely(!p))
-        READ_BUF(*len);
+                goto out_overflow;
+        *len = be32_to_cpup(p);
+        p = xdr_inline_decode(xdr, *len);
+        if (unlikely(!p))
+                goto out_overflow;
        *string = (char *)p;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr)
 {
        __be32 *p;
-        READ_BUF(8);
+        p = xdr_inline_decode(xdr, 8);
-        READ32(hdr->status);
+        if (unlikely(!p))
-        READ32(hdr->taglen);
+                goto out_overflow;
+        hdr->status = be32_to_cpup(p++);
+        hdr->taglen = be32_to_cpup(p);
-        READ_BUF(hdr->taglen + 4);
+        p = xdr_inline_decode(xdr, hdr->taglen + 4);
+        if (unlikely(!p))
+                goto out_overflow;
        hdr->tag = (char *)p;
        p += XDR_QUADLEN(hdr->taglen);
-        READ32(hdr->nops);
+        hdr->nops = be32_to_cpup(p);
        if (unlikely(hdr->nops < 1))
                return nfs4_stat_to_errno(hdr->status);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
@@ -2536,18 +2478,23 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected)
        uint32_t opnum;
        int32_t nfserr;
-        READ_BUF(8);
+        p = xdr_inline_decode(xdr, 8);
-        READ32(opnum);
+        if (unlikely(!p))
+                goto out_overflow;
+        opnum = be32_to_cpup(p++);
        if (opnum != expected) {
                dprintk("nfs: Server returned operation"
                        " %d but we issued a request for %d\n",
                                opnum, expected);
                return -EIO;
        }
-        READ32(nfserr);
+        nfserr = be32_to_cpup(p);
        if (nfserr != NFS_OK)
                return nfs4_stat_to_errno(nfserr);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 /* Dummy routine */
@@ -2557,8 +2504,11 @@ static int decode_ace(struct xdr_stream *xdr, void *ace, struct nfs_client *clp)
        unsigned int strlen;
        char *str;
-        READ_BUF(12);
+        p = xdr_inline_decode(xdr, 12);
-        return decode_opaque_inline(xdr, &strlen, &str);
+        if (likely(p))
+                return decode_opaque_inline(xdr, &strlen, &str);
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
@@ -2566,27 +2516,39 @@ static int decode_attr_bitmap(struct xdr_stream *xdr, uint32_t *bitmap)
        uint32_t bmlen;
        __be32 *p;
-        READ_BUF(4);
+        p = xdr_inline_decode(xdr, 4);
-        READ32(bmlen);
+        if (unlikely(!p))
+                goto out_overflow;
+        bmlen = be32_to_cpup(p);
        bitmap[0] = bitmap[1] = 0;
-        READ_BUF((bmlen << 2));
+        p = xdr_inline_decode(xdr, (bmlen << 2));
+        if (unlikely(!p))
+                goto out_overflow;
        if (bmlen > 0) {
-                READ32(bitmap[0]);
+                bitmap[0] = be32_to_cpup(p++);
                if (bmlen > 1)
-                        READ32(bitmap[1]);
+                        bitmap[1] = be32_to_cpup(p);
        }
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static inline int decode_attr_length(struct xdr_stream *xdr, uint32_t *attrlen, __be32 **savep)
 {
        __be32 *p;
-        READ_BUF(4);
+        p = xdr_inline_decode(xdr, 4);
-        READ32(*attrlen);
+        if (unlikely(!p))
+                goto out_overflow;
+        *attrlen = be32_to_cpup(p);
        *savep = xdr->p;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *bitmask)
@@ -2609,8 +2571,10 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
        if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_TYPE)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(*type);
+                if (unlikely(!p))
+                        goto out_overflow;
+                *type = be32_to_cpup(p);
                if (*type < NF4REG || *type > NF4NAMEDATTR) {
                        dprintk("%s: bad type %d\n", __func__, *type);
                        return -EIO;
@@ -2620,6 +2584,9 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
        }
        dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
@@ -2631,14 +2598,19 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
        if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_CHANGE)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*change);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, change);
                bitmap[0] &= ~FATTR4_WORD0_CHANGE;
                ret = NFS_ATTR_FATTR_CHANGE;
        }
        dprintk("%s: change attribute=%Lu\n", __func__,
                        (unsigned long long)*change);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
@@ -2650,13 +2622,18 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
        if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_SIZE)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*size);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, size);
                bitmap[0] &= ~FATTR4_WORD0_SIZE;
                ret = NFS_ATTR_FATTR_SIZE;
        }
        dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2667,12 +2644,17 @@ static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, ui
        if (unlikely(bitmap[0] & (FATTR4_WORD0_LINK_SUPPORT - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_LINK_SUPPORT)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                *res = be32_to_cpup(p);
                bitmap[0] &= ~FATTR4_WORD0_LINK_SUPPORT;
        }
        dprintk("%s: link support=%s\n", __func__, *res == 0 ? "false" : "true");
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2683,12 +2665,17 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
        if (unlikely(bitmap[0] & (FATTR4_WORD0_SYMLINK_SUPPORT - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_SYMLINK_SUPPORT)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                *res = be32_to_cpup(p);
                bitmap[0] &= ~FATTR4_WORD0_SYMLINK_SUPPORT;
        }
        dprintk("%s: symlink support=%s\n", __func__, *res == 0 ? "false" : "true");
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
@@ -2701,9 +2688,11 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
        if (unlikely(bitmap[0] & (FATTR4_WORD0_FSID - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_FSID)) {
-                READ_BUF(16);
+                p = xdr_inline_decode(xdr, 16);
-                READ64(fsid->major);
+                if (unlikely(!p))
-                READ64(fsid->minor);
+                        goto out_overflow;
+                p = xdr_decode_hyper(p, &fsid->major);
+                xdr_decode_hyper(p, &fsid->minor);
                bitmap[0] &= ~FATTR4_WORD0_FSID;
                ret = NFS_ATTR_FATTR_FSID;
        }
@@ -2711,6 +2700,9 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
                        (unsigned long long)fsid->major,
                        (unsigned long long)fsid->minor);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2721,12 +2713,17 @@ static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint
        if (unlikely(bitmap[0] & (FATTR4_WORD0_LEASE_TIME - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_LEASE_TIME)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                *res = be32_to_cpup(p);
                bitmap[0] &= ~FATTR4_WORD0_LEASE_TIME;
        }
        dprintk("%s: file size=%u\n", __func__, (unsigned int)*res);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2737,12 +2734,17 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
        if (unlikely(bitmap[0] & (FATTR4_WORD0_ACLSUPPORT - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_ACLSUPPORT)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                *res = be32_to_cpup(p);
                bitmap[0] &= ~FATTR4_WORD0_ACLSUPPORT;
        }
        dprintk("%s: ACLs supported=%u\n", __func__, (unsigned int)*res);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2754,13 +2756,18 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
        if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_FILEID)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*fileid);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, fileid);
                bitmap[0] &= ~FATTR4_WORD0_FILEID;
                ret = NFS_ATTR_FATTR_FILEID;
        }
        dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
@@ -2772,13 +2779,18 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
        if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_MOUNTED_ON_FILEID)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*fileid);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, fileid);
                bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
                ret = NFS_ATTR_FATTR_FILEID;
        }
        dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2790,12 +2802,17 @@ static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
        if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_AVAIL - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_FILES_AVAIL)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, res);
                bitmap[0] &= ~FATTR4_WORD0_FILES_AVAIL;
        }
        dprintk("%s: files avail=%Lu\n", __func__, (unsigned long long)*res);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2807,12 +2824,17 @@ static int decode_attr_files_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
        if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_FREE - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_FILES_FREE)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, res);
                bitmap[0] &= ~FATTR4_WORD0_FILES_FREE;
        }
        dprintk("%s: files free=%Lu\n", __func__, (unsigned long long)*res);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2824,12 +2846,17 @@ static int decode_attr_files_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
        if (unlikely(bitmap[0] & (FATTR4_WORD0_FILES_TOTAL - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_FILES_TOTAL)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, res);
                bitmap[0] &= ~FATTR4_WORD0_FILES_TOTAL;
        }
        dprintk("%s: files total=%Lu\n", __func__, (unsigned long long)*res);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
@@ -2838,8 +2865,10 @@ static int decode_pathname(struct xdr_stream *xdr, struct nfs4_pathname *path)
        __be32 *p;
        int status = 0;
-        READ_BUF(4);
+        p = xdr_inline_decode(xdr, 4);
-        READ32(n);
+        if (unlikely(!p))
+                goto out_overflow;
+        n = be32_to_cpup(p);
        if (n == 0)
                goto root_path;
        dprintk("path ");
@@ -2873,6 +2902,9 @@ out_eio:
        dprintk(" status %d", status);
        status = -EIO;
        goto out;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs4_fs_locations *res)
@@ -2890,8 +2922,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
        status = decode_pathname(xdr, &res->fs_path);
        if (unlikely(status != 0))
                goto out;
-        READ_BUF(4);
+        p = xdr_inline_decode(xdr, 4);
-        READ32(n);
+        if (unlikely(!p))
+                goto out_overflow;
+        n = be32_to_cpup(p);
        if (n <= 0)
                goto out_eio;
        res->nlocations = 0;
@@ -2899,8 +2933,10 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
                u32 m;
                struct nfs4_fs_location *loc = &res->locations[res->nlocations];
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(m);
+                if (unlikely(!p))
+                        goto out_overflow;
+                m = be32_to_cpup(p);
                loc->nservers = 0;
                dprintk("%s: servers ", __func__);
@@ -2939,6 +2975,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
 out:
        dprintk("%s: fs_locations done, error = %d\n", __func__, status);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
 out_eio:
        status = -EIO;
        goto out;
@@ -2953,12 +2991,17 @@ static int decode_attr_maxfilesize(struct xdr_stream *xdr, uint32_t *bitmap, uin
        if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXFILESIZE - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_MAXFILESIZE)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, res);
                bitmap[0] &= ~FATTR4_WORD0_MAXFILESIZE;
        }
        dprintk("%s: maxfilesize=%Lu\n", __func__, (unsigned long long)*res);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxlink)
@@ -2970,12 +3013,17 @@ static int decode_attr_maxlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
        if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXLINK - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_MAXLINK)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(*maxlink);
+                if (unlikely(!p))
+                        goto out_overflow;
+                *maxlink = be32_to_cpup(p);
                bitmap[0] &= ~FATTR4_WORD0_MAXLINK;
        }
        dprintk("%s: maxlink=%u\n", __func__, *maxlink);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *maxname)
@@ -2987,12 +3035,17 @@ static int decode_attr_maxname(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
        if (unlikely(bitmap[0] & (FATTR4_WORD0_MAXNAME - 1U)))
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_MAXNAME)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(*maxname);
+                if (unlikely(!p))
+                        goto out_overflow;
+                *maxname = be32_to_cpup(p);
                bitmap[0] &= ~FATTR4_WORD0_MAXNAME;
        }
        dprintk("%s: maxname=%u\n", __func__, *maxname);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3005,8 +3058,10 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_MAXREAD)) {
                uint64_t maxread;
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(maxread);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, &maxread);
                if (maxread > 0x7FFFFFFF)
                        maxread = 0x7FFFFFFF;
                *res = (uint32_t)maxread;
@@ -3014,6 +3069,9 @@ static int decode_attr_maxread(struct xdr_stream *xdr, uint32_t *bitmap, uint32_
        }
        dprintk("%s: maxread=%lu\n", __func__, (unsigned long)*res);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -3026,8 +3084,10 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
                return -EIO;
        if (likely(bitmap[0] & FATTR4_WORD0_MAXWRITE)) {
                uint64_t maxwrite;
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(maxwrite);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, &maxwrite);
                if (maxwrite > 0x7FFFFFFF)
                        maxwrite = 0x7FFFFFFF;
                *res = (uint32_t)maxwrite;
@@ -3035,6 +3095,9 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
        }
        dprintk("%s: maxwrite=%lu\n", __func__, (unsigned long)*res);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
@@ -3047,14 +3110,19 @@ static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *m
        if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(tmp);
+                if (unlikely(!p))
+                        goto out_overflow;
+                tmp = be32_to_cpup(p);
                *mode = tmp & ~S_IFMT;
                bitmap[1] &= ~FATTR4_WORD1_MODE;
                ret = NFS_ATTR_FATTR_MODE;
        }
        dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
@@ -3066,16 +3134,22 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
        if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_NUMLINKS)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(*nlink);
+                if (unlikely(!p))
+                        goto out_overflow;
+                *nlink = be32_to_cpup(p);
                bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
                ret = NFS_ATTR_FATTR_NLINK;
        }
        dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
+static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap,
+                struct nfs_client *clp, uint32_t *uid, int may_sleep)
 {
        uint32_t len;
        __be32 *p;
@@ -3085,10 +3159,16 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
        if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_OWNER)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(len);
+                if (unlikely(!p))
-                READ_BUF(len);
+                        goto out_overflow;
-                if (len < XDR_MAX_NETOBJ) {
+                len = be32_to_cpup(p);
+                p = xdr_inline_decode(xdr, len);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (!may_sleep) {
+                        /* do nothing */
+                } else if (len < XDR_MAX_NETOBJ) {
                        if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
                                ret = NFS_ATTR_FATTR_OWNER;
                        else
@@ -3101,9 +3181,13 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
        }
        dprintk("%s: uid=%d\n", __func__, (int)*uid);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
+static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap,
+                struct nfs_client *clp, uint32_t *gid, int may_sleep)
 {
        uint32_t len;
        __be32 *p;
@@ -3113,10 +3197,16 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
        if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_OWNER_GROUP)) {
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(len);
+                if (unlikely(!p))
-                READ_BUF(len);
+                        goto out_overflow;
-                if (len < XDR_MAX_NETOBJ) {
+                len = be32_to_cpup(p);
+                p = xdr_inline_decode(xdr, len);
+                if (unlikely(!p))
+                        goto out_overflow;
+                if (!may_sleep) {
+                        /* do nothing */
+                } else if (len < XDR_MAX_NETOBJ) {
                        if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
                                ret = NFS_ATTR_FATTR_GROUP;
                        else
@@ -3129,6 +3219,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
        }
        dprintk("%s: gid=%d\n", __func__, (int)*gid);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
@@ -3143,9 +3236,11 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
        if (likely(bitmap[1] & FATTR4_WORD1_RAWDEV)) {
                dev_t tmp;
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ32(major);
+                if (unlikely(!p))
-                READ32(minor);
+                        goto out_overflow;
+                major = be32_to_cpup(p++);
+                minor = be32_to_cpup(p);
                tmp = MKDEV(major, minor);
                if (MAJOR(tmp) == major && MINOR(tmp) == minor)
                        *rdev = tmp;
@@ -3154,6 +3249,9 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
        }
        dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3165,12 +3263,17 @@ static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uin
        if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_AVAIL - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_SPACE_AVAIL)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, res);
                bitmap[1] &= ~FATTR4_WORD1_SPACE_AVAIL;
        }
        dprintk("%s: space avail=%Lu\n", __func__, (unsigned long long)*res);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3182,12 +3285,17 @@ static int decode_attr_space_free(struct xdr_stream *xdr, uint32_t *bitmap, uint
        if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_FREE - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_SPACE_FREE)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, res);
                bitmap[1] &= ~FATTR4_WORD1_SPACE_FREE;
        }
        dprintk("%s: space free=%Lu\n", __func__, (unsigned long long)*res);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -3199,12 +3307,17 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
        if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_TOTAL - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_SPACE_TOTAL)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*res);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, res);
                bitmap[1] &= ~FATTR4_WORD1_SPACE_TOTAL;
        }
        dprintk("%s: space total=%Lu\n", __func__, (unsigned long long)*res);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
@@ -3216,14 +3329,19 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
        if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_SPACE_USED)) {
-                READ_BUF(8);
+                p = xdr_inline_decode(xdr, 8);
-                READ64(*used);
+                if (unlikely(!p))
+                        goto out_overflow;
+                xdr_decode_hyper(p, used);
                bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
                ret = NFS_ATTR_FATTR_SPACE_USED;
        }
        dprintk("%s: space used=%Lu\n", __func__,
                        (unsigned long long)*used);
        return ret;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -3232,12 +3350,17 @@ static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
        uint64_t sec;
        uint32_t nsec;
-        READ_BUF(12);
+        p = xdr_inline_decode(xdr, 12);
-        READ64(sec);
+        if (unlikely(!p))
-        READ32(nsec);
+                goto out_overflow;
+        p = xdr_decode_hyper(p, &sec);
+        nsec = be32_to_cpup(p);
        time->tv_sec = (time_t)sec;
        time->tv_nsec = (long)nsec;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, struct timespec *time)
@@ -3315,11 +3438,16 @@ static int decode_change_info(struct xdr_stream *xdr, struct nfs4_change_info *c
 {
        __be32 *p;
-        READ_BUF(20);
+        p = xdr_inline_decode(xdr, 20);
-        READ32(cinfo->atomic);
+        if (unlikely(!p))
-        READ64(cinfo->before);
+                goto out_overflow;
-        READ64(cinfo->after);
+        cinfo->atomic = be32_to_cpup(p++);
+        p = xdr_decode_hyper(p, &cinfo->before);
+        xdr_decode_hyper(p, &cinfo->after);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
@@ -3331,40 +3459,62 @@ static int decode_access(struct xdr_stream *xdr, struct nfs4_accessres *access)
        status = decode_op_hdr(xdr, OP_ACCESS);
        if (status)
                return status;
-        READ_BUF(8);
+        p = xdr_inline_decode(xdr, 8);
-        READ32(supp);
+        if (unlikely(!p))
-        READ32(acc);
+                goto out_overflow;
+        supp = be32_to_cpup(p++);
+        acc = be32_to_cpup(p);
        access->supported = supp;
        access->access = acc;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
-static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+static int decode_opaque_fixed(struct xdr_stream *xdr, void *buf, size_t len)
 {
        __be32 *p;
+        p = xdr_inline_decode(xdr, len);
+        if (likely(p)) {
+                memcpy(buf, p, len);
+                return 0;
+        }
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
+{
+        return decode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+}
+static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res)
+{
        int status;
        status = decode_op_hdr(xdr, OP_CLOSE);
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
-        if (status)
+        if (!status)
-                return status;
+                status = decode_stateid(xdr, &res->stateid);
-        READ_BUF(NFS4_STATEID_SIZE);
+        return status;
-        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+}
-        return 0;
+static int decode_verifier(struct xdr_stream *xdr, void *verifier)
+{
+        return decode_opaque_fixed(xdr, verifier, 8);
 }
 static int decode_commit(struct xdr_stream *xdr, struct nfs_writeres *res)
 {
-        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_COMMIT);
-        if (status)
+        if (!status)
-                return status;
+                status = decode_verifier(xdr, res->verf->verifier);
-        READ_BUF(8);
+        return status;
-        COPYMEM(res->verf->verifier, 8);
-        return 0;
 }
 static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3378,10 +3528,16 @@ static int decode_create(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
                return status;
        if ((status = decode_change_info(xdr, cinfo)))
                return status;
-        READ_BUF(4);
+        p = xdr_inline_decode(xdr, 4);
-        READ32(bmlen);
+        if (unlikely(!p))
-        READ_BUF(bmlen << 2);
+                goto out_overflow;
-        return 0;
+        bmlen = be32_to_cpup(p);
+        p = xdr_inline_decode(xdr, bmlen << 2);
+        if (likely(p))
+                return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_server_caps(struct xdr_stream *xdr, struct nfs4_server_caps_res *res)
@@ -3466,7 +3622,8 @@ xdr_error:
        return status;
 }
-static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, const struct nfs_server *server)
+static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
+                const struct nfs_server *server, int may_sleep)
 {
        __be32 *savep;
        uint32_t attrlen,
@@ -3538,12 +3695,14 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+        status = decode_attr_owner(xdr, bitmap, server->nfs_client,
+                        &fattr->uid, may_sleep);
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
-        status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+        status = decode_attr_group(xdr, bitmap, server->nfs_client,
+                        &fattr->gid, may_sleep);
        if (status < 0)
                goto xdr_error;
        fattr->valid |= status;
@@ -3633,14 +3792,21 @@ static int decode_getfh(struct xdr_stream *xdr, struct nfs_fh *fh)
        if (status)
                return status;
-        READ_BUF(4);
+        p = xdr_inline_decode(xdr, 4);
-        READ32(len);
+        if (unlikely(!p))
+                goto out_overflow;
+        len = be32_to_cpup(p);
        if (len > NFS4_FHSIZE)
                return -EIO;
        fh->size = len;
-        READ_BUF(len);
+        p = xdr_inline_decode(xdr, len);
-        COPYMEM(fh->data, len);
+        if (unlikely(!p))
+                goto out_overflow;
+        memcpy(fh->data, p, len);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -3662,10 +3828,12 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
        __be32 *p;
        uint32_t namelen, type;
-        READ_BUF(32);
+        p = xdr_inline_decode(xdr, 32);
-        READ64(offset);
+        if (unlikely(!p))
-        READ64(length);
+                goto out_overflow;
-        READ32(type);
+        p = xdr_decode_hyper(p, &offset);
+        p = xdr_decode_hyper(p, &length);
+        type = be32_to_cpup(p++);
        if (fl != NULL) {
                fl->fl_start = (loff_t)offset;
                fl->fl_end = fl->fl_start + (loff_t)length - 1;
@@ -3676,23 +3844,27 @@ static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl)
                        fl->fl_type = F_RDLCK;
                fl->fl_pid = 0;
        }
-        READ64(clientid);
+        p = xdr_decode_hyper(p, &clientid);
-        READ32(namelen);
+        namelen = be32_to_cpup(p);
-        READ_BUF(namelen);
+        p = xdr_inline_decode(xdr, namelen);
-        return -NFS4ERR_DENIED;
+        if (likely(p))
+                return -NFS4ERR_DENIED;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res)
 {
-        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_LOCK);
        if (status == -EIO)
                goto out;
        if (status == 0) {
-                READ_BUF(NFS4_STATEID_SIZE);
+                status = decode_stateid(xdr, &res->stateid);
-                COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
+                if (unlikely(status))
+                        goto out;
        } else if (status == -NFS4ERR_DENIED)
                status = decode_lock_denied(xdr, NULL);
        if (res->open_seqid != NULL)
@@ -3713,16 +3885,13 @@ static int decode_lockt(struct xdr_stream *xdr, struct nfs_lockt_res *res)
 static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res)
 {
-        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_LOCKU);
        if (status != -EIO)
                nfs_increment_lock_seqid(status, res->seqid);
-        if (status == 0) {
+        if (status == 0)
-                READ_BUF(NFS4_STATEID_SIZE);
+                status = decode_stateid(xdr, &res->stateid);
-                COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-        }
        return status;
 }
@@ -3737,34 +3906,46 @@ static int decode_space_limit(struct xdr_stream *xdr, u64 *maxsize)
        __be32 *p;
        uint32_t limit_type, nblocks, blocksize;
-        READ_BUF(12);
+        p = xdr_inline_decode(xdr, 12);
-        READ32(limit_type);
+        if (unlikely(!p))
+                goto out_overflow;
+        limit_type = be32_to_cpup(p++);
        switch (limit_type) {
        case 1:
-                READ64(*maxsize);
+                xdr_decode_hyper(p, maxsize);
                break;
        case 2:
-                READ32(nblocks);
+                nblocks = be32_to_cpup(p++);
-                READ32(blocksize);
+                blocksize = be32_to_cpup(p);
                *maxsize = (uint64_t)nblocks * (uint64_t)blocksize;
        }
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
 {
        __be32 *p;
        uint32_t delegation_type;
+        int status;
-        READ_BUF(4);
+        p = xdr_inline_decode(xdr, 4);
-        READ32(delegation_type);
+        if (unlikely(!p))
+                goto out_overflow;
+        delegation_type = be32_to_cpup(p);
        if (delegation_type == NFS4_OPEN_DELEGATE_NONE) {
                res->delegation_type = 0;
                return 0;
        }
-        READ_BUF(NFS4_STATEID_SIZE+4);
+        status = decode_stateid(xdr, &res->delegation);
-        COPYMEM(res->delegation.data, NFS4_STATEID_SIZE);
+        if (unlikely(status))
-        READ32(res->do_recall);
+                return status;
+        p = xdr_inline_decode(xdr, 4);
+        if (unlikely(!p))
+                goto out_overflow;
+        res->do_recall = be32_to_cpup(p);
        switch (delegation_type) {
        case NFS4_OPEN_DELEGATE_READ:
@@ -3776,6 +3957,9 @@ static int decode_delegation(struct xdr_stream *xdr, struct nfs_openres *res)
                                return -EIO;
        }
        return decode_ace(xdr, NULL, res->server->nfs_client);
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
@@ -3787,23 +3971,27 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
        status = decode_op_hdr(xdr, OP_OPEN);
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
-        if (status)
+        if (!status)
+                status = decode_stateid(xdr, &res->stateid);
+        if (unlikely(status))
                return status;
-        READ_BUF(NFS4_STATEID_SIZE);
-        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
        decode_change_info(xdr, &res->cinfo);
-        READ_BUF(8);
+        p = xdr_inline_decode(xdr, 8);
-        READ32(res->rflags);
+        if (unlikely(!p))
-        READ32(bmlen);
+                goto out_overflow;
+        res->rflags = be32_to_cpup(p++);
+        bmlen = be32_to_cpup(p);
        if (bmlen > 10)
                goto xdr_error;
-        READ_BUF(bmlen << 2);
+        p = xdr_inline_decode(xdr, bmlen << 2);
+        if (unlikely(!p))
+                goto out_overflow;
        savewords = min_t(uint32_t, bmlen, NFS4_BITMAP_SIZE);
        for (i = 0; i < savewords; ++i)
-                READ32(res->attrset[i]);
+                res->attrset[i] = be32_to_cpup(p++);
        for (; i < NFS4_BITMAP_SIZE; i++)
                res->attrset[i] = 0;
@@ -3811,36 +3999,33 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res)
 xdr_error:
        dprintk("%s: Bitmap too large! Length = %u\n", __func__, bmlen);
        return -EIO;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmres *res)
 {
-        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_OPEN_CONFIRM);
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
-        if (status)
+        if (!status)
-                return status;
+                status = decode_stateid(xdr, &res->stateid);
-        READ_BUF(NFS4_STATEID_SIZE);
+        return status;
-        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-        return 0;
 }
 static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *res)
 {
-        __be32 *p;
        int status;
        status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE);
        if (status != -EIO)
                nfs_increment_open_seqid(status, res->seqid);
-        if (status)
+        if (!status)
-                return status;
+                status = decode_stateid(xdr, &res->stateid);
-        READ_BUF(NFS4_STATEID_SIZE);
+        return status;
-        COPYMEM(res->stateid.data, NFS4_STATEID_SIZE);
-        return 0;
 }
 static int decode_putfh(struct xdr_stream *xdr)
@@ -3863,9 +4048,11 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
        status = decode_op_hdr(xdr, OP_READ);
        if (status)
                return status;
-        READ_BUF(8);
+        p = xdr_inline_decode(xdr, 8);
-        READ32(eof);
+        if (unlikely(!p))
-        READ32(count);
+                goto out_overflow;
+        eof = be32_to_cpup(p++);
+        count = be32_to_cpup(p);
        hdrlen = (u8 *) p - (u8 *) iov->iov_base;
        recvd = req->rq_rcv_buf.len - hdrlen;
        if (count > recvd) {
@@ -3878,6 +4065,9 @@ static int decode_read(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs_
        res->eof = eof;
        res->count = count;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct nfs4_readdir_res *readdir)
@@ -3892,17 +4082,17 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n
        int             status;
        status = decode_op_hdr(xdr, OP_READDIR);
-        if (status)
+        if (!status)
+                status = decode_verifier(xdr, readdir->verifier.data);
+        if (unlikely(status))
                return status;
-        READ_BUF(8);
-        COPYMEM(readdir->verifier.data, 8);
        dprintk("%s: verifier = %08x:%08x\n",
                        __func__,
                        ((u32 *)readdir->verifier.data)[0],
                        ((u32 *)readdir->verifier.data)[1]);
-        hdrlen = (char *) p - (char *) iov->iov_base;
+        hdrlen = (char *) xdr->p - (char *) iov->iov_base;
        recvd = rcvbuf->len - hdrlen;
        if (pglen > recvd)
                pglen = recvd;
@@ -3990,8 +4180,10 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
                return status;
        /* Convert length of symlink */
-        READ_BUF(4);
+        p = xdr_inline_decode(xdr, 4);
-        READ32(len);
+        if (unlikely(!p))
+                goto out_overflow;
+        len = be32_to_cpup(p);
        if (len >= rcvbuf->page_len || len <= 0) {
                dprintk("nfs: server returned giant symlink!\n");
                return -ENAMETOOLONG;
@@ -4015,6 +4207,9 @@ static int decode_readlink(struct xdr_stream *xdr, struct rpc_rqst *req)
        kaddr[len+rcvbuf->page_base] = '\0';
        kunmap_atomic(kaddr, KM_USER0);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_remove(struct xdr_stream *xdr, struct nfs4_change_info *cinfo)
@@ -4112,10 +4307,16 @@ static int decode_setattr(struct xdr_stream *xdr)
        status = decode_op_hdr(xdr, OP_SETATTR);
        if (status)
                return status;
-        READ_BUF(4);
+        p = xdr_inline_decode(xdr, 4);
-        READ32(bmlen);
+        if (unlikely(!p))
-        READ_BUF(bmlen << 2);
+                goto out_overflow;
-        return 0;
+        bmlen = be32_to_cpup(p);
+        p = xdr_inline_decode(xdr, bmlen << 2);
+        if (likely(p))
+                return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
@@ -4124,35 +4325,50 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp)
        uint32_t opnum;
        int32_t nfserr;
-        READ_BUF(8);
+        p = xdr_inline_decode(xdr, 8);
-        READ32(opnum);
+        if (unlikely(!p))
+                goto out_overflow;
+        opnum = be32_to_cpup(p++);
        if (opnum != OP_SETCLIENTID) {
                dprintk("nfs: decode_setclientid: Server returned operation"
                        " %d\n", opnum);
                return -EIO;
        }
-        READ32(nfserr);
+        nfserr = be32_to_cpup(p);
        if (nfserr == NFS_OK) {
-                READ_BUF(8 + NFS4_VERIFIER_SIZE);
+                p = xdr_inline_decode(xdr, 8 + NFS4_VERIFIER_SIZE);
-                READ64(clp->cl_clientid);
+                if (unlikely(!p))
-                COPYMEM(clp->cl_confirm.data, NFS4_VERIFIER_SIZE);
+                        goto out_overflow;
+                p = xdr_decode_hyper(p, &clp->cl_clientid);
+                memcpy(clp->cl_confirm.data, p, NFS4_VERIFIER_SIZE);
        } else if (nfserr == NFSERR_CLID_INUSE) {
                uint32_t len;
                /* skip netid string */
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(len);
+                if (unlikely(!p))
-                READ_BUF(len);
+                        goto out_overflow;
+                len = be32_to_cpup(p);
+                p = xdr_inline_decode(xdr, len);
+                if (unlikely(!p))
+                        goto out_overflow;
                /* skip uaddr string */
-                READ_BUF(4);
+                p = xdr_inline_decode(xdr, 4);
-                READ32(len);
+                if (unlikely(!p))
-                READ_BUF(len);
+                        goto out_overflow;
+                len = be32_to_cpup(p);
+                p = xdr_inline_decode(xdr, len);
+                if (unlikely(!p))
+                        goto out_overflow;
                return -NFSERR_CLID_INUSE;
        } else
                return nfs4_stat_to_errno(nfserr);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_setclientid_confirm(struct xdr_stream *xdr)
@@ -4169,11 +4385,16 @@ static int decode_write(struct xdr_stream *xdr, struct nfs_writeres *res)
        if (status)
                return status;
-        READ_BUF(16);
+        p = xdr_inline_decode(xdr, 16);
-        READ32(res->count);
+        if (unlikely(!p))
-        READ32(res->verf->committed);
+                goto out_overflow;
-        COPYMEM(res->verf->verifier, 8);
+        res->count = be32_to_cpup(p++);
+        res->verf->committed = be32_to_cpup(p++);
+        memcpy(res->verf->verifier, p, 8);
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_delegreturn(struct xdr_stream *xdr)
@@ -4187,6 +4408,7 @@ static int decode_exchange_id(struct xdr_stream *xdr,
 {
        __be32 *p;
        uint32_t dummy;
+        char *dummy_str;
        int status;
        struct nfs_client *clp = res->client;
@@ -4194,36 +4416,45 @@ static int decode_exchange_id(struct xdr_stream *xdr,
        if (status)
                return status;
-        READ_BUF(8);
+        p = xdr_inline_decode(xdr, 8);
-        READ64(clp->cl_ex_clid);
+        if (unlikely(!p))
-        READ_BUF(12);
+                goto out_overflow;
-        READ32(clp->cl_seqid);
+        xdr_decode_hyper(p, &clp->cl_ex_clid);
-        READ32(clp->cl_exchange_flags);
+        p = xdr_inline_decode(xdr, 12);
+        if (unlikely(!p))
+                goto out_overflow;
+        clp->cl_seqid = be32_to_cpup(p++);
+        clp->cl_exchange_flags = be32_to_cpup(p++);
        /* We ask for SP4_NONE */
-        READ32(dummy);
+        dummy = be32_to_cpup(p);
        if (dummy != SP4_NONE)
                return -EIO;
        /* Throw away minor_id */
-        READ_BUF(8);
+        p = xdr_inline_decode(xdr, 8);
+        if (unlikely(!p))
+                goto out_overflow;
        /* Throw away Major id */
-        READ_BUF(4);
+        status = decode_opaque_inline(xdr, &dummy, &dummy_str);
-        READ32(dummy);
+        if (unlikely(status))
-        READ_BUF(dummy);
+                return status;
        /* Throw away server_scope */
-        READ_BUF(4);
+        status = decode_opaque_inline(xdr, &dummy, &dummy_str);
-        READ32(dummy);
+        if (unlikely(status))
-        READ_BUF(dummy);
+                return status;
        /* Throw away Implementation id array */
-        READ_BUF(4);
+        status = decode_opaque_inline(xdr, &dummy, &dummy_str);
-        READ32(dummy);
+        if (unlikely(status))
-        READ_BUF(dummy);
+                return status;
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_chan_attrs(struct xdr_stream *xdr,
@@ -4232,22 +4463,35 @@ static int decode_chan_attrs(struct xdr_stream *xdr,
        __be32 *p;
        u32 nr_attrs;
-        READ_BUF(28);
+        p = xdr_inline_decode(xdr, 28);
-        READ32(attrs->headerpadsz);
+        if (unlikely(!p))
-        READ32(attrs->max_rqst_sz);
+                goto out_overflow;
-        READ32(attrs->max_resp_sz);
+        attrs->headerpadsz = be32_to_cpup(p++);
-        READ32(attrs->max_resp_sz_cached);
+        attrs->max_rqst_sz = be32_to_cpup(p++);
-        READ32(attrs->max_ops);
+        attrs->max_resp_sz = be32_to_cpup(p++);
-        READ32(attrs->max_reqs);
+        attrs->max_resp_sz_cached = be32_to_cpup(p++);
-        READ32(nr_attrs);
+        attrs->max_ops = be32_to_cpup(p++);
+        attrs->max_reqs = be32_to_cpup(p++);
+        nr_attrs = be32_to_cpup(p);
        if (unlikely(nr_attrs > 1)) {
                printk(KERN_WARNING "%s: Invalid rdma channel attrs count %u\n",
                        __func__, nr_attrs);
                return -EINVAL;
        }
-        if (nr_attrs == 1)
+        if (nr_attrs == 1) {
-                READ_BUF(4); /* skip rdma_attrs */
+                p = xdr_inline_decode(xdr, 4); /* skip rdma_attrs */
+                if (unlikely(!p))
+                        goto out_overflow;
+        }
        return 0;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
+}
+static int decode_sessionid(struct xdr_stream *xdr, struct nfs4_sessionid *sid)
+{
+        return decode_opaque_fixed(xdr, sid->data, NFS4_MAX_SESSIONID_LEN);
 }
 static int decode_create_session(struct xdr_stream *xdr,
@@ -4259,24 +4503,26 @@ static int decode_create_session(struct xdr_stream *xdr,
        struct nfs4_session *session = clp->cl_session;
        status = decode_op_hdr(xdr, OP_CREATE_SESSION);
+        if (!status)
-        if (status)
+                status = decode_sessionid(xdr, &session->sess_id);
+        if (unlikely(status))
                return status;
-        /* sessionid */
-        READ_BUF(NFS4_MAX_SESSIONID_LEN);
-        COPYMEM(&session->sess_id, NFS4_MAX_SESSIONID_LEN);
        /* seqid, flags */
-        READ_BUF(8);
+        p = xdr_inline_decode(xdr, 8);
-        READ32(clp->cl_seqid);
+        if (unlikely(!p))
-        READ32(session->flags);
+                goto out_overflow;
+        clp->cl_seqid = be32_to_cpup(p++);
+        session->flags = be32_to_cpup(p);
        /* Channel attributes */
        status = decode_chan_attrs(xdr, &session->fc_attrs);
        if (!status)
                status = decode_chan_attrs(xdr, &session->bc_attrs);
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        return -EIO;
 }
 static int decode_destroy_session(struct xdr_stream *xdr, void *dummy)
@@ -4300,7 +4546,9 @@ static int decode_sequence(struct xdr_stream *xdr,
                return 0;
        status = decode_op_hdr(xdr, OP_SEQUENCE);
-        if (status)
+        if (!status)
+                status = decode_sessionid(xdr, &id);
+        if (unlikely(status))
                goto out_err;
        /*
@@ -4309,36 +4557,43 @@ static int decode_sequence(struct xdr_stream *xdr,
         */
        status = -ESERVERFAULT;
-        slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
-        READ_BUF(NFS4_MAX_SESSIONID_LEN + 20);
-        COPYMEM(id.data, NFS4_MAX_SESSIONID_LEN);
        if (memcmp(id.data, res->sr_session->sess_id.data,
                   NFS4_MAX_SESSIONID_LEN)) {
                dprintk("%s Invalid session id\n", __func__);
                goto out_err;
        }
+        p = xdr_inline_decode(xdr, 20);
+        if (unlikely(!p))
+                goto out_overflow;
        /* seqid */
-        READ32(dummy);
+        slot = &res->sr_session->fc_slot_table.slots[res->sr_slotid];
+        dummy = be32_to_cpup(p++);
        if (dummy != slot->seq_nr) {
                dprintk("%s Invalid sequence number\n", __func__);
                goto out_err;
        }
        /* slot id */
-        READ32(dummy);
+        dummy = be32_to_cpup(p++);
        if (dummy != res->sr_slotid) {
                dprintk("%s Invalid slot id\n", __func__);
                goto out_err;
        }
        /* highest slot id - currently not processed */
-        READ32(dummy);
+        dummy = be32_to_cpup(p++);
        /* target highest slot id - currently not processed */
-        READ32(dummy);
+        dummy = be32_to_cpup(p++);
        /* result flags - currently not processed */
-        READ32(dummy);
+        dummy = be32_to_cpup(p);
        status = 0;
 out_err:
        res->sr_status = status;
        return status;
+out_overflow:
+        print_overflow_msg(__func__, xdr);
+        status = -EIO;
+        goto out_err;
 #else  /* CONFIG_NFS_V4_1 */
        return 0;
 #endif /* CONFIG_NFS_V4_1 */
@@ -4370,7 +4625,8 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp, __be32 *p, struct
        status = decode_open_downgrade(&xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server);
+        decode_getfattr(&xdr, res->fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4397,7 +4653,8 @@ static int nfs4_xdr_dec_access(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_ac
        status = decode_access(&xdr, res);
        if (status != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server);
+        decode_getfattr(&xdr, res->fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4424,7 +4681,8 @@ static int nfs4_xdr_dec_lookup(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_lo
                goto out;
        if ((status = decode_getfh(&xdr, res->fh)) != 0)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server);
+        status = decode_getfattr(&xdr, res->fattr, res->server
+                        ,!RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4448,7 +4706,8 @@ static int nfs4_xdr_dec_lookup_root(struct rpc_rqst *rqstp, __be32 *p, struct nf
        if ((status = decode_putrootfh(&xdr)) != 0)
                goto out;
        if ((status = decode_getfh(&xdr, res->fh)) == 0)
-                status = decode_getfattr(&xdr, res->fattr, res->server);
+                status = decode_getfattr(&xdr, res->fattr, res->server,
+                                !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4473,7 +4732,8 @@ static int nfs4_xdr_dec_remove(struct rpc_rqst *rqstp, __be32 *p, struct nfs_rem
                goto out;
        if ((status = decode_remove(&xdr, &res->cinfo)) != 0)
                goto out;
-        decode_getfattr(&xdr, &res->dir_attr, res->server);
+        decode_getfattr(&xdr, &res->dir_attr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4503,11 +4763,13 @@ static int nfs4_xdr_dec_rename(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_re
        if ((status = decode_rename(&xdr, &res->old_cinfo, &res->new_cinfo)) != 0)
                goto out;
        /* Current FH is target directory */
-        if (decode_getfattr(&xdr, res->new_fattr, res->server) != 0)
+        if (decode_getfattr(&xdr, res->new_fattr, res->server,
+                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
        if ((status = decode_restorefh(&xdr)) != 0)
                goto out;
-        decode_getfattr(&xdr, res->old_fattr, res->server);
+        decode_getfattr(&xdr, res->old_fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4540,11 +4802,13 @@ static int nfs4_xdr_dec_link(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_link
         * Note order: OP_LINK leaves the directory as the current
         *             filehandle.
         */
-        if (decode_getfattr(&xdr, res->dir_attr, res->server) != 0)
+        if (decode_getfattr(&xdr, res->dir_attr, res->server,
+                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
        if ((status = decode_restorefh(&xdr)) != 0)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server);
+        decode_getfattr(&xdr, res->fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4573,11 +4837,13 @@ static int nfs4_xdr_dec_create(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_cr
                goto out;
        if ((status = decode_getfh(&xdr, res->fh)) != 0)
                goto out;
-        if (decode_getfattr(&xdr, res->fattr, res->server) != 0)
+        if (decode_getfattr(&xdr, res->fattr, res->server,
+                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
        if ((status = decode_restorefh(&xdr)) != 0)
                goto out;
-        decode_getfattr(&xdr, res->dir_fattr, res->server);
+        decode_getfattr(&xdr, res->dir_fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4609,7 +4875,8 @@ static int nfs4_xdr_dec_getattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs4_g
        status = decode_putfh(&xdr);
        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server);
+        status = decode_getfattr(&xdr, res->fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4716,7 +4983,8 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, __be32 *p, struct nfs_clos
         *      an ESTALE error. Shouldn't be a problem,
         *      though, since fattr->valid will remain unset.
         */
-        decode_getfattr(&xdr, res->fattr, res->server);
+        decode_getfattr(&xdr, res->fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4748,11 +5016,13 @@ static int nfs4_xdr_dec_open(struct rpc_rqst *rqstp, __be32 *p, struct nfs_openr
                goto out;
        if (decode_getfh(&xdr, &res->fh) != 0)
                goto out;
-        if (decode_getfattr(&xdr, res->f_attr, res->server) != 0)
+        if (decode_getfattr(&xdr, res->f_attr, res->server,
+                                !RPC_IS_ASYNC(rqstp->rq_task)) != 0)
                goto out;
        if (decode_restorefh(&xdr) != 0)
                goto out;
-        decode_getfattr(&xdr, res->dir_attr, res->server);
+        decode_getfattr(&xdr, res->dir_attr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4800,7 +5070,8 @@ static int nfs4_xdr_dec_open_noattr(struct rpc_rqst *rqstp, __be32 *p, struct nf
        status = decode_open(&xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->f_attr, res->server);
+        decode_getfattr(&xdr, res->f_attr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -4827,7 +5098,8 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
        status = decode_setattr(&xdr);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server);
+        decode_getfattr(&xdr, res->fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5001,7 +5273,8 @@ static int nfs4_xdr_dec_write(struct rpc_rqst *rqstp, __be32 *p, struct nfs_writ
        status = decode_write(&xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server);
+        decode_getfattr(&xdr, res->fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
        if (!status)
                status = res->count;
 out:
@@ -5030,7 +5303,8 @@ static int nfs4_xdr_dec_commit(struct rpc_rqst *rqstp, __be32 *p, struct nfs_wri
        status = decode_commit(&xdr, res);
        if (status)
                goto out;
-        decode_getfattr(&xdr, res->fattr, res->server);
+        decode_getfattr(&xdr, res->fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5194,7 +5468,8 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp, __be32 *p, struct nf
        if (status != 0)
                goto out;
        status = decode_delegreturn(&xdr);
-        decode_getfattr(&xdr, res->fattr, res->server);
+        decode_getfattr(&xdr, res->fattr, res->server,
+                        !RPC_IS_ASYNC(rqstp->rq_task));
 out:
        return status;
 }
@@ -5222,7 +5497,8 @@ static int nfs4_xdr_dec_fs_locations(struct rpc_rqst *req, __be32 *p,
                goto out;
        xdr_enter_page(&xdr, PAGE_SIZE);
        status = decode_getfattr(&xdr, &res->fs_locations->fattr,
-                                 res->fs_locations->server);
+                                 res->fs_locations->server,
+                                 !RPC_IS_ASYNC(req->rq_task));
 out:
        return status;
 }
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 73ea5e8d66ce..12c9e66d3f1d 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
        return p;
 }
-static void nfs_readdata_free(struct nfs_read_data *p)
+void nfs_readdata_free(struct nfs_read_data *p)
 {
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_rdata_mempool);
 }
-void nfs_readdata_release(void *data)
+static void nfs_readdata_release(struct nfs_read_data *rdata)
 {
-        struct nfs_read_data *rdata = data;
        put_nfs_open_context(rdata->args.context);
        nfs_readdata_free(rdata);
 }
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 0b4cbdc60abd..de935692d40d 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -73,7 +73,7 @@ enum {
        Opt_cto, Opt_nocto,
        Opt_ac, Opt_noac,
        Opt_lock, Opt_nolock,
-        Opt_v2, Opt_v3,
+        Opt_v2, Opt_v3, Opt_v4,
        Opt_udp, Opt_tcp, Opt_rdma,
        Opt_acl, Opt_noacl,
        Opt_rdirplus, Opt_nordirplus,
@@ -127,6 +127,7 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_nolock, "nolock" },
        { Opt_v2, "v2" },
        { Opt_v3, "v3" },
+        { Opt_v4, "v4" },
        { Opt_udp, "udp" },
        { Opt_tcp, "tcp" },
        { Opt_rdma, "rdma" },
@@ -158,7 +159,7 @@ static const match_table_t nfs_mount_option_tokens = {
        { Opt_mountvers, "mountvers=%s" },
        { Opt_nfsvers, "nfsvers=%s" },
        { Opt_nfsvers, "vers=%s" },
-        { Opt_minorversion, "minorversion=%u" },
+        { Opt_minorversion, "minorversion=%s" },
        { Opt_sec, "sec=%s" },
        { Opt_proto, "proto=%s" },
@@ -272,6 +273,10 @@ static const struct super_operations nfs_sops = {
 };
 #ifdef CONFIG_NFS_V4
+static int nfs4_validate_text_mount_data(void *options,
+        struct nfs_parsed_mount_data *args, const char *dev_name);
+static int nfs4_try_mount(int flags, const char *dev_name,
+        struct nfs_parsed_mount_data *data, struct vfsmount *mnt);
 static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt);
 static int nfs4_remote_get_sb(struct file_system_type *fs_type,
@@ -742,127 +747,23 @@ static int nfs_verify_server_address(struct sockaddr *addr)
        }
        }
+        dfprintk(MOUNT, "NFS: Invalid IP address specified\n");
        return 0;
 }
-static void nfs_parse_ipv4_address(char *string, size_t str_len,
-                                   struct sockaddr *sap, size_t *addr_len)
-{
-        struct sockaddr_in *sin = (struct sockaddr_in *)sap;
-        u8 *addr = (u8 *)&sin->sin_addr.s_addr;
-        if (str_len <= INET_ADDRSTRLEN) {
-                dfprintk(MOUNT, "NFS: parsing IPv4 address %*s\n",
-                                (int)str_len, string);
-                sin->sin_family = AF_INET;
-                *addr_len = sizeof(*sin);
-                if (in4_pton(string, str_len, addr, '\0', NULL))
-                        return;
-        }
-        sap->sa_family = AF_UNSPEC;
-        *addr_len = 0;
-}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len,
-                                   const char *delim,
-                                   struct sockaddr_in6 *sin6)
-{
-        char *p;
-        size_t len;
-        if ((string + str_len) == delim)
-                return 1;
-        if (*delim != IPV6_SCOPE_DELIMITER)
-                return 0;
-        if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL))
-                return 0;
-        len = (string + str_len) - delim - 1;
-        p = kstrndup(delim + 1, len, GFP_KERNEL);
-        if (p) {
-                unsigned long scope_id = 0;
-                struct net_device *dev;
-                dev = dev_get_by_name(&init_net, p);
-                if (dev != NULL) {
-                        scope_id = dev->ifindex;
-                        dev_put(dev);
-                } else {
-                        if (strict_strtoul(p, 10, &scope_id) == 0) {
-                                kfree(p);
-                                return 0;
-                        }
-                }
-                kfree(p);
-                sin6->sin6_scope_id = scope_id;
-                dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id);
-                return 1;
-        }
-        return 0;
-}
-static void nfs_parse_ipv6_address(char *string, size_t str_len,
-                                   struct sockaddr *sap, size_t *addr_len)
-{
-        struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap;
-        u8 *addr = (u8 *)&sin6->sin6_addr.in6_u;
-        const char *delim;
-        if (str_len <= INET6_ADDRSTRLEN) {
-                dfprintk(MOUNT, "NFS: parsing IPv6 address %*s\n",
-                                (int)str_len, string);
-                sin6->sin6_family = AF_INET6;
-                *addr_len = sizeof(*sin6);
-                if (in6_pton(string, str_len, addr,
-                                        IPV6_SCOPE_DELIMITER, &delim) != 0) {
-                        if (nfs_parse_ipv6_scope_id(string, str_len,
-                                                        delim, sin6) != 0)
-                                return;
-                }
-        }
-        sap->sa_family = AF_UNSPEC;
-        *addr_len = 0;
-}
-#else
-static void nfs_parse_ipv6_address(char *string, size_t str_len,
-                                   struct sockaddr *sap, size_t *addr_len)
-{
-        sap->sa_family = AF_UNSPEC;
-        *addr_len = 0;
-}
-#endif
 /*
- * Construct a sockaddr based on the contents of a string that contains
+ * Select between a default port value and a user-specified port value.
- * an IP address in presentation format.
+ * If a zero value is set, then autobind will be used.
- *
- * If there is a problem constructing the new sockaddr, set the address
- * family to AF_UNSPEC.
 */
-void nfs_parse_ip_address(char *string, size_t str_len,
+static void nfs_set_default_port(struct sockaddr *sap, const int parsed_port,
-                                 struct sockaddr *sap, size_t *addr_len)
+                                 const unsigned short default_port)
 {
-        unsigned int i, colons;
+        unsigned short port = default_port;
-        colons = 0;
+        if (parsed_port != NFS_UNSPEC_PORT)
-        for (i = 0; i < str_len; i++)
+                port = parsed_port;
-                if (string[i] == ':')
-                        colons++;
-        if (colons >= 2)
+        rpc_set_port(sap, port);
-                nfs_parse_ipv6_address(string, str_len, sap, addr_len);
-        else
-                nfs_parse_ipv4_address(string, str_len, sap, addr_len);
 }
 /*
@@ -904,8 +805,6 @@ static void nfs_set_mount_transport_protocol(struct nfs_parsed_mount_data *mnt)
 /*
 * Parse the value of the 'sec=' option.
- *
- * The flavor_len setting is for v4 mounts.
 */
 static int nfs_parse_security_flavors(char *value,
                                      struct nfs_parsed_mount_data *mnt)
@@ -916,53 +815,43 @@ static int nfs_parse_security_flavors(char *value,
        switch (match_token(value, nfs_secflavor_tokens, args)) {
        case Opt_sec_none:
-                mnt->auth_flavor_len = 0;
                mnt->auth_flavors[0] = RPC_AUTH_NULL;
                break;
        case Opt_sec_sys:
-                mnt->auth_flavor_len = 0;
                mnt->auth_flavors[0] = RPC_AUTH_UNIX;
                break;
        case Opt_sec_krb5:
-                mnt->auth_flavor_len = 1;
                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5;
                break;
        case Opt_sec_krb5i:
-                mnt->auth_flavor_len = 1;
                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5I;
                break;
        case Opt_sec_krb5p:
-                mnt->auth_flavor_len = 1;
                mnt->auth_flavors[0] = RPC_AUTH_GSS_KRB5P;
                break;
        case Opt_sec_lkey:
-                mnt->auth_flavor_len = 1;
                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEY;
                break;
        case Opt_sec_lkeyi:
-                mnt->auth_flavor_len = 1;
                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYI;
                break;
        case Opt_sec_lkeyp:
-                mnt->auth_flavor_len = 1;
                mnt->auth_flavors[0] = RPC_AUTH_GSS_LKEYP;
                break;
        case Opt_sec_spkm:
-                mnt->auth_flavor_len = 1;
                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKM;
                break;
        case Opt_sec_spkmi:
-                mnt->auth_flavor_len = 1;
                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMI;
                break;
        case Opt_sec_spkmp:
-                mnt->auth_flavor_len = 1;
                mnt->auth_flavors[0] = RPC_AUTH_GSS_SPKMP;
                break;
        default:
                return 0;
        }
+        mnt->auth_flavor_len = 1;
        return 1;
 }
@@ -1001,7 +890,6 @@ static int nfs_parse_mount_options(char *raw,
        while ((p = strsep(&raw, ",")) != NULL) {
                substring_t args[MAX_OPT_ARGS];
                unsigned long option;
-                int int_option;
                int token;
                if (!*p)
@@ -1047,10 +935,18 @@ static int nfs_parse_mount_options(char *raw,
                        break;
                case Opt_v2:
                        mnt->flags &= ~NFS_MOUNT_VER3;
+                        mnt->version = 2;
                        break;
                case Opt_v3:
                        mnt->flags |= NFS_MOUNT_VER3;
+                        mnt->version = 3;
                        break;
+#ifdef CONFIG_NFS_V4
+                case Opt_v4:
+                        mnt->flags &= ~NFS_MOUNT_VER3;
+                        mnt->version = 4;
+                        break;
+#endif
                case Opt_udp:
                        mnt->flags &= ~NFS_MOUNT_TCP;
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_UDP;
@@ -1264,20 +1160,33 @@ static int nfs_parse_mount_options(char *raw,
                        switch (option) {
                        case NFS2_VERSION:
                                mnt->flags &= ~NFS_MOUNT_VER3;
+                                mnt->version = 2;
                                break;
                        case NFS3_VERSION:
                                mnt->flags |= NFS_MOUNT_VER3;
+                                mnt->version = 3;
                                break;
+#ifdef CONFIG_NFS_V4
+                        case NFS4_VERSION:
+                                mnt->flags &= ~NFS_MOUNT_VER3;
+                                mnt->version = 4;
+                                break;
+#endif
                        default:
                                goto out_invalid_value;
                        }
                        break;
                case Opt_minorversion:
-                        if (match_int(args, &int_option))
+                        string = match_strdup(args);
-                                return 0;
+                        if (string == NULL)
-                        if (int_option < 0 || int_option > NFS4_MAX_MINOR_VERSION)
+                                goto out_nomem;
-                                return 0;
+                        rc = strict_strtoul(string, 10, &option);
-                        mnt->minorversion = int_option;
+                        kfree(string);
+                        if (rc != 0)
+                                goto out_invalid_value;
+                        if (option > NFS4_MAX_MINOR_VERSION)
+                                goto out_invalid_value;
+                        mnt->minorversion = option;
                        break;
                /*
@@ -1352,11 +1261,14 @@ static int nfs_parse_mount_options(char *raw,
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        nfs_parse_ip_address(string, strlen(string),
+                        mnt->nfs_server.addrlen =
-                                             (struct sockaddr *)
+                                rpc_pton(string, strlen(string),
-                                                &mnt->nfs_server.address,
+                                        (struct sockaddr *)
-                                             &mnt->nfs_server.addrlen);
+                                        &mnt->nfs_server.address,
+                                        sizeof(mnt->nfs_server.address));
                        kfree(string);
+                        if (mnt->nfs_server.addrlen == 0)
+                                goto out_invalid_address;
                        break;
                case Opt_clientaddr:
                        string = match_strdup(args);
@@ -1376,11 +1288,14 @@ static int nfs_parse_mount_options(char *raw,
                        string = match_strdup(args);
                        if (string == NULL)
                                goto out_nomem;
-                        nfs_parse_ip_address(string, strlen(string),
+                        mnt->mount_server.addrlen =
-                                             (struct sockaddr *)
+                                rpc_pton(string, strlen(string),
-                                                &mnt->mount_server.address,
+                                        (struct sockaddr *)
-                                             &mnt->mount_server.addrlen);
+                                        &mnt->mount_server.address,
+                                        sizeof(mnt->mount_server.address));
                        kfree(string);
+                        if (mnt->mount_server.addrlen == 0)
+                                goto out_invalid_address;
                        break;
                case Opt_lookupcache:
                        string = match_strdup(args);
@@ -1432,8 +1347,11 @@ static int nfs_parse_mount_options(char *raw,
        return 1;
+out_invalid_address:
+        printk(KERN_INFO "NFS: bad IP address specified: %s\n", p);
+        return 0;
 out_invalid_value:
-        printk(KERN_INFO "NFS: bad mount option value specified: %s \n", p);
+        printk(KERN_INFO "NFS: bad mount option value specified: %s\n", p);
        return 0;
 out_nomem:
        printk(KERN_INFO "NFS: not enough memory to parse option\n");
@@ -1445,13 +1363,60 @@ out_security_failure:
 }
 /*
+ * Match the requested auth flavors with the list returned by
+ * the server.  Returns zero and sets the mount's authentication
+ * flavor on success; returns -EACCES if server does not support
+ * the requested flavor.
+ */
+static int nfs_walk_authlist(struct nfs_parsed_mount_data *args,
+                             struct nfs_mount_request *request)
+{
+        unsigned int i, j, server_authlist_len = *(request->auth_flav_len);
+        /*
+         * Certain releases of Linux's mountd return an empty
+         * flavor list.  To prevent behavioral regression with
+         * these servers (ie. rejecting mounts that used to
+         * succeed), revert to pre-2.6.32 behavior (no checking)
+         * if the returned flavor list is empty.
+         */
+        if (server_authlist_len == 0)
+                return 0;
+        /*
+         * We avoid sophisticated negotiating here, as there are
+         * plenty of cases where we can get it wrong, providing
+         * either too little or too much security.
+         *
+         * RFC 2623, section 2.7 suggests we SHOULD prefer the
+         * flavor listed first.  However, some servers list
+         * AUTH_NULL first.  Our caller plants AUTH_SYS, the
+         * preferred default, in args->auth_flavors[0] if user
+         * didn't specify sec= mount option.
+         */
+        for (i = 0; i < args->auth_flavor_len; i++)
+                for (j = 0; j < server_authlist_len; j++)
+                        if (args->auth_flavors[i] == request->auth_flavs[j]) {
+                                dfprintk(MOUNT, "NFS: using auth flavor %d\n",
+                                        request->auth_flavs[j]);
+                                args->auth_flavors[0] = request->auth_flavs[j];
+                                return 0;
+                        }
+        dfprintk(MOUNT, "NFS: server does not support requested auth flavor\n");
+        nfs_umount(request);
+        return -EACCES;
+}
+/*
 * Use the remote server's MOUNT service to request the NFS file handle
 * corresponding to the provided path.
 */
 static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                         struct nfs_fh *root_fh)
 {
-        unsigned int auth_flavor_len = 0;
+        rpc_authflavor_t server_authlist[NFS_MAX_SECFLAVORS];
+        unsigned int server_authlist_len = ARRAY_SIZE(server_authlist);
        struct nfs_mount_request request = {
                .sap            = (struct sockaddr *)
                                                &args->mount_server.address,
@@ -1459,7 +1424,8 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                .protocol       = args->mount_server.protocol,
                .fh             = root_fh,
                .noresvport     = args->flags & NFS_MOUNT_NORESVPORT,
-                .auth_flav_len  = &auth_flavor_len,
+                .auth_flav_len  = &server_authlist_len,
+                .auth_flavs     = server_authlist,
        };
        int status;
@@ -1485,23 +1451,25 @@ static int nfs_try_mount(struct nfs_parsed_mount_data *args,
                args->mount_server.addrlen = args->nfs_server.addrlen;
        }
        request.salen = args->mount_server.addrlen;
+        nfs_set_default_port(request.sap, args->mount_server.port, 0);
-        /*
-         * autobind will be used if mount_server.port == 0
-         */
-        nfs_set_port(request.sap, args->mount_server.port);
        /*
         * Now ask the mount server to map our export path
         * to a file handle.
         */
        status = nfs_mount(&request);
-        if (status == 0)
+        if (status != 0) {
-                return 0;
+                dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
+                                request.hostname, status);
+                return status;
+        }
-        dfprintk(MOUNT, "NFS: unable to mount server %s, error %d\n",
+        /*
-                        request.hostname, status);
+         * MNTv1 (NFSv2) does not support auth flavor negotiation.
-        return status;
+         */
+        if (args->mount_server.version != NFS_MNT3_VERSION)
+                return 0;
+        return nfs_walk_authlist(args, &request);
 }
 static int nfs_parse_simple_hostname(const char *dev_name,
@@ -1661,6 +1629,7 @@ static int nfs_validate_mount_data(void *options,
                                   const char *dev_name)
 {
        struct nfs_mount_data *data = (struct nfs_mount_data *)options;
+        struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
        if (data == NULL)
                goto out_no_data;
@@ -1672,10 +1641,12 @@ static int nfs_validate_mount_data(void *options,
        args->acregmax          = NFS_DEF_ACREGMAX;
        args->acdirmin          = NFS_DEF_ACDIRMIN;
        args->acdirmax          = NFS_DEF_ACDIRMAX;
-        args->mount_server.port = 0;    /* autobind unless user sets port */
+        args->mount_server.port = NFS_UNSPEC_PORT;
-        args->nfs_server.port   = 0;    /* autobind unless user sets port */
+        args->nfs_server.port   = NFS_UNSPEC_PORT;
        args->nfs_server.protocol = XPRT_TRANSPORT_TCP;
        args->auth_flavors[0]   = RPC_AUTH_UNIX;
+        args->auth_flavor_len   = 1;
+        args->minorversion      = 0;
        switch (data->version) {
        case 1:
@@ -1697,8 +1668,11 @@ static int nfs_validate_mount_data(void *options,
                        if (data->root.size > NFS3_FHSIZE || data->root.size == 0)
                                goto out_invalid_fh;
                        mntfh->size = data->root.size;
-                } else
+                        args->version = 3;
+                } else {
                        mntfh->size = NFS2_FHSIZE;
+                        args->version = 2;
+                }
                memcpy(mntfh->data, data->root.data, mntfh->size);
@@ -1720,11 +1694,9 @@ static int nfs_validate_mount_data(void *options,
                args->acdirmin          = data->acdirmin;
                args->acdirmax          = data->acdirmax;
-                memcpy(&args->nfs_server.address, &data->addr,
+                memcpy(sap, &data->addr, sizeof(data->addr));
-                       sizeof(data->addr));
                args->nfs_server.addrlen = sizeof(data->addr);
-                if (!nfs_verify_server_address((struct sockaddr *)
+                if (!nfs_verify_server_address(sap))
-                                                &args->nfs_server.address))
                        goto out_no_address;
                if (!(data->flags & NFS_MOUNT_TCP))
@@ -1772,12 +1744,18 @@ static int nfs_validate_mount_data(void *options,
                if (nfs_parse_mount_options((char *)options, args) == 0)
                        return -EINVAL;
-                if (!nfs_verify_server_address((struct sockaddr *)
+                if (!nfs_verify_server_address(sap))
-                                                &args->nfs_server.address))
                        goto out_no_address;
-                nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+                if (args->version == 4)
-                                args->nfs_server.port);
+#ifdef CONFIG_NFS_V4
+                        return nfs4_validate_text_mount_data(options,
+                                                             args, dev_name);
+#else
+                        goto out_v4_not_compiled;
+#endif
+                nfs_set_default_port(sap, args->nfs_server.port, 0);
                nfs_set_mount_transport_protocol(args);
@@ -1825,6 +1803,12 @@ out_v3_not_compiled:
        return -EPROTONOSUPPORT;
 #endif /* !CONFIG_NFS_V3 */
+#ifndef CONFIG_NFS_V4
+out_v4_not_compiled:
+        dfprintk(MOUNT, "NFS: NFSv4 is not compiled into kernel\n");
+        return -EPROTONOSUPPORT;
+#endif /* !CONFIG_NFS_V4 */
 out_nomem:
        dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n");
        return -ENOMEM;
@@ -1934,6 +1918,8 @@ static inline void nfs_initialise_sb(struct super_block *sb)
        if (server->flags & NFS_MOUNT_NOAC)
                sb->s_flags |= MS_SYNCHRONOUS;
+        sb->s_bdi = &server->backing_dev_info;
        nfs_super_set_maxbytes(sb, server->maxfilesize);
 }
@@ -2120,6 +2106,14 @@ static int nfs_get_sb(struct file_system_type *fs_type,
        if (error < 0)
                goto out;
+#ifdef CONFIG_NFS_V4
+        if (data->version == 4) {
+                error = nfs4_try_mount(flags, dev_name, data, mnt);
+                kfree(data->client_address);
+                goto out;
+        }
+#endif  /* CONFIG_NFS_V4 */
        /* Get a volume representation */
        server = nfs_create_server(data, mntfh);
        if (IS_ERR(server)) {
@@ -2317,6 +2311,43 @@ static void nfs4_validate_mount_flags(struct nfs_parsed_mount_data *args)
        args->flags &= ~(NFS_MOUNT_NONLM|NFS_MOUNT_NOACL|NFS_MOUNT_VER3);
 }
+static int nfs4_validate_text_mount_data(void *options,
+                                         struct nfs_parsed_mount_data *args,
+                                         const char *dev_name)
+{
+        struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
+        nfs_set_default_port(sap, args->nfs_server.port, NFS_PORT);
+        nfs_validate_transport_protocol(args);
+        nfs4_validate_mount_flags(args);
+        if (args->version != 4) {
+                dfprintk(MOUNT,
+                         "NFS4: Illegal mount version\n");
+                return -EINVAL;
+        }
+        if (args->auth_flavor_len > 1) {
+                dfprintk(MOUNT,
+                         "NFS4: Too many RPC auth flavours specified\n");
+                return -EINVAL;
+        }
+        if (args->client_address == NULL) {
+                dfprintk(MOUNT,
+                         "NFS4: mount program didn't pass callback address\n");
+                return -EINVAL;
+        }
+        return nfs_parse_devname(dev_name,
+                                   &args->nfs_server.hostname,
+                                   NFS4_MAXNAMLEN,
+                                   &args->nfs_server.export_path,
+                                   NFS4_MAXPATHLEN);
+}
 /*
 * Validate NFSv4 mount options
 */
@@ -2324,7 +2355,7 @@ static int nfs4_validate_mount_data(void *options,
                                    struct nfs_parsed_mount_data *args,
                                    const char *dev_name)
 {
-        struct sockaddr_in *ap;
+        struct sockaddr *sap = (struct sockaddr *)&args->nfs_server.address;
        struct nfs4_mount_data *data = (struct nfs4_mount_data *)options;
        char *c;
@@ -2337,23 +2368,22 @@ static int nfs4_validate_mount_data(void *options,
        args->acregmax          = NFS_DEF_ACREGMAX;
        args->acdirmin          = NFS_DEF_ACDIRMIN;
        args->acdirmax          = NFS_DEF_ACDIRMAX;
-        args->nfs_server.port   = NFS_PORT; /* 2049 unless user set port= */
+        args->nfs_server.port   = NFS_UNSPEC_PORT;
        args->auth_flavors[0]   = RPC_AUTH_UNIX;
-        args->auth_flavor_len   = 0;
+        args->auth_flavor_len   = 1;
+        args->version           = 4;
        args->minorversion      = 0;
        switch (data->version) {
        case 1:
-                ap = (struct sockaddr_in *)&args->nfs_server.address;
                if (data->host_addrlen > sizeof(args->nfs_server.address))
                        goto out_no_address;
                if (data->host_addrlen == 0)
                        goto out_no_address;
                args->nfs_server.addrlen = data->host_addrlen;
-                if (copy_from_user(ap, data->host_addr, data->host_addrlen))
+                if (copy_from_user(sap, data->host_addr, data->host_addrlen))
                        return -EFAULT;
-                if (!nfs_verify_server_address((struct sockaddr *)
+                if (!nfs_verify_server_address(sap))
-                                                &args->nfs_server.address))
                        goto out_no_address;
                if (data->auth_flavourlen) {
@@ -2399,39 +2429,14 @@ static int nfs4_validate_mount_data(void *options,
                nfs_validate_transport_protocol(args);
                break;
-        default: {
+        default:
-                int status;
                if (nfs_parse_mount_options((char *)options, args) == 0)
                        return -EINVAL;
-                if (!nfs_verify_server_address((struct sockaddr *)
+                if (!nfs_verify_server_address(sap))
-                                                &args->nfs_server.address))
                        return -EINVAL;
-                nfs_set_port((struct sockaddr *)&args->nfs_server.address,
+                return nfs4_validate_text_mount_data(options, args, dev_name);
-                                args->nfs_server.port);
-                nfs_validate_transport_protocol(args);
-                nfs4_validate_mount_flags(args);
-                if (args->auth_flavor_len > 1)
-                        goto out_inval_auth;
-                if (args->client_address == NULL)
-                        goto out_no_client_address;
-                status = nfs_parse_devname(dev_name,
-                                           &args->nfs_server.hostname,
-                                           NFS4_MAXNAMLEN,
-                                           &args->nfs_server.export_path,
-                                           NFS4_MAXPATHLEN);
-                if (status < 0)
-                        return status;
-                break;
-                }
        }
        return 0;
@@ -2448,10 +2453,6 @@ out_inval_auth:
 out_no_address:
        dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n");
        return -EINVAL;
-out_no_client_address:
-        dfprintk(MOUNT, "NFS4: mount program didn't pass callback address\n");
-        return -EINVAL;
 }
 /*
@@ -2618,6 +2619,34 @@ out_err:
        return ret;
 }
+static int nfs4_try_mount(int flags, const char *dev_name,
+                         struct nfs_parsed_mount_data *data,
+                         struct vfsmount *mnt)
+{
+        char *export_path;
+        struct vfsmount *root_mnt;
+        int error;
+        dfprintk(MOUNT, "--> nfs4_try_mount()\n");
+        export_path = data->nfs_server.export_path;
+        data->nfs_server.export_path = "/";
+        root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
+                        data->nfs_server.hostname);
+        data->nfs_server.export_path = export_path;
+        error = PTR_ERR(root_mnt);
+        if (IS_ERR(root_mnt))
+                goto out;
+        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
+out:
+        dfprintk(MOUNT, "<-- nfs4_try_mount() = %d%s\n", error,
+                        error != 0 ? " [error]" : "");
+        return error;
+}
 /*
 * Get the superblock for an NFS4 mountpoint
 */
@@ -2625,8 +2654,6 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt)
 {
        struct nfs_parsed_mount_data *data;
-        char *export_path;
-        struct vfsmount *root_mnt;
        int error = -ENOMEM;
        data = kzalloc(sizeof(*data), GFP_KERNEL);
@@ -2638,17 +2665,7 @@ static int nfs4_get_sb(struct file_system_type *fs_type,
        if (error < 0)
                goto out;
-        export_path = data->nfs_server.export_path;
+        error = nfs4_try_mount(flags, dev_name, data, mnt);
-        data->nfs_server.export_path = "/";
-        root_mnt = nfs_do_root_mount(&nfs4_remote_fs_type, flags, data,
-                        data->nfs_server.hostname);
-        data->nfs_server.export_path = export_path;
-        error = PTR_ERR(root_mnt);
-        if (IS_ERR(root_mnt))
-                goto out;
-        error = nfs_follow_remote_path(root_mnt, export_path, mnt);
 out:
        kfree(data->client_address);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 0a0a2ff767c3..53eb26c16b50 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -13,6 +13,7 @@
 #include <linux/file.h>
 #include <linux/writeback.h>
 #include <linux/swap.h>
+#include <linux/migrate.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfs_fs.h>
@@ -26,6 +27,7 @@
 #include "internal.h"
 #include "iostat.h"
 #include "nfs4_fs.h"
+#include "fscache.h"
 #define NFSDBG_FACILITY         NFSDBG_PAGECACHE
@@ -87,17 +89,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
        return p;
 }
-static void nfs_writedata_free(struct nfs_write_data *p)
+void nfs_writedata_free(struct nfs_write_data *p)
 {
        if (p && (p->pagevec != &p->page_array[0]))
                kfree(p->pagevec);
        mempool_free(p, nfs_wdata_mempool);
 }
-void nfs_writedata_release(void *data)
+static void nfs_writedata_release(struct nfs_write_data *wdata)
 {
-        struct nfs_write_data *wdata = data;
        put_nfs_open_context(wdata->args.context);
        nfs_writedata_free(wdata);
 }
@@ -220,24 +220,17 @@ static void nfs_end_page_writeback(struct page *page)
                clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 }
-/*
+static struct nfs_page *nfs_find_and_lock_request(struct page *page)
- * Find an associated nfs write request, and prepare to flush it out
- * May return an error if the user signalled nfs_wait_on_request().
- */
-static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-                                struct page *page)
 {
        struct inode *inode = page->mapping->host;
        struct nfs_page *req;
        int ret;
        spin_lock(&inode->i_lock);
-        for(;;) {
+        for (;;) {
                req = nfs_page_find_request_locked(page);
-                if (req == NULL) {
+                if (req == NULL)
-                        spin_unlock(&inode->i_lock);
+                        break;
-                        return 0;
-                }
                if (nfs_set_page_tag_locked(req))
                        break;
                /* Note: If we hold the page lock, as is the case in nfs_writepage,
@@ -249,23 +242,40 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
                ret = nfs_wait_on_request(req);
                nfs_release_request(req);
                if (ret != 0)
-                        return ret;
+                        return ERR_PTR(ret);
                spin_lock(&inode->i_lock);
        }
-        if (test_bit(PG_CLEAN, &req->wb_flags)) {
-                spin_unlock(&inode->i_lock);
-                BUG();
-        }
-        if (nfs_set_page_writeback(page) != 0) {
-                spin_unlock(&inode->i_lock);
-                BUG();
-        }
        spin_unlock(&inode->i_lock);
+        return req;
+}
+/*
+ * Find an associated nfs write request, and prepare to flush it out
+ * May return an error if the user signalled nfs_wait_on_request().
+ */
+static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
+                                struct page *page)
+{
+        struct nfs_page *req;
+        int ret = 0;
+        req = nfs_find_and_lock_request(page);
+        if (!req)
+                goto out;
+        ret = PTR_ERR(req);
+        if (IS_ERR(req))
+                goto out;
+        ret = nfs_set_page_writeback(page);
+        BUG_ON(ret != 0);
+        BUG_ON(test_bit(PG_CLEAN, &req->wb_flags));
        if (!nfs_pageio_add_request(pgio, req)) {
                nfs_redirty_request(req);
-                return pgio->pg_error;
+                ret = pgio->pg_error;
        }
-        return 0;
+out:
+        return ret;
 }
 static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
@@ -1480,7 +1490,6 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
                .nr_to_write = LONG_MAX,
                .range_start = 0,
                .range_end = LLONG_MAX,
-                .for_writepages = 1,
        };
        return __nfs_write_mapping(mapping, &wbc, how);
@@ -1582,6 +1591,41 @@ int nfs_wb_page(struct inode *inode, struct page* page)
        return nfs_wb_page_priority(inode, page, FLUSH_STABLE);
 }
+#ifdef CONFIG_MIGRATION
+int nfs_migrate_page(struct address_space *mapping, struct page *newpage,
+                struct page *page)
+{
+        struct nfs_page *req;
+        int ret;
+        if (PageFsCache(page))
+                nfs_fscache_release_page(page, GFP_KERNEL);
+        req = nfs_find_and_lock_request(page);
+        ret = PTR_ERR(req);
+        if (IS_ERR(req))
+                goto out;
+        ret = migrate_page(mapping, newpage, page);
+        if (!req)
+                goto out;
+        if (ret)
+                goto out_unlock;
+        page_cache_get(newpage);
+        req->wb_page = newpage;
+        SetPagePrivate(newpage);
+        set_page_private(newpage, page_private(page));
+        ClearPagePrivate(page);
+        set_page_private(page, 0);
+        page_cache_release(page);
+out_unlock:
+        nfs_clear_page_tag_locked(req);
+        nfs_release_request(req);
+out:
+        return ret;
+}
+#endif
 int __init nfs_init_writepagecache(void)
 {
        nfs_wdata_cachep = kmem_cache_create("nfs_write_data",
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index 5573508f707f..36fcabbf5186 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -34,6 +34,8 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
        int flags = nfsexp_flags(rqstp, exp);
        int ret;
+        validate_process_creds();
        /* discard any old override before preparing the new set */
        revert_creds(get_cred(current->real_cred));
        new = prepare_creds();
@@ -86,8 +88,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
        else
                new->cap_effective = cap_raise_nfsd_set(new->cap_effective,
                                                        new->cap_permitted);
+        validate_process_creds();
        put_cred(override_creds(new));
        put_cred(new);
+        validate_process_creds();
        return 0;
 oom:
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index b92a27629fb7..d9462643155c 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -85,6 +85,11 @@ static void expkey_request(struct cache_detail *cd,
        (*bpp)[-1] = '\n';
 }
+static int expkey_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+        return sunrpc_cache_pipe_upcall(cd, h, expkey_request);
+}
 static struct svc_expkey *svc_expkey_update(struct svc_expkey *new, struct svc_expkey *old);
 static struct svc_expkey *svc_expkey_lookup(struct svc_expkey *);
 static struct cache_detail svc_expkey_cache;
@@ -259,7 +264,7 @@ static struct cache_detail svc_expkey_cache = {
        .hash_table     = expkey_table,
        .name           = "nfsd.fh",
        .cache_put      = expkey_put,
-        .cache_request  = expkey_request,
+        .cache_upcall   = expkey_upcall,
        .cache_parse    = expkey_parse,
        .cache_show     = expkey_show,
        .match          = expkey_match,
@@ -355,6 +360,11 @@ static void svc_export_request(struct cache_detail *cd,
        (*bpp)[-1] = '\n';
 }
+static int svc_export_upcall(struct cache_detail *cd, struct cache_head *h)
+{
+        return sunrpc_cache_pipe_upcall(cd, h, svc_export_request);
+}
 static struct svc_export *svc_export_update(struct svc_export *new,
                                            struct svc_export *old);
 static struct svc_export *svc_export_lookup(struct svc_export *);
@@ -724,7 +734,7 @@ struct cache_detail svc_export_cache = {
        .hash_table     = export_table,
        .name           = "nfsd.export",
        .cache_put      = svc_export_put,
-        .cache_request  = svc_export_request,
+        .cache_upcall   = svc_export_upcall,
        .cache_parse    = svc_export_parse,
        .cache_show     = svc_export_show,
        .match          = svc_export_match,
diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c
index 5b398421b051..cdfa86fa1471 100644
--- a/fs/nfsd/nfs4idmap.c
+++ b/fs/nfsd/nfs4idmap.c
@@ -146,6 +146,12 @@ idtoname_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
 }
 static int
+idtoname_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+        return sunrpc_cache_pipe_upcall(cd, ch, idtoname_request);
+}
+static int
 idtoname_match(struct cache_head *ca, struct cache_head *cb)
 {
        struct ent *a = container_of(ca, struct ent, h);
@@ -175,10 +181,10 @@ idtoname_show(struct seq_file *m, struct cache_detail *cd, struct cache_head *h)
 }
 static void
-warn_no_idmapd(struct cache_detail *detail)
+warn_no_idmapd(struct cache_detail *detail, int has_died)
 {
        printk("nfsd: nfsv4 idmapping failing: has idmapd %s?\n",
-                        detail->last_close? "died" : "not been started");
+                        has_died ? "died" : "not been started");
 }
@@ -192,7 +198,7 @@ static struct cache_detail idtoname_cache = {
        .hash_table     = idtoname_table,
        .name           = "nfs4.idtoname",
        .cache_put      = ent_put,
-        .cache_request  = idtoname_request,
+        .cache_upcall   = idtoname_upcall,
        .cache_parse    = idtoname_parse,
        .cache_show     = idtoname_show,
        .warn_no_listener = warn_no_idmapd,
@@ -325,6 +331,12 @@ nametoid_request(struct cache_detail *cd, struct cache_head *ch, char **bpp,
 }
 static int
+nametoid_upcall(struct cache_detail *cd, struct cache_head *ch)
+{
+        return sunrpc_cache_pipe_upcall(cd, ch, nametoid_request);
+}
+static int
 nametoid_match(struct cache_head *ca, struct cache_head *cb)
 {
        struct ent *a = container_of(ca, struct ent, h);
@@ -363,7 +375,7 @@ static struct cache_detail nametoid_cache = {
        .hash_table     = nametoid_table,
        .name           = "nfs4.nametoid",
        .cache_put      = ent_put,
-        .cache_request  = nametoid_request,
+        .cache_upcall   = nametoid_upcall,
        .cache_parse    = nametoid_parse,
        .cache_show     = nametoid_show,
        .warn_no_listener = warn_no_idmapd,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 6d0847562d87..7e906c5b7671 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -37,6 +37,7 @@
 #include <linux/nfsd/xdr.h>
 #include <linux/nfsd/syscall.h>
 #include <linux/lockd/lockd.h>
+#include <linux/sunrpc/clnt.h>
 #include <asm/uaccess.h>
 #include <net/ipv6.h>
@@ -490,22 +491,18 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size)
 *
 * Input:
 *                      buf:    '\n'-terminated C string containing a
- *                              presentation format IPv4 address
+ *                              presentation format IP address
 *                      size:   length of C string in @buf
 * Output:
 *      On success:     returns zero if all specified locks were released;
 *                      returns one if one or more locks were not released
 *      On error:       return code is negative errno value
- *
- * Note: Only AF_INET client addresses are passed in
 */
 static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
 {
-        struct sockaddr_in sin = {
+        struct sockaddr_storage address;
-                .sin_family     = AF_INET,
+        struct sockaddr *sap = (struct sockaddr *)&address;
-        };
+        size_t salen = sizeof(address);
-        int b1, b2, b3, b4;
-        char c;
        char *fo_path;
        /* sanity check */
@@ -519,14 +516,10 @@ static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size)
        if (qword_get(&buf, fo_path, size) < 0)
                return -EINVAL;
-        /* get ipv4 address */
+        if (rpc_pton(fo_path, size, sap, salen) == 0)
-        if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4)
-                return -EINVAL;
-        if (b1 > 255 || b2 > 255 || b3 > 255 || b4 > 255)
                return -EINVAL;
-        sin.sin_addr.s_addr = htonl((b1 << 24) | (b2 << 16) | (b3 << 8) | b4);
-        return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin);
+        return nlmsvc_unlock_all_by_ip(sap);
 }
 /**
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 492c79b7800b..24d58adfe5fd 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -496,7 +496,9 @@ nfsd(void *vrqstp)
                /* Lock the export hash tables for reading. */
                exp_readlock();
+                validate_process_creds();
                svc_process(rqstp);
+                validate_process_creds();
                /* Unlock export hash tables */
                exp_readunlock();
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 23341c1063bc..8fa09bfbcba7 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -684,6 +684,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
        __be32          err;
        int             host_err;
+        validate_process_creds();
        /*
         * If we get here, then the client has already done an "open",
         * and (hopefully) checked permission - so allow OWNER_OVERRIDE
@@ -740,6 +742,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 out_nfserr:
        err = nfserrno(host_err);
 out:
+        validate_process_creds();
        return err;
 }
diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig
index 72da095d4009..251da07b2a1d 100644
--- a/fs/nilfs2/Kconfig
+++ b/fs/nilfs2/Kconfig
@@ -1,6 +1,6 @@
 config NILFS2_FS
        tristate "NILFS2 file system support (EXPERIMENTAL)"
-        depends on BLOCK && EXPERIMENTAL
+        depends on EXPERIMENTAL
        select CRC32
        help
          NILFS2 is a log-structured file system (LFS) supporting continuous
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
index 99d58a028b94..08834df6ec68 100644
--- a/fs/nilfs2/bmap.c
+++ b/fs/nilfs2/bmap.c
@@ -36,6 +36,26 @@ struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
        return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
 }
+/**
+ * nilfs_bmap_lookup_at_level - find a data block or node block
+ * @bmap: bmap
+ * @key: key
+ * @level: level
+ * @ptrp: place to store the value associated to @key
+ *
+ * Description: nilfs_bmap_lookup_at_level() finds a record whose key
+ * matches @key in the block at @level of the bmap.
+ *
+ * Return Value: On success, 0 is returned and the record associated with @key
+ * is stored in the place pointed by @ptrp. On error, one of the following
+ * negative error codes is returned.
+ *
+ * %-EIO - I/O error.
+ *
+ * %-ENOMEM - Insufficient amount of memory available.
+ *
+ * %-ENOENT - A record associated with @key does not exist.
+ */
 int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
                               __u64 *ptrp)
 {
@@ -69,39 +89,6 @@ int nilfs_bmap_lookup_contig(struct nilfs_bmap *bmap, __u64 key, __u64 *ptrp,
        return ret;
 }
-/**
- * nilfs_bmap_lookup - find a record
- * @bmap: bmap
- * @key: key
- * @recp: pointer to record
- *
- * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
- * @bmap.
- *
- * Return Value: On success, 0 is returned and the record associated with @key
- * is stored in the place pointed by @recp. On error, one of the following
- * negative error codes is returned.
- *
- * %-EIO - I/O error.
- *
- * %-ENOMEM - Insufficient amount of memory available.
- *
- * %-ENOENT - A record associated with @key does not exist.
- */
-int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
-                      unsigned long key,
-                      unsigned long *recp)
-{
-        __u64 ptr;
-        int ret;
-        /* XXX: use macro for level 1 */
-        ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
-        if (recp != NULL)
-                *recp = ptr;
-        return ret;
-}
 static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
        __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
@@ -469,104 +456,6 @@ __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
                (entries_per_group / NILFS_BMAP_GROUP_DIV);
 }
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
-                                 union nilfs_bmap_ptr_req *req)
-{
-        return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
-                                 union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
-                              union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-int nilfs_bmap_start_v(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *req,
-                       sector_t blocknr)
-{
-        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        int ret;
-        ret = nilfs_dat_prepare_start(dat, &req->bpr_req);
-        if (likely(!ret))
-                nilfs_dat_commit_start(dat, &req->bpr_req, blocknr);
-        return ret;
-}
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
-                             union nilfs_bmap_ptr_req *req)
-{
-        return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
-                             union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req,
-                             bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-}
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
-                            union nilfs_bmap_ptr_req *req)
-{
-        nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
-}
-int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
-                      sector_t blocknr)
-{
-        return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
-}
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
-{
-        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
-}
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *bmap,
-                                union nilfs_bmap_ptr_req *oldreq,
-                                union nilfs_bmap_ptr_req *newreq)
-{
-        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        int ret;
-        ret = nilfs_dat_prepare_end(dat, &oldreq->bpr_req);
-        if (ret < 0)
-                return ret;
-        ret = nilfs_dat_prepare_alloc(dat, &newreq->bpr_req);
-        if (ret < 0)
-                nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-        return ret;
-}
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *bmap,
-                                union nilfs_bmap_ptr_req *oldreq,
-                                union nilfs_bmap_ptr_req *newreq)
-{
-        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        nilfs_dat_commit_end(dat, &oldreq->bpr_req,
-                             bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
-        nilfs_dat_commit_alloc(dat, &newreq->bpr_req);
-}
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *bmap,
-                               union nilfs_bmap_ptr_req *oldreq,
-                               union nilfs_bmap_ptr_req *newreq)
-{
-        struct inode *dat = nilfs_bmap_get_dat(bmap);
-        nilfs_dat_abort_end(dat, &oldreq->bpr_req);
-        nilfs_dat_abort_alloc(dat, &newreq->bpr_req);
-}
 static struct lock_class_key nilfs_bmap_dat_lock_key;
 static struct lock_class_key nilfs_bmap_mdt_lock_key;
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
index b2890cdcef12..9980d7dbab91 100644
--- a/fs/nilfs2/bmap.h
+++ b/fs/nilfs2/bmap.h
@@ -28,6 +28,7 @@
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
 #include "alloc.h"
+#include "dat.h"
 #define NILFS_BMAP_INVALID_PTR  0
@@ -141,7 +142,6 @@ struct nilfs_bmap {
 int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
 int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
 void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
-int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
 int nilfs_bmap_lookup_contig(struct nilfs_bmap *, __u64, __u64 *, unsigned);
 int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
 int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
@@ -160,90 +160,76 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
 void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
+static inline int nilfs_bmap_lookup(struct nilfs_bmap *bmap, __u64 key,
+                                    __u64 *ptr)
+{
+        return nilfs_bmap_lookup_at_level(bmap, key, 1, ptr);
+}
 /*
 * Internal use only
 */
 struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *,
-                               union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *,
-                               union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *,
-                              union nilfs_bmap_ptr_req *);
 static inline int nilfs_bmap_prepare_alloc_ptr(struct nilfs_bmap *bmap,
-                                               union nilfs_bmap_ptr_req *req)
+                                               union nilfs_bmap_ptr_req *req,
+                                               struct inode *dat)
 {
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (dat)
-                return nilfs_bmap_prepare_alloc_v(bmap, req);
+                return nilfs_dat_prepare_alloc(dat, &req->bpr_req);
        /* ignore target ptr */
        req->bpr_ptr = bmap->b_last_allocated_ptr++;
        return 0;
 }
 static inline void nilfs_bmap_commit_alloc_ptr(struct nilfs_bmap *bmap,
-                                               union nilfs_bmap_ptr_req *req)
+                                               union nilfs_bmap_ptr_req *req,
+                                               struct inode *dat)
 {
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (dat)
-                nilfs_bmap_commit_alloc_v(bmap, req);
+                nilfs_dat_commit_alloc(dat, &req->bpr_req);
 }
 static inline void nilfs_bmap_abort_alloc_ptr(struct nilfs_bmap *bmap,
-                                              union nilfs_bmap_ptr_req *req)
+                                              union nilfs_bmap_ptr_req *req,
+                                              struct inode *dat)
 {
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (dat)
-                nilfs_bmap_abort_alloc_v(bmap, req);
+                nilfs_dat_abort_alloc(dat, &req->bpr_req);
        else
                bmap->b_last_allocated_ptr--;
 }
-int nilfs_bmap_prepare_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_end_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *);
 static inline int nilfs_bmap_prepare_end_ptr(struct nilfs_bmap *bmap,
-                                             union nilfs_bmap_ptr_req *req)
+                                             union nilfs_bmap_ptr_req *req,
+                                             struct inode *dat)
 {
-        return NILFS_BMAP_USE_VBN(bmap) ?
+        return dat ? nilfs_dat_prepare_end(dat, &req->bpr_req) : 0;
-                nilfs_bmap_prepare_end_v(bmap, req) : 0;
 }
 static inline void nilfs_bmap_commit_end_ptr(struct nilfs_bmap *bmap,
-                                             union nilfs_bmap_ptr_req *req)
+                                             union nilfs_bmap_ptr_req *req,
+                                             struct inode *dat)
 {
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (dat)
-                nilfs_bmap_commit_end_v(bmap, req);
+                nilfs_dat_commit_end(dat, &req->bpr_req,
+                                     bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
 }
 static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap,
-                                            union nilfs_bmap_ptr_req *req)
+                                            union nilfs_bmap_ptr_req *req,
+                                            struct inode *dat)
 {
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (dat)
-                nilfs_bmap_abort_end_v(bmap, req);
+                nilfs_dat_abort_end(dat, &req->bpr_req);
 }
-int nilfs_bmap_start_v(struct nilfs_bmap *, union nilfs_bmap_ptr_req *,
-                       sector_t);
-int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
-int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
                              const struct buffer_head *);
 __u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
 __u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
-int nilfs_bmap_prepare_update_v(struct nilfs_bmap *,
-                                union nilfs_bmap_ptr_req *,
-                                union nilfs_bmap_ptr_req *);
-void nilfs_bmap_commit_update_v(struct nilfs_bmap *,
-                                union nilfs_bmap_ptr_req *,
-                                union nilfs_bmap_ptr_req *);
-void nilfs_bmap_abort_update_v(struct nilfs_bmap *,
-                               union nilfs_bmap_ptr_req *,
-                               union nilfs_bmap_ptr_req *);
 void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
 void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 7e0b61be212e..c668bca579c1 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -209,6 +209,7 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
                 * We cannot call radix_tree_preload for the kernels older
                 * than 2.6.23, because it is not exported for modules.
                 */
+retry:
                err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
                if (err)
                        goto failed_unlock;
@@ -219,7 +220,6 @@ int nilfs_btnode_prepare_change_key(struct address_space *btnc,
                                       (unsigned long long)oldkey,
                                       (unsigned long long)newkey);
-retry:
                spin_lock_irq(&btnc->tree_lock);
                err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
                spin_unlock_irq(&btnc->tree_lock);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index aa412724b64e..e25b507a474f 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -71,21 +71,17 @@ void nilfs_btree_path_cache_destroy(void)
        kmem_cache_destroy(nilfs_btree_path_cache);
 }
-static inline struct nilfs_btree_path *
+static inline struct nilfs_btree_path *nilfs_btree_alloc_path(void)
-nilfs_btree_alloc_path(const struct nilfs_btree *btree)
 {
-        return (struct nilfs_btree_path *)
+        return kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
-                kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
 }
-static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
+static inline void nilfs_btree_free_path(struct nilfs_btree_path *path)
-                                         struct nilfs_btree_path *path)
 {
        kmem_cache_free(nilfs_btree_path_cache, path);
 }
-static void nilfs_btree_init_path(const struct nilfs_btree *btree,
+static void nilfs_btree_init_path(struct nilfs_btree_path *path)
-                                  struct nilfs_btree_path *path)
 {
        int level;
@@ -101,26 +97,13 @@ static void nilfs_btree_init_path(const struct nilfs_btree *btree,
        }
 }
-static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
+static void nilfs_btree_release_path(struct nilfs_btree_path *path)
-                                   struct nilfs_btree_path *path)
 {
        int level;
-        for (level = NILFS_BTREE_LEVEL_DATA;
+        for (level = NILFS_BTREE_LEVEL_DATA; level < NILFS_BTREE_LEVEL_MAX;
-             level < NILFS_BTREE_LEVEL_MAX;
+             level++)
-             level++) {
+                brelse(path[level].bp_bh);
-                if (path[level].bp_bh != NULL) {
-                        brelse(path[level].bp_bh);
-                        path[level].bp_bh = NULL;
-                }
-                /* sib_bh is released or deleted by prepare or commit
-                 * operations. */
-                path[level].bp_sib_bh = NULL;
-                path[level].bp_index = 0;
-                path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
-                path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
-                path[level].bp_op = NULL;
-        }
 }
 /*
@@ -148,129 +131,110 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree,
 }
 static inline int
-nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
+nilfs_btree_node_get_flags(const struct nilfs_btree_node *node)
-                           const struct nilfs_btree_node *node)
 {
        return node->bn_flags;
 }
 static inline void
-nilfs_btree_node_set_flags(struct nilfs_btree *btree,
+nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags)
-                           struct nilfs_btree_node *node,
-                           int flags)
 {
        node->bn_flags = flags;
 }
-static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
+static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node)
-                                        const struct nilfs_btree_node *node)
 {
-        return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
+        return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT;
 }
 static inline int
-nilfs_btree_node_get_level(const struct nilfs_btree *btree,
+nilfs_btree_node_get_level(const struct nilfs_btree_node *node)
-                           const struct nilfs_btree_node *node)
 {
        return node->bn_level;
 }
 static inline void
-nilfs_btree_node_set_level(struct nilfs_btree *btree,
+nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level)
-                           struct nilfs_btree_node *node,
-                           int level)
 {
        node->bn_level = level;
 }
 static inline int
-nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
+nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node)
-                               const struct nilfs_btree_node *node)
 {
        return le16_to_cpu(node->bn_nchildren);
 }
 static inline void
-nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
+nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren)
-                               struct nilfs_btree_node *node,
-                               int nchildren)
 {
        node->bn_nchildren = cpu_to_le16(nchildren);
 }
-static inline int
+static inline int nilfs_btree_node_size(const struct nilfs_btree *btree)
-nilfs_btree_node_size(const struct nilfs_btree *btree)
 {
        return 1 << btree->bt_bmap.b_inode->i_blkbits;
 }
 static inline int
-nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
+nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node,
-                               const struct nilfs_btree_node *node)
+                               const struct nilfs_btree *btree)
 {
-        return nilfs_btree_node_root(btree, node) ?
+        return nilfs_btree_node_root(node) ?
                NILFS_BTREE_ROOT_NCHILDREN_MIN :
                NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
 }
 static inline int
-nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
+nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node,
-                               const struct nilfs_btree_node *node)
+                               const struct nilfs_btree *btree)
 {
-        return nilfs_btree_node_root(btree, node) ?
+        return nilfs_btree_node_root(node) ?
                NILFS_BTREE_ROOT_NCHILDREN_MAX :
                NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
 }
 static inline __le64 *
-nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
+nilfs_btree_node_dkeys(const struct nilfs_btree_node *node)
-                       const struct nilfs_btree_node *node)
 {
        return (__le64 *)((char *)(node + 1) +
-                          (nilfs_btree_node_root(btree, node) ?
+                          (nilfs_btree_node_root(node) ?
                           0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
 }
 static inline __le64 *
-nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
+nilfs_btree_node_dptrs(const struct nilfs_btree_node *node,
-                       const struct nilfs_btree_node *node)
+                       const struct nilfs_btree *btree)
 {
-        return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
+        return (__le64 *)(nilfs_btree_node_dkeys(node) +
-                          nilfs_btree_node_nchildren_max(btree, node));
+                          nilfs_btree_node_nchildren_max(node, btree));
 }
 static inline __u64
-nilfs_btree_node_get_key(const struct nilfs_btree *btree,
+nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index)
-                         const struct nilfs_btree_node *node, int index)
 {
-        return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
+        return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index));
-                                        index));
 }
 static inline void
-nilfs_btree_node_set_key(struct nilfs_btree *btree,
+nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key)
-                         struct nilfs_btree_node *node, int index, __u64 key)
 {
-        *(nilfs_btree_node_dkeys(btree, node) + index) =
+        *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key);
-                nilfs_bmap_key_to_dkey(key);
 }
 static inline __u64
 nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
-                         const struct nilfs_btree_node *node,
+                         const struct nilfs_btree_node *node, int index)
-                         int index)
 {
-        return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
+        return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) +
                                        index));
 }
 static inline void
 nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
-                         struct nilfs_btree_node *node,
+                         struct nilfs_btree_node *node, int index, __u64 ptr)
-                         int index,
-                         __u64 ptr)
 {
-        *(nilfs_btree_node_dptrs(btree, node) + index) =
+        *(nilfs_btree_node_dptrs(node, btree) + index) =
                nilfs_bmap_ptr_to_dptr(ptr);
 }
@@ -283,12 +247,12 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree,
        __le64 *dptrs;
        int i;
-        nilfs_btree_node_set_flags(btree, node, flags);
+        nilfs_btree_node_set_flags(node, flags);
-        nilfs_btree_node_set_level(btree, node, level);
+        nilfs_btree_node_set_level(node, level);
-        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+        nilfs_btree_node_set_nchildren(node, nchildren);
-        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(btree, node);
+        dptrs = nilfs_btree_node_dptrs(node, btree);
        for (i = 0; i < nchildren; i++) {
                dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
                dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
@@ -305,13 +269,13 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
        __le64 *ldptrs, *rdptrs;
        int lnchildren, rnchildren;
-        ldkeys = nilfs_btree_node_dkeys(btree, left);
+        ldkeys = nilfs_btree_node_dkeys(left);
-        ldptrs = nilfs_btree_node_dptrs(btree, left);
+        ldptrs = nilfs_btree_node_dptrs(left, btree);
-        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(left);
-        rdkeys = nilfs_btree_node_dkeys(btree, right);
+        rdkeys = nilfs_btree_node_dkeys(right);
-        rdptrs = nilfs_btree_node_dptrs(btree, right);
+        rdptrs = nilfs_btree_node_dptrs(right, btree);
-        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(right);
        memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
        memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
@@ -320,8 +284,8 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
        lnchildren += n;
        rnchildren -= n;
-        nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+        nilfs_btree_node_set_nchildren(left, lnchildren);
-        nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+        nilfs_btree_node_set_nchildren(right, rnchildren);
 }
 /* Assume that the buffer heads corresponding to left and right are locked. */
@@ -334,13 +298,13 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
        __le64 *ldptrs, *rdptrs;
        int lnchildren, rnchildren;
-        ldkeys = nilfs_btree_node_dkeys(btree, left);
+        ldkeys = nilfs_btree_node_dkeys(left);
-        ldptrs = nilfs_btree_node_dptrs(btree, left);
+        ldptrs = nilfs_btree_node_dptrs(left, btree);
-        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(left);
-        rdkeys = nilfs_btree_node_dkeys(btree, right);
+        rdkeys = nilfs_btree_node_dkeys(right);
-        rdptrs = nilfs_btree_node_dptrs(btree, right);
+        rdptrs = nilfs_btree_node_dptrs(right, btree);
-        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(right);
        memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
        memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
@@ -349,8 +313,8 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
        lnchildren -= n;
        rnchildren += n;
-        nilfs_btree_node_set_nchildren(btree, left, lnchildren);
+        nilfs_btree_node_set_nchildren(left, lnchildren);
-        nilfs_btree_node_set_nchildren(btree, right, rnchildren);
+        nilfs_btree_node_set_nchildren(right, rnchildren);
 }
 /* Assume that the buffer head corresponding to node is locked. */
@@ -362,9 +326,9 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
        __le64 *dptrs;
        int nchildren;
-        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(btree, node);
+        dptrs = nilfs_btree_node_dptrs(node, btree);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
        if (index < nchildren) {
                memmove(dkeys + index + 1, dkeys + index,
                        (nchildren - index) * sizeof(*dkeys));
@@ -374,7 +338,7 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree,
        dkeys[index] = nilfs_bmap_key_to_dkey(key);
        dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
        nchildren++;
-        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+        nilfs_btree_node_set_nchildren(node, nchildren);
 }
 /* Assume that the buffer head corresponding to node is locked. */
@@ -388,11 +352,11 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
        __le64 *dptrs;
        int nchildren;
-        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(btree, node);
+        dptrs = nilfs_btree_node_dptrs(node, btree);
        key = nilfs_bmap_dkey_to_key(dkeys[index]);
        ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
        if (keyp != NULL)
                *keyp = key;
        if (ptrp != NULL)
@@ -405,11 +369,10 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree,
                        (nchildren - index - 1) * sizeof(*dptrs));
        }
        nchildren--;
-        nilfs_btree_node_set_nchildren(btree, node, nchildren);
+        nilfs_btree_node_set_nchildren(node, nchildren);
 }
-static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
+static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node,
-                                   const struct nilfs_btree_node *node,
                                   __u64 key, int *indexp)
 {
        __u64 nkey;
@@ -417,12 +380,12 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
        /* binary search */
        low = 0;
-        high = nilfs_btree_node_get_nchildren(btree, node) - 1;
+        high = nilfs_btree_node_get_nchildren(node) - 1;
        index = 0;
        s = 0;
        while (low <= high) {
                index = (low + high) / 2;
-                nkey = nilfs_btree_node_get_key(btree, node, index);
+                nkey = nilfs_btree_node_get_key(node, index);
                if (nkey == key) {
                        s = 0;
                        goto out;
@@ -436,9 +399,8 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
        }
        /* adjust index */
-        if (nilfs_btree_node_get_level(btree, node) >
+        if (nilfs_btree_node_get_level(node) > NILFS_BTREE_LEVEL_NODE_MIN) {
-            NILFS_BTREE_LEVEL_NODE_MIN) {
+                if (s > 0 && index > 0)
-                if ((s > 0) && (index > 0))
                        index--;
        } else if (s < 0)
                index++;
@@ -456,25 +418,20 @@ nilfs_btree_get_root(const struct nilfs_btree *btree)
 }
 static inline struct nilfs_btree_node *
-nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
+nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level)
-                             const struct nilfs_btree_path *path,
-                             int level)
 {
        return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
 }
 static inline struct nilfs_btree_node *
-nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
+nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level)
-                         const struct nilfs_btree_path *path,
-                         int level)
 {
        return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
 }
 static inline int nilfs_btree_height(const struct nilfs_btree *btree)
 {
-        return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
+        return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1;
-                + 1;
 }
 static inline struct nilfs_btree_node *
@@ -484,7 +441,7 @@ nilfs_btree_get_node(const struct nilfs_btree *btree,
 {
        return (level == nilfs_btree_height(btree) - 1) ?
                nilfs_btree_get_root(btree) :
-                nilfs_btree_get_nonroot_node(btree, path, level);
+                nilfs_btree_get_nonroot_node(path, level);
 }
 static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
@@ -496,12 +453,11 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
        int level, index, found, ret;
        node = nilfs_btree_get_root(btree);
-        level = nilfs_btree_node_get_level(btree, node);
+        level = nilfs_btree_node_get_level(node);
-        if ((level < minlevel) ||
+        if (level < minlevel || nilfs_btree_node_get_nchildren(node) <= 0)
-            (nilfs_btree_node_get_nchildren(btree, node) <= 0))
                return -ENOENT;
-        found = nilfs_btree_node_lookup(btree, node, key, &index);
+        found = nilfs_btree_node_lookup(node, key, &index);
        ptr = nilfs_btree_node_get_ptr(btree, node, index);
        path[level].bp_bh = NULL;
        path[level].bp_index = index;
@@ -510,14 +466,13 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
                if (ret < 0)
                        return ret;
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
-                BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+                BUG_ON(level != nilfs_btree_node_get_level(node));
                if (!found)
-                        found = nilfs_btree_node_lookup(btree, node, key,
+                        found = nilfs_btree_node_lookup(node, key, &index);
-                                                        &index);
                else
                        index = 0;
-                if (index < nilfs_btree_node_nchildren_max(btree, node))
+                if (index < nilfs_btree_node_nchildren_max(node, btree))
                        ptr = nilfs_btree_node_get_ptr(btree, node, index);
                else {
                        WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
@@ -544,10 +499,10 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
        int index, level, ret;
        node = nilfs_btree_get_root(btree);
-        index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+        index = nilfs_btree_node_get_nchildren(node) - 1;
        if (index < 0)
                return -ENOENT;
-        level = nilfs_btree_node_get_level(btree, node);
+        level = nilfs_btree_node_get_level(node);
        ptr = nilfs_btree_node_get_ptr(btree, node, index);
        path[level].bp_bh = NULL;
        path[level].bp_index = index;
@@ -556,15 +511,15 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
                ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh);
                if (ret < 0)
                        return ret;
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
-                BUG_ON(level != nilfs_btree_node_get_level(btree, node));
+                BUG_ON(level != nilfs_btree_node_get_level(node));
-                index = nilfs_btree_node_get_nchildren(btree, node) - 1;
+                index = nilfs_btree_node_get_nchildren(node) - 1;
                ptr = nilfs_btree_node_get_ptr(btree, node, index);
                path[level].bp_index = index;
        }
        if (keyp != NULL)
-                *keyp = nilfs_btree_node_get_key(btree, node, index);
+                *keyp = nilfs_btree_node_get_key(node, index);
        if (ptrp != NULL)
                *ptrp = ptr;
@@ -580,18 +535,18 @@ static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
        int ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
        if (ptrp != NULL)
                *ptrp = ptr;
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -608,10 +563,10 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        int level = NILFS_BTREE_LEVEL_NODE_MIN;
        int ret, cnt, index, maxlevel;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
        if (ret < 0)
                goto out;
@@ -631,8 +586,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        node = nilfs_btree_get_node(btree, path, level);
        index = path[level].bp_index + 1;
        for (;;) {
-                while (index < nilfs_btree_node_get_nchildren(btree, node)) {
+                while (index < nilfs_btree_node_get_nchildren(node)) {
-                        if (nilfs_btree_node_get_key(btree, node, index) !=
+                        if (nilfs_btree_node_get_key(node, index) !=
                            key + cnt)
                                goto end;
                        ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
@@ -653,8 +608,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
                /* look-up right sibling node */
                node = nilfs_btree_get_node(btree, path, level + 1);
                index = path[level + 1].bp_index + 1;
-                if (index >= nilfs_btree_node_get_nchildren(btree, node) ||
+                if (index >= nilfs_btree_node_get_nchildren(node) ||
-                    nilfs_btree_node_get_key(btree, node, index) != key + cnt)
+                    nilfs_btree_node_get_key(node, index) != key + cnt)
                        break;
                ptr2 = nilfs_btree_node_get_ptr(btree, node, index);
                path[level + 1].bp_index = index;
@@ -664,7 +619,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
                ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh);
                if (ret < 0)
                        goto out;
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
                index = 0;
                path[level].bp_index = index;
        }
@@ -672,8 +627,8 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap,
        *ptrp = ptr;
        ret = cnt;
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -685,9 +640,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
                do {
                        lock_buffer(path[level].bp_bh);
                        nilfs_btree_node_set_key(
-                                btree,
+                                nilfs_btree_get_nonroot_node(path, level),
-                                nilfs_btree_get_nonroot_node(
-                                        btree, path, level),
                                path[level].bp_index, key);
                        if (!buffer_dirty(path[level].bp_bh))
                                nilfs_btnode_mark_dirty(path[level].bp_bh);
@@ -698,8 +651,7 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree,
        /* root */
        if (level == nilfs_btree_height(btree) - 1) {
-                nilfs_btree_node_set_key(btree,
+                nilfs_btree_node_set_key(nilfs_btree_get_root(btree),
-                                         nilfs_btree_get_root(btree),
                                         path[level].bp_index, key);
        }
 }
@@ -712,7 +664,7 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
        if (level < nilfs_btree_height(btree) - 1) {
                lock_buffer(path[level].bp_bh);
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
                                        path[level].bp_index);
                if (!buffer_dirty(path[level].bp_bh))
@@ -721,8 +673,8 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree,
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
-                                                nilfs_btree_node_get_key(
+                                                nilfs_btree_node_get_key(node,
-                                                        btree, node, 0));
+                                                                         0));
        } else {
                node = nilfs_btree_get_root(btree);
                nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
@@ -740,10 +692,10 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        left = nilfs_btree_get_sib_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(path, level);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
-        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(left);
        move = 0;
        n = (nchildren + lnchildren + 1) / 2 - lnchildren;
@@ -764,7 +716,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btree_promote_key(btree, path, level + 1,
-                                nilfs_btree_node_get_key(btree, node, 0));
+                                nilfs_btree_node_get_key(node, 0));
        if (move) {
                brelse(path[level].bp_bh);
@@ -791,10 +743,10 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        right = nilfs_btree_get_sib_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(path, level);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
-        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(right);
        move = 0;
        n = (nchildren + rnchildren + 1) / 2 - rnchildren;
@@ -816,15 +768,14 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree,
        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
-                                nilfs_btree_node_get_key(btree, right, 0));
+                                nilfs_btree_node_get_key(right, 0));
        path[level + 1].bp_index--;
        if (move) {
                brelse(path[level].bp_bh);
                path[level].bp_bh = path[level].bp_sib_bh;
                path[level].bp_sib_bh = NULL;
-                path[level].bp_index -=
+                path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
-                        nilfs_btree_node_get_nchildren(btree, node);
                path[level + 1].bp_index++;
        } else {
                brelse(path[level].bp_sib_bh);
@@ -846,9 +797,9 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        right = nilfs_btree_get_sib_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(path, level);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
        move = 0;
        n = (nchildren + 1) / 2;
@@ -867,16 +818,15 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_bh);
        unlock_buffer(path[level].bp_sib_bh);
-        newkey = nilfs_btree_node_get_key(btree, right, 0);
+        newkey = nilfs_btree_node_get_key(right, 0);
        newptr = path[level].bp_newreq.bpr_ptr;
        if (move) {
-                path[level].bp_index -=
+                path[level].bp_index -= nilfs_btree_node_get_nchildren(node);
-                        nilfs_btree_node_get_nchildren(btree, node);
                nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
                                        path[level].bp_index);
-                *keyp = nilfs_btree_node_get_key(btree, right, 0);
+                *keyp = nilfs_btree_node_get_key(right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
                brelse(path[level].bp_bh);
@@ -885,7 +835,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree,
        } else {
                nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
-                *keyp = nilfs_btree_node_get_key(btree, right, 0);
+                *keyp = nilfs_btree_node_get_key(right, 0);
                *ptrp = path[level].bp_newreq.bpr_ptr;
                brelse(path[level].bp_sib_bh);
@@ -905,12 +855,12 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_sib_bh);
        root = nilfs_btree_get_root(btree);
-        child = nilfs_btree_get_sib_node(btree, path, level);
+        child = nilfs_btree_get_sib_node(path, level);
-        n = nilfs_btree_node_get_nchildren(btree, root);
+        n = nilfs_btree_node_get_nchildren(root);
        nilfs_btree_node_move_right(btree, root, child, n);
-        nilfs_btree_node_set_level(btree, root, level + 1);
+        nilfs_btree_node_set_level(root, level + 1);
        if (!buffer_dirty(path[level].bp_sib_bh))
                nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
@@ -922,7 +872,7 @@ static void nilfs_btree_grow(struct nilfs_btree *btree,
        nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
-        *keyp = nilfs_btree_node_get_key(btree, child, 0);
+        *keyp = nilfs_btree_node_get_key(child, 0);
        *ptrp = path[level].bp_newreq.bpr_ptr;
 }
@@ -990,26 +940,29 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        struct nilfs_btree_node *node, *parent, *sib;
        __u64 sibptr;
        int pindex, level, ret;
+        struct inode *dat = NULL;
        stats->bs_nblocks = 0;
        level = NILFS_BTREE_LEVEL_DATA;
        /* allocate a new ptr for data block */
-        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
                path[level].bp_newreq.bpr_ptr =
                        nilfs_btree_find_target_v(btree, path, key);
+                dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+        }
        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                                           &path[level].bp_newreq);
+                                           &path[level].bp_newreq, dat);
        if (ret < 0)
                goto err_out_data;
        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < nilfs_btree_height(btree) - 1;
             level++) {
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
-                if (nilfs_btree_node_get_nchildren(btree, node) <
+                if (nilfs_btree_node_get_nchildren(node) <
-                    nilfs_btree_node_nchildren_max(btree, node)) {
+                    nilfs_btree_node_nchildren_max(node, btree)) {
                        path[level].bp_op = nilfs_btree_do_insert;
                        stats->bs_nblocks++;
                        goto out;
@@ -1026,8 +979,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(btree, sib) <
+                        if (nilfs_btree_node_get_nchildren(sib) <
-                            nilfs_btree_node_nchildren_max(btree, sib)) {
+                            nilfs_btree_node_nchildren_max(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_carry_left;
                                stats->bs_nblocks++;
@@ -1038,15 +991,15 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                /* right sibling */
                if (pindex <
-                    nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+                    nilfs_btree_node_get_nchildren(parent) - 1) {
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex + 1);
                        ret = nilfs_btree_get_block(btree, sibptr, &bh);
                        if (ret < 0)
                                goto err_out_child_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(btree, sib) <
+                        if (nilfs_btree_node_get_nchildren(sib) <
-                            nilfs_btree_node_nchildren_max(btree, sib)) {
+                            nilfs_btree_node_nchildren_max(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_carry_right;
                                stats->bs_nblocks++;
@@ -1059,7 +1012,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
                path[level].bp_newreq.bpr_ptr =
                        path[level - 1].bp_newreq.bpr_ptr + 1;
                ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                                                   &path[level].bp_newreq);
+                                                   &path[level].bp_newreq, dat);
                if (ret < 0)
                        goto err_out_child_node;
                ret = nilfs_btree_get_new_block(btree,
@@ -1081,8 +1034,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* root */
        node = nilfs_btree_get_root(btree);
-        if (nilfs_btree_node_get_nchildren(btree, node) <
+        if (nilfs_btree_node_get_nchildren(node) <
-            nilfs_btree_node_nchildren_max(btree, node)) {
+            nilfs_btree_node_nchildren_max(node, btree)) {
                path[level].bp_op = nilfs_btree_do_insert;
                stats->bs_nblocks++;
                goto out;
@@ -1091,7 +1044,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* grow */
        path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
        ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap,
-                                           &path[level].bp_newreq);
+                                           &path[level].bp_newreq, dat);
        if (ret < 0)
                goto err_out_child_node;
        ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr,
@@ -1119,16 +1072,18 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+                                   dat);
 err_out_child_node:
        for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
                nilfs_btnode_delete(path[level].bp_sib_bh);
                nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap,
-                                           &path[level].bp_newreq);
+                                           &path[level].bp_newreq, dat);
        }
-        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq);
+        nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq,
+                                   dat);
 err_out_data:
        *levelp = level;
        stats->bs_nblocks = 0;
@@ -1139,16 +1094,19 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
                                      struct nilfs_btree_path *path,
                                      int maxlevel, __u64 key, __u64 ptr)
 {
+        struct inode *dat = NULL;
        int level;
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
-        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap))
+        if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) {
                nilfs_btree_set_target_v(btree, key, ptr);
+                dat = nilfs_bmap_get_dat(&btree->bt_bmap);
+        }
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
                nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap,
-                                            &path[level - 1].bp_newreq);
+                                            &path[level - 1].bp_newreq, dat);
                path[level].bp_op(btree, path, level, &key, &ptr);
        }
@@ -1164,10 +1122,10 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        int level, ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN);
@@ -1184,8 +1142,8 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1197,7 +1155,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
        if (level < nilfs_btree_height(btree) - 1) {
                lock_buffer(path[level].bp_bh);
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
                nilfs_btree_node_delete(btree, node, keyp, ptrp,
                                        path[level].bp_index);
                if (!buffer_dirty(path[level].bp_bh))
@@ -1205,7 +1163,7 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree,
                unlock_buffer(path[level].bp_bh);
                if (path[level].bp_index == 0)
                        nilfs_btree_promote_key(btree, path, level + 1,
-                                nilfs_btree_node_get_key(btree, node, 0));
+                                nilfs_btree_node_get_key(node, 0));
        } else {
                node = nilfs_btree_get_root(btree);
                nilfs_btree_node_delete(btree, node, keyp, ptrp,
@@ -1225,10 +1183,10 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        left = nilfs_btree_get_sib_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(path, level);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
-        lnchildren = nilfs_btree_node_get_nchildren(btree, left);
+        lnchildren = nilfs_btree_node_get_nchildren(left);
        n = (nchildren + lnchildren) / 2 - nchildren;
@@ -1243,7 +1201,7 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
        unlock_buffer(path[level].bp_sib_bh);
        nilfs_btree_promote_key(btree, path, level + 1,
-                                nilfs_btree_node_get_key(btree, node, 0));
+                                nilfs_btree_node_get_key(node, 0));
        brelse(path[level].bp_sib_bh);
        path[level].bp_sib_bh = NULL;
@@ -1262,10 +1220,10 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        right = nilfs_btree_get_sib_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(path, level);
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
-        rnchildren = nilfs_btree_node_get_nchildren(btree, right);
+        rnchildren = nilfs_btree_node_get_nchildren(right);
        n = (nchildren + rnchildren) / 2 - nchildren;
@@ -1281,7 +1239,7 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
        path[level + 1].bp_index++;
        nilfs_btree_promote_key(btree, path, level + 1,
-                                nilfs_btree_node_get_key(btree, right, 0));
+                                nilfs_btree_node_get_key(right, 0));
        path[level + 1].bp_index--;
        brelse(path[level].bp_sib_bh);
@@ -1300,10 +1258,10 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        left = nilfs_btree_get_sib_node(btree, path, level);
+        left = nilfs_btree_get_sib_node(path, level);
-        n = nilfs_btree_node_get_nchildren(btree, node);
+        n = nilfs_btree_node_get_nchildren(node);
        nilfs_btree_node_move_left(btree, left, node, n);
@@ -1316,7 +1274,7 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree,
        nilfs_btnode_delete(path[level].bp_bh);
        path[level].bp_bh = path[level].bp_sib_bh;
        path[level].bp_sib_bh = NULL;
-        path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
+        path[level].bp_index += nilfs_btree_node_get_nchildren(left);
 }
 static void nilfs_btree_concat_right(struct nilfs_btree *btree,
@@ -1331,10 +1289,10 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        lock_buffer(path[level].bp_sib_bh);
-        node = nilfs_btree_get_nonroot_node(btree, path, level);
+        node = nilfs_btree_get_nonroot_node(path, level);
-        right = nilfs_btree_get_sib_node(btree, path, level);
+        right = nilfs_btree_get_sib_node(path, level);
-        n = nilfs_btree_node_get_nchildren(btree, right);
+        n = nilfs_btree_node_get_nchildren(right);
        nilfs_btree_node_move_left(btree, node, right, n);
@@ -1360,11 +1318,11 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
        lock_buffer(path[level].bp_bh);
        root = nilfs_btree_get_root(btree);
-        child = nilfs_btree_get_nonroot_node(btree, path, level);
+        child = nilfs_btree_get_nonroot_node(path, level);
        nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
-        nilfs_btree_node_set_level(btree, root, level);
+        nilfs_btree_node_set_level(root, level);
-        n = nilfs_btree_node_get_nchildren(btree, child);
+        n = nilfs_btree_node_get_nchildren(child);
        nilfs_btree_node_move_left(btree, root, child, n);
        unlock_buffer(path[level].bp_bh);
@@ -1376,7 +1334,8 @@ static void nilfs_btree_shrink(struct nilfs_btree *btree,
 static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                                      struct nilfs_btree_path *path,
                                      int *levelp,
-                                      struct nilfs_bmap_stats *stats)
+                                      struct nilfs_bmap_stats *stats,
+                                      struct inode *dat)
 {
        struct buffer_head *bh;
        struct nilfs_btree_node *node, *parent, *sib;
@@ -1388,17 +1347,17 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        for (level = NILFS_BTREE_LEVEL_NODE_MIN;
             level < nilfs_btree_height(btree) - 1;
             level++) {
-                node = nilfs_btree_get_nonroot_node(btree, path, level);
+                node = nilfs_btree_get_nonroot_node(path, level);
                path[level].bp_oldreq.bpr_ptr =
                        nilfs_btree_node_get_ptr(btree, node,
                                                 path[level].bp_index);
                ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-                                                 &path[level].bp_oldreq);
+                                                 &path[level].bp_oldreq, dat);
                if (ret < 0)
                        goto err_out_child_node;
-                if (nilfs_btree_node_get_nchildren(btree, node) >
+                if (nilfs_btree_node_get_nchildren(node) >
-                    nilfs_btree_node_nchildren_min(btree, node)) {
+                    nilfs_btree_node_nchildren_min(node, btree)) {
                        path[level].bp_op = nilfs_btree_do_delete;
                        stats->bs_nblocks++;
                        goto out;
@@ -1415,8 +1374,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(btree, sib) >
+                        if (nilfs_btree_node_get_nchildren(sib) >
-                            nilfs_btree_node_nchildren_min(btree, sib)) {
+                            nilfs_btree_node_nchildren_min(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_borrow_left;
                                stats->bs_nblocks++;
@@ -1428,7 +1387,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                                /* continue; */
                        }
                } else if (pindex <
-                           nilfs_btree_node_get_nchildren(btree, parent) - 1) {
+                           nilfs_btree_node_get_nchildren(parent) - 1) {
                        /* right sibling */
                        sibptr = nilfs_btree_node_get_ptr(btree, parent,
                                                          pindex + 1);
@@ -1436,8 +1395,8 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        if (ret < 0)
                                goto err_out_curr_node;
                        sib = (struct nilfs_btree_node *)bh->b_data;
-                        if (nilfs_btree_node_get_nchildren(btree, sib) >
+                        if (nilfs_btree_node_get_nchildren(sib) >
-                            nilfs_btree_node_nchildren_min(btree, sib)) {
+                            nilfs_btree_node_nchildren_min(sib, btree)) {
                                path[level].bp_sib_bh = bh;
                                path[level].bp_op = nilfs_btree_borrow_right;
                                stats->bs_nblocks++;
@@ -1452,7 +1411,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                        /* no siblings */
                        /* the only child of the root node */
                        WARN_ON(level != nilfs_btree_height(btree) - 2);
-                        if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
+                        if (nilfs_btree_node_get_nchildren(node) - 1 <=
                            NILFS_BTREE_ROOT_NCHILDREN_MAX) {
                                path[level].bp_op = nilfs_btree_shrink;
                                stats->bs_nblocks += 2;
@@ -1471,7 +1430,7 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
                nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
        ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap,
-                                         &path[level].bp_oldreq);
+                                         &path[level].bp_oldreq, dat);
        if (ret < 0)
                goto err_out_child_node;
@@ -1486,12 +1445,12 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
        /* error */
 err_out_curr_node:
-        nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq);
+        nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat);
 err_out_child_node:
        for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
                brelse(path[level].bp_sib_bh);
                nilfs_bmap_abort_end_ptr(&btree->bt_bmap,
-                                         &path[level].bp_oldreq);
+                                         &path[level].bp_oldreq, dat);
        }
        *levelp = level;
        stats->bs_nblocks = 0;
@@ -1500,13 +1459,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
 static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
                                      struct nilfs_btree_path *path,
-                                      int maxlevel)
+                                      int maxlevel, struct inode *dat)
 {
        int level;
        for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
                nilfs_bmap_commit_end_ptr(&btree->bt_bmap,
-                                          &path[level].bp_oldreq);
+                                          &path[level].bp_oldreq, dat);
                path[level].bp_op(btree, path, level, NULL, NULL);
        }
@@ -1520,27 +1479,32 @@ static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
        struct nilfs_btree *btree;
        struct nilfs_btree_path *path;
        struct nilfs_bmap_stats stats;
+        struct inode *dat;
        int level, ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, NULL,
                                    NILFS_BTREE_LEVEL_NODE_MIN);
        if (ret < 0)
                goto out;
-        ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
+        dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ?
+                nilfs_bmap_get_dat(&btree->bt_bmap) : NULL;
+        ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat);
        if (ret < 0)
                goto out;
-        nilfs_btree_commit_delete(btree, path, level);
+        nilfs_btree_commit_delete(btree, path, level, dat);
        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1551,15 +1515,15 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
        int ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -1581,7 +1545,7 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
                node = root;
                break;
        case 3:
-                nchildren = nilfs_btree_node_get_nchildren(btree, root);
+                nchildren = nilfs_btree_node_get_nchildren(root);
                if (nchildren > 1)
                        return 0;
                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
@@ -1594,10 +1558,10 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
                return 0;
        }
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
-        maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
+        maxkey = nilfs_btree_node_get_key(node, nchildren - 1);
        nextmaxkey = (nchildren > 1) ?
-                nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
+                nilfs_btree_node_get_key(node, nchildren - 2) : 0;
        if (bh != NULL)
                brelse(bh);
@@ -1623,7 +1587,7 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
                node = root;
                break;
        case 3:
-                nchildren = nilfs_btree_node_get_nchildren(btree, root);
+                nchildren = nilfs_btree_node_get_nchildren(root);
                WARN_ON(nchildren > 1);
                ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
                ret = nilfs_btree_get_block(btree, ptr, &bh);
@@ -1636,11 +1600,11 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
                return -EINVAL;
        }
-        nchildren = nilfs_btree_node_get_nchildren(btree, node);
+        nchildren = nilfs_btree_node_get_nchildren(node);
        if (nchildren < nitems)
                nitems = nchildren;
-        dkeys = nilfs_btree_node_dkeys(btree, node);
+        dkeys = nilfs_btree_node_dkeys(node);
-        dptrs = nilfs_btree_node_dptrs(btree, node);
+        dptrs = nilfs_btree_node_dptrs(node, btree);
        for (i = 0; i < nitems; i++) {
                keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
                ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
@@ -1660,18 +1624,20 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
                                       struct nilfs_bmap_stats *stats)
 {
        struct buffer_head *bh;
-        struct nilfs_btree *btree;
+        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
+        struct inode *dat = NULL;
        int ret;
-        btree = (struct nilfs_btree *)bmap;
        stats->bs_nblocks = 0;
        /* for data */
        /* cannot find near ptr */
-        if (NILFS_BMAP_USE_VBN(bmap))
+        if (NILFS_BMAP_USE_VBN(bmap)) {
                dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key);
+                dat = nilfs_bmap_get_dat(bmap);
+        }
-        ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq);
+        ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat);
        if (ret < 0)
                return ret;
@@ -1679,7 +1645,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        stats->bs_nblocks++;
        if (nreq != NULL) {
                nreq->bpr_ptr = dreq->bpr_ptr + 1;
-                ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq);
+                ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat);
                if (ret < 0)
                        goto err_out_dreq;
@@ -1696,9 +1662,9 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
        /* error */
 err_out_nreq:
-        nilfs_bmap_abort_alloc_ptr(bmap, nreq);
+        nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat);
 err_out_dreq:
-        nilfs_bmap_abort_alloc_ptr(bmap, dreq);
+        nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat);
        stats->bs_nblocks = 0;
        return ret;
@@ -1713,8 +1679,9 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                                      union nilfs_bmap_ptr_req *nreq,
                                      struct buffer_head *bh)
 {
-        struct nilfs_btree *btree;
+        struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
        struct nilfs_btree_node *node;
+        struct inode *dat;
        __u64 tmpptr;
        /* free resources */
@@ -1725,11 +1692,11 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
        set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
        /* convert and insert */
-        btree = (struct nilfs_btree *)bmap;
+        dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
        nilfs_btree_init(bmap);
        if (nreq != NULL) {
-                nilfs_bmap_commit_alloc_ptr(bmap, dreq);
+                nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
-                nilfs_bmap_commit_alloc_ptr(bmap, nreq);
+                nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat);
                /* create child node at level 1 */
                lock_buffer(bh);
@@ -1751,7 +1718,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
                nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
                                      2, 1, &keys[0], &tmpptr);
        } else {
-                nilfs_bmap_commit_alloc_ptr(bmap, dreq);
+                nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat);
                /* create root node at level 1 */
                node = nilfs_btree_get_root(btree);
@@ -1822,7 +1789,7 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
 static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                                        struct nilfs_btree_path *path,
-                                        int level)
+                                        int level, struct inode *dat)
 {
        struct nilfs_btree_node *parent;
        int ret;
@@ -1832,9 +1799,8 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                nilfs_btree_node_get_ptr(btree, parent,
                                         path[level + 1].bp_index);
        path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
-        ret = nilfs_bmap_prepare_update_v(&btree->bt_bmap,
+        ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req,
-                                          &path[level].bp_oldreq,
+                                       &path[level].bp_newreq.bpr_req);
-                                          &path[level].bp_newreq);
        if (ret < 0)
                return ret;
@@ -1846,9 +1812,9 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
                        &path[level].bp_ctxt);
                if (ret < 0) {
-                        nilfs_bmap_abort_update_v(&btree->bt_bmap,
+                        nilfs_dat_abort_update(dat,
-                                                  &path[level].bp_oldreq,
+                                               &path[level].bp_oldreq.bpr_req,
-                                                  &path[level].bp_newreq);
+                                               &path[level].bp_newreq.bpr_req);
                        return ret;
                }
        }
@@ -1858,13 +1824,13 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
 static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
                                        struct nilfs_btree_path *path,
-                                        int level)
+                                        int level, struct inode *dat)
 {
        struct nilfs_btree_node *parent;
-        nilfs_bmap_commit_update_v(&btree->bt_bmap,
+        nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req,
-                                   &path[level].bp_oldreq,
+                                &path[level].bp_newreq.bpr_req,
-                                   &path[level].bp_newreq);
+                                btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS);
        if (buffer_nilfs_node(path[level].bp_bh)) {
                nilfs_btnode_commit_change_key(
@@ -1881,11 +1847,10 @@ static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
 static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
                                       struct nilfs_btree_path *path,
-                                       int level)
+                                       int level, struct inode *dat)
 {
-        nilfs_bmap_abort_update_v(&btree->bt_bmap,
+        nilfs_dat_abort_update(dat, &path[level].bp_oldreq.bpr_req,
-                                  &path[level].bp_oldreq,
+                               &path[level].bp_newreq.bpr_req);
-                                  &path[level].bp_newreq);
        if (buffer_nilfs_node(path[level].bp_bh))
                nilfs_btnode_abort_change_key(
                        &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
@@ -1894,14 +1859,14 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
 static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
                                           struct nilfs_btree_path *path,
-                                           int minlevel,
+                                           int minlevel, int *maxlevelp,
-                                           int *maxlevelp)
+                                           struct inode *dat)
 {
        int level, ret;
        level = minlevel;
        if (!buffer_nilfs_volatile(path[level].bp_bh)) {
-                ret = nilfs_btree_prepare_update_v(btree, path, level);
+                ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
                if (ret < 0)
                        return ret;
        }
@@ -1909,7 +1874,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
               !buffer_dirty(path[level].bp_bh)) {
                WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
-                ret = nilfs_btree_prepare_update_v(btree, path, level);
+                ret = nilfs_btree_prepare_update_v(btree, path, level, dat);
                if (ret < 0)
                        goto out;
        }
@@ -1921,39 +1886,40 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
        /* error */
 out:
        while (--level > minlevel)
-                nilfs_btree_abort_update_v(btree, path, level);
+                nilfs_btree_abort_update_v(btree, path, level, dat);
        if (!buffer_nilfs_volatile(path[level].bp_bh))
-                nilfs_btree_abort_update_v(btree, path, level);
+                nilfs_btree_abort_update_v(btree, path, level, dat);
        return ret;
 }
 static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
                                           struct nilfs_btree_path *path,
-                                           int minlevel,
+                                           int minlevel, int maxlevel,
-                                           int maxlevel,
+                                           struct buffer_head *bh,
-                                           struct buffer_head *bh)
+                                           struct inode *dat)
 {
        int level;
        if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
-                nilfs_btree_commit_update_v(btree, path, minlevel);
+                nilfs_btree_commit_update_v(btree, path, minlevel, dat);
        for (level = minlevel + 1; level <= maxlevel; level++)
-                nilfs_btree_commit_update_v(btree, path, level);
+                nilfs_btree_commit_update_v(btree, path, level, dat);
 }
 static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
                                   struct nilfs_btree_path *path,
-                                   int level,
+                                   int level, struct buffer_head *bh)
-                                   struct buffer_head *bh)
 {
        int maxlevel, ret;
        struct nilfs_btree_node *parent;
+        struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
        __u64 ptr;
        get_bh(bh);
        path[level].bp_bh = bh;
-        ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
+        ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel,
+                                              dat);
        if (ret < 0)
                goto out;
@@ -1961,12 +1927,12 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
                parent = nilfs_btree_get_node(btree, path, level + 1);
                ptr = nilfs_btree_node_get_ptr(btree, parent,
                                               path[level + 1].bp_index);
-                ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
+                ret = nilfs_dat_mark_dirty(dat, ptr);
                if (ret < 0)
                        goto out;
        }
-        nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
+        nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh, dat);
 out:
        brelse(path[level].bp_bh);
@@ -1986,15 +1952,15 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
        WARN_ON(!buffer_dirty(bh));
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(bh)) {
                node = (struct nilfs_btree_node *)bh->b_data;
-                key = nilfs_btree_node_get_key(btree, node, 0);
+                key = nilfs_btree_node_get_key(node, 0);
-                level = nilfs_btree_node_get_level(btree, node);
+                level = nilfs_btree_node_get_level(node);
        } else {
                key = nilfs_bmap_data_get_key(bmap, bh);
                level = NILFS_BTREE_LEVEL_DATA;
@@ -2013,8 +1979,8 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
                nilfs_btree_propagate_p(btree, path, level, bh);
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -2022,7 +1988,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
 static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
                                    struct buffer_head *bh)
 {
-        return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
+        return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr);
 }
 static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
@@ -2037,12 +2003,12 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
        get_bh(bh);
        node = (struct nilfs_btree_node *)bh->b_data;
-        key = nilfs_btree_node_get_key(btree, node, 0);
+        key = nilfs_btree_node_get_key(node, 0);
-        level = nilfs_btree_node_get_level(btree, node);
+        level = nilfs_btree_node_get_level(node);
        list_for_each(head, &lists[level]) {
                cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
                cnode = (struct nilfs_btree_node *)cbh->b_data;
-                ckey = nilfs_btree_node_get_key(btree, cnode, 0);
+                ckey = nilfs_btree_node_get_key(cnode, 0);
                if (key < ckey)
                        break;
        }
@@ -2120,8 +2086,7 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree,
        nilfs_btree_node_set_ptr(btree, parent,
                                 path[level + 1].bp_index, blocknr);
-        key = nilfs_btree_node_get_key(btree, parent,
+        key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
-                                       path[level + 1].bp_index);
        /* on-disk format */
        binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
        binfo->bi_dat.bi_level = level;
@@ -2137,6 +2102,7 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
                                union nilfs_binfo *binfo)
 {
        struct nilfs_btree_node *parent;
+        struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap);
        __u64 key;
        __u64 ptr;
        union nilfs_bmap_ptr_req req;
@@ -2146,12 +2112,12 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree,
        ptr = nilfs_btree_node_get_ptr(btree, parent,
                                       path[level + 1].bp_index);
        req.bpr_ptr = ptr;
-        ret = nilfs_bmap_start_v(&btree->bt_bmap, &req, blocknr);
+        ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
-        if (unlikely(ret < 0))
+        if (ret < 0)
                return ret;
+        nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
-        key = nilfs_btree_node_get_key(btree, parent,
+        key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index);
-                                       path[level + 1].bp_index);
        /* on-disk format */
        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
@@ -2171,15 +2137,15 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
        int level, ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        if (buffer_nilfs_node(*bh)) {
                node = (struct nilfs_btree_node *)(*bh)->b_data;
-                key = nilfs_btree_node_get_key(btree, node, 0);
+                key = nilfs_btree_node_get_key(node, 0);
-                level = nilfs_btree_node_get_level(btree, node);
+                level = nilfs_btree_node_get_level(node);
        } else {
                key = nilfs_bmap_data_get_key(bmap, *bh);
                level = NILFS_BTREE_LEVEL_DATA;
@@ -2196,8 +2162,8 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap,
                nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo);
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
@@ -2207,19 +2173,18 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
 {
-        struct nilfs_btree *btree;
        struct nilfs_btree_node *node;
        __u64 key;
        int ret;
-        btree = (struct nilfs_btree *)bmap;
+        ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr,
-        ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
+                             blocknr);
        if (ret < 0)
                return ret;
        if (buffer_nilfs_node(*bh)) {
                node = (struct nilfs_btree_node *)(*bh)->b_data;
-                key = nilfs_btree_node_get_key(btree, node, 0);
+                key = nilfs_btree_node_get_key(node, 0);
        } else
                key = nilfs_bmap_data_get_key(bmap, *bh);
@@ -2239,10 +2204,10 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
        int ret;
        btree = (struct nilfs_btree *)bmap;
-        path = nilfs_btree_alloc_path(btree);
+        path = nilfs_btree_alloc_path();
        if (path == NULL)
                return -ENOMEM;
-        nilfs_btree_init_path(btree, path);
+        nilfs_btree_init_path(path);
        ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
        if (ret < 0) {
@@ -2262,8 +2227,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
                nilfs_bmap_set_dirty(&btree->bt_bmap);
 out:
-        nilfs_btree_clear_path(btree, path);
+        nilfs_btree_release_path(path);
-        nilfs_btree_free_path(btree, path);
+        nilfs_btree_free_path(path);
        return ret;
 }
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
index aec942cf79e3..1c6cfb59128d 100644
--- a/fs/nilfs2/cpfile.c
+++ b/fs/nilfs2/cpfile.c
@@ -815,8 +815,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
        void *kaddr;
        int ret;
-        if (cno == 0)
+        /* CP number is invalid if it's zero or larger than the
-                return -ENOENT; /* checkpoint number 0 is invalid */
+        largest exist one.*/
+        if (cno == 0 || cno >= nilfs_mdt_cno(cpfile))
+                return -ENOENT;
        down_read(&NILFS_MDT(cpfile)->mi_sem);
        ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
@@ -824,7 +826,10 @@ int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
                goto out;
        kaddr = kmap_atomic(bh->b_page, KM_USER0);
        cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
-        ret = nilfs_checkpoint_snapshot(cp);
+        if (nilfs_checkpoint_invalid(cp))
+                ret = -ENOENT;
+        else
+                ret = nilfs_checkpoint_snapshot(cp);
        kunmap_atomic(kaddr, KM_USER0);
        brelse(bh);
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
index 788a45950197..debea896e701 100644
--- a/fs/nilfs2/cpfile.h
+++ b/fs/nilfs2/cpfile.h
@@ -27,8 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/nilfs2_fs.h>
-#define NILFS_CPFILE_GFP        NILFS_MDT_GFP
 int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
                                struct nilfs_checkpoint **,
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8927ca27e6f7..1ff8e15bd36b 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -109,12 +109,6 @@ void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
        nilfs_palloc_commit_free_entry(dat, req);
 }
-void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
-{
-        nilfs_dat_abort_entry(dat, req);
-        nilfs_palloc_abort_free_entry(dat, req);
-}
 int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
 {
        int ret;
@@ -140,11 +134,6 @@ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
        nilfs_dat_commit_entry(dat, req);
 }
-void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
-{
-        nilfs_dat_abort_entry(dat, req);
-}
 int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
 {
        struct nilfs_dat_entry *entry;
@@ -222,6 +211,37 @@ void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
        nilfs_dat_abort_entry(dat, req);
 }
+int nilfs_dat_prepare_update(struct inode *dat,
+                             struct nilfs_palloc_req *oldreq,
+                             struct nilfs_palloc_req *newreq)
+{
+        int ret;
+        ret = nilfs_dat_prepare_end(dat, oldreq);
+        if (!ret) {
+                ret = nilfs_dat_prepare_alloc(dat, newreq);
+                if (ret < 0)
+                        nilfs_dat_abort_end(dat, oldreq);
+        }
+        return ret;
+}
+void nilfs_dat_commit_update(struct inode *dat,
+                             struct nilfs_palloc_req *oldreq,
+                             struct nilfs_palloc_req *newreq, int dead)
+{
+        nilfs_dat_commit_end(dat, oldreq, dead);
+        nilfs_dat_commit_alloc(dat, newreq);
+}
+void nilfs_dat_abort_update(struct inode *dat,
+                            struct nilfs_palloc_req *oldreq,
+                            struct nilfs_palloc_req *newreq)
+{
+        nilfs_dat_abort_end(dat, oldreq);
+        nilfs_dat_abort_alloc(dat, newreq);
+}
 /**
 * nilfs_dat_mark_dirty -
 * @dat: DAT file inode
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
index d328b81eead4..406070d3ff49 100644
--- a/fs/nilfs2/dat.h
+++ b/fs/nilfs2/dat.h
@@ -27,7 +27,6 @@
 #include <linux/buffer_head.h>
 #include <linux/fs.h>
-#define NILFS_DAT_GFP   NILFS_MDT_GFP
 struct nilfs_palloc_req;
@@ -39,10 +38,15 @@ void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
 void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
                            sector_t);
-void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
 int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
 void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
 void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
+int nilfs_dat_prepare_update(struct inode *, struct nilfs_palloc_req *,
+                             struct nilfs_palloc_req *);
+void nilfs_dat_commit_update(struct inode *, struct nilfs_palloc_req *,
+                             struct nilfs_palloc_req *, int);
+void nilfs_dat_abort_update(struct inode *, struct nilfs_palloc_req *,
+                            struct nilfs_palloc_req *);
 int nilfs_dat_mark_dirty(struct inode *, __u64);
 int nilfs_dat_freev(struct inode *, __u64 *, size_t);
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
index 342d9765df8d..d369ac718277 100644
--- a/fs/nilfs2/direct.c
+++ b/fs/nilfs2/direct.c
@@ -125,106 +125,64 @@ static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
        direct->d_bmap.b_last_allocated_ptr = ptr;
 }
-static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
-                                       __u64 key,
-                                       union nilfs_bmap_ptr_req *req,
-                                       struct nilfs_bmap_stats *stats)
-{
-        int ret;
-        if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-                req->bpr_ptr = nilfs_direct_find_target_v(direct, key);
-        ret = nilfs_bmap_prepare_alloc_ptr(&direct->d_bmap, req);
-        if (ret < 0)
-                return ret;
-        stats->bs_nblocks = 1;
-        return 0;
-}
-static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
-                                       union nilfs_bmap_ptr_req *req,
-                                       __u64 key, __u64 ptr)
-{
-        struct buffer_head *bh;
-        /* ptr must be a pointer to a buffer head. */
-        bh = (struct buffer_head *)((unsigned long)ptr);
-        set_buffer_nilfs_volatile(bh);
-        nilfs_bmap_commit_alloc_ptr(&direct->d_bmap, req);
-        nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
-        if (!nilfs_bmap_dirty(&direct->d_bmap))
-                nilfs_bmap_set_dirty(&direct->d_bmap);
-        if (NILFS_BMAP_USE_VBN(&direct->d_bmap))
-                nilfs_direct_set_target_v(direct, key, req->bpr_ptr);
-}
 static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
 {
-        struct nilfs_direct *direct;
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        union nilfs_bmap_ptr_req req;
-        struct nilfs_bmap_stats stats;
+        struct inode *dat = NULL;
+        struct buffer_head *bh;
        int ret;
-        direct = (struct nilfs_direct *)bmap;
        if (key > NILFS_DIRECT_KEY_MAX)
                return -ENOENT;
        if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
                return -EEXIST;
-        ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
+        if (NILFS_BMAP_USE_VBN(bmap)) {
-        if (ret < 0)
+                req.bpr_ptr = nilfs_direct_find_target_v(direct, key);
-                return ret;
+                dat = nilfs_bmap_get_dat(bmap);
-        nilfs_direct_commit_insert(direct, &req, key, ptr);
+        }
-        nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
+        ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat);
+        if (!ret) {
+                /* ptr must be a pointer to a buffer head. */
+                bh = (struct buffer_head *)((unsigned long)ptr);
+                set_buffer_nilfs_volatile(bh);
-        return 0;
+                nilfs_bmap_commit_alloc_ptr(bmap, &req, dat);
-}
+                nilfs_direct_set_ptr(direct, key, req.bpr_ptr);
-static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
+                if (!nilfs_bmap_dirty(bmap))
-                                       union nilfs_bmap_ptr_req *req,
+                        nilfs_bmap_set_dirty(bmap);
-                                       __u64 key,
-                                       struct nilfs_bmap_stats *stats)
-{
-        int ret;
-        req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
+                if (NILFS_BMAP_USE_VBN(bmap))
-        ret = nilfs_bmap_prepare_end_ptr(&direct->d_bmap, req);
+                        nilfs_direct_set_target_v(direct, key, req.bpr_ptr);
-        if (!ret)
-                stats->bs_nblocks = 1;
-        return ret;
-}
-static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
+                nilfs_bmap_add_blocks(bmap, 1);
-                                       union nilfs_bmap_ptr_req *req,
+        }
-                                       __u64 key)
+        return ret;
-{
-        nilfs_bmap_commit_end_ptr(&direct->d_bmap, req);
-        nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
 }
 static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
 {
-        struct nilfs_direct *direct;
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
        union nilfs_bmap_ptr_req req;
-        struct nilfs_bmap_stats stats;
+        struct inode *dat;
        int ret;
-        direct = (struct nilfs_direct *)bmap;
+        if (key > NILFS_DIRECT_KEY_MAX ||
-        if ((key > NILFS_DIRECT_KEY_MAX) ||
            nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
                return -ENOENT;
-        ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
+        dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL;
-        if (ret < 0)
+        req.bpr_ptr = nilfs_direct_get_ptr(direct, key);
-                return ret;
-        nilfs_direct_commit_delete(direct, &req, key);
-        nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
-        return 0;
+        ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat);
+        if (!ret) {
+                nilfs_bmap_commit_end_ptr(bmap, &req, dat);
+                nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
+                nilfs_bmap_sub_blocks(bmap, 1);
+        }
+        return ret;
 }
 static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
@@ -310,59 +268,56 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
        return 0;
 }
-static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
+static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
-                                    struct buffer_head *bh)
+                                  struct buffer_head *bh)
 {
-        union nilfs_bmap_ptr_req oldreq, newreq;
+        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
+        struct nilfs_palloc_req oldreq, newreq;
+        struct inode *dat;
        __u64 key;
        __u64 ptr;
        int ret;
-        key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
+        if (!NILFS_BMAP_USE_VBN(bmap))
+                return 0;
+        dat = nilfs_bmap_get_dat(bmap);
+        key = nilfs_bmap_data_get_key(bmap, bh);
        ptr = nilfs_direct_get_ptr(direct, key);
        if (!buffer_nilfs_volatile(bh)) {
-                oldreq.bpr_ptr = ptr;
+                oldreq.pr_entry_nr = ptr;
-                newreq.bpr_ptr = ptr;
+                newreq.pr_entry_nr = ptr;
-                ret = nilfs_bmap_prepare_update_v(&direct->d_bmap, &oldreq,
+                ret = nilfs_dat_prepare_update(dat, &oldreq, &newreq);
-                                                  &newreq);
                if (ret < 0)
                        return ret;
-                nilfs_bmap_commit_update_v(&direct->d_bmap, &oldreq, &newreq);
+                nilfs_dat_commit_update(dat, &oldreq, &newreq,
+                                        bmap->b_ptr_type == NILFS_BMAP_PTR_VS);
                set_buffer_nilfs_volatile(bh);
-                nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
+                nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr);
        } else
-                ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
+                ret = nilfs_dat_mark_dirty(dat, ptr);
        return ret;
 }
-static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
-                                  struct buffer_head *bh)
-{
-        struct nilfs_direct *direct = (struct nilfs_direct *)bmap;
-        return NILFS_BMAP_USE_VBN(bmap) ?
-                nilfs_direct_propagate_v(direct, bh) : 0;
-}
 static int nilfs_direct_assign_v(struct nilfs_direct *direct,
                                 __u64 key, __u64 ptr,
                                 struct buffer_head **bh,
                                 sector_t blocknr,
                                 union nilfs_binfo *binfo)
 {
+        struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap);
        union nilfs_bmap_ptr_req req;
        int ret;
        req.bpr_ptr = ptr;
-        ret = nilfs_bmap_start_v(&direct->d_bmap, &req, blocknr);
+        ret = nilfs_dat_prepare_start(dat, &req.bpr_req);
-        if (unlikely(ret < 0))
+        if (!ret) {
-                return ret;
+                nilfs_dat_commit_start(dat, &req.bpr_req, blocknr);
+                binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
-        binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
+                binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
-        binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
+        }
+        return ret;
-        return 0;
 }
 static int nilfs_direct_assign_p(struct nilfs_direct *direct,
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
index 5d30a35679b5..ecc3ba76db47 100644
--- a/fs/nilfs2/ifile.h
+++ b/fs/nilfs2/ifile.h
@@ -31,7 +31,6 @@
 #include "mdt.h"
 #include "alloc.h"
-#define NILFS_IFILE_GFP  NILFS_MDT_GFP
 static inline struct nilfs_inode *
 nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index fe9d8f2a13f8..807e584b163d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -430,7 +430,8 @@ static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
        raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
-        if (nilfs_read_inode_common(inode, raw_inode))
+        err = nilfs_read_inode_common(inode, raw_inode);
+        if (err)
                goto failed_unmap;
        if (S_ISREG(inode->i_mode)) {
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index 6ea5f872e2de..6572ea4bc4df 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -442,12 +442,6 @@ int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
        const char *msg;
        int ret;
-        ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
-        if (ret < 0) {
-                msg = "cannot read source blocks";
-                goto failed;
-        }
        ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], kbufs[1]);
        if (ret < 0) {
                /*
@@ -548,7 +542,25 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
                }
        }
-        ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+        /*
+         * nilfs_ioctl_move_blocks() will call nilfs_gc_iget(),
+         * which will operates an inode list without blocking.
+         * To protect the list from concurrent operations,
+         * nilfs_ioctl_move_blocks should be atomic operation.
+         */
+        if (test_and_set_bit(THE_NILFS_GC_RUNNING, &nilfs->ns_flags)) {
+                ret = -EBUSY;
+                goto out_free;
+        }
+        ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], kbufs[0]);
+        if (ret < 0)
+                printk(KERN_ERR "NILFS: GC failed during preparation: "
+                        "cannot read source blocks: err=%d\n", ret);
+        else
+                ret = nilfs_clean_segments(inode->i_sb, argv, kbufs);
+        clear_nilfs_gc_running(nilfs);
 out_free:
        while (--n >= 0)
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index 3d3ddb3f5177..156bf6091a96 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -103,15 +103,12 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
                goto failed_unlock;
        err = -EEXIST;
-        if (buffer_uptodate(bh) || buffer_mapped(bh))
+        if (buffer_uptodate(bh))
                goto failed_bh;
-#if 0
-        /* The uptodate flag is not protected by the page lock, but
-           the mapped flag is.  Thus, we don't have to wait the buffer. */
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                goto failed_bh;
-#endif
        bh->b_bdev = nilfs->ns_bdev;
        err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
@@ -139,7 +136,7 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
                       int mode, struct buffer_head **out_bh)
 {
        struct buffer_head *bh;
-        unsigned long blknum = 0;
+        __u64 blknum = 0;
        int ret = -ENOMEM;
        bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
@@ -162,17 +159,15 @@ nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
                unlock_buffer(bh);
                goto out;
        }
-        if (!buffer_mapped(bh)) { /* unused buffer */
-                ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
+        ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff, &blknum);
-                                        &blknum);
+        if (unlikely(ret)) {
-                if (unlikely(ret)) {
+                unlock_buffer(bh);
-                        unlock_buffer(bh);
+                goto failed_bh;
-                        goto failed_bh;
-                }
-                bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
-                bh->b_blocknr = blknum;
-                set_buffer_mapped(bh);
        }
+        bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
+        bh->b_blocknr = (sector_t)blknum;
+        set_buffer_mapped(bh);
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
@@ -402,6 +397,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
        struct inode *inode = container_of(page->mapping,
                                           struct inode, i_data);
        struct super_block *sb = inode->i_sb;
+        struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
        struct nilfs_sb_info *writer = NULL;
        int err = 0;
@@ -411,9 +407,12 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
        if (page->mapping->assoc_mapping)
                return 0; /* Do not request flush for shadow page cache */
        if (!sb) {
-                writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
+                down_read(&nilfs->ns_writer_sem);
-                if (!writer)
+                writer = nilfs->ns_writer;
+                if (!writer) {
+                        up_read(&nilfs->ns_writer_sem);
                        return -EROFS;
+                }
                sb = writer->s_super;
        }
@@ -423,7 +422,7 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
                nilfs_flush_segment(sb, inode->i_ino);
        if (writer)
-                nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
+                up_read(&nilfs->ns_writer_sem);
        return err;
 }
@@ -514,9 +513,10 @@ nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
 }
 struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
-                            ino_t ino, gfp_t gfp_mask)
+                            ino_t ino)
 {
-        struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
+        struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino,
+                                                   NILFS_MDT_GFP);
        if (!inode)
                return NULL;
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index df683e0bca6a..431599733c9b 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -74,8 +74,7 @@ int nilfs_mdt_forget_block(struct inode *, unsigned long);
 int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
 int nilfs_mdt_fetch_dirty(struct inode *);
-struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
+struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t);
-                            gfp_t);
 struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
                                   ino_t, gfp_t);
 void nilfs_mdt_destroy(struct inode *);
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
index d80cc71be749..6dc83591d118 100644
--- a/fs/nilfs2/recovery.c
+++ b/fs/nilfs2/recovery.c
@@ -552,7 +552,8 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
                printk(KERN_WARNING
                       "NILFS warning: error recovering data block "
                       "(err=%d, ino=%lu, block-offset=%llu)\n",
-                       err, rb->ino, (unsigned long long)rb->blkoff);
+                       err, (unsigned long)rb->ino,
+                       (unsigned long long)rb->blkoff);
                if (!err2)
                        err2 = err;
 next:
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
index 9e3fe17bb96b..e6d9e37fa241 100644
--- a/fs/nilfs2/segbuf.c
+++ b/fs/nilfs2/segbuf.c
@@ -316,10 +316,10 @@ static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
 {
        struct bio *bio;
-        bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+        bio = bio_alloc(GFP_NOIO, nr_vecs);
        if (bio == NULL) {
                while (!bio && (nr_vecs >>= 1))
-                        bio = bio_alloc(GFP_NOWAIT, nr_vecs);
+                        bio = bio_alloc(GFP_NOIO, nr_vecs);
        }
        if (likely(bio)) {
                bio->bi_bdev = sb->s_bdev;
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index 8b5e4778cf28..683df89dbae5 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -1859,12 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err)
        if (!page)
                return;
-        if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page))
+        if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) {
                /*
                 * For b-tree node pages, this function may be called twice
                 * or more because they might be split in a segment.
                 */
+                if (PageDirty(page)) {
+                        /*
+                         * For pages holding split b-tree node buffers, dirty
+                         * flag on the buffers may be cleared discretely.
+                         * In that case, the page is once redirtied for
+                         * remaining buffers, and it must be cancelled if
+                         * all the buffers get cleaned later.
+                         */
+                        lock_page(page);
+                        if (nilfs_page_buffers_clean(page))
+                                __nilfs_clear_page_dirty(page);
+                        unlock_page(page);
+                }
                return;
+        }
        __nilfs_end_page_io(page, err);
 }
@@ -2487,7 +2501,8 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
                if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
                    nilfs_discontinued(nilfs)) {
                        down_write(&nilfs->ns_sem);
-                        req->sb_err = nilfs_commit_super(sbi, 0);
+                        req->sb_err = nilfs_commit_super(sbi,
+                                        nilfs_altsb_need_update(nilfs));
                        up_write(&nilfs->ns_sem);
                }
        }
@@ -2675,6 +2690,7 @@ static int nilfs_segctor_thread(void *arg)
        } else {
                DEFINE_WAIT(wait);
                int should_sleep = 1;
+                struct the_nilfs *nilfs;
                prepare_to_wait(&sci->sc_wait_daemon, &wait,
                                TASK_INTERRUPTIBLE);
@@ -2695,6 +2711,9 @@ static int nilfs_segctor_thread(void *arg)
                finish_wait(&sci->sc_wait_daemon, &wait);
                timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
                           time_after_eq(jiffies, sci->sc_timer->expires));
+                nilfs = sci->sc_sbi->s_nilfs;
+                if (sci->sc_super->s_dirt && nilfs_sb_need_update(nilfs))
+                        set_nilfs_discontinued(nilfs);
        }
        goto loop;
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
index a2c4d76c3366..0e99e5c0bd0f 100644
--- a/fs/nilfs2/sufile.h
+++ b/fs/nilfs2/sufile.h
@@ -28,7 +28,6 @@
 #include <linux/nilfs2_fs.h>
 #include "mdt.h"
-#define NILFS_SUFILE_GFP        NILFS_MDT_GFP
 static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
 {
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 8e2ec43b18f4..55f3d6b60732 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -50,6 +50,8 @@
 #include <linux/writeback.h>
 #include <linux/kobject.h>
 #include <linux/exportfs.h>
+#include <linux/seq_file.h>
+#include <linux/mount.h>
 #include "nilfs.h"
 #include "mdt.h"
 #include "alloc.h"
@@ -65,7 +67,6 @@ MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
                   "(NILFS)");
 MODULE_LICENSE("GPL");
-static void nilfs_write_super(struct super_block *sb);
 static int nilfs_remount(struct super_block *sb, int *flags, char *data);
 /**
@@ -311,9 +312,6 @@ static void nilfs_put_super(struct super_block *sb)
        lock_kernel();
-        if (sb->s_dirt)
-                nilfs_write_super(sb);
        nilfs_detach_segment_constructor(sbi);
        if (!(sb->s_flags & MS_RDONLY)) {
@@ -336,63 +334,21 @@ static void nilfs_put_super(struct super_block *sb)
        unlock_kernel();
 }
-/**
+static int nilfs_sync_fs(struct super_block *sb, int wait)
- * nilfs_write_super - write super block(s) of NILFS
- * @sb: super_block
- *
- * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
- * clears s_dirt.  This function is called in the section protected by
- * lock_super().
- *
- * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
- * of the struct the_nilfs.  Lock order must be as follows:
- *
- *   1. lock_super()
- *   2.    down_write(&nilfs->ns_sem)
- *
- * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
- * of the super block (nilfs->ns_sbp[]).
- *
- * In most cases, VFS functions call lock_super() before calling these
- * methods.  So we must be careful not to bring on deadlocks when using
- * lock_super();  see generic_shutdown_super(), write_super(), and so on.
- *
- * Note that order of lock_kernel() and lock_super() depends on contexts
- * of VFS.  We should also note that lock_kernel() can be used in its
- * protective section and only the outermost one has an effect.
- */
-static void nilfs_write_super(struct super_block *sb)
 {
        struct nilfs_sb_info *sbi = NILFS_SB(sb);
        struct the_nilfs *nilfs = sbi->s_nilfs;
-        down_write(&nilfs->ns_sem);
-        if (!(sb->s_flags & MS_RDONLY)) {
-                struct nilfs_super_block **sbp = nilfs->ns_sbp;
-                u64 t = get_seconds();
-                int dupsb;
-                if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
-                    t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
-                        up_write(&nilfs->ns_sem);
-                        return;
-                }
-                dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
-                nilfs_commit_super(sbi, dupsb);
-        }
-        sb->s_dirt = 0;
-        up_write(&nilfs->ns_sem);
-}
-static int nilfs_sync_fs(struct super_block *sb, int wait)
-{
        int err = 0;
-        nilfs_write_super(sb);
        /* This function is called when super block should be written back */
        if (wait)
                err = nilfs_construct_segment(sb);
+        down_write(&nilfs->ns_sem);
+        if (sb->s_dirt)
+                nilfs_commit_super(sbi, 1);
+        up_write(&nilfs->ns_sem);
        return err;
 }
@@ -407,8 +363,7 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        list_add(&sbi->s_list, &nilfs->ns_supers);
        up_write(&nilfs->ns_super_sem);
-        sbi->s_ifile = nilfs_mdt_new(
+        sbi->s_ifile = nilfs_mdt_new(nilfs, sbi->s_super, NILFS_IFILE_INO);
-                nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
        if (!sbi->s_ifile)
                return -ENOMEM;
@@ -416,8 +371,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
        if (unlikely(err))
                goto failed;
+        down_read(&nilfs->ns_segctor_sem);
        err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
                                          &bh_cp);
+        up_read(&nilfs->ns_segctor_sem);
        if (unlikely(err)) {
                if (err == -ENOENT || err == -EINVAL) {
                        printk(KERN_ERR
@@ -527,6 +484,26 @@ static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        return 0;
 }
+static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
+{
+        struct super_block *sb = vfs->mnt_sb;
+        struct nilfs_sb_info *sbi = NILFS_SB(sb);
+        if (!nilfs_test_opt(sbi, BARRIER))
+                seq_printf(seq, ",barrier=off");
+        if (nilfs_test_opt(sbi, SNAPSHOT))
+                seq_printf(seq, ",cp=%llu",
+                           (unsigned long long int)sbi->s_snapshot_cno);
+        if (nilfs_test_opt(sbi, ERRORS_RO))
+                seq_printf(seq, ",errors=remount-ro");
+        if (nilfs_test_opt(sbi, ERRORS_PANIC))
+                seq_printf(seq, ",errors=panic");
+        if (nilfs_test_opt(sbi, STRICT_ORDER))
+                seq_printf(seq, ",order=strict");
+        return 0;
+}
 static struct super_operations nilfs_sops = {
        .alloc_inode    = nilfs_alloc_inode,
        .destroy_inode  = nilfs_destroy_inode,
@@ -536,7 +513,7 @@ static struct super_operations nilfs_sops = {
        /* .drop_inode    = nilfs_drop_inode, */
        .delete_inode   = nilfs_delete_inode,
        .put_super      = nilfs_put_super,
-        .write_super    = nilfs_write_super,
+        /* .write_super    = nilfs_write_super, */
        .sync_fs        = nilfs_sync_fs,
        /* .write_super_lockfs */
        /* .unlockfs */
@@ -544,7 +521,7 @@ static struct super_operations nilfs_sops = {
        .remount_fs     = nilfs_remount,
        .clear_inode    = nilfs_clear_inode,
        /* .umount_begin */
-        /* .show_options */
+        .show_options = nilfs_show_options
 };
 static struct inode *
@@ -814,10 +791,15 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent,
        if (sb->s_flags & MS_RDONLY) {
                if (nilfs_test_opt(sbi, SNAPSHOT)) {
+                        down_read(&nilfs->ns_segctor_sem);
                        err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
                                                       sbi->s_snapshot_cno);
-                        if (err < 0)
+                        up_read(&nilfs->ns_segctor_sem);
+                        if (err < 0) {
+                                if (err == -ENOENT)
+                                        err = -EINVAL;
                                goto failed_sbi;
+                        }
                        if (!err) {
                                printk(KERN_ERR
                                       "NILFS: The specified checkpoint is "
@@ -1125,10 +1107,6 @@ nilfs_get_sb(struct file_system_type *fs_type, int flags,
         */
        sd.sbi = nilfs_find_sbinfo(nilfs, !(flags & MS_RDONLY), sd.cno);
-        if (!sd.cno)
-                /* trying to get the latest checkpoint.  */
-                sd.cno = nilfs_last_cno(nilfs);
        /*
         * Get super block instance holding the nilfs_sb_info struct.
         * A new instance is allocated if no existing mount is present or
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 8b8889825716..ad391a8c3e7e 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -68,12 +68,11 @@ static struct the_nilfs *alloc_nilfs(struct block_device *bdev)
        nilfs->ns_bdev = bdev;
        atomic_set(&nilfs->ns_count, 1);
-        atomic_set(&nilfs->ns_writer_refcount, -1);
        atomic_set(&nilfs->ns_ndirtyblks, 0);
        init_rwsem(&nilfs->ns_sem);
        init_rwsem(&nilfs->ns_super_sem);
        mutex_init(&nilfs->ns_mount_mutex);
-        mutex_init(&nilfs->ns_writer_mutex);
+        init_rwsem(&nilfs->ns_writer_sem);
        INIT_LIST_HEAD(&nilfs->ns_list);
        INIT_LIST_HEAD(&nilfs->ns_supers);
        spin_lock_init(&nilfs->ns_last_segment_lock);
@@ -188,23 +187,19 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs,
        inode_size = nilfs->ns_inode_size;
        err = -ENOMEM;
-        nilfs->ns_dat = nilfs_mdt_new(
+        nilfs->ns_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
-                nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
        if (unlikely(!nilfs->ns_dat))
                goto failed;
-        nilfs->ns_gc_dat = nilfs_mdt_new(
+        nilfs->ns_gc_dat = nilfs_mdt_new(nilfs, NULL, NILFS_DAT_INO);
-                nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
        if (unlikely(!nilfs->ns_gc_dat))
                goto failed_dat;
-        nilfs->ns_cpfile = nilfs_mdt_new(
+        nilfs->ns_cpfile = nilfs_mdt_new(nilfs, NULL, NILFS_CPFILE_INO);
-                nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
        if (unlikely(!nilfs->ns_cpfile))
                goto failed_gc_dat;
-        nilfs->ns_sufile = nilfs_mdt_new(
+        nilfs->ns_sufile = nilfs_mdt_new(nilfs, NULL, NILFS_SUFILE_INO);
-                nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
        if (unlikely(!nilfs->ns_sufile))
                goto failed_cpfile;
@@ -596,9 +591,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
        nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
-        bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
+        bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
-        if (!bdi)
-                bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
        nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
        /* Finding last segment */
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
index e8adbffc626f..20abd55881e0 100644
--- a/fs/nilfs2/the_nilfs.h
+++ b/fs/nilfs2/the_nilfs.h
@@ -37,6 +37,7 @@ enum {
        THE_NILFS_LOADED,       /* Roll-back/roll-forward has done and
                                   the latest checkpoint was loaded */
        THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
+        THE_NILFS_GC_RUNNING,   /* gc process is running */
 };
 /**
@@ -50,8 +51,7 @@ enum {
 * @ns_sem: semaphore for shared states
 * @ns_super_sem: semaphore for global operations across super block instances
 * @ns_mount_mutex: mutex protecting mount process of nilfs
- * @ns_writer_mutex: mutex protecting ns_writer attach/detach
+ * @ns_writer_sem: semaphore protecting ns_writer attach/detach
- * @ns_writer_refcount: number of referrers on ns_writer
 * @ns_current: back pointer to current mount
 * @ns_sbh: buffer heads of on-disk super blocks
 * @ns_sbp: pointers to super block data
@@ -100,8 +100,7 @@ struct the_nilfs {
        struct rw_semaphore     ns_sem;
        struct rw_semaphore     ns_super_sem;
        struct mutex            ns_mount_mutex;
-        struct mutex            ns_writer_mutex;
+        struct rw_semaphore     ns_writer_sem;
-        atomic_t                ns_writer_refcount;
        /*
         * components protected by ns_super_sem
@@ -197,11 +196,26 @@ static inline int nilfs_##name(struct the_nilfs *nilfs)			\
 THE_NILFS_FNS(INIT, init)
 THE_NILFS_FNS(LOADED, loaded)
 THE_NILFS_FNS(DISCONTINUED, discontinued)
+THE_NILFS_FNS(GC_RUNNING, gc_running)
 /* Minimum interval of periodical update of superblocks (in seconds) */
 #define NILFS_SB_FREQ           10
 #define NILFS_ALTSB_FREQ        60  /* spare superblock */
+static inline int nilfs_sb_need_update(struct the_nilfs *nilfs)
+{
+        u64 t = get_seconds();
+        return t < nilfs->ns_sbwtime[0] ||
+                 t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ;
+}
+static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs)
+{
+        u64 t = get_seconds();
+        struct nilfs_super_block **sbp = nilfs->ns_sbp;
+        return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
+}
 void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
 struct the_nilfs *find_or_create_nilfs(struct block_device *);
 void put_nilfs(struct the_nilfs *);
@@ -221,39 +235,26 @@ static inline void get_nilfs(struct the_nilfs *nilfs)
        atomic_inc(&nilfs->ns_count);
 }
-static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
-{
-        if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
-                mutex_lock(&nilfs->ns_writer_mutex);
-        return nilfs->ns_writer;
-}
-static inline void nilfs_put_writer(struct the_nilfs *nilfs)
-{
-        if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
-                mutex_unlock(&nilfs->ns_writer_mutex);
-}
 static inline void
 nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 {
-        mutex_lock(&nilfs->ns_writer_mutex);
+        down_write(&nilfs->ns_writer_sem);
        nilfs->ns_writer = sbi;
-        mutex_unlock(&nilfs->ns_writer_mutex);
+        up_write(&nilfs->ns_writer_sem);
 }
 static inline void
 nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
 {
-        mutex_lock(&nilfs->ns_writer_mutex);
+        down_write(&nilfs->ns_writer_sem);
        if (sbi == nilfs->ns_writer)
                nilfs->ns_writer = NULL;
-        mutex_unlock(&nilfs->ns_writer_mutex);
+        up_write(&nilfs->ns_writer_sem);
 }
 static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi)
 {
-        if (!atomic_dec_and_test(&sbi->s_count))
+        if (atomic_dec_and_test(&sbi->s_count))
                kfree(sbi);
 }
diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 31dac7e3b0f1..dffbb0911d02 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,15 +1,5 @@
 config FSNOTIFY
-        bool "Filesystem notification backend"
+        def_bool n
-        default y
-        ---help---
-           fsnotify is a backend for filesystem notification.  fsnotify does
-           not provide any userspace interface but does provide the basis
-           needed for other notification schemes such as dnotify, inotify,
-           and fanotify.
-           Say Y here to enable fsnotify suport.
-           If unsure, say Y.
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 904ff8d5405a..f9c1ca139d8f 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,6 +1,6 @@
 config DNOTIFY
        bool "Dnotify support"
-        depends on FSNOTIFY
+        select FSNOTIFY
        default y
        help
          Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ec2f7bd76818..037e878e03fc 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
                        if (!group->ops->should_send_event(group, to_tell, mask))
                                continue;
                        if (!event) {
-                                event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
+                                event = fsnotify_create_event(to_tell, mask, data,
+                                                              data_is, file_name, cookie,
+                                                              GFP_KERNEL);
                                /* shit, we OOM'd and now we can't tell, maybe
                                 * someday someone else will want to do something
                                 * here */
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 5356884289a1..3e56dbffe729 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,7 +15,7 @@ config INOTIFY
 config INOTIFY_USER
        bool "Inotify support for userspace"
-        depends on FSNOTIFY
+        select FSNOTIFY
        default y
        ---help---
          Say Y here to enable inotify support for userspace, including the
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 47cd258fd24d..c9ee67b442e1 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
        event_priv->wd = wd;
        ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
-        /* EEXIST is not an error */
+        if (ret) {
-        if (ret == -EEXIST)
-                ret = 0;
-        /* did event_priv get attached? */
-        if (list_empty(&fsn_event_priv->event_list))
                inotify_free_event_priv(fsn_event_priv);
+                /* EEXIST says we tail matched, EOVERFLOW isn't something
+                 * to report up the stack. */
+                if ((ret == -EEXIST) ||
+                    (ret == -EOVERFLOW))
+                        ret = 0;
+        }
        /*
         * If we hold the entry until after the event is on the queue
@@ -104,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
        return send;
 }
+/*
+ * This is NEVER supposed to be called.  Inotify marks should either have been
+ * removed from the idr when the watch was removed or in the
+ * fsnotify_destroy_mark_by_group() call when the inotify instance was being
+ * torn down.  This is only called if the idr is about to be freed but there
+ * are still marks in it.
+ */
 static int idr_callback(int id, void *p, void *data)
 {
-        BUG();
+        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark_entry *ientry;
+        static bool warned = false;
+        if (warned)
+                return 0;
+        warned = false;
+        entry = p;
+        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
+                "idr.  Probably leaking memory\n", id, p, data);
+        /*
+         * I'm taking the liberty of assuming that the mark in question is a
+         * valid address and I'm dereferencing it.  This might help to figure
+         * out why we got here and the panic is no worse than the original
+         * BUG() that was here.
+         */
+        if (entry)
+                printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
+                        entry->group, entry->inode, ientry->wd);
        return 0;
 }
 static void inotify_free_group_priv(struct fsnotify_group *group)
 {
        /* ideally the idr is empty and we won't hit the BUG in teh callback */
-        idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
+        idr_for_each(&group->inotify_data.idr, idr_callback, group);
        idr_remove_all(&group->inotify_data.idr);
        idr_destroy(&group->inotify_data.idr);
 }
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ff27a2965844..dcd2040d330c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,9 +47,6 @@
 static struct vfsmount *inotify_mnt __read_mostly;
-/* this just sits here and wastes global memory.  used to just pad userspace messages with zeros */
-static struct inotify_event nul_inotify_event;
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
 static int inotify_max_queued_events __read_mostly;
@@ -57,7 +54,6 @@ int inotify_max_user_watches __read_mostly;
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 struct kmem_cache *event_priv_cachep __read_mostly;
-static struct fsnotify_event *inotify_ignored_event;
 /*
 * When inotify registers a new group it increments this and uses that
@@ -158,7 +154,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
        event = fsnotify_peek_notify_event(group);
-        event_size += roundup(event->name_len, event_size);
+        if (event->name_len)
+                event_size += roundup(event->name_len + 1, event_size);
        if (event_size > count)
                return ERR_PTR(-EINVAL);
@@ -184,7 +181,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
        struct fsnotify_event_private_data *fsn_priv;
        struct inotify_event_private_data *priv;
        size_t event_size = sizeof(struct inotify_event);
-        size_t name_len;
+        size_t name_len = 0;
        /* we get the inotify watch descriptor from the event private data */
        spin_lock(&event->lock);
@@ -200,8 +197,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
                inotify_free_event_priv(fsn_priv);
        }
-        /* round up event->name_len so it is a multiple of event_size */
+        /*
-        name_len = roundup(event->name_len, event_size);
+         * round up event->name_len so it is a multiple of event_size
+         * plus an extra byte for the terminating '\0'.
+         */
+        if (event->name_len)
+                name_len = roundup(event->name_len + 1, event_size);
        inotify_event.len = name_len;
        inotify_event.mask = inotify_mask_to_arg(event->mask);
@@ -225,8 +226,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
                        return -EFAULT;
                buf += event->name_len;
-                /* fill userspace with 0's from nul_inotify_event */
+                /* fill userspace with 0's */
-                if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
+                if (clear_user(buf, len_to_zero))
                        return -EFAULT;
                buf += len_to_zero;
                event_size += name_len;
@@ -327,8 +328,9 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
                list_for_each_entry(holder, &group->notification_list, event_list) {
                        event = holder->event;
                        send_len += sizeof(struct inotify_event);
-                        send_len += roundup(event->name_len,
+                        if (event->name_len)
-                                             sizeof(struct inotify_event));
+                                send_len += roundup(event->name_len + 1,
+                                                sizeof(struct inotify_event));
                }
                mutex_unlock(&group->notification_mutex);
                ret = put_user(send_len, (int __user *) p);
@@ -366,20 +368,71 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 }
 /*
- * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
+ * Remove the mark from the idr (if present) and drop the reference
- * internal reference help on the mark because it is in the idr.
+ * on the mark because it was in the idr.
+ */
+static void inotify_remove_from_idr(struct fsnotify_group *group,
+                                    struct inotify_inode_mark_entry *ientry)
+{
+        struct idr *idr;
+        struct fsnotify_mark_entry *entry;
+        struct inotify_inode_mark_entry *found_ientry;
+        int wd;
+        spin_lock(&group->inotify_data.idr_lock);
+        idr = &group->inotify_data.idr;
+        wd = ientry->wd;
+        if (wd == -1)
+                goto out;
+        entry = idr_find(&group->inotify_data.idr, wd);
+        if (unlikely(!entry))
+                goto out;
+        found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+        if (unlikely(found_ientry != ientry)) {
+                /* We found an entry in the idr with the right wd, but it's
+                 * not the entry we were told to remove.  eparis seriously
+                 * fucked up somewhere. */
+                WARN_ON(1);
+                ientry->wd = -1;
+                goto out;
+        }
+        /* One ref for being in the idr, one ref held by the caller */
+        BUG_ON(atomic_read(&entry->refcnt) < 2);
+        idr_remove(idr, wd);
+        ientry->wd = -1;
+        /* removed from the idr, drop that ref */
+        fsnotify_put_mark(entry);
+out:
+        spin_unlock(&group->inotify_data.idr_lock);
+}
+/*
+ * Send IN_IGNORED for this wd, remove this wd from the idr.
 */
 void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
                                    struct fsnotify_group *group)
 {
        struct inotify_inode_mark_entry *ientry;
+        struct fsnotify_event *ignored_event;
        struct inotify_event_private_data *event_priv;
        struct fsnotify_event_private_data *fsn_event_priv;
-        struct idr *idr;
+        int ret;
+        ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
+                                              FSNOTIFY_EVENT_NONE, NULL, 0,
+                                              GFP_NOFS);
+        if (!ignored_event)
+                return;
        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+        event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
        if (unlikely(!event_priv))
                goto skip_send_ignore;
@@ -388,22 +441,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
        fsn_event_priv->group = group;
        event_priv->wd = ientry->wd;
-        fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+        ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
+        if (ret)
-        /* did the private data get added? */
-        if (list_empty(&fsn_event_priv->event_list))
                inotify_free_event_priv(fsn_event_priv);
 skip_send_ignore:
+        /* matches the reference taken when the event was created */
+        fsnotify_put_event(ignored_event);
        /* remove this entry from the idr */
-        spin_lock(&group->inotify_data.idr_lock);
+        inotify_remove_from_idr(group, ientry);
-        idr = &group->inotify_data.idr;
-        idr_remove(idr, ientry->wd);
-        spin_unlock(&group->inotify_data.idr_lock);
-        /* removed from idr, drop that reference */
+        atomic_dec(&group->inotify_data.user->inotify_watches);
-        fsnotify_put_mark(entry);
 }
 /* ding dong the mark is dead */
@@ -414,67 +464,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry)
        kmem_cache_free(inotify_inode_mark_cachep, ientry);
 }
-static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+static int inotify_update_existing_watch(struct fsnotify_group *group,
+                                         struct inode *inode,
+                                         u32 arg)
 {
-        struct fsnotify_mark_entry *entry = NULL;
+        struct fsnotify_mark_entry *entry;
        struct inotify_inode_mark_entry *ientry;
-        int ret = 0;
-        int add = (arg & IN_MASK_ADD);
-        __u32 mask;
        __u32 old_mask, new_mask;
+        __u32 mask;
+        int add = (arg & IN_MASK_ADD);
+        int ret;
        /* don't allow invalid bits: we don't want flags set */
        mask = inotify_arg_to_mask(arg);
        if (unlikely(!mask))
                return -EINVAL;
-        ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-        if (unlikely(!ientry))
-                return -ENOMEM;
-        /* we set the mask at the end after attaching it */
-        fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
-        ientry->wd = 0;
-find_entry:
        spin_lock(&inode->i_lock);
        entry = fsnotify_find_mark_entry(group, inode);
        spin_unlock(&inode->i_lock);
-        if (entry) {
+        if (!entry)
-                kmem_cache_free(inotify_inode_mark_cachep, ientry);
+                return -ENOENT;
-                ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-        } else {
-                if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
-                        ret = -ENOSPC;
-                        goto out_err;
-                }
-                ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
-                if (ret == -EEXIST)
-                        goto find_entry;
-                else if (ret)
-                        goto out_err;
-                entry = &ientry->fsn_entry;
+        ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-retry:
-                ret = -ENOMEM;
-                if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
-                        goto out_err;
-                spin_lock(&group->inotify_data.idr_lock);
-                /* if entry is added to the idr we keep the reference obtained
-                 * through fsnotify_mark_add.  remember to drop this reference
-                 * when entry is removed from idr */
-                ret = idr_get_new_above(&group->inotify_data.idr, entry,
-                                        ++group->inotify_data.last_wd,
-                                        &ientry->wd);
-                spin_unlock(&group->inotify_data.idr_lock);
-                if (ret) {
-                        if (ret == -EAGAIN)
-                                goto retry;
-                        goto out_err;
-                }
-                atomic_inc(&group->inotify_data.user->inotify_watches);
-        }
        spin_lock(&entry->lock);
@@ -506,14 +518,108 @@ retry:
                        fsnotify_recalc_group_mask(group);
        }
-        return ientry->wd;
+        /* return the wd */
+        ret = ientry->wd;
-out_err:
+        /* match the get from fsnotify_find_mark_entry() */
-        /* see this isn't supposed to happen, just kill the watch */
+        fsnotify_put_mark(entry);
-        if (entry) {
-                fsnotify_destroy_mark_by_entry(entry);
+        return ret;
-                fsnotify_put_mark(entry);
+}
+static int inotify_new_watch(struct fsnotify_group *group,
+                             struct inode *inode,
+                             u32 arg)
+{
+        struct inotify_inode_mark_entry *tmp_ientry;
+        __u32 mask;
+        int ret;
+        /* don't allow invalid bits: we don't want flags set */
+        mask = inotify_arg_to_mask(arg);
+        if (unlikely(!mask))
+                return -EINVAL;
+        tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+        if (unlikely(!tmp_ientry))
+                return -ENOMEM;
+        fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
+        tmp_ientry->fsn_entry.mask = mask;
+        tmp_ientry->wd = -1;
+        ret = -ENOSPC;
+        if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
+                goto out_err;
+retry:
+        ret = -ENOMEM;
+        if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+                goto out_err;
+        spin_lock(&group->inotify_data.idr_lock);
+        ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
+                                group->inotify_data.last_wd,
+                                &tmp_ientry->wd);
+        spin_unlock(&group->inotify_data.idr_lock);
+        if (ret) {
+                /* idr was out of memory allocate and try again */
+                if (ret == -EAGAIN)
+                        goto retry;
+                goto out_err;
+        }
+        /* we put the mark on the idr, take a reference */
+        fsnotify_get_mark(&tmp_ientry->fsn_entry);
+        /* we are on the idr, now get on the inode */
+        ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+        if (ret) {
+                /* we failed to get on the inode, get off the idr */
+                inotify_remove_from_idr(group, tmp_ientry);
+                goto out_err;
        }
+        /* update the idr hint, who cares about races, it's just a hint */
+        group->inotify_data.last_wd = tmp_ientry->wd;
+        /* increment the number of watches the user has */
+        atomic_inc(&group->inotify_data.user->inotify_watches);
+        /* return the watch descriptor for this new entry */
+        ret = tmp_ientry->wd;
+        /* match the ref from fsnotify_init_markentry() */
+        fsnotify_put_mark(&tmp_ientry->fsn_entry);
+        /* if this mark added a new event update the group mask */
+        if (mask & ~group->mask)
+                fsnotify_recalc_group_mask(group);
+out_err:
+        if (ret < 0)
+                kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
+        return ret;
+}
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+        int ret = 0;
+retry:
+        /* try to update and existing watch with the new arg */
+        ret = inotify_update_existing_watch(group, inode, arg);
+        /* no mark present, try to add a new one */
+        if (ret == -ENOENT)
+                ret = inotify_new_watch(group, inode, arg);
+        /*
+         * inotify_new_watch could race with another thread which did an
+         * inotify_new_watch between the update_existing and the add watch
+         * here, go back and try to update an existing mark again.
+         */
+        if (ret == -EEXIST)
+                goto retry;
        return ret;
 }
@@ -532,7 +638,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
        spin_lock_init(&group->inotify_data.idr_lock);
        idr_init(&group->inotify_data.idr);
-        group->inotify_data.last_wd = 0;
+        group->inotify_data.last_wd = 1;
        group->inotify_data.user = user;
        group->inotify_data.fa = NULL;
@@ -721,9 +827,6 @@ static int __init inotify_user_setup(void)
        inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
        event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
-        inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
-        if (!inotify_ignored_event)
-                panic("unable to allocate the inotify ignored event\n");
        inotify_max_queued_events = 16384;
        inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 959b73e756fd..3816d5750dd5 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -136,18 +136,28 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 {
        if ((old->mask == new->mask) &&
            (old->to_tell == new->to_tell) &&
-            (old->data_type == new->data_type)) {
+            (old->data_type == new->data_type) &&
+            (old->name_len == new->name_len)) {
                switch (old->data_type) {
                case (FSNOTIFY_EVENT_INODE):
-                        if (old->inode == new->inode)
+                        /* remember, after old was put on the wait_q we aren't
+                         * allowed to look at the inode any more, only thing
+                         * left to check was if the file_name is the same */
+                        if (old->name_len &&
+                            !strcmp(old->file_name, new->file_name))
                                return true;
                        break;
                case (FSNOTIFY_EVENT_PATH):
                        if ((old->path.mnt == new->path.mnt) &&
                            (old->path.dentry == new->path.dentry))
                                return true;
+                        break;
                case (FSNOTIFY_EVENT_NONE):
-                        return true;
+                        if (old->mask & FS_Q_OVERFLOW)
+                                return true;
+                        else if (old->mask & FS_IN_IGNORED)
+                                return false;
+                        return false;
                };
        }
        return false;
@@ -165,9 +175,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
        struct list_head *list = &group->notification_list;
        struct fsnotify_event_holder *last_holder;
        struct fsnotify_event *last_event;
+        int ret = 0;
-        /* easy to tell if priv was attached to the event */
-        INIT_LIST_HEAD(&priv->event_list);
        /*
         * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -188,6 +196,7 @@ alloc_holder:
        if (group->q_len >= group->max_events) {
                event = &q_overflow_event;
+                ret = -EOVERFLOW;
                /* sorry, no private data on the overflow event */
                priv = NULL;
        }
@@ -229,7 +238,7 @@ alloc_holder:
        mutex_unlock(&group->notification_mutex);
        wake_up(&group->notification_waitq);
-        return 0;
+        return ret;
 }
 /*
@@ -339,18 +348,19 @@ static void initialize_event(struct fsnotify_event *event)
 * @name the filename, if available
 */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-                                             int data_type, const char *name, u32 cookie)
+                                             int data_type, const char *name, u32 cookie,
+                                             gfp_t gfp)
 {
        struct fsnotify_event *event;
-        event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+        event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
        if (!event)
                return NULL;
        initialize_event(event);
        if (name) {
-                event->file_name = kstrdup(name, GFP_KERNEL);
+                event->file_name = kstrdup(name, gfp);
                if (!event->file_name) {
                        kmem_cache_free(fsnotify_event_cachep, event);
                        return NULL;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 3140a4429af1..4350d4993b18 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2076,14 +2076,6 @@ err_out:
        *ppos = pos;
        if (cached_page)
                page_cache_release(cached_page);
-        /* For now, when the user asks for O_SYNC, we actually give O_DSYNC. */
-        if (likely(!status)) {
-                if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(vi))) {
-                        if (!mapping->a_ops->writepage || !is_sync_kiocb(iocb))
-                                status = generic_osync_inode(vi, mapping,
-                                                OSYNC_METADATA|OSYNC_DATA);
-                }
-        }
        pagevec_lru_add_file(&lru_pvec);
        ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
                        written ? "written" : "status", (unsigned long)written,
@@ -2145,8 +2137,8 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
        mutex_lock(&inode->i_mutex);
        ret = ntfs_file_aio_write_nolock(iocb, iov, nr_segs, &iocb->ki_pos);
        mutex_unlock(&inode->i_mutex);
-        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+        if (ret > 0) {
-                int err = sync_page_range(inode, mapping, pos, ret);
+                int err = generic_write_sync(file, pos, ret);
                if (err < 0)
                        ret = err;
        }
@@ -2173,8 +2165,8 @@ static ssize_t ntfs_file_writev(struct file *file, const struct iovec *iov,
        if (ret == -EIOCBQUEUED)
                ret = wait_on_sync_kiocb(&kiocb);
        mutex_unlock(&inode->i_mutex);
-        if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
+        if (ret > 0) {
-                int err = sync_page_range(inode, mapping, *ppos - ret, ret);
+                int err = generic_write_sync(file, *ppos - ret, ret);
                if (err < 0)
                        ret = err;
        }
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 23bf68453d7d..1caa0ef0b2bb 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -384,13 +384,12 @@ unm_err_out:
 * it is dirty in the inode meta data rather than the data page cache of the
 * inode, and thus there are no data pages that need writing out.  Therefore, a
 * full mark_inode_dirty() is overkill.  A mark_inode_dirty_sync(), on the
- * other hand, is not sufficient, because I_DIRTY_DATASYNC needs to be set to
+ * other hand, is not sufficient, because ->write_inode needs to be called even
- * ensure ->write_inode is called from generic_osync_inode() and this needs to
+ * in case of fdatasync. This needs to happen or the file data would not
- * happen or the file data would not necessarily hit the device synchronously,
+ * necessarily hit the device synchronously, even though the vfs inode has the
- * even though the vfs inode has the O_SYNC flag set.  Also, I_DIRTY_DATASYNC
+ * O_SYNC flag set.  Also, I_DIRTY_DATASYNC simply "feels" better than just
- * simply "feels" better than just I_DIRTY_SYNC, since the file data has not
+ * I_DIRTY_SYNC, since the file data has not actually hit the block device yet,
- * actually hit the block device yet, which is not what I_DIRTY_SYNC on its own
+ * which is not what I_DIRTY_SYNC on its own would suggest.
- * would suggest.
 */
 void __mark_mft_record_dirty(ntfs_inode *ni)
 {
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 9edcde4974aa..ab513ddaeff2 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
         * immediately to their right.
         */
        left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
-        if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) {
+        if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) {
+                BUG_ON(right_child_el->l_tree_depth);
                BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1);
                left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos);
        }
@@ -2476,15 +2477,37 @@ out_ret_path:
        return ret;
 }
-static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
+static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
-                                      struct ocfs2_path *path)
+                                     int subtree_index, struct ocfs2_path *path)
 {
-        int i, idx;
+        int i, idx, ret;
        struct ocfs2_extent_rec *rec;
        struct ocfs2_extent_list *el;
        struct ocfs2_extent_block *eb;
        u32 range;
+        /*
+         * In normal tree rotation process, we will never touch the
+         * tree branch above subtree_index and ocfs2_extend_rotate_transaction
+         * doesn't reserve the credits for them either.
+         *
+         * But we do have a special case here which will update the rightmost
+         * records for all the bh in the path.
+         * So we have to allocate extra credits and access them.
+         */
+        ret = ocfs2_extend_trans(handle,
+                                 handle->h_buffer_credits + subtree_index);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
+        ret = ocfs2_journal_access_path(inode, handle, path);
+        if (ret) {
+                mlog_errno(ret);
+                goto out;
+        }
        /* Path should always be rightmost. */
        eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data;
        BUG_ON(eb->h_next_leaf_blk != 0ULL);
@@ -2505,6 +2528,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle,
                ocfs2_journal_dirty(handle, path->p_node[i].bh);
        }
+out:
+        return ret;
 }
 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle,
@@ -2717,7 +2742,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle,
        if (del_right_subtree) {
                ocfs2_unlink_subtree(inode, handle, left_path, right_path,
                                     subtree_index, dealloc);
-                ocfs2_update_edge_lengths(inode, handle, left_path);
+                ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+                                                left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -3034,7 +3064,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle,
                ocfs2_unlink_subtree(inode, handle, left_path, path,
                                     subtree_index, dealloc);
-                ocfs2_update_edge_lengths(inode, handle, left_path);
+                ret = ocfs2_update_edge_lengths(inode, handle, subtree_index,
+                                                left_path);
+                if (ret) {
+                        mlog_errno(ret);
+                        goto out;
+                }
                eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data;
                ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno));
@@ -6816,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
        }
        status = 0;
 bail:
+        brelse(last_eb_bh);
        mlog_exit(status);
        return status;
 }
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index b2c52b3a1484..8a1e61545f41 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
                             (unsigned long long)OCFS2_I(inode)->ip_blkno);
                        mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
                        dump_stack();
+                        goto bail;
                }
                past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
@@ -894,18 +895,17 @@ struct ocfs2_write_cluster_desc {
         */
        unsigned        c_new;
        unsigned        c_unwritten;
+        unsigned        c_needs_zero;
 };
-static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d)
-{
-        return d->c_new || d->c_unwritten;
-}
 struct ocfs2_write_ctxt {
        /* Logical cluster position / len of write */
        u32                             w_cpos;
        u32                             w_clen;
+        /* First cluster allocated in a nonsparse extend */
+        u32                             w_first_new_cpos;
        struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE];
        /*
@@ -983,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp,
                return -ENOMEM;
        wc->w_cpos = pos >> osb->s_clustersize_bits;
+        wc->w_first_new_cpos = UINT_MAX;
        cend = (pos + len - 1) >> osb->s_clustersize_bits;
        wc->w_clen = cend - wc->w_cpos + 1;
        get_bh(di_bh);
@@ -1217,20 +1218,18 @@ out:
 */
 static int ocfs2_write_cluster(struct address_space *mapping,
                               u32 phys, unsigned int unwritten,
+                               unsigned int should_zero,
                               struct ocfs2_alloc_context *data_ac,
                               struct ocfs2_alloc_context *meta_ac,
                               struct ocfs2_write_ctxt *wc, u32 cpos,
                               loff_t user_pos, unsigned user_len)
 {
-        int ret, i, new, should_zero = 0;
+        int ret, i, new;
        u64 v_blkno, p_blkno;
        struct inode *inode = mapping->host;
        struct ocfs2_extent_tree et;
        new = phys == 0 ? 1 : 0;
-        if (new || unwritten)
-                should_zero = 1;
        if (new) {
                u32 tmp_pos;
@@ -1301,7 +1300,7 @@ static int ocfs2_write_cluster(struct address_space *mapping,
                if (tmpret) {
                        mlog_errno(tmpret);
                        if (ret == 0)
-                                tmpret = ret;
+                                ret = tmpret;
                }
        }
@@ -1341,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping,
                        local_len = osb->s_clustersize - cluster_off;
                ret = ocfs2_write_cluster(mapping, desc->c_phys,
-                                          desc->c_unwritten, data_ac, meta_ac,
+                                          desc->c_unwritten,
+                                          desc->c_needs_zero,
+                                          data_ac, meta_ac,
                                          wc, desc->c_cpos, pos, local_len);
                if (ret) {
                        mlog_errno(ret);
@@ -1391,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb,
                 * newly allocated cluster.
                 */
                desc = &wc->w_desc[0];
-                if (ocfs2_should_zero_cluster(desc))
+                if (desc->c_needs_zero)
                        ocfs2_figure_cluster_boundaries(osb,
                                                        desc->c_cpos,
                                                        &wc->w_target_from,
                                                        NULL);
                desc = &wc->w_desc[wc->w_clen - 1];
-                if (ocfs2_should_zero_cluster(desc))
+                if (desc->c_needs_zero)
                        ocfs2_figure_cluster_boundaries(osb,
                                                        desc->c_cpos,
                                                        NULL,
@@ -1466,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode,
                        phys++;
                }
+                /*
+                 * If w_first_new_cpos is < UINT_MAX, we have a non-sparse
+                 * file that got extended.  w_first_new_cpos tells us
+                 * where the newly allocated clusters are so we can
+                 * zero them.
+                 */
+                if (desc->c_cpos >= wc->w_first_new_cpos) {
+                        BUG_ON(phys == 0);
+                        desc->c_needs_zero = 1;
+                }
                desc->c_phys = phys;
                if (phys == 0) {
                        desc->c_new = 1;
+                        desc->c_needs_zero = 1;
                        *clusters_to_alloc = *clusters_to_alloc + 1;
                }
-                if (ext_flags & OCFS2_EXT_UNWRITTEN)
+                if (ext_flags & OCFS2_EXT_UNWRITTEN) {
                        desc->c_unwritten = 1;
+                        desc->c_needs_zero = 1;
+                }
                num_clusters--;
        }
@@ -1632,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos,
        if (newsize <= i_size_read(inode))
                return 0;
-        ret = ocfs2_extend_no_holes(inode, newsize, newsize - len);
+        ret = ocfs2_extend_no_holes(inode, newsize, pos);
        if (ret)
                mlog_errno(ret);
+        wc->w_first_new_cpos =
+                ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode));
        return ret;
 }
@@ -1644,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
                             struct page **pagep, void **fsdata,
                             struct buffer_head *di_bh, struct page *mmap_page)
 {
-        int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+        int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS;
        unsigned int clusters_to_alloc, extents_to_split;
        struct ocfs2_write_ctxt *wc;
        struct inode *inode = mapping->host;
@@ -1722,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
        }
-        ocfs2_set_target_boundaries(osb, wc, pos, len,
+        /*
-                                    clusters_to_alloc + extents_to_split);
+         * We have to zero sparse allocated clusters, unwritten extent clusters,
+         * and non-sparse clusters we just extended.  For non-sparse writes,
+         * we know zeros will only be needed in the first and/or last cluster.
+         */
+        if (clusters_to_alloc || extents_to_split ||
+            (wc->w_clen && (wc->w_desc[0].c_needs_zero ||
+                            wc->w_desc[wc->w_clen - 1].c_needs_zero)))
+                cluster_of_pages = 1;
+        else
+                cluster_of_pages = 0;
+        ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages);
        handle = ocfs2_start_trans(osb, credits);
        if (IS_ERR(handle)) {
@@ -1756,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping,
         * extent.
         */
        ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos,
-                                         clusters_to_alloc + extents_to_split,
+                                         cluster_of_pages, mmap_page);
-                                         mmap_page);
        if (ret) {
                mlog_errno(ret);
                goto out_quota;
diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c
index b574431a031d..b4957c7d9fe2 100644
--- a/fs/ocfs2/dcache.c
+++ b/fs/ocfs2/dcache.c
@@ -85,6 +85,17 @@ static int ocfs2_dentry_revalidate(struct dentry *dentry,
                goto bail;
        }
+        /*
+         * If the last lookup failed to create dentry lock, let us
+         * redo it.
+         */
+        if (!dentry->d_fsdata) {
+                mlog(0, "Inode %llu doesn't have dentry lock, "
+                     "returning false\n",
+                     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+                goto bail;
+        }
        ret = 1;
 bail:
@@ -310,22 +321,19 @@ out_attach:
        return ret;
 }
-static DEFINE_SPINLOCK(dentry_list_lock);
+DEFINE_SPINLOCK(dentry_list_lock);
 /* We limit the number of dentry locks to drop in one go. We have
 * this limit so that we don't starve other users of ocfs2_wq. */
 #define DL_INODE_DROP_COUNT 64
 /* Drop inode references from dentry locks */
-void ocfs2_drop_dl_inodes(struct work_struct *work)
+static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count)
 {
-        struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
-                                               dentry_lock_work);
        struct ocfs2_dentry_lock *dl;
-        int drop_count = DL_INODE_DROP_COUNT;
        spin_lock(&dentry_list_lock);
-        while (osb->dentry_lock_list && drop_count--) {
+        while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) {
                dl = osb->dentry_lock_list;
                osb->dentry_lock_list = dl->dl_next;
                spin_unlock(&dentry_list_lock);
@@ -333,11 +341,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work)
                kfree(dl);
                spin_lock(&dentry_list_lock);
        }
-        if (osb->dentry_lock_list)
+        spin_unlock(&dentry_list_lock);
+}
+void ocfs2_drop_dl_inodes(struct work_struct *work)
+{
+        struct ocfs2_super *osb = container_of(work, struct ocfs2_super,
+                                               dentry_lock_work);
+        __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT);
+        /*
+         * Don't queue dropping if umount is in progress. We flush the
+         * list in ocfs2_dismount_volume
+         */
+        spin_lock(&dentry_list_lock);
+        if (osb->dentry_lock_list &&
+            !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
                queue_work(ocfs2_wq, &osb->dentry_lock_work);
        spin_unlock(&dentry_list_lock);
 }
+/* Flush the whole work queue */
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb)
+{
+        __ocfs2_drop_dl_inodes(osb, -1);
+}
 /*
 * ocfs2_dentry_iput() and friends.
 *
@@ -368,7 +397,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb,
        /* We leave dropping of inode reference to ocfs2_wq as that can
         * possibly lead to inode deletion which gets tricky */
        spin_lock(&dentry_list_lock);
-        if (!osb->dentry_lock_list)
+        if (!osb->dentry_lock_list &&
+            !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED))
                queue_work(ocfs2_wq, &osb->dentry_lock_work);
        dl->dl_next = osb->dentry_lock_list;
        osb->dentry_lock_list = dl;
diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h
index faa12e75f98d..f5dd1789acf1 100644
--- a/fs/ocfs2/dcache.h
+++ b/fs/ocfs2/dcache.h
@@ -49,10 +49,13 @@ struct ocfs2_dentry_lock {
 int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode,
                             u64 parent_blkno);
+extern spinlock_t dentry_list_lock;
 void ocfs2_dentry_lock_put(struct ocfs2_super *osb,
                           struct ocfs2_dentry_lock *dl);
 void ocfs2_drop_dl_inodes(struct work_struct *work);
+void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb);
 struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno,
                                      int skip_unhashed);
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index d07ddbe4b283..81eff8e58322 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock)
                     lock->ast_pending, lock->ml.type);
                BUG();
        }
-        BUG_ON(!list_empty(&lock->ast_list));
        if (lock->ast_pending)
                mlog(0, "lock has an ast getting flushed right now\n");
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 1c9efb406a96..02bf17808bdc 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -325,6 +325,7 @@ clear_fields:
 }
 static struct backing_dev_info dlmfs_backing_dev_info = {
+        .name           = "ocfs2-dlmfs",
        .ra_pages       = 0,    /* No readahead */
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index bcb9260c3735..43e6e3280569 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
        mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n",
             dlm->name, res->lockname.len, res->lockname.name,
-             orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery",
+             orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery",
             send_to);
        /* send it */
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index fcf879ed6930..756f5b0998e0 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
         * that still has AST's pending... */
        in_use = !list_empty(&lock->ast_list);
        spin_unlock(&dlm->ast_lock);
-        if (in_use) {
+        if (in_use && !(flags & LKM_CANCEL)) {
               mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
                    "while waiting for an ast!", res->lockname.len,
                    res->lockname.name);
@@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
        spin_lock(&res->spinlock);
        if (res->state & DLM_LOCK_RES_IN_PROGRESS) {
-                if (master_node) {
+                if (master_node && !(flags & LKM_CANCEL)) {
                        mlog(ML_ERROR, "lockres in progress!\n");
                        spin_unlock(&res->spinlock);
                        return DLM_FORWARD;
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 62442e413a00..221c5e98957b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1851,6 +1851,7 @@ relock:
                if (ret)
                        goto out_dio;
+                count = ocount;
                ret = generic_write_checks(file, ppos, &count,
                                           S_ISBLK(inode->i_mode));
                if (ret)
@@ -1870,8 +1871,7 @@ relock:
                        goto out_dio;
                }
        } else {
-                written = generic_file_aio_write_nolock(iocb, iov, nr_segs,
+                written = __generic_file_aio_write(iocb, iov, nr_segs, ppos);
-                                                        *ppos);
        }
 out_dio:
@@ -1879,18 +1879,21 @@ out_dio:
        BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
        if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) {
-                /*
+                ret = filemap_fdatawrite_range(file->f_mapping, pos,
-                 * The generic write paths have handled getting data
+                                               pos + count - 1);
-                 * to disk, but since we don't make use of the dirty
+                if (ret < 0)
-                 * inode list, a manual journal commit is necessary
+                        written = ret;
-                 * here.
-                 */
+                if (!ret && (old_size != i_size_read(inode) ||
-                if (old_size != i_size_read(inode) ||
+                    old_clusters != OCFS2_I(inode)->ip_clusters)) {
-                    old_clusters != OCFS2_I(inode)->ip_clusters) {
                        ret = jbd2_journal_force_commit(osb->journal->j_journal);
                        if (ret < 0)
                                written = ret;
                }
+                if (!ret)
+                        ret = filemap_fdatawait_range(file->f_mapping, pos,
+                                                      pos + count - 1);
        }
        /* 
@@ -1918,8 +1921,10 @@ out_sems:
        mutex_unlock(&inode->i_mutex);
+        if (written)
+                ret = written;
        mlog_exit(ret);
-        return written ? written : ret;
+        return ret;
 }
 static int ocfs2_splice_to_file(struct pipe_inode_info *pipe,
@@ -1988,31 +1993,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
        if (ret > 0) {
                unsigned long nr_pages;
+                int err;
-                *ppos += ret;
                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-                /*
+                err = generic_write_sync(out, *ppos, ret);
-                 * If file or inode is SYNC and we actually wrote some data,
+                if (err)
-                 * sync it.
+                        ret = err;
-                 */
+                else
-                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+                        *ppos += ret;
-                        int err;
-                        mutex_lock(&inode->i_mutex);
-                        err = ocfs2_rw_lock(inode, 1);
-                        if (err < 0) {
-                                mlog_errno(err);
-                        } else {
-                                err = generic_osync_inode(inode, mapping,
-                                                  OSYNC_METADATA|OSYNC_DATA);
-                                ocfs2_rw_unlock(inode, 1);
-                        }
-                        mutex_unlock(&inode->i_mutex);
-                        if (err)
-                                ret = err;
-                }
                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
        }
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index f033760ecbea..c48b93ac6b65 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb)
        os->os_osb = osb;
        os->os_count = 0;
        os->os_seqno = 0;
-        os->os_scantime = CURRENT_TIME;
        mutex_init(&os->os_lock);
        INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work);
+}
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
+{
+        struct ocfs2_orphan_scan *os;
+        os = &osb->osb_orphan_scan;
+        os->os_scantime = CURRENT_TIME;
        if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
                atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
        else {
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 5432c7f79cc6..2c3222aec622 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb,
 /* Exported only for the journal struct init code in super.c. Do not call. */
 void ocfs2_orphan_scan_init(struct ocfs2_super *osb);
+void ocfs2_orphan_scan_start(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_stop(struct ocfs2_super *osb);
 void ocfs2_orphan_scan_exit(struct ocfs2_super *osb);
@@ -329,20 +330,27 @@ int                  ocfs2_journal_dirty(handle_t *handle,
 /* extended attribute block update */
 #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1
+/* Update of a single quota block */
+#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1
 /* global quotafile inode update, data block */
-#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1)
+#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \
+                                   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
+#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS
 /*
 * The two writes below can accidentally see global info dirty due
 * to set_info() quotactl so make them prepared for the writes.
 */
 /* quota data block, global info */
 /* Write to local quota file */
-#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1)
+#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+                              OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 /* global quota data block, local quota data block, global quota inode,
 * global quota info */
-#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3)
+#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \
+                             2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS)
 static inline int ocfs2_quota_trans_credits(struct super_block *sb)
 {
@@ -355,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb)
        return credits;
 }
-/* Number of credits needed for removing quota structure from file */
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type);
-/* Number of credits needed for initialization of new quota structure */
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type);
 /* group extend. inode update and last group update. */
 #define OCFS2_GROUP_EXTEND_CREDITS      (OCFS2_INODE_UPDATE_CREDITS + 1)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index c9345ebb8493..39e1d5a39505 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -224,10 +224,12 @@ enum ocfs2_mount_options
        OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */
 };
-#define OCFS2_OSB_SOFT_RO       0x0001
+#define OCFS2_OSB_SOFT_RO                       0x0001
-#define OCFS2_OSB_HARD_RO       0x0002
+#define OCFS2_OSB_HARD_RO                       0x0002
-#define OCFS2_OSB_ERROR_FS      0x0004
+#define OCFS2_OSB_ERROR_FS                      0x0004
-#define OCFS2_DEFAULT_ATIME_QUANTUM     60
+#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED        0x0008
+#define OCFS2_DEFAULT_ATIME_QUANTUM             60
 struct ocfs2_journal;
 struct ocfs2_slot_info;
@@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb,
        spin_unlock(&osb->osb_lock);
 }
+static inline unsigned long  ocfs2_test_osb_flag(struct ocfs2_super *osb,
+                                                 unsigned long flag)
+{
+        unsigned long ret;
+        spin_lock(&osb->osb_lock);
+        ret = osb->osb_flags & flag;
+        spin_unlock(&osb->osb_lock);
+        return ret;
+}
 static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb,
                                     int hard)
 {
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index fcdba091af3d..c212cf5a2bdf 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -108,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = {
        [OCFS2_LOCK_TYPE_OPEN] = "Open",
        [OCFS2_LOCK_TYPE_FLOCK] = "Flock",
        [OCFS2_LOCK_TYPE_QINFO] = "Quota",
+        [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync",
        [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan",
 };
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index 7365e2e08706..3fb96fcd4c81 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo {
        unsigned int dqi_chunks;        /* Number of chunks in local quota file */
        unsigned int dqi_blocks;        /* Number of blocks allocated for local quota file */
        unsigned int dqi_syncms;        /* How often should we sync with other nodes */
-        unsigned int dqi_syncjiff;      /* Precomputed dqi_syncms in jiffies */
        struct list_head dqi_chunk;     /* List of chunks */
        struct inode *dqi_gqinode;      /* Global quota file inode */
        struct ocfs2_lock_res dqi_gqlock;       /* Lock protecting quota information structure */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index edfa60cd155c..44f2a5e1d042 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -23,6 +23,7 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "uptodate.h"
+#include "super.h"
 #include "quota.h"
 static struct workqueue_struct *ocfs2_quota_wq = NULL;
@@ -69,6 +70,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot)
        d->dqb_curspace = cpu_to_le64(m->dqb_curspace);
        d->dqb_btime = cpu_to_le64(m->dqb_btime);
        d->dqb_itime = cpu_to_le64(m->dqb_itime);
+        d->dqb_pad1 = d->dqb_pad2 = 0;
 }
 static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
@@ -113,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block,
        int rc = 0;
        struct buffer_head *tmp = *bh;
+        if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) {
+                ocfs2_error(inode->i_sb,
+                            "Quota file %llu is probably corrupted! Requested "
+                            "to read block %Lu but file has size only %Lu\n",
+                            (unsigned long long)OCFS2_I(inode)->ip_blkno,
+                            (unsigned long long)v_block,
+                            (unsigned long long)i_size_read(inode));
+                return -EIO;
+        }
        rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0,
                                    ocfs2_validate_quota_block);
        if (rc)
@@ -211,14 +222,13 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
        mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA);
        if (gqinode->i_size < off + len) {
-                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                loff_t rounded_end =
-                err = ocfs2_extend_no_holes(gqinode, off + len, off);
+                                ocfs2_align_bytes_to_blocks(sb, off + len);
-                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
-                if (err < 0)
+                /* Space is already allocated in ocfs2_global_read_dquot() */
-                        goto out;
                err = ocfs2_simple_size_update(gqinode,
                                               oinfo->dqi_gqi_bh,
-                                               off + len);
+                                               rounded_end);
                if (err < 0)
                        goto out;
                new = 1;
@@ -234,7 +244,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type,
        }
        if (err) {
                mlog_errno(err);
-                return err;
+                goto out;
        }
        lock_buffer(bh);
        if (new)
@@ -342,7 +352,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace);
        info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace);
        oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms);
-        oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms);
        oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks);
        oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk);
        oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry);
@@ -352,7 +361,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type)
        oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi);
        INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn);
        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-                           oinfo->dqi_syncjiff);
+                           msecs_to_jiffies(oinfo->dqi_syncms));
 out_err:
        mlog_exit(status);
@@ -402,13 +411,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type)
        return err;
 }
+static int ocfs2_global_qinit_alloc(struct super_block *sb, int type)
+{
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
+        /*
+         * We may need to allocate tree blocks and a leaf block but not the
+         * root block
+         */
+        return oinfo->dqi_gi.dqi_qtree_depth;
+}
+static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type)
+{
+        /* We modify all the allocated blocks, tree root, and info block */
+        return (ocfs2_global_qinit_alloc(sb, type) + 2) *
+                        OCFS2_QUOTA_BLOCK_UPDATE_CREDITS;
+}
 /* Read in information from global quota file and acquire a reference to it.
 * dquot_acquire() has already started the transaction and locked quota file */
 int ocfs2_global_read_dquot(struct dquot *dquot)
 {
        int err, err2, ex = 0;
-        struct ocfs2_mem_dqinfo *info =
+        struct super_block *sb = dquot->dq_sb;
-                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
+        int type = dquot->dq_type;
+        struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv;
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        struct inode *gqinode = info->dqi_gqinode;
+        int need_alloc = ocfs2_global_qinit_alloc(sb, type);
+        handle_t *handle = NULL;
        err = ocfs2_qinfo_lock(info, 0);
        if (err < 0)
@@ -419,14 +451,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot)
        OCFS2_DQUOT(dquot)->dq_use_count++;
        OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace;
        OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes;
+        ocfs2_qinfo_unlock(info, 0);
        if (!dquot->dq_off) {   /* No real quota entry? */
-                /* Upgrade to exclusive lock for allocation */
-                ocfs2_qinfo_unlock(info, 0);
-                err = ocfs2_qinfo_lock(info, 1);
-                if (err < 0)
-                        goto out_qlock;
                ex = 1;
+                /*
+                 * Add blocks to quota file before we start a transaction since
+                 * locking allocators ranks above a transaction start
+                 */
+                WARN_ON(journal_current_handle());
+                down_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                err = ocfs2_extend_no_holes(gqinode,
+                        gqinode->i_size + (need_alloc << sb->s_blocksize_bits),
+                        gqinode->i_size);
+                up_write(&OCFS2_I(gqinode)->ip_alloc_sem);
+                if (err < 0)
+                        goto out;
        }
+        handle = ocfs2_start_trans(osb,
+                                   ocfs2_calc_global_qinit_credits(sb, type));
+        if (IS_ERR(handle)) {
+                err = PTR_ERR(handle);
+                goto out;
+        }
+        err = ocfs2_qinfo_lock(info, ex);
+        if (err < 0)
+                goto out_trans;
        err = qtree_write_dquot(&info->dqi_gi, dquot);
        if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) {
                err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type);
@@ -438,6 +489,9 @@ out_qlock:
                ocfs2_qinfo_unlock(info, 1);
        else
                ocfs2_qinfo_unlock(info, 0);
+out_trans:
+        if (handle)
+                ocfs2_commit_trans(osb, handle);
 out:
        if (err < 0)
                mlog_errno(err);
@@ -607,7 +661,7 @@ static void qsync_work_fn(struct work_struct *work)
        dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
        queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work,
-                           oinfo->dqi_syncjiff);
+                           msecs_to_jiffies(oinfo->dqi_syncms));
 }
 /*
@@ -635,20 +689,18 @@ out:
        return status;
 }
-int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
+static int ocfs2_calc_qdel_credits(struct super_block *sb, int type)
 {
-        struct ocfs2_mem_dqinfo *oinfo;
+        struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv;
-        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
+        /*
-                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
+         * We modify tree, leaf block, global info, local chunk header,
+         * global and local inode; OCFS2_QINFO_WRITE_CREDITS already
-        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
+         * accounts for inode update
-                return 0;
+         */
+        return (oinfo->dqi_gi.dqi_qtree_depth + 2) *
-        oinfo = sb_dqinfo(sb, type)->dqi_priv;
+               OCFS2_QUOTA_BLOCK_UPDATE_CREDITS +
-        /* We modify tree, leaf block, global info, local chunk header,
+               OCFS2_QINFO_WRITE_CREDITS +
-         * global and local inode */
+               OCFS2_INODE_UPDATE_CREDITS;
-        return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 +
-               2 * OCFS2_INODE_UPDATE_CREDITS;
 }
 static int ocfs2_release_dquot(struct dquot *dquot)
@@ -680,33 +732,10 @@ out:
        return status;
 }
-int ocfs2_calc_qinit_credits(struct super_block *sb, int type)
-{
-        struct ocfs2_mem_dqinfo *oinfo;
-        int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
-                                    OCFS2_FEATURE_RO_COMPAT_GRPQUOTA };
-        struct ocfs2_dinode *lfe, *gfe;
-        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type]))
-                return 0;
-        oinfo = sb_dqinfo(sb, type)->dqi_priv;
-        gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data;
-        lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data;
-        /* We can extend local file + global file. In local file we
-         * can modify info, chunk header block and dquot block. In
-         * global file we can modify info, tree and leaf block */
-        return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) +
-               ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) +
-               3 + oinfo->dqi_gi.dqi_qtree_depth + 2;
-}
 static int ocfs2_acquire_dquot(struct dquot *dquot)
 {
-        handle_t *handle;
        struct ocfs2_mem_dqinfo *oinfo =
                        sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv;
-        struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb);
        int status = 0;
        mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type);
@@ -715,16 +744,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot)
        status = ocfs2_lock_global_qf(oinfo, 1);
        if (status < 0)
                goto out;
-        handle = ocfs2_start_trans(osb,
-                ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type));
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                mlog_errno(status);
-                goto out_ilock;
-        }
        status = dquot_acquire(dquot);
-        ocfs2_commit_trans(osb, handle);
-out_ilock:
        ocfs2_unlock_global_qf(oinfo, 1);
 out:
        mlog_exit(status);
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 5a460fa82553..bdb09cb6e1fe 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -20,6 +20,7 @@
 #include "sysfile.h"
 #include "dlmglue.h"
 #include "quota.h"
+#include "uptodate.h"
 /* Number of local quota structures per block */
 static inline unsigned int ol_quota_entries_per_block(struct super_block *sb)
@@ -100,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh,
        handle_t *handle;
        int status;
-        handle = ocfs2_start_trans(OCFS2_SB(sb), 1);
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                                   OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
@@ -610,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
                        goto out_bh;
                /* Mark quota file as clean if we are recovering quota file of
                 * some other node. */
-                handle = ocfs2_start_trans(osb, 1);
+                handle = ocfs2_start_trans(osb,
+                                           OCFS2_LOCAL_QINFO_WRITE_CREDITS);
                if (IS_ERR(handle)) {
                        status = PTR_ERR(handle);
                        mlog_errno(status);
@@ -940,7 +943,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        struct ocfs2_local_disk_chunk *dchunk;
        int status;
        handle_t *handle;
-        struct buffer_head *bh = NULL;
+        struct buffer_head *bh = NULL, *dbh = NULL;
        u64 p_blkno;
        /* We are protected by dqio_sem so no locking needed */
@@ -964,32 +967,35 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
                mlog_errno(status);
                goto out;
        }
+        /* Local quota info and two new blocks we initialize */
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                        OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+                        2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
+        if (IS_ERR(handle)) {
+                status = PTR_ERR(handle);
+                mlog_errno(status);
+                goto out;
+        }
+        /* Initialize chunk header */
        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
                                             &p_blkno, NULL, NULL);
        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
        if (status < 0) {
                mlog_errno(status);
-                goto out;
+                goto out_trans;
        }
        bh = sb_getblk(sb, p_blkno);
        if (!bh) {
                status = -ENOMEM;
                mlog_errno(status);
-                goto out;
+                goto out_trans;
        }
        dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data;
+        ocfs2_set_new_buffer_uptodate(lqinode, bh);
-        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
-        if (IS_ERR(handle)) {
-                status = PTR_ERR(handle);
-                mlog_errno(status);
-                goto out;
-        }
        status = ocfs2_journal_access_dq(handle, lqinode, bh,
-                                         OCFS2_JOURNAL_ACCESS_WRITE);
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
        if (status < 0) {
                mlog_errno(status);
                goto out_trans;
@@ -999,7 +1005,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
        memset(dchunk->dqc_bitmap, 0,
               sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) -
               OCFS2_QBLK_RESERVED_SPACE);
-        set_buffer_uptodate(bh);
        unlock_buffer(bh);
        status = ocfs2_journal_dirty(handle, bh);
        if (status < 0) {
@@ -1007,6 +1012,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk(
                goto out_trans;
        }
+        /* Initialize new block with structures */
+        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1,
+                                             &p_blkno, NULL, NULL);
+        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        dbh = sb_getblk(sb, p_blkno);
+        if (!dbh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out_trans;
+        }
+        ocfs2_set_new_buffer_uptodate(lqinode, dbh);
+        status = ocfs2_journal_access_dq(handle, lqinode, dbh,
+                                         OCFS2_JOURNAL_ACCESS_CREATE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        lock_buffer(dbh);
+        memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE);
+        unlock_buffer(dbh);
+        status = ocfs2_journal_dirty(handle, dbh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        /* Update local quotafile info */
        oinfo->dqi_blocks += 2;
        oinfo->dqi_chunks++;
        status = ocfs2_local_write_info(sb, type);
@@ -1031,6 +1068,7 @@ out_trans:
        ocfs2_commit_trans(OCFS2_SB(sb), handle);
 out:
        brelse(bh);
+        brelse(dbh);
        kmem_cache_free(ocfs2_qf_chunk_cachep, chunk);
        return ERR_PTR(status);
 }
@@ -1048,6 +1086,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
        struct ocfs2_local_disk_chunk *dchunk;
        int epb = ol_quota_entries_per_block(sb);
        unsigned int chunk_blocks;
+        struct buffer_head *bh;
+        u64 p_blkno;
        int status;
        handle_t *handle;
@@ -1075,12 +1115,49 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
                mlog_errno(status);
                goto out;
        }
-        handle = ocfs2_start_trans(OCFS2_SB(sb), 2);
+        /* Get buffer from the just added block */
+        down_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks,
+                                             &p_blkno, NULL, NULL);
+        up_read(&OCFS2_I(lqinode)->ip_alloc_sem);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out;
+        }
+        bh = sb_getblk(sb, p_blkno);
+        if (!bh) {
+                status = -ENOMEM;
+                mlog_errno(status);
+                goto out;
+        }
+        ocfs2_set_new_buffer_uptodate(lqinode, bh);
+        /* Local quota info, chunk header and the new block we initialize */
+        handle = ocfs2_start_trans(OCFS2_SB(sb),
+                        OCFS2_LOCAL_QINFO_WRITE_CREDITS +
+                        2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS);
        if (IS_ERR(handle)) {
                status = PTR_ERR(handle);
                mlog_errno(status);
                goto out;
        }
+        /* Zero created block */
+        status = ocfs2_journal_access_dq(handle, lqinode, bh,
+                                 OCFS2_JOURNAL_ACCESS_CREATE);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        lock_buffer(bh);
+        memset(bh->b_data, 0, sb->s_blocksize);
+        unlock_buffer(bh);
+        status = ocfs2_journal_dirty(handle, bh);
+        if (status < 0) {
+                mlog_errno(status);
+                goto out_trans;
+        }
+        /* Update chunk header */
        status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh,
                                 OCFS2_JOURNAL_ACCESS_WRITE);
        if (status < 0) {
@@ -1097,6 +1174,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file(
                mlog_errno(status);
                goto out_trans;
        }
+        /* Update file header */
        oinfo->dqi_blocks++;
        status = ocfs2_local_write_info(sb, type);
        if (status < 0) {
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 3f661376a2de..e49c41050264 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -17,6 +17,7 @@
 * General Public License for more details.
 */
+#include <linux/kernel.h>
 #include <linux/crc32.h>
 #include <linux/module.h>
@@ -153,7 +154,7 @@ static int status_map[] = {
 static int dlm_status_to_errno(enum dlm_status status)
 {
-        BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0])));
+        BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map));
        return status_map[status];
 }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 7efb349fb9bd..a3f8871d21fd 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
                }
                di = (struct ocfs2_dinode *) (*bh)->b_data;
                memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats));
+                spin_lock_init(&stats->b_lock);
                status = ocfs2_verify_volume(di, *bh, blksize, stats);
                if (status >= 0)
                        goto bail;
@@ -1182,7 +1183,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
        wake_up(&osb->osb_mount_event);
        /* Start this when the mount is almost sure of being successful */
-        ocfs2_orphan_scan_init(osb);
+        ocfs2_orphan_scan_start(osb);
        mlog_exit(status);
        return status;
@@ -1213,14 +1214,31 @@ static int ocfs2_get_sb(struct file_system_type *fs_type,
                           mnt);
 }
+static void ocfs2_kill_sb(struct super_block *sb)
+{
+        struct ocfs2_super *osb = OCFS2_SB(sb);
+        /* Failed mount? */
+        if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED)
+                goto out;
+        /* Prevent further queueing of inode drop events */
+        spin_lock(&dentry_list_lock);
+        ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED);
+        spin_unlock(&dentry_list_lock);
+        /* Wait for work to finish and/or remove it */
+        cancel_work_sync(&osb->dentry_lock_work);
+out:
+        kill_block_super(sb);
+}
 static struct file_system_type ocfs2_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ocfs2",
        .get_sb         = ocfs2_get_sb, /* is this called when we mount
                                        * the fs? */
-        .kill_sb        = kill_block_super, /* set to the generic one
+        .kill_sb        = ocfs2_kill_sb,
-                                             * right now, but do we
-                                             * need to change that? */
        .fs_flags       = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE,
        .next           = NULL
 };
@@ -1819,6 +1837,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
        debugfs_remove(osb->osb_ctxt);
+        /*
+         * Flush inode dropping work queue so that deletes are
+         * performed while the filesystem is still working
+         */
+        ocfs2_drop_all_dl_inodes(osb);
        /* Orphan scan should be stopped as early as possible */
        ocfs2_orphan_scan_stop(osb);
@@ -1981,6 +2005,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
        snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
                 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
+        ocfs2_orphan_scan_init(osb);
        status = ocfs2_recovery_init(osb);
        if (status) {
                mlog(ML_ERROR, "Unable to initialize recovery state\n");
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index ba320e250747..d1a27cda984f 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode,
        struct ocfs2_xattr_block *xb;
        struct ocfs2_xattr_value_root *xv;
        size_t size;
-        int ret = -ENODATA, name_offset, name_len, block_off, i;
+        int ret = -ENODATA, name_offset, name_len, i;
+        int uninitialized_var(block_off);
        xs->bucket = ocfs2_xattr_bucket_new(inode);
        if (!xs->bucket) {
diff --git a/fs/open.c b/fs/open.c
index dd98e8076024..31191bf513e4 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -199,7 +199,7 @@ out:
 int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        struct file *filp)
 {
-        int err;
+        int ret;
        struct iattr newattrs;
        /* Not pretty: "inode->i_size" shouldn't really be signed. But it is. */
@@ -214,12 +214,14 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
        }
        /* Remove suid/sgid on truncate too */
-        newattrs.ia_valid |= should_remove_suid(dentry);
+        ret = should_remove_suid(dentry);
+        if (ret)
+                newattrs.ia_valid |= ret | ATTR_FORCE;
        mutex_lock(&dentry->d_inode->i_mutex);
-        err = notify_change(dentry, &newattrs);
+        ret = notify_change(dentry, &newattrs);
        mutex_unlock(&dentry->d_inode->i_mutex);
-        return err;
+        return ret;
 }
 static long do_sys_truncate(const char __user *pathname, loff_t length)
@@ -957,6 +959,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags,
        int error;
        struct file *f;
+        validate_creds(cred);
        /*
         * We must always pass in a valid mount pointer.   Historically
         * callers got away with not passing it, but we must enforce this at
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index ea4e6cb29e13..fbeaddf595d3 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -248,11 +248,19 @@ ssize_t part_stat_show(struct device *dev,
                part_stat_read(p, merges[WRITE]),
                (unsigned long long)part_stat_read(p, sectors[WRITE]),
                jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-                p->in_flight,
+                part_in_flight(p),
                jiffies_to_msecs(part_stat_read(p, io_ticks)),
                jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
+ssize_t part_inflight_show(struct device *dev,
+                        struct device_attribute *attr, char *buf)
+{
+        struct hd_struct *p = dev_to_part(dev);
+        return sprintf(buf, "%8u %8u\n", p->in_flight[0], p->in_flight[1]);
+}
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
                       struct device_attribute *attr, char *buf)
@@ -281,6 +289,7 @@ static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
 static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL);
 static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL);
+static DEVICE_ATTR(inflight, S_IRUGO, part_inflight_show, NULL);
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static struct device_attribute dev_attr_fail =
        __ATTR(make-it-fail, S_IRUGO|S_IWUSR, part_fail_show, part_fail_store);
@@ -292,6 +301,7 @@ static struct attribute *part_attrs[] = {
        &dev_attr_size.attr,
        &dev_attr_alignment_offset.attr,
        &dev_attr_stat.attr,
+        &dev_attr_inflight.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
        &dev_attr_fail.attr,
 #endif
@@ -302,7 +312,7 @@ static struct attribute_group part_attr_group = {
        .attrs = part_attrs,
 };
-static struct attribute_group *part_attr_groups[] = {
+static const struct attribute_group *part_attr_groups[] = {
        &part_attr_group,
 #ifdef CONFIG_BLK_DEV_IO_TRACE
        &blk_trace_attr_group,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 3ce5ae9e3d2d..6f742f6658a9 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -234,23 +234,20 @@ static int check_mem_permission(struct task_struct *task)
 struct mm_struct *mm_for_maps(struct task_struct *task)
 {
-        struct mm_struct *mm = get_task_mm(task);
+        struct mm_struct *mm;
-        if (!mm)
+        if (mutex_lock_killable(&task->cred_guard_mutex))
                return NULL;
-        down_read(&mm->mmap_sem);
-        task_lock(task);
+        mm = get_task_mm(task);
-        if (task->mm != mm)
+        if (mm && mm != current->mm &&
-                goto out;
+                        !ptrace_may_access(task, PTRACE_MODE_READ)) {
-        if (task->mm != current->mm &&
+                mmput(mm);
-            __ptrace_may_access(task, PTRACE_MODE_READ) < 0)
+                mm = NULL;
-                goto out;
+        }
-        task_unlock(task);
+        mutex_unlock(&task->cred_guard_mutex);
        return mm;
-out:
-        task_unlock(task);
-        up_read(&mm->mmap_sem);
-        mmput(mm);
-        return NULL;
 }
 static int proc_pid_cmdline(struct task_struct *task, char * buffer)
@@ -1006,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
        if (!task)
                return -ESRCH;
-        task_lock(task);
+        oom_adjust = task->oomkilladj;
-        if (task->mm)
-                oom_adjust = task->mm->oom_adj;
-        else
-                oom_adjust = OOM_DISABLE;
-        task_unlock(task);
        put_task_struct(task);
        len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
@@ -1040,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
        task = get_proc_task(file->f_path.dentry->d_inode);
        if (!task)
                return -ESRCH;
-        task_lock(task);
+        if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) {
-        if (!task->mm) {
-                task_unlock(task);
-                put_task_struct(task);
-                return -EINVAL;
-        }
-        if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) {
-                task_unlock(task);
                put_task_struct(task);
                return -EACCES;
        }
-        task->mm->oom_adj = oom_adjust;
+        task->oomkilladj = oom_adjust;
-        task_unlock(task);
        put_task_struct(task);
        if (end - buffer == 0)
                return -EIO;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 6f61b7cc32e0..9bd8be1d235c 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
        mm = mm_for_maps(priv->task);
        if (!mm)
                return NULL;
+        down_read(&mm->mmap_sem);
        tail_vma = get_gate_vma(priv->task);
        priv->tail_vma = tail_vma;
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 64a72e2e7650..8f5c05d3dbd3 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos)
                priv->task = NULL;
                return NULL;
        }
+        down_read(&mm->mmap_sem);
        /* start from the Nth VMA */
        for (p = rb_first(&mm->mm_rb); p; p = rb_next(p))
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 70f36c043d62..38f7bd559f35 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -2043,7 +2043,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                invalidate_bdev(sb->s_bdev);
        }
        mutex_lock(&dqopt->dqonoff_mutex);
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
        if (sb_has_quota_loaded(sb, type)) {
                error = -EBUSY;
                goto out_lock;
@@ -2054,9 +2053,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                 * possible) Also nobody should write to the file - we use
                 * special IO operations which ignore the immutable bit. */
                down_write(&dqopt->dqptr_sem);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
                oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
                                             S_NOQUOTA);
                inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
+                mutex_unlock(&inode->i_mutex);
                up_write(&dqopt->dqptr_sem);
                sb->dq_op->drop(inode);
        }
@@ -2080,7 +2081,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
                goto out_file_init;
        }
        mutex_unlock(&dqopt->dqio_mutex);
-        mutex_unlock(&inode->i_mutex);
        spin_lock(&dq_state_lock);
        dqopt->flags |= dquot_state_flag(flags, type);
        spin_unlock(&dq_state_lock);
@@ -2096,13 +2096,14 @@ out_file_init:
 out_lock:
        if (oldflags != -1) {
                down_write(&dqopt->dqptr_sem);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
                /* Set the flags back (in the case of accidental quotaon()
                 * on a wrong file we don't want to mess up the flags) */
                inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
                inode->i_flags |= oldflags;
+                mutex_unlock(&inode->i_mutex);
                up_write(&dqopt->dqptr_sem);
        }
-        mutex_unlock(&inode->i_mutex);
        mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
        put_quota_format(fmt);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index ebb2c417912c..11f0c06316de 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -20,6 +20,7 @@
 #include <linux/ramfs.h>
 #include <linux/pagevec.h>
 #include <linux/mman.h>
+#include <linux/sched.h>
 #include <asm/uaccess.h>
 #include "internal.h"
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 0ff7566c767c..a7f0110fca4c 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -46,6 +46,7 @@ static const struct super_operations ramfs_ops;
 static const struct inode_operations ramfs_dir_inode_operations;
 static struct backing_dev_info ramfs_backing_dev_info = {
+        .name           = "ramfs",
        .ra_pages       = 0,    /* No readahead */
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK |
                          BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY |
diff --git a/fs/select.c b/fs/select.c
index d870237e42c7..8084834e123e 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq)
 {
        init_poll_funcptr(&pwq->pt, __pollwait);
        pwq->polling_task = current;
+        pwq->triggered = 0;
        pwq->error = 0;
        pwq->table = NULL;
        pwq->inline_index = 0;
diff --git a/fs/splice.c b/fs/splice.c
index 73766d24f97b..7394e9e17534 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -502,8 +502,10 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
                len = left;
        ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
-        if (ret > 0)
+        if (ret > 0) {
                *ppos += ret;
+                file_accessed(in);
+        }
        return ret;
 }
@@ -963,8 +965,10 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
                ret = file_remove_suid(out);
-                if (!ret)
+                if (!ret) {
+                        file_update_time(out);
                        ret = splice_from_pipe_feed(pipe, &sd, pipe_to_file);
+                }
                mutex_unlock(&inode->i_mutex);
        } while (ret > 0);
        splice_from_pipe_end(pipe, &sd);
@@ -976,25 +980,15 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
        if (ret > 0) {
                unsigned long nr_pages;
+                int err;
-                *ppos += ret;
                nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
-                /*
+                err = generic_write_sync(out, *ppos, ret);
-                 * If file or inode is SYNC and we actually wrote some data,
+                if (err)
-                 * sync it.
+                        ret = err;
-                 */
+                else
-                if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+                        *ppos += ret;
-                        int err;
-                        mutex_lock(&inode->i_mutex);
-                        err = generic_osync_inode(inode, mapping,
-                                                  OSYNC_METADATA|OSYNC_DATA);
-                        mutex_unlock(&inode->i_mutex);
-                        if (err)
-                                ret = err;
-                }
                balance_dirty_pages_ratelimited_nr(mapping, nr_pages);
        }
diff --git a/fs/super.c b/fs/super.c
index 2761d3e22ed9..b03fea8fbfb6 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -62,9 +62,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
                        s = NULL;
                        goto out;
                }
-                INIT_LIST_HEAD(&s->s_dirty);
-                INIT_LIST_HEAD(&s->s_io);
-                INIT_LIST_HEAD(&s->s_more_io);
                INIT_LIST_HEAD(&s->s_files);
                INIT_LIST_HEAD(&s->s_instances);
                INIT_HLIST_HEAD(&s->s_anon);
@@ -171,7 +168,7 @@ int __put_super_and_need_restart(struct super_block *sb)
 *      Drops a temporary reference, frees superblock if there's no
 *      references left.
 */
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
 {
        spin_lock(&sb_lock);
        __put_super(sb);
@@ -710,6 +707,12 @@ static int set_bdev_super(struct super_block *s, void *data)
 {
        s->s_bdev = data;
        s->s_dev = s->s_bdev->bd_dev;
+        /*
+         * We set the bdi here to the queue backing, file systems can
+         * overwrite this in ->fill_super()
+         */
+        s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info;
        return 0;
 }
diff --git a/fs/sync.c b/fs/sync.c
index 3422ba61d86d..c08467a5d7cb 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -19,20 +19,29 @@
                        SYNC_FILE_RANGE_WAIT_AFTER)
 /*
- * Do the filesystem syncing work. For simple filesystems sync_inodes_sb(sb, 0)
+ * Do the filesystem syncing work. For simple filesystems
- * just dirties buffers with inodes so we have to submit IO for these buffers
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
- * via __sync_blockdev(). This also speeds up the wait == 1 case since in that
+ * submit IO for these buffers via __sync_blockdev(). This also speeds up the
- * case write_inode() functions do sync_dirty_buffer() and thus effectively
+ * wait == 1 case since in that case write_inode() functions do
- * write one block at a time.
+ * sync_dirty_buffer() and thus effectively write one block at a time.
 */
 static int __sync_filesystem(struct super_block *sb, int wait)
 {
+        /*
+         * This should be safe, as we require bdi backing to actually
+         * write out data in the first place
+         */
+        if (!sb->s_bdi)
+                return 0;
        /* Avoid doing twice syncing and cache pruning for quota sync */
-        if (!wait)
+        if (!wait) {
                writeout_quota_sb(sb, -1);
-        else
+                writeback_inodes_sb(sb);
+        } else {
                sync_quota_sb(sb, -1);
-        sync_inodes_sb(sb, wait);
+                sync_inodes_sb(sb);
+        }
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, wait);
        return __sync_blockdev(sb->s_bdev, wait);
@@ -99,7 +108,7 @@ restart:
                spin_unlock(&sb_lock);
                down_read(&sb->s_umount);
-                if (!(sb->s_flags & MS_RDONLY) && sb->s_root)
+                if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
                        __sync_filesystem(sb, wait);
                up_read(&sb->s_umount);
@@ -118,7 +127,7 @@ restart:
 */
 SYSCALL_DEFINE0(sync)
 {
-        wakeup_pdflush(0);
+        wakeup_flusher_threads(0);
        sync_filesystems(0);
        sync_filesystems(1);
        if (unlikely(laptop_mode))
@@ -176,19 +185,23 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
 }
 /**
- * vfs_fsync - perform a fsync or fdatasync on a file
+ * vfs_fsync_range - helper to sync a range of data & metadata to disk
 * @file:               file to sync
 * @dentry:             dentry of @file
- * @data:               only perform a fdatasync operation
+ * @start:              offset in bytes of the beginning of data range to sync
+ * @end:                offset in bytes of the end of data range (inclusive)
+ * @datasync:           perform only datasync
 *
- * Write back data and metadata for @file to disk.  If @datasync is
+ * Write back data in range @start..@end and metadata for @file to disk.  If
- * set only metadata needed to access modified file data is written.
+ * @datasync is set only metadata needed to access modified file data is
+ * written.
 *
 * In case this function is called from nfsd @file may be %NULL and
 * only @dentry is set.  This can only happen when the filesystem
 * implements the export_operations API.
 */
-int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int vfs_fsync_range(struct file *file, struct dentry *dentry, loff_t start,
+                    loff_t end, int datasync)
 {
        const struct file_operations *fop;
        struct address_space *mapping;
@@ -212,7 +225,7 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
                goto out;
        }
-        ret = filemap_fdatawrite(mapping);
+        ret = filemap_write_and_wait_range(mapping, start, end);
        /*
         * We need to protect against concurrent writers, which could cause
@@ -223,12 +236,29 @@ int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
        if (!ret)
                ret = err;
        mutex_unlock(&mapping->host->i_mutex);
-        err = filemap_fdatawait(mapping);
-        if (!ret)
-                ret = err;
 out:
        return ret;
 }
+EXPORT_SYMBOL(vfs_fsync_range);
+/**
+ * vfs_fsync - perform a fsync or fdatasync on a file
+ * @file:               file to sync
+ * @dentry:             dentry of @file
+ * @datasync:           only perform a fdatasync operation
+ *
+ * Write back data and metadata for @file to disk.  If @datasync is
+ * set only metadata needed to access modified file data is written.
+ *
+ * In case this function is called from nfsd @file may be %NULL and
+ * only @dentry is set.  This can only happen when the filesystem
+ * implements the export_operations API.
+ */
+int vfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+        return vfs_fsync_range(file, dentry, 0, LLONG_MAX, datasync);
+}
 EXPORT_SYMBOL(vfs_fsync);
 static int do_fsync(unsigned int fd, int datasync)
@@ -254,6 +284,23 @@ SYSCALL_DEFINE1(fdatasync, unsigned int, fd)
        return do_fsync(fd, 1);
 }
+/**
+ * generic_write_sync - perform syncing after a write if file / inode is sync
+ * @file:       file to which the write happened
+ * @pos:        offset where the write started
+ * @count:      length of the write
+ *
+ * This is just a simple wrapper about our general syncing function.
+ */
+int generic_write_sync(struct file *file, loff_t pos, loff_t count)
+{
+        if (!(file->f_flags & O_SYNC) && !IS_SYNC(file->f_mapping->host))
+                return 0;
+        return vfs_fsync_range(file, file->f_path.dentry, pos,
+                               pos + count - 1, 1);
+}
+EXPORT_SYMBOL(generic_write_sync);
 /*
 * sys_sync_file_range() permits finely controlled syncing over a segment of
 * a file in the range offset .. (offset+nbytes-1) inclusive.  If nbytes is
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index d88d0fac9fa5..0050fc40e8c9 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -760,6 +760,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 const struct inode_operations sysfs_dir_inode_operations = {
        .lookup         = sysfs_lookup,
        .setattr        = sysfs_setattr,
+        .setxattr       = sysfs_setxattr,
 };
 static void remove_dir(struct sysfs_dirent *sd)
@@ -939,8 +940,10 @@ again:
        /* Remove from old parent's list and insert into new parent's list. */
        sysfs_unlink_sibling(sd);
        sysfs_get(new_parent_sd);
+        drop_nlink(old_parent->d_inode);
        sysfs_put(sd->s_parent);
        sd->s_parent = new_parent_sd;
+        inc_nlink(new_parent->d_inode);
        sysfs_link_sibling(sd);
 out_unlock:
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index 555f0ff988df..e28cecf179f5 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -18,6 +18,8 @@
 #include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/xattr.h>
+#include <linux/security.h>
 #include "sysfs.h"
 extern struct super_block * sysfs_sb;
@@ -29,12 +31,14 @@ static const struct address_space_operations sysfs_aops = {
 };
 static struct backing_dev_info sysfs_backing_dev_info = {
+        .name           = "sysfs",
        .ra_pages       = 0,    /* No readahead */
        .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 static const struct inode_operations sysfs_inode_operations ={
        .setattr        = sysfs_setattr,
+        .setxattr       = sysfs_setxattr,
 };
 int __init sysfs_inode_init(void)
@@ -42,18 +46,37 @@ int __init sysfs_inode_init(void)
        return bdi_init(&sysfs_backing_dev_info);
 }
+struct sysfs_inode_attrs *sysfs_init_inode_attrs(struct sysfs_dirent *sd)
+{
+        struct sysfs_inode_attrs *attrs;
+        struct iattr *iattrs;
+        attrs = kzalloc(sizeof(struct sysfs_inode_attrs), GFP_KERNEL);
+        if (!attrs)
+                return NULL;
+        iattrs = &attrs->ia_iattr;
+        /* assign default attributes */
+        iattrs->ia_mode = sd->s_mode;
+        iattrs->ia_uid = 0;
+        iattrs->ia_gid = 0;
+        iattrs->ia_atime = iattrs->ia_mtime = iattrs->ia_ctime = CURRENT_TIME;
+        return attrs;
+}
 int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
 {
        struct inode * inode = dentry->d_inode;
        struct sysfs_dirent * sd = dentry->d_fsdata;
-        struct iattr * sd_iattr;
+        struct sysfs_inode_attrs *sd_attrs;
+        struct iattr *iattrs;
        unsigned int ia_valid = iattr->ia_valid;
        int error;
        if (!sd)
                return -EINVAL;
-        sd_iattr = sd->s_iattr;
+        sd_attrs = sd->s_iattr;
        error = inode_change_ok(inode, iattr);
        if (error)
@@ -65,42 +88,77 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr)
        if (error)
                return error;
-        if (!sd_iattr) {
+        if (!sd_attrs) {
                /* setting attributes for the first time, allocate now */
-                sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
+                sd_attrs = sysfs_init_inode_attrs(sd);
-                if (!sd_iattr)
+                if (!sd_attrs)
                        return -ENOMEM;
-                /* assign default attributes */
+                sd->s_iattr = sd_attrs;
-                sd_iattr->ia_mode = sd->s_mode;
+        } else {
-                sd_iattr->ia_uid = 0;
+                /* attributes were changed at least once in past */
-                sd_iattr->ia_gid = 0;
+                iattrs = &sd_attrs->ia_iattr;
-                sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
-                sd->s_iattr = sd_iattr;
+                if (ia_valid & ATTR_UID)
+                        iattrs->ia_uid = iattr->ia_uid;
+                if (ia_valid & ATTR_GID)
+                        iattrs->ia_gid = iattr->ia_gid;
+                if (ia_valid & ATTR_ATIME)
+                        iattrs->ia_atime = timespec_trunc(iattr->ia_atime,
+                                        inode->i_sb->s_time_gran);
+                if (ia_valid & ATTR_MTIME)
+                        iattrs->ia_mtime = timespec_trunc(iattr->ia_mtime,
+                                        inode->i_sb->s_time_gran);
+                if (ia_valid & ATTR_CTIME)
+                        iattrs->ia_ctime = timespec_trunc(iattr->ia_ctime,
+                                        inode->i_sb->s_time_gran);
+                if (ia_valid & ATTR_MODE) {
+                        umode_t mode = iattr->ia_mode;
+                        if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+                                mode &= ~S_ISGID;
+                        iattrs->ia_mode = sd->s_mode = mode;
+                }
        }
+        return error;
+}
-        /* attributes were changed atleast once in past */
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+                size_t size, int flags)
-        if (ia_valid & ATTR_UID)
+{
-                sd_iattr->ia_uid = iattr->ia_uid;
+        struct sysfs_dirent *sd = dentry->d_fsdata;
-        if (ia_valid & ATTR_GID)
+        struct sysfs_inode_attrs *iattrs;
-                sd_iattr->ia_gid = iattr->ia_gid;
+        void *secdata;
-        if (ia_valid & ATTR_ATIME)
+        int error;
-                sd_iattr->ia_atime = timespec_trunc(iattr->ia_atime,
+        u32 secdata_len = 0;
-                                                inode->i_sb->s_time_gran);
-        if (ia_valid & ATTR_MTIME)
+        if (!sd)
-                sd_iattr->ia_mtime = timespec_trunc(iattr->ia_mtime,
+                return -EINVAL;
-                                                inode->i_sb->s_time_gran);
+        if (!sd->s_iattr)
-        if (ia_valid & ATTR_CTIME)
+                sd->s_iattr = sysfs_init_inode_attrs(sd);
-                sd_iattr->ia_ctime = timespec_trunc(iattr->ia_ctime,
+        if (!sd->s_iattr)
-                                                inode->i_sb->s_time_gran);
+                return -ENOMEM;
-        if (ia_valid & ATTR_MODE) {
-                umode_t mode = iattr->ia_mode;
+        iattrs = sd->s_iattr;
-                if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID))
+        if (!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) {
-                        mode &= ~S_ISGID;
+                const char *suffix = name + XATTR_SECURITY_PREFIX_LEN;
-                sd_iattr->ia_mode = sd->s_mode = mode;
+                error = security_inode_setsecurity(dentry->d_inode, suffix,
-        }
+                                                value, size, flags);
+                if (error)
+                        goto out;
+                error = security_inode_getsecctx(dentry->d_inode,
+                                                &secdata, &secdata_len);
+                if (error)
+                        goto out;
+                if (iattrs->ia_secdata)
+                        security_release_secctx(iattrs->ia_secdata,
+                                                iattrs->ia_secdata_len);
+                iattrs->ia_secdata = secdata;
+                iattrs->ia_secdata_len = secdata_len;
+        } else
+                return -EINVAL;
+out:
        return error;
 }
@@ -146,6 +204,7 @@ static int sysfs_count_nlink(struct sysfs_dirent *sd)
 static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
 {
        struct bin_attribute *bin_attr;
+        struct sysfs_inode_attrs *iattrs;
        inode->i_private = sysfs_get(sd);
        inode->i_mapping->a_ops = &sysfs_aops;
@@ -154,16 +213,20 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode)
        inode->i_ino = sd->s_ino;
        lockdep_set_class(&inode->i_mutex, &sysfs_inode_imutex_key);
-        if (sd->s_iattr) {
+        iattrs = sd->s_iattr;
+        if (iattrs) {
                /* sysfs_dirent has non-default attributes
                 * get them for the new inode from persistent copy
                 * in sysfs_dirent
                 */
-                set_inode_attr(inode, sd->s_iattr);
+                set_inode_attr(inode, &iattrs->ia_iattr);
+                if (iattrs->ia_secdata)
+                        security_inode_notifysecctx(inode,
+                                                iattrs->ia_secdata,
+                                                iattrs->ia_secdata_len);
        } else
                set_default_inode_attr(inode, sd->s_mode);
        /* initialize inode according to type */
        switch (sysfs_type(sd)) {
        case SYSFS_DIR:
diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c
index 1d897ad808e0..c5081ad77026 100644
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -16,6 +16,7 @@
 #include <linux/kobject.h>
 #include <linux/namei.h>
 #include <linux/mutex.h>
+#include <linux/security.h>
 #include "sysfs.h"
@@ -209,6 +210,7 @@ static void sysfs_put_link(struct dentry *dentry, struct nameidata *nd, void *co
 }
 const struct inode_operations sysfs_symlink_inode_operations = {
+        .setxattr = sysfs_setxattr,
        .readlink = generic_readlink,
        .follow_link = sysfs_follow_link,
        .put_link = sysfs_put_link,
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 3fa0d98481e2..af4c4e7482ac 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -8,6 +8,8 @@
 * This file is released under the GPLv2.
 */
+#include <linux/fs.h>
 struct sysfs_open_dirent;
 /* type-specific structures for sysfs_dirent->s_* union members */
@@ -31,6 +33,12 @@ struct sysfs_elem_bin_attr {
        struct hlist_head       buffers;
 };
+struct sysfs_inode_attrs {
+        struct iattr    ia_iattr;
+        void            *ia_secdata;
+        u32             ia_secdata_len;
+};
 /*
 * sysfs_dirent - the building block of sysfs hierarchy.  Each and
 * every sysfs node is represented by single sysfs_dirent.
@@ -56,7 +64,7 @@ struct sysfs_dirent {
        unsigned int            s_flags;
        ino_t                   s_ino;
        umode_t                 s_mode;
-        struct iattr            *s_iattr;
+        struct sysfs_inode_attrs *s_iattr;
 };
 #define SD_DEACTIVATED_BIAS             INT_MIN
@@ -148,6 +156,8 @@ static inline void __sysfs_put(struct sysfs_dirent *sd)
 struct inode *sysfs_get_inode(struct sysfs_dirent *sd);
 void sysfs_delete_inode(struct inode *inode);
 int sysfs_setattr(struct dentry *dentry, struct iattr *iattr);
+int sysfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+                size_t size, int flags);
 int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const char *name);
 int sysfs_inode_init(void);
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index c1f3f99b2939..076ca50e9933 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -54,41 +54,15 @@
 * @nr_to_write: how many dirty pages to write-back
 *
 * This function shrinks UBIFS liability by means of writing back some amount
- * of dirty inodes and their pages. Returns the amount of pages which were
+ * of dirty inodes and their pages.
- * written back. The returned value does not include dirty inodes which were
- * synchronized.
 *
 * Note, this function synchronizes even VFS inodes which are locked
 * (@i_mutex) by the caller of the budgeting function, because write-back does
 * not touch @i_mutex.
 */
-static int shrink_liability(struct ubifs_info *c, int nr_to_write)
+static void shrink_liability(struct ubifs_info *c, int nr_to_write)
 {
-        int nr_written;
+        writeback_inodes_sb(c->vfs_sb);
-        struct writeback_control wbc = {
-                .sync_mode   = WB_SYNC_NONE,
-                .range_end   = LLONG_MAX,
-                .nr_to_write = nr_to_write,
-        };
-        generic_sync_sb_inodes(c->vfs_sb, &wbc);
-        nr_written = nr_to_write - wbc.nr_to_write;
-        if (!nr_written) {
-                /*
-                 * Re-try again but wait on pages/inodes which are being
-                 * written-back concurrently (e.g., by pdflush).
-                 */
-                memset(&wbc, 0, sizeof(struct writeback_control));
-                wbc.sync_mode   = WB_SYNC_ALL;
-                wbc.range_end   = LLONG_MAX;
-                wbc.nr_to_write = nr_to_write;
-                generic_sync_sb_inodes(c->vfs_sb, &wbc);
-                nr_written = nr_to_write - wbc.nr_to_write;
-        }
-        dbg_budg("%d pages were written back", nr_written);
-        return nr_written;
 }
 /**
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 7e2b3d4d487a..333e181ee987 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1952,6 +1952,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
         *
         * Read-ahead will be disabled because @c->bdi.ra_pages is 0.
         */
+        c->bdi.name = "ubifs",
        c->bdi.capabilities = BDI_CAP_MAP_COPY;
        c->bdi.unplug_io_fn = default_unplug_io_fn;
        err  = bdi_init(&c->bdi);
@@ -1966,6 +1967,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
        if (err)
                goto out_bdi;
+        sb->s_bdi = &c->bdi;
        sb->s_fs_info = c;
        sb->s_magic = UBIFS_SUPER_MAGIC;
        sb->s_blocksize = UBIFS_BLOCK_SIZE;
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 1d2c570704c8..2ffdb6733af1 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -18,59 +18,6 @@
 #include <linux/string.h>
 #include <linux/buffer_head.h>
-#if 0
-static uint8_t *udf_filead_read(struct inode *dir, uint8_t *tmpad,
-                                uint8_t ad_size, struct kernel_lb_addr fe_loc,
-                                int *pos, int *offset, struct buffer_head **bh,
-                                int *error)
-{
-        int loffset = *offset;
-        int block;
-        uint8_t *ad;
-        int remainder;
-        *error = 0;
-        ad = (uint8_t *)(*bh)->b_data + *offset;
-        *offset += ad_size;
-        if (!ad) {
-                brelse(*bh);
-                *error = 1;
-                return NULL;
-        }
-        if (*offset == dir->i_sb->s_blocksize) {
-                brelse(*bh);
-                block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
-                if (!block)
-                        return NULL;
-                *bh = udf_tread(dir->i_sb, block);
-                if (!*bh)
-                        return NULL;
-        } else if (*offset > dir->i_sb->s_blocksize) {
-                ad = tmpad;
-                remainder = dir->i_sb->s_blocksize - loffset;
-                memcpy((uint8_t *)ad, (*bh)->b_data + loffset, remainder);
-                brelse(*bh);
-                block = udf_get_lb_pblock(dir->i_sb, fe_loc, ++*pos);
-                if (!block)
-                        return NULL;
-                (*bh) = udf_tread(dir->i_sb, block);
-                if (!*bh)
-                        return NULL;
-                memcpy((uint8_t *)ad + remainder, (*bh)->b_data,
-                        ad_size - remainder);
-                *offset = ad_size - remainder;
-        }
-        return ad;
-}
-#endif
 struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
                                         struct udf_fileident_bh *fibh,
                                         struct fileIdentDesc *cfi,
@@ -248,39 +195,6 @@ struct fileIdentDesc *udf_get_fileident(void *buffer, int bufsize, int *offset)
        return fi;
 }
-#if 0
-static struct extent_ad *udf_get_fileextent(void *buffer, int bufsize, int *offset)
-{
-        struct extent_ad *ext;
-        struct fileEntry *fe;
-        uint8_t *ptr;
-        if ((!buffer) || (!offset)) {
-                printk(KERN_ERR "udf: udf_get_fileextent() invalidparms\n");
-                return NULL;
-        }
-        fe = (struct fileEntry *)buffer;
-        if (fe->descTag.tagIdent != cpu_to_le16(TAG_IDENT_FE)) {
-                udf_debug("0x%x != TAG_IDENT_FE\n",
-                          le16_to_cpu(fe->descTag.tagIdent));
-                return NULL;
-        }
-        ptr = (uint8_t *)(fe->extendedAttr) +
-                le32_to_cpu(fe->lengthExtendedAttr);
-        if ((*offset > 0) && (*offset < le32_to_cpu(fe->lengthAllocDescs)))
-                ptr += *offset;
-        ext = (struct extent_ad *)ptr;
-        *offset = *offset + sizeof(struct extent_ad);
-        return ext;
-}
-#endif
 struct short_ad *udf_get_fileshortad(uint8_t *ptr, int maxoffset, uint32_t *offset,
                              int inc)
 {
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 7464305382b5..b80cbd78833c 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -193,9 +193,11 @@ int udf_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
 static int udf_release_file(struct inode *inode, struct file *filp)
 {
        if (filp->f_mode & FMODE_WRITE) {
+                mutex_lock(&inode->i_mutex);
                lock_kernel();
                udf_discard_prealloc(inode);
                unlock_kernel();
+                mutex_unlock(&inode->i_mutex);
        }
        return 0;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e7533f785636..6d24c2c63f93 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -90,19 +90,16 @@ no_delete:
 }
 /*
- * If we are going to release inode from memory, we discard preallocation and
+ * If we are going to release inode from memory, we truncate last inode extent
- * truncate last inode extent to proper length. We could use drop_inode() but
+ * to proper length. We could use drop_inode() but it's called under inode_lock
- * it's called under inode_lock and thus we cannot mark inode dirty there.  We
+ * and thus we cannot mark inode dirty there.  We use clear_inode() but we have
- * use clear_inode() but we have to make sure to write inode as it's not written
+ * to make sure to write inode as it's not written automatically.
- * automatically.
 */
 void udf_clear_inode(struct inode *inode)
 {
        struct udf_inode_info *iinfo;
        if (!(inode->i_sb->s_flags & MS_RDONLY)) {
                lock_kernel();
-                /* Discard preallocation for directories, symlinks, etc. */
-                udf_discard_prealloc(inode);
                udf_truncate_tail_extent(inode);
                unlock_kernel();
                write_inode_now(inode, 0);
@@ -664,8 +661,12 @@ static struct buffer_head *inode_getblk(struct inode *inode, sector_t block,
        udf_split_extents(inode, &c, offset, newblocknum, laarr, &endnum);
 #ifdef UDF_PREALLOCATE
-        /* preallocate blocks */
+        /* We preallocate blocks only for regular files. It also makes sense
-        udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
+         * for directories but there's a problem when to drop the
+         * preallocation. We might use some delayed work for that but I feel
+         * it's overengineering for a filesystem like UDF. */
+        if (S_ISREG(inode->i_mode))
+                udf_prealloc_extents(inode, c, lastblock, laarr, &endnum);
 #endif
        /* merge any continuous blocks in laarr */
diff --git a/fs/udf/lowlevel.c b/fs/udf/lowlevel.c
index 1b88fd5df05d..43e24a3b8e10 100644
--- a/fs/udf/lowlevel.c
+++ b/fs/udf/lowlevel.c
@@ -36,14 +36,10 @@ unsigned int udf_get_last_session(struct super_block *sb)
        ms_info.addr_format = CDROM_LBA;
        i = ioctl_by_bdev(bdev, CDROMMULTISESSION, (unsigned long)&ms_info);
-#define WE_OBEY_THE_WRITTEN_STANDARDS 1
        if (i == 0) {
                udf_debug("XA disk: %s, vol_desc_start=%d\n",
                          (ms_info.xa_flag ? "yes" : "no"), ms_info.addr.lba);
-#if WE_OBEY_THE_WRITTEN_STANDARDS
                if (ms_info.xa_flag) /* necessary for a valid ms_info.addr */
-#endif
                        vol_desc_start = ms_info.addr.lba;
        } else {
                udf_debug("CDROMMULTISESSION not supported: rc=%d\n", i);
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 6a29fa34c478..21dad8c608f9 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -943,7 +943,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
                pc->componentType = 1;
                pc->lengthComponentIdent = 0;
                pc->componentFileVersionNum = 0;
-                pc += sizeof(struct pathComponent);
                elen += sizeof(struct pathComponent);
        }
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 6832135159b6..9d1b8c2e6c45 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1087,11 +1087,23 @@ static int udf_load_vat(struct super_block *sb, int p_index, int type1_index)
        struct udf_inode_info *vati;
        uint32_t pos;
        struct virtualAllocationTable20 *vat20;
+        sector_t blocks = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        /* VAT file entry is in the last recorded block */
        ino.partitionReferenceNum = type1_index;
        ino.logicalBlockNum = sbi->s_last_block - map->s_partition_root;
        sbi->s_vat_inode = udf_iget(sb, &ino);
+        if (!sbi->s_vat_inode &&
+            sbi->s_last_block != blocks - 1) {
+                printk(KERN_NOTICE "UDF-fs: Failed to read VAT inode from the"
+                       " last recorded block (%lu), retrying with the last "
+                       "block of the device (%lu).\n",
+                       (unsigned long)sbi->s_last_block,
+                       (unsigned long)blocks - 1);
+                ino.partitionReferenceNum = type1_index;
+                ino.logicalBlockNum = blocks - 1 - map->s_partition_root;
+                sbi->s_vat_inode = udf_iget(sb, &ino);
+        }
        if (!sbi->s_vat_inode)
                return 1;
diff --git a/fs/xattr.c b/fs/xattr.c
index 1c3d0af59ddf..6d4f6d3449fb 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -66,22 +66,28 @@ xattr_permission(struct inode *inode, const char *name, int mask)
        return inode_permission(inode, mask);
 }
-int
+/**
-vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+ *  __vfs_setxattr_noperm - perform setxattr operation without performing
-                size_t size, int flags)
+ *  permission checks.
+ *
+ *  @dentry - object to perform setxattr on
+ *  @name - xattr name to set
+ *  @value - value to set @name to
+ *  @size - size of @value
+ *  @flags - flags to pass into filesystem operations
+ *
+ *  returns the result of the internal setxattr or setsecurity operations.
+ *
+ *  This function requires the caller to lock the inode's i_mutex before it
+ *  is executed. It also assumes that the caller will make the appropriate
+ *  permission checks.
+ */
+int __vfs_setxattr_noperm(struct dentry *dentry, const char *name,
+                const void *value, size_t size, int flags)
 {
        struct inode *inode = dentry->d_inode;
-        int error;
+        int error = -EOPNOTSUPP;
-        error = xattr_permission(inode, name, MAY_WRITE);
-        if (error)
-                return error;
-        mutex_lock(&inode->i_mutex);
-        error = security_inode_setxattr(dentry, name, value, size, flags);
-        if (error)
-                goto out;
-        error = -EOPNOTSUPP;
        if (inode->i_op->setxattr) {
                error = inode->i_op->setxattr(dentry, name, value, size, flags);
                if (!error) {
@@ -97,6 +103,29 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                if (!error)
                        fsnotify_xattr(dentry);
        }
+        return error;
+}
+int
+vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
+                size_t size, int flags)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = xattr_permission(inode, name, MAY_WRITE);
+        if (error)
+                return error;
+        mutex_lock(&inode->i_mutex);
+        error = security_inode_setxattr(dentry, name, value, size, flags);
+        if (error)
+                goto out;
+        error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
 out:
        mutex_unlock(&inode->i_mutex);
        return error;
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 7ec89fc05b2b..d5e5559e31db 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -216,7 +216,6 @@ xfs_setfilesize(
        if (ip->i_d.di_size < isize) {
                ip->i_d.di_size = isize;
                ip->i_update_core = 1;
-                ip->i_update_size = 1;
                xfs_mark_inode_dirty_sync(ip);
        }
@@ -1268,6 +1267,14 @@ xfs_vm_writepage(
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << inode->i_blkbits, 0);
+        /*
+         *  VM calculation for nr_to_write seems off.  Bump it way
+         *  up, this gets simple streaming writes zippy again.
+         *  To be reviewed again after Jens' writeback changes.
+         */
+        wbc->nr_to_write *= 4;
        /*
         * Convert delayed allocate, unwritten or unmapped space
         * to real space and flush out to disk.
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 0c93c7ef3d18..965df1227d64 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -770,7 +770,7 @@ xfs_buf_associate_memory(
        bp->b_pages = NULL;
        bp->b_addr = mem;
-        rval = _xfs_buf_get_pages(bp, page_count, 0);
+        rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK);
        if (rval)
                return rval;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index 0542fd507649..988d8f87bc0f 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -172,12 +172,21 @@ xfs_file_release(
 */
 STATIC int
 xfs_file_fsync(
-        struct file     *filp,
+        struct file             *file,
-        struct dentry   *dentry,
+        struct dentry           *dentry,
-        int             datasync)
+        int                     datasync)
 {
-        xfs_iflags_clear(XFS_I(dentry->d_inode), XFS_ITRUNCATED);
+        struct inode            *inode = dentry->d_inode;
-        return -xfs_fsync(XFS_I(dentry->d_inode));
+        struct xfs_inode        *ip = XFS_I(inode);
+        int                     error;
+        /* capture size updates in I/O completion before writing the inode. */
+        error = filemap_fdatawait(inode->i_mapping);
+        if (error)
+                return error;
+        xfs_iflags_clear(ip, XFS_ITRUNCATED);
+        return -xfs_fsync(ip);
 }
 STATIC int
diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c
index 0882d166239a..eafcc7c18706 100644
--- a/fs/xfs/linux-2.6/xfs_ioctl32.c
+++ b/fs/xfs/linux-2.6/xfs_ioctl32.c
@@ -619,7 +619,7 @@ xfs_file_compat_ioctl(
        case XFS_IOC_GETVERSION_32:
                cmd = _NATIVE_IOC(cmd, long);
                return xfs_file_ioctl(filp, cmd, p);
-        case XFS_IOC_SWAPEXT: {
+        case XFS_IOC_SWAPEXT_32: {
                struct xfs_swapext        sxp;
                struct compat_xfs_swapext __user *sxu = arg;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 58973bb46038..da0159d99f82 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -43,7 +43,6 @@
 #include "xfs_error.h"
 #include "xfs_itable.h"
 #include "xfs_rw.h"
-#include "xfs_acl.h"
 #include "xfs_attr.h"
 #include "xfs_buf_item.h"
 #include "xfs_utils.h"
@@ -485,14 +484,6 @@ xfs_vn_put_link(
 }
 STATIC int
-xfs_vn_permission(
-        struct inode            *inode,
-        int                     mask)
-{
-        return generic_permission(inode, mask, xfs_check_acl);
-}
-STATIC int
 xfs_vn_getattr(
        struct vfsmount         *mnt,
        struct dentry           *dentry,
@@ -680,8 +671,8 @@ xfs_vn_fiemap(
        else
                bm.bmv_length = BTOBB(length);
-        /* our formatter will tell xfs_getbmap when to stop. */
+        /* We add one because in getbmap world count includes the header */
-        bm.bmv_count = MAXEXTNUM;
+        bm.bmv_count = fieinfo->fi_extents_max + 1;
        bm.bmv_iflags = BMV_IF_PREALLOC;
        if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR)
                bm.bmv_iflags |= BMV_IF_ATTRFORK;
@@ -696,7 +687,7 @@ xfs_vn_fiemap(
 }
 static const struct inode_operations xfs_inode_operations = {
-        .permission             = xfs_vn_permission,
+        .check_acl              = xfs_check_acl,
        .truncate               = xfs_vn_truncate,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
@@ -724,7 +715,7 @@ static const struct inode_operations xfs_dir_inode_operations = {
        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
        .rename                 = xfs_vn_rename,
-        .permission             = xfs_vn_permission,
+        .check_acl              = xfs_check_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
@@ -749,7 +740,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
        .rmdir                  = xfs_vn_unlink,
        .mknod                  = xfs_vn_mknod,
        .rename                 = xfs_vn_rename,
-        .permission             = xfs_vn_permission,
+        .check_acl              = xfs_check_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
@@ -762,7 +753,7 @@ static const struct inode_operations xfs_symlink_inode_operations = {
        .readlink               = generic_readlink,
        .follow_link            = xfs_vn_follow_link,
        .put_link               = xfs_vn_put_link,
-        .permission             = xfs_vn_permission,
+        .check_acl              = xfs_check_acl,
        .getattr                = xfs_vn_getattr,
        .setattr                = xfs_vn_setattr,
        .setxattr               = generic_setxattr,
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 7078974a6eee..49e4a6aea73c 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -812,18 +812,21 @@ write_retry:
        /* Handle various SYNC-type writes */
        if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
+                loff_t end = pos + ret - 1;
                int error2;
                xfs_iunlock(xip, iolock);
                if (need_i_mutex)
                        mutex_unlock(&inode->i_mutex);
-                error2 = sync_page_range(inode, mapping, pos, ret);
+                error2 = filemap_write_and_wait_range(mapping, pos, end);
                if (!error)
                        error = error2;
                if (need_i_mutex)
                        mutex_lock(&inode->i_mutex);
                xfs_ilock(xip, iolock);
-                error2 = xfs_write_sync_logforce(mp, xip);
+                error2 = xfs_fsync(xip);
                if (!error)
                        error = error2;
        }
diff --git a/fs/xfs/linux-2.6/xfs_stats.c b/fs/xfs/linux-2.6/xfs_stats.c
index c3526d445f6a..76fdc5861932 100644
--- a/fs/xfs/linux-2.6/xfs_stats.c
+++ b/fs/xfs/linux-2.6/xfs_stats.c
@@ -20,16 +20,9 @@
 DEFINE_PER_CPU(struct xfsstats, xfsstats);
-STATIC int
+static int xfs_stat_proc_show(struct seq_file *m, void *v)
-xfs_read_xfsstats(
-        char            *buffer,
-        char            **start,
-        off_t           offset,
-        int             count,
-        int             *eof,
-        void            *data)
 {
-        int             c, i, j, len, val;
+        int             c, i, j, val;
        __uint64_t      xs_xstrat_bytes = 0;
        __uint64_t      xs_write_bytes = 0;
        __uint64_t      xs_read_bytes = 0;
@@ -60,18 +53,18 @@ xfs_read_xfsstats(
        };
        /* Loop over all stats groups */
-        for (i=j=len = 0; i < ARRAY_SIZE(xstats); i++) {
+        for (i=j = 0; i < ARRAY_SIZE(xstats); i++) {
-                len += sprintf(buffer + len, "%s", xstats[i].desc);
+                seq_printf(m, "%s", xstats[i].desc);
                /* inner loop does each group */
                while (j < xstats[i].endpoint) {
                        val = 0;
                        /* sum over all cpus */
                        for_each_possible_cpu(c)
                                val += *(((__u32*)&per_cpu(xfsstats, c) + j));
-                        len += sprintf(buffer + len, " %u", val);
+                        seq_printf(m, " %u", val);
                        j++;
                }
-                buffer[len++] = '\n';
+                seq_putc(m, '\n');
        }
        /* extra precision counters */
        for_each_possible_cpu(i) {
@@ -80,36 +73,38 @@ xfs_read_xfsstats(
                xs_read_bytes += per_cpu(xfsstats, i).xs_read_bytes;
        }
-        len += sprintf(buffer + len, "xpc %Lu %Lu %Lu\n",
+        seq_printf(m, "xpc %Lu %Lu %Lu\n",
                        xs_xstrat_bytes, xs_write_bytes, xs_read_bytes);
-        len += sprintf(buffer + len, "debug %u\n",
+        seq_printf(m, "debug %u\n",
 #if defined(DEBUG)
                1);
 #else
                0);
 #endif
+        return 0;
+}
-        if (offset >= len) {
+static int xfs_stat_proc_open(struct inode *inode, struct file *file)
-                *start = buffer;
+{
-                *eof = 1;
+        return single_open(file, xfs_stat_proc_show, NULL);
-                return 0;
-        }
-        *start = buffer + offset;
-        if ((len -= offset) > count)
-                return count;
-        *eof = 1;
-        return len;
 }
+static const struct file_operations xfs_stat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = xfs_stat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 int
 xfs_init_procfs(void)
 {
        if (!proc_mkdir("fs/xfs", NULL))
                goto out;
-        if (!create_proc_read_entry("fs/xfs/stat", 0, NULL,
+        if (!proc_create("fs/xfs/stat", 0, NULL,
-                        xfs_read_xfsstats, NULL))
+                         &xfs_stat_proc_fops))
                goto out_remove_entry;
        return 0;
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index a220d36f789b..5d7c60ac77b4 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -579,15 +579,19 @@ xfs_showargs(
        else if (mp->m_qflags & XFS_UQUOTA_ACCT)
                seq_puts(m, "," MNTOPT_UQUOTANOENF);
-        if (mp->m_qflags & (XFS_PQUOTA_ACCT|XFS_OQUOTA_ENFD))
+        /* Either project or group quotas can be active, not both */
-                seq_puts(m, "," MNTOPT_PRJQUOTA);
-        else if (mp->m_qflags & XFS_PQUOTA_ACCT)
+        if (mp->m_qflags & XFS_PQUOTA_ACCT) {
-                seq_puts(m, "," MNTOPT_PQUOTANOENF);
+                if (mp->m_qflags & XFS_OQUOTA_ENFD)
+                        seq_puts(m, "," MNTOPT_PRJQUOTA);
-        if (mp->m_qflags & (XFS_GQUOTA_ACCT|XFS_OQUOTA_ENFD))
+                else
-                seq_puts(m, "," MNTOPT_GRPQUOTA);
+                        seq_puts(m, "," MNTOPT_PQUOTANOENF);
-        else if (mp->m_qflags & XFS_GQUOTA_ACCT)
+        } else if (mp->m_qflags & XFS_GQUOTA_ACCT) {
-                seq_puts(m, "," MNTOPT_GQUOTANOENF);
+                if (mp->m_qflags & XFS_OQUOTA_ENFD)
+                        seq_puts(m, "," MNTOPT_GRPQUOTA);
+                else
+                        seq_puts(m, "," MNTOPT_GQUOTANOENF);
+        }
        if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT))
                seq_puts(m, "," MNTOPT_NOQUOTA);
@@ -687,7 +691,7 @@ xfs_barrier_test(
        return error;
 }
-void
+STATIC void
 xfs_mountfs_check_barriers(xfs_mount_t *mp)
 {
        int error;
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index b619d6b8ca43..320be6aea492 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -708,6 +708,16 @@ xfs_reclaim_inode(
        return 0;
 }
+void
+__xfs_inode_set_reclaim_tag(
+        struct xfs_perag        *pag,
+        struct xfs_inode        *ip)
+{
+        radix_tree_tag_set(&pag->pag_ici_root,
+                           XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
+                           XFS_ICI_RECLAIM_TAG);
+}
 /*
 * We set the inode flag atomically with the radix tree tag.
 * Once we get tag lookups on the radix tree, this inode flag
@@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag(
        read_lock(&pag->pag_ici_lock);
        spin_lock(&ip->i_flags_lock);
-        radix_tree_tag_set(&pag->pag_ici_root,
+        __xfs_inode_set_reclaim_tag(pag, ip);
-                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
        __xfs_iflags_set(ip, XFS_IRECLAIMABLE);
        spin_unlock(&ip->i_flags_lock);
        read_unlock(&pag->pag_ici_lock);
@@ -740,21 +749,6 @@ __xfs_inode_clear_reclaim_tag(
                        XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG);
 }
-void
-xfs_inode_clear_reclaim_tag(
-        xfs_inode_t     *ip)
-{
-        xfs_mount_t     *mp = ip->i_mount;
-        xfs_perag_t     *pag = xfs_get_perag(mp, ip->i_ino);
-        read_lock(&pag->pag_ici_lock);
-        spin_lock(&ip->i_flags_lock);
-        __xfs_inode_clear_reclaim_tag(mp, pag, ip);
-        spin_unlock(&ip->i_flags_lock);
-        read_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(mp, pag);
-}
 STATIC int
 xfs_reclaim_inode_now(
        struct xfs_inode        *ip,
diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h
index 2a10301c99c7..27920eb7a820 100644
--- a/fs/xfs/linux-2.6/xfs_sync.h
+++ b/fs/xfs/linux-2.6/xfs_sync.h
@@ -48,7 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode);
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
-void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip);
+void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip);
 void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag,
                                struct xfs_inode *ip);
diff --git a/fs/xfs/quota/xfs_qm_stats.c b/fs/xfs/quota/xfs_qm_stats.c
index 21b08c0396a1..83e7ea3e25fa 100644
--- a/fs/xfs/quota/xfs_qm_stats.c
+++ b/fs/xfs/quota/xfs_qm_stats.c
@@ -48,50 +48,34 @@
 struct xqmstats xqmstats;
-STATIC int
+static int xqm_proc_show(struct seq_file *m, void *v)
-xfs_qm_read_xfsquota(
-        char            *buffer,
-        char            **start,
-        off_t           offset,
-        int             count,
-        int             *eof,
-        void            *data)
 {
-        int             len;
        /* maximum; incore; ratio free to inuse; freelist */
-        len = sprintf(buffer, "%d\t%d\t%d\t%u\n",
+        seq_printf(m, "%d\t%d\t%d\t%u\n",
                        ndquot,
                        xfs_Gqm? atomic_read(&xfs_Gqm->qm_totaldquots) : 0,
                        xfs_Gqm? xfs_Gqm->qm_dqfree_ratio : 0,
                        xfs_Gqm? xfs_Gqm->qm_dqfreelist.qh_nelems : 0);
+        return 0;
-        if (offset >= len) {
-                *start = buffer;
-                *eof = 1;
-                return 0;
-        }
-        *start = buffer + offset;
-        if ((len -= offset) > count)
-                return count;
-        *eof = 1;
-        return len;
 }
-STATIC int
+static int xqm_proc_open(struct inode *inode, struct file *file)
-xfs_qm_read_stats(
-        char            *buffer,
-        char            **start,
-        off_t           offset,
-        int             count,
-        int             *eof,
-        void            *data)
 {
-        int             len;
+        return single_open(file, xqm_proc_show, NULL);
+}
+static const struct file_operations xqm_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = xqm_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
+static int xqmstat_proc_show(struct seq_file *m, void *v)
+{
        /* quota performance statistics */
-        len = sprintf(buffer, "qm %u %u %u %u %u %u %u %u\n",
+        seq_printf(m, "qm %u %u %u %u %u %u %u %u\n",
                        xqmstats.xs_qm_dqreclaims,
                        xqmstats.xs_qm_dqreclaim_misses,
                        xqmstats.xs_qm_dquot_dups,
@@ -100,25 +84,27 @@ xfs_qm_read_stats(
                        xqmstats.xs_qm_dqwants,
                        xqmstats.xs_qm_dqshake_reclaims,
                        xqmstats.xs_qm_dqinact_reclaims);
+        return 0;
+}
-        if (offset >= len) {
+static int xqmstat_proc_open(struct inode *inode, struct file *file)
-                *start = buffer;
+{
-                *eof = 1;
+        return single_open(file, xqmstat_proc_show, NULL);
-                return 0;
-        }
-        *start = buffer + offset;
-        if ((len -= offset) > count)
-                return count;
-        *eof = 1;
-        return len;
 }
+static const struct file_operations xqmstat_proc_fops = {
+        .owner          = THIS_MODULE,
+        .open           = xqmstat_proc_open,
+        .read           = seq_read,
+        .llseek         = seq_lseek,
+        .release        = single_release,
+};
 void
 xfs_qm_init_procfs(void)
 {
-        create_proc_read_entry("fs/xfs/xqmstat", 0, NULL, xfs_qm_read_stats, NULL);
+        proc_create("fs/xfs/xqmstat", 0, NULL, &xqmstat_proc_fops);
-        create_proc_read_entry("fs/xfs/xqm", 0, NULL, xfs_qm_read_xfsquota, NULL);
+        proc_create("fs/xfs/xqm", 0, NULL, &xqm_proc_fops);
 }
 void
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index f24b50b68d03..a5d54bf4931b 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -198,6 +198,15 @@ typedef struct xfs_perag
        xfs_agino_t     pagi_count;     /* number of allocated inodes */
        int             pagb_count;     /* pagb slots in use */
        xfs_perag_busy_t *pagb_list;    /* unstable blocks */
+        /*
+         * Inode allocation search lookup optimisation.
+         * If the pagino matches, the search for new inodes
+         * doesn't need to search the near ones again straight away
+         */
+        xfs_agino_t     pagl_pagino;
+        xfs_agino_t     pagl_leftrec;
+        xfs_agino_t     pagl_rightrec;
 #ifdef __KERNEL__
        spinlock_t      pagb_lock;      /* lock for pagb_list */
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index db15feb906ff..4ece1906bd41 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args)
                        dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock);
                        blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount);
                        error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno,
-                                             blkcnt, XFS_BUF_LOCK, &bp);
+                                             blkcnt,
+                                             XFS_BUF_LOCK | XBF_DONT_BLOCK,
+                                             &bp);
                        if (error)
                                return(error);
@@ -2141,8 +2143,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args)
                dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock),
                blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount);
-                bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno,
+                bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt,
-                                                        blkcnt, XFS_BUF_LOCK);
+                                       XFS_BUF_LOCK | XBF_DONT_BLOCK);
                ASSERT(bp);
                ASSERT(!XFS_BUF_GETERROR(bp));
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 7928b9983c1d..8971fb09d387 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -3713,7 +3713,7 @@ done:
 * entry (null if none).  Else, *lastxp will be set to the index
 * of the found entry; *gotp will contain the entry.
 */
-xfs_bmbt_rec_host_t *                   /* pointer to found extent entry */
+STATIC xfs_bmbt_rec_host_t *            /* pointer to found extent entry */
 xfs_bmap_search_multi_extents(
        xfs_ifork_t     *ifp,           /* inode fork pointer */
        xfs_fileoff_t   bno,            /* block number searched for */
@@ -6009,7 +6009,7 @@ xfs_getbmap(
         */
        error = ENOMEM;
        subnex = 16;
-        map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL);
+        map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS);
        if (!map)
                goto out_unlock_ilock;
diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h
index 1b8ff9256bd0..56f62d2edc35 100644
--- a/fs/xfs/xfs_bmap.h
+++ b/fs/xfs/xfs_bmap.h
@@ -392,17 +392,6 @@ xfs_bmap_count_blocks(
        int                     whichfork,
        int                     *count);
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry.  If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none).  Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t *
-xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *,
-                        xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *);
 #endif  /* __KERNEL__ */
 #endif  /* __XFS_BMAP_H__ */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index 5c1ade06578e..eb7b702d0690 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -202,16 +202,6 @@ xfs_bmbt_get_state(
                                ext_flag);
 }
-/* Endian flipping versions of the bmbt extraction functions */
-void
-xfs_bmbt_disk_get_all(
-        xfs_bmbt_rec_t  *r,
-        xfs_bmbt_irec_t *s)
-{
-        __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
-                                get_unaligned_be64(&r->l1), s);
-}
 /*
 * Extract the blockcount field from an on disk bmap extent record.
 */
@@ -816,6 +806,16 @@ xfs_bmbt_trace_key(
        *l1 = 0;
 }
+/* Endian flipping versions of the bmbt extraction functions */
+STATIC void
+xfs_bmbt_disk_get_all(
+        xfs_bmbt_rec_t  *r,
+        xfs_bmbt_irec_t *s)
+{
+        __xfs_bmbt_get_all(get_unaligned_be64(&r->l0),
+                                get_unaligned_be64(&r->l1), s);
+}
 STATIC void
 xfs_bmbt_trace_record(
        struct xfs_btree_cur    *cur,
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 0e8df007615e..5549d495947f 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -220,7 +220,6 @@ extern xfs_fsblock_t xfs_bmbt_get_startblock(xfs_bmbt_rec_host_t *r);
 extern xfs_fileoff_t xfs_bmbt_get_startoff(xfs_bmbt_rec_host_t *r);
 extern xfs_exntst_t xfs_bmbt_get_state(xfs_bmbt_rec_host_t *r);
-extern void xfs_bmbt_disk_get_all(xfs_bmbt_rec_t *r, xfs_bmbt_irec_t *s);
 extern xfs_filblks_t xfs_bmbt_disk_get_blockcount(xfs_bmbt_rec_t *r);
 extern xfs_fileoff_t xfs_bmbt_disk_get_startoff(xfs_bmbt_rec_t *r);
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index e9df99574829..52b5f14d0c32 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -120,8 +120,8 @@ xfs_btree_check_sblock(
                        XFS_RANDOM_BTREE_CHECK_SBLOCK))) {
                if (bp)
                        xfs_buftrace("SBTREE ERROR", bp);
-                XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW,
+                XFS_CORRUPTION_ERROR("xfs_btree_check_sblock",
-                                 cur->bc_mp);
+                        XFS_ERRLEVEL_LOW, cur->bc_mp, block);
                return XFS_ERROR(EFSCORRUPTED);
        }
        return 0;
@@ -646,46 +646,6 @@ xfs_btree_read_bufl(
 }
 /*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int                                     /* error */
-xfs_btree_read_bufs(
-        xfs_mount_t     *mp,            /* file system mount point */
-        xfs_trans_t     *tp,            /* transaction pointer */
-        xfs_agnumber_t  agno,           /* allocation group number */
-        xfs_agblock_t   agbno,          /* allocation group block number */
-        uint            lock,           /* lock flags for read_buf */
-        xfs_buf_t       **bpp,          /* buffer for agno/agbno */
-        int             refval)         /* ref count value for buffer */
-{
-        xfs_buf_t       *bp;            /* return value */
-        xfs_daddr_t     d;              /* real disk block address */
-        int             error;
-        ASSERT(agno != NULLAGNUMBER);
-        ASSERT(agbno != NULLAGBLOCK);
-        d = XFS_AGB_TO_DADDR(mp, agno, agbno);
-        if ((error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, d,
-                                        mp->m_bsize, lock, &bp))) {
-                return error;
-        }
-        ASSERT(!bp || !XFS_BUF_GETERROR(bp));
-        if (bp != NULL) {
-                switch (refval) {
-                case XFS_ALLOC_BTREE_REF:
-                        XFS_BUF_SET_VTYPE_REF(bp, B_FS_MAP, refval);
-                        break;
-                case XFS_INO_BTREE_REF:
-                        XFS_BUF_SET_VTYPE_REF(bp, B_FS_INOMAP, refval);
-                        break;
-                }
-        }
-        *bpp = bp;
-        return 0;
-}
-/*
 * Read-ahead the block, don't wait for it, don't return a buffer.
 * Long-form addressing.
 */
@@ -2951,7 +2911,7 @@ error0:
 * inode we have to copy the single block it was pointing to into the
 * inode.
 */
-int
+STATIC int
 xfs_btree_kill_iroot(
        struct xfs_btree_cur    *cur)
 {
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index 4f852b735b96..7fa07062bdda 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -379,20 +379,6 @@ xfs_btree_read_bufl(
        int                     refval);/* ref count value for buffer */
 /*
- * Get a buffer for the block, return it read in.
- * Short-form addressing.
- */
-int                                     /* error */
-xfs_btree_read_bufs(
-        struct xfs_mount        *mp,    /* file system mount point */
-        struct xfs_trans        *tp,    /* transaction pointer */
-        xfs_agnumber_t          agno,   /* allocation group number */
-        xfs_agblock_t           agbno,  /* allocation group block number */
-        uint                    lock,   /* lock flags for read_buf */
-        struct xfs_buf          **bpp,  /* buffer for agno/agbno */
-        int                     refval);/* ref count value for buffer */
-/*
 * Read-ahead the block, don't wait for it, don't return a buffer.
 * Long-form addressing.
 */
@@ -432,7 +418,6 @@ int xfs_btree_decrement(struct xfs_btree_cur *, int, int *);
 int xfs_btree_lookup(struct xfs_btree_cur *, xfs_lookup_t, int *);
 int xfs_btree_update(struct xfs_btree_cur *, union xfs_btree_rec *);
 int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
-int xfs_btree_kill_iroot(struct xfs_btree_cur *);
 int xfs_btree_insert(struct xfs_btree_cur *, int *);
 int xfs_btree_delete(struct xfs_btree_cur *, int *);
 int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index 9ff6e57a5075..2847bbc1c534 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone;		/* dabuf zone */
 xfs_da_state_t *
 xfs_da_state_alloc(void)
 {
-        return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP);
+        return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS);
 }
 /*
@@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra)
        int             off;
        if (nbuf == 1)
-                dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP);
+                dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS);
        else
-                dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP);
+                dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS);
        dabuf->dirty = 0;
 #ifdef XFS_DABUF_DEBUG
        dabuf->ra = ra;
diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c
index c657bec6d951..bb1d58eb3982 100644
--- a/fs/xfs/xfs_dir2.c
+++ b/fs/xfs/xfs_dir2.c
@@ -256,7 +256,7 @@ xfs_dir_cilookup_result(
                                        !(args->op_flags & XFS_DA_OP_CILOOKUP))
                return EEXIST;
-        args->value = kmem_alloc(len, KM_MAYFAIL);
+        args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL);
        if (!args->value)
                return ENOMEM;
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index cbd451bb4848..2d0b3e1da9e6 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -167,17 +167,25 @@ xfs_growfs_data_private(
        new = nb - mp->m_sb.sb_dblocks;
        oagcount = mp->m_sb.sb_agcount;
        if (nagcount > oagcount) {
+                void *new_perag, *old_perag;
                xfs_filestream_flush(mp);
+                new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount,
+                                        KM_MAYFAIL);
+                if (!new_perag)
+                        return XFS_ERROR(ENOMEM);
                down_write(&mp->m_peraglock);
-                mp->m_perag = kmem_realloc(mp->m_perag,
+                memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount);
-                        sizeof(xfs_perag_t) * nagcount,
+                old_perag = mp->m_perag;
-                        sizeof(xfs_perag_t) * oagcount,
+                mp->m_perag = new_perag;
-                        KM_SLEEP);
-                memset(&mp->m_perag[oagcount], 0,
-                        (nagcount - oagcount) * sizeof(xfs_perag_t));
                mp->m_flags |= XFS_MOUNT_32BITINODES;
                nagimax = xfs_initialize_perag(mp, nagcount);
                up_write(&mp->m_peraglock);
+                kmem_free(old_perag);
        }
        tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS);
        tp->t_flags |= XFS_TRANS_RESERVE;
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 3120a3a5e20f..ab64f3efb43b 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -57,75 +57,35 @@ xfs_ialloc_cluster_alignment(
 }
 /*
- * Lookup the record equal to ino in the btree given by cur.
+ * Lookup a record by ino in the btree given by cur.
- */
-STATIC int                              /* error */
-xfs_inobt_lookup_eq(
-        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agino_t             ino,    /* starting inode of chunk */
-        __int32_t               fcnt,   /* free inode count */
-        xfs_inofree_t           free,   /* free inode mask */
-        int                     *stat)  /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
-}
-/*
- * Lookup the first record greater than or equal to ino
- * in the btree given by cur.
 */
 int                                     /* error */
-xfs_inobt_lookup_ge(
+xfs_inobt_lookup(
        struct xfs_btree_cur    *cur,   /* btree cursor */
        xfs_agino_t             ino,    /* starting inode of chunk */
-        __int32_t               fcnt,   /* free inode count */
+        xfs_lookup_t            dir,    /* <=, >=, == */
-        xfs_inofree_t           free,   /* free inode mask */
        int                     *stat)  /* success/failure */
 {
        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
+        cur->bc_rec.i.ir_freecount = 0;
-        cur->bc_rec.i.ir_free = free;
+        cur->bc_rec.i.ir_free = 0;
-        return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+        return xfs_btree_lookup(cur, dir, stat);
 }
 /*
- * Lookup the first record less than or equal to ino
+ * Update the record referred to by cur to the value given.
- * in the btree given by cur.
- */
-int                                     /* error */
-xfs_inobt_lookup_le(
-        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agino_t             ino,    /* starting inode of chunk */
-        __int32_t               fcnt,   /* free inode count */
-        xfs_inofree_t           free,   /* free inode mask */
-        int                     *stat)  /* success/failure */
-{
-        cur->bc_rec.i.ir_startino = ino;
-        cur->bc_rec.i.ir_freecount = fcnt;
-        cur->bc_rec.i.ir_free = free;
-        return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
-}
-/*
- * Update the record referred to by cur to the value given
- * by [ino, fcnt, free].
 * This either works (return 0) or gets an EFSCORRUPTED error.
 */
 STATIC int                              /* error */
 xfs_inobt_update(
        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agino_t             ino,    /* starting inode of chunk */
+        xfs_inobt_rec_incore_t  *irec)  /* btree record */
-        __int32_t               fcnt,   /* free inode count */
-        xfs_inofree_t           free)   /* free inode mask */
 {
        union xfs_btree_rec     rec;
-        rec.inobt.ir_startino = cpu_to_be32(ino);
+        rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino);
-        rec.inobt.ir_freecount = cpu_to_be32(fcnt);
+        rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount);
-        rec.inobt.ir_free = cpu_to_be64(free);
+        rec.inobt.ir_free = cpu_to_be64(irec->ir_free);
        return xfs_btree_update(cur, &rec);
 }
@@ -135,9 +95,7 @@ xfs_inobt_update(
 int                                     /* error */
 xfs_inobt_get_rec(
        struct xfs_btree_cur    *cur,   /* btree cursor */
-        xfs_agino_t             *ino,   /* output: starting inode of chunk */
+        xfs_inobt_rec_incore_t  *irec,  /* btree record */
-        __int32_t               *fcnt,  /* output: number of free inodes */
-        xfs_inofree_t           *free,  /* output: free inode mask */
        int                     *stat)  /* output: success/failure */
 {
        union xfs_btree_rec     *rec;
@@ -145,14 +103,136 @@ xfs_inobt_get_rec(
        error = xfs_btree_get_rec(cur, &rec, stat);
        if (!error && *stat == 1) {
-                *ino = be32_to_cpu(rec->inobt.ir_startino);
+                irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino);
-                *fcnt = be32_to_cpu(rec->inobt.ir_freecount);
+                irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount);
-                *free = be64_to_cpu(rec->inobt.ir_free);
+                irec->ir_free = be64_to_cpu(rec->inobt.ir_free);
        }
        return error;
 }
 /*
+ * Verify that the number of free inodes in the AGI is correct.
+ */
+#ifdef DEBUG
+STATIC int
+xfs_check_agi_freecount(
+        struct xfs_btree_cur    *cur,
+        struct xfs_agi          *agi)
+{
+        if (cur->bc_nlevels == 1) {
+                xfs_inobt_rec_incore_t rec;
+                int             freecount = 0;
+                int             error;
+                int             i;
+                error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+                if (error)
+                        return error;
+                do {
+                        error = xfs_inobt_get_rec(cur, &rec, &i);
+                        if (error)
+                                return error;
+                        if (i) {
+                                freecount += rec.ir_freecount;
+                                error = xfs_btree_increment(cur, 0, &i);
+                                if (error)
+                                        return error;
+                        }
+                } while (i == 1);
+                if (!XFS_FORCED_SHUTDOWN(cur->bc_mp))
+                        ASSERT(freecount == be32_to_cpu(agi->agi_freecount));
+        }
+        return 0;
+}
+#else
+#define xfs_check_agi_freecount(cur, agi)       0
+#endif
+/*
+ * Initialise a new set of inodes.
+ */
+STATIC void
+xfs_ialloc_inode_init(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           agbno,
+        xfs_agblock_t           length,
+        unsigned int            gen)
+{
+        struct xfs_buf          *fbuf;
+        struct xfs_dinode       *free;
+        int                     blks_per_cluster, nbufs, ninodes;
+        int                     version;
+        int                     i, j;
+        xfs_daddr_t             d;
+        /*
+         * Loop over the new block(s), filling in the inodes.
+         * For small block sizes, manipulate the inodes in buffers
+         * which are multiples of the blocks size.
+         */
+        if (mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(mp)) {
+                blks_per_cluster = 1;
+                nbufs = length;
+                ninodes = mp->m_sb.sb_inopblock;
+        } else {
+                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) /
+                                   mp->m_sb.sb_blocksize;
+                nbufs = length / blks_per_cluster;
+                ninodes = blks_per_cluster * mp->m_sb.sb_inopblock;
+        }
+        /*
+         * Figure out what version number to use in the inodes we create.
+         * If the superblock version has caught up to the one that supports
+         * the new inode format, then use the new inode version.  Otherwise
+         * use the old version so that old kernels will continue to be
+         * able to use the file system.
+         */
+        if (xfs_sb_version_hasnlink(&mp->m_sb))
+                version = 2;
+        else
+                version = 1;
+        for (j = 0; j < nbufs; j++) {
+                /*
+                 * Get the block.
+                 */
+                d = XFS_AGB_TO_DADDR(mp, agno, agbno + (j * blks_per_cluster));
+                fbuf = xfs_trans_get_buf(tp, mp->m_ddev_targp, d,
+                                         mp->m_bsize * blks_per_cluster,
+                                         XFS_BUF_LOCK);
+                ASSERT(fbuf);
+                ASSERT(!XFS_BUF_GETERROR(fbuf));
+                /*
+                 * Initialize all inodes in this buffer and then log them.
+                 *
+                 * XXX: It would be much better if we had just one transaction
+                 *      to log a whole cluster of inodes instead of all the
+                 *      individual transactions causing a lot of log traffic.
+                 */
+                xfs_biozero(fbuf, 0, ninodes << mp->m_sb.sb_inodelog);
+                for (i = 0; i < ninodes; i++) {
+                        int     ioffset = i << mp->m_sb.sb_inodelog;
+                        uint    isize = sizeof(struct xfs_dinode);
+                        free = xfs_make_iptr(mp, fbuf, i);
+                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
+                        free->di_version = version;
+                        free->di_gen = cpu_to_be32(gen);
+                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
+                        xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
+                }
+                xfs_trans_inode_alloc_buf(tp, fbuf);
+        }
+}
+/*
 * Allocate new inodes in the allocation group specified by agbp.
 * Return 0 for success, else error code.
 */
@@ -164,24 +244,15 @@ xfs_ialloc_ag_alloc(
 {
        xfs_agi_t       *agi;           /* allocation group header */
        xfs_alloc_arg_t args;           /* allocation argument structure */
-        int             blks_per_cluster;  /* fs blocks per inode cluster */
        xfs_btree_cur_t *cur;           /* inode btree cursor */
-        xfs_daddr_t     d;              /* disk addr of buffer */
        xfs_agnumber_t  agno;
        int             error;
-        xfs_buf_t       *fbuf;          /* new free inodes' buffer */
+        int             i;
-        xfs_dinode_t    *free;          /* new free inode structure */
-        int             i;              /* inode counter */
-        int             j;              /* block counter */
-        int             nbufs;          /* num bufs of new inodes */
        xfs_agino_t     newino;         /* new first inode's number */
        xfs_agino_t     newlen;         /* new number of inodes */
-        int             ninodes;        /* num inodes per buf */
        xfs_agino_t     thisino;        /* current inode number, for loop */
-        int             version;        /* inode version number to use */
        int             isaligned = 0;  /* inode allocation at stripe unit */
                                        /* boundary */
-        unsigned int    gen;
        args.tp = tp;
        args.mp = tp->t_mountp;
@@ -202,12 +273,12 @@ xfs_ialloc_ag_alloc(
         */
        agi = XFS_BUF_TO_AGI(agbp);
        newino = be32_to_cpu(agi->agi_newino);
+        agno = be32_to_cpu(agi->agi_seqno);
        args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
                        XFS_IALLOC_BLOCKS(args.mp);
        if (likely(newino != NULLAGINO &&
                  (args.agbno < be32_to_cpu(agi->agi_length)))) {
-                args.fsbno = XFS_AGB_TO_FSB(args.mp,
+                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-                                be32_to_cpu(agi->agi_seqno), args.agbno);
                args.type = XFS_ALLOCTYPE_THIS_BNO;
                args.mod = args.total = args.wasdel = args.isfl =
                        args.userdata = args.minalignslop = 0;
@@ -258,8 +329,7 @@ xfs_ialloc_ag_alloc(
                 * For now, just allocate blocks up front.
                 */
                args.agbno = be32_to_cpu(agi->agi_root);
-                args.fsbno = XFS_AGB_TO_FSB(args.mp,
+                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-                                be32_to_cpu(agi->agi_seqno), args.agbno);
                /*
                 * Allocate a fixed-size extent of inodes.
                 */
@@ -282,8 +352,7 @@ xfs_ialloc_ag_alloc(
        if (isaligned && args.fsbno == NULLFSBLOCK) {
                args.type = XFS_ALLOCTYPE_NEAR_BNO;
                args.agbno = be32_to_cpu(agi->agi_root);
-                args.fsbno = XFS_AGB_TO_FSB(args.mp,
+                args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno);
-                                be32_to_cpu(agi->agi_seqno), args.agbno);
                args.alignment = xfs_ialloc_cluster_alignment(&args);
                if ((error = xfs_alloc_vextent(&args)))
                        return error;
@@ -294,85 +363,30 @@ xfs_ialloc_ag_alloc(
                return 0;
        }
        ASSERT(args.len == args.minlen);
-        /*
-         * Convert the results.
-         */
-        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
-        /*
-         * Loop over the new block(s), filling in the inodes.
-         * For small block sizes, manipulate the inodes in buffers
-         * which are multiples of the blocks size.
-         */
-        if (args.mp->m_sb.sb_blocksize >= XFS_INODE_CLUSTER_SIZE(args.mp)) {
-                blks_per_cluster = 1;
-                nbufs = (int)args.len;
-                ninodes = args.mp->m_sb.sb_inopblock;
-        } else {
-                blks_per_cluster = XFS_INODE_CLUSTER_SIZE(args.mp) /
-                                   args.mp->m_sb.sb_blocksize;
-                nbufs = (int)args.len / blks_per_cluster;
-                ninodes = blks_per_cluster * args.mp->m_sb.sb_inopblock;
-        }
-        /*
-         * Figure out what version number to use in the inodes we create.
-         * If the superblock version has caught up to the one that supports
-         * the new inode format, then use the new inode version.  Otherwise
-         * use the old version so that old kernels will continue to be
-         * able to use the file system.
-         */
-        if (xfs_sb_version_hasnlink(&args.mp->m_sb))
-                version = 2;
-        else
-                version = 1;
        /*
+         * Stamp and write the inode buffers.
+         *
         * Seed the new inode cluster with a random generation number. This
         * prevents short-term reuse of generation numbers if a chunk is
         * freed and then immediately reallocated. We use random numbers
         * rather than a linear progression to prevent the next generation
         * number from being easily guessable.
         */
-        gen = random32();
+        xfs_ialloc_inode_init(args.mp, tp, agno, args.agbno, args.len,
-        for (j = 0; j < nbufs; j++) {
+                              random32());
-                /*
-                 * Get the block.
-                 */
-                d = XFS_AGB_TO_DADDR(args.mp, be32_to_cpu(agi->agi_seqno),
-                                     args.agbno + (j * blks_per_cluster));
-                fbuf = xfs_trans_get_buf(tp, args.mp->m_ddev_targp, d,
-                                         args.mp->m_bsize * blks_per_cluster,
-                                         XFS_BUF_LOCK);
-                ASSERT(fbuf);
-                ASSERT(!XFS_BUF_GETERROR(fbuf));
-                /*
+        /*
-                 * Initialize all inodes in this buffer and then log them.
+         * Convert the results.
-                 *
+         */
-                 * XXX: It would be much better if we had just one transaction to
+        newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0);
-                 *      log a whole cluster of inodes instead of all the individual
-                 *      transactions causing a lot of log traffic.
-                 */
-                xfs_biozero(fbuf, 0, ninodes << args.mp->m_sb.sb_inodelog);
-                for (i = 0; i < ninodes; i++) {
-                        int     ioffset = i << args.mp->m_sb.sb_inodelog;
-                        uint    isize = sizeof(struct xfs_dinode);
-                        free = xfs_make_iptr(args.mp, fbuf, i);
-                        free->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
-                        free->di_version = version;
-                        free->di_gen = cpu_to_be32(gen);
-                        free->di_next_unlinked = cpu_to_be32(NULLAGINO);
-                        xfs_trans_log_buf(tp, fbuf, ioffset, ioffset + isize - 1);
-                }
-                xfs_trans_inode_alloc_buf(tp, fbuf);
-        }
        be32_add_cpu(&agi->agi_count, newlen);
        be32_add_cpu(&agi->agi_freecount, newlen);
-        agno = be32_to_cpu(agi->agi_seqno);
        down_read(&args.mp->m_peraglock);
        args.mp->m_perag[agno].pagi_freecount += newlen;
        up_read(&args.mp->m_peraglock);
        agi->agi_newino = cpu_to_be32(newino);
        /*
         * Insert records describing the new inode chunk into the btree.
         */
@@ -380,13 +394,17 @@ xfs_ialloc_ag_alloc(
        for (thisino = newino;
             thisino < newino + newlen;
             thisino += XFS_INODES_PER_CHUNK) {
-                if ((error = xfs_inobt_lookup_eq(cur, thisino,
+                cur->bc_rec.i.ir_startino = thisino;
-                                XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i))) {
+                cur->bc_rec.i.ir_freecount = XFS_INODES_PER_CHUNK;
+                cur->bc_rec.i.ir_free = XFS_INOBT_ALL_FREE;
+                error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, &i);
+                if (error) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                        return error;
                }
                ASSERT(i == 0);
-                if ((error = xfs_btree_insert(cur, &i))) {
+                error = xfs_btree_insert(cur, &i);
+                if (error) {
                        xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                        return error;
                }
@@ -539,6 +557,62 @@ nextag:
 }
 /*
+ * Try to retrieve the next record to the left/right from the current one.
+ */
+STATIC int
+xfs_ialloc_next_rec(
+        struct xfs_btree_cur    *cur,
+        xfs_inobt_rec_incore_t  *rec,
+        int                     *done,
+        int                     left)
+{
+        int                     error;
+        int                     i;
+        if (left)
+                error = xfs_btree_decrement(cur, 0, &i);
+        else
+                error = xfs_btree_increment(cur, 0, &i);
+        if (error)
+                return error;
+        *done = !i;
+        if (i) {
+                error = xfs_inobt_get_rec(cur, rec, &i);
+                if (error)
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        return 0;
+}
+STATIC int
+xfs_ialloc_get_rec(
+        struct xfs_btree_cur    *cur,
+        xfs_agino_t             agino,
+        xfs_inobt_rec_incore_t  *rec,
+        int                     *done,
+        int                     left)
+{
+        int                     error;
+        int                     i;
+        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_EQ, &i);
+        if (error)
+                return error;
+        *done = !i;
+        if (i) {
+                error = xfs_inobt_get_rec(cur, rec, &i);
+                if (error)
+                        return error;
+                XFS_WANT_CORRUPTED_RETURN(i == 1);
+        }
+        return 0;
+}
+/*
 * Visible inode allocation functions.
 */
@@ -592,8 +666,8 @@ xfs_dialloc(
        int             j;              /* result code */
        xfs_mount_t     *mp;            /* file system mount structure */
        int             offset;         /* index of inode in chunk */
-        xfs_agino_t     pagino;         /* parent's a.g. relative inode # */
+        xfs_agino_t     pagino;         /* parent's AG relative inode # */
-        xfs_agnumber_t  pagno;          /* parent's allocation group number */
+        xfs_agnumber_t  pagno;          /* parent's AG number */
        xfs_inobt_rec_incore_t rec;     /* inode allocation record */
        xfs_agnumber_t  tagno;          /* testing allocation group number */
        xfs_btree_cur_t *tcur;          /* temp cursor */
@@ -716,6 +790,8 @@ nextag:
         */
        agno = tagno;
        *IO_agbp = NULL;
+ restart_pagno:
        cur = xfs_inobt_init_cursor(mp, tp, agbp, be32_to_cpu(agi->agi_seqno));
        /*
         * If pagino is 0 (this is the root inode allocation) use newino.
@@ -723,220 +799,199 @@ nextag:
         */
        if (!pagino)
                pagino = be32_to_cpu(agi->agi_newino);
-#ifdef DEBUG
-        if (cur->bc_nlevels == 1) {
-                int     freecount = 0;
-                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+        error = xfs_check_agi_freecount(cur, agi);
-                        goto error0;
+        if (error)
-                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                goto error0;
-                do {
-                        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-                                        &rec.ir_freecount, &rec.ir_free, &i)))
-                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        freecount += rec.ir_freecount;
-                        if ((error = xfs_btree_increment(cur, 0, &i)))
-                                goto error0;
-                } while (i == 1);
-                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-                       XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
        /*
-         * If in the same a.g. as the parent, try to get near the parent.
+         * If in the same AG as the parent, try to get near the parent.
         */
        if (pagno == agno) {
-                if ((error = xfs_inobt_lookup_le(cur, pagino, 0, 0, &i)))
+                xfs_perag_t     *pag = &mp->m_perag[agno];
+                int             doneleft;       /* done, to the left */
+                int             doneright;      /* done, to the right */
+                int             searchdistance = 10;
+                error = xfs_inobt_lookup(cur, pagino, XFS_LOOKUP_LE, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                error = xfs_inobt_get_rec(cur, &rec, &j);
+                if (error)
                        goto error0;
-                if (i != 0 &&
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-                            &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
+                if (rec.ir_freecount > 0) {
-                    j == 1 &&
-                    rec.ir_freecount > 0) {
                        /*
                         * Found a free inode in the same chunk
-                         * as parent, done.
+                         * as the parent, done.
                         */
+                        goto alloc_inode;
                }
+                /*
+                 * In the same AG as parent, but parent's chunk is full.
+                 */
+                /* duplicate the cursor, search left & right simultaneously */
+                error = xfs_btree_dup_cursor(cur, &tcur);
+                if (error)
+                        goto error0;
                /*
-                 * In the same a.g. as parent, but parent's chunk is full.
+                 * Skip to last blocks looked up if same parent inode.
                 */
-                else {
+                if (pagino != NULLAGINO &&
-                        int     doneleft;       /* done, to the left */
+                    pag->pagl_pagino == pagino &&
-                        int     doneright;      /* done, to the right */
+                    pag->pagl_leftrec != NULLAGINO &&
+                    pag->pagl_rightrec != NULLAGINO) {
+                        error = xfs_ialloc_get_rec(tcur, pag->pagl_leftrec,
+                                                   &trec, &doneleft, 1);
+                        if (error)
+                                goto error1;
+                        error = xfs_ialloc_get_rec(cur, pag->pagl_rightrec,
+                                                   &rec, &doneright, 0);
                        if (error)
-                                goto error0;
-                        ASSERT(i == 1);
-                        ASSERT(j == 1);
-                        /*
-                         * Duplicate the cursor, search left & right
-                         * simultaneously.
-                         */
-                        if ((error = xfs_btree_dup_cursor(cur, &tcur)))
-                                goto error0;
-                        /*
-                         * Search left with tcur, back up 1 record.
-                         */
-                        if ((error = xfs_btree_decrement(tcur, 0, &i)))
                                goto error1;
-                        doneleft = !i;
+                } else {
-                        if (!doneleft) {
+                        /* search left with tcur, back up 1 record */
-                                if ((error = xfs_inobt_get_rec(tcur,
+                        error = xfs_ialloc_next_rec(tcur, &trec, &doneleft, 1);
-                                                &trec.ir_startino,
+                        if (error)
-                                                &trec.ir_freecount,
-                                                &trec.ir_free, &i)))
-                                        goto error1;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
-                        }
-                        /*
-                         * Search right with cur, go forward 1 record.
-                         */
-                        if ((error = xfs_btree_increment(cur, 0, &i)))
                                goto error1;
-                        doneright = !i;
-                        if (!doneright) {
-                                if ((error = xfs_inobt_get_rec(cur,
-                                                &rec.ir_startino,
-                                                &rec.ir_freecount,
-                                                &rec.ir_free, &i)))
-                                        goto error1;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, error1);
-                        }
-                        /*
-                         * Loop until we find the closest inode chunk
-                         * with a free one.
-                         */
-                        while (!doneleft || !doneright) {
-                                int     useleft;  /* using left inode
-                                                     chunk this time */
+                        /* search right with cur, go forward 1 record. */
+                        error = xfs_ialloc_next_rec(cur, &rec, &doneright, 0);
+                        if (error)
+                                goto error1;
+                }
+                /*
+                 * Loop until we find an inode chunk with a free inode.
+                 */
+                while (!doneleft || !doneright) {
+                        int     useleft;  /* using left inode chunk this time */
+                        if (!--searchdistance) {
                                /*
-                                 * Figure out which block is closer,
+                                 * Not in range - save last search
-                                 * if both are valid.
+                                 * location and allocate a new inode
-                                 */
-                                if (!doneleft && !doneright)
-                                        useleft =
-                                                pagino -
-                                                (trec.ir_startino +
-                                                 XFS_INODES_PER_CHUNK - 1) <
-                                                 rec.ir_startino - pagino;
-                                else
-                                        useleft = !doneleft;
-                                /*
-                                 * If checking the left, does it have
-                                 * free inodes?
-                                 */
-                                if (useleft && trec.ir_freecount) {
-                                        /*
-                                         * Yes, set it up as the chunk to use.
-                                         */
-                                        rec = trec;
-                                        xfs_btree_del_cursor(cur,
-                                                XFS_BTREE_NOERROR);
-                                        cur = tcur;
-                                        break;
-                                }
-                                /*
-                                 * If checking the right, does it have
-                                 * free inodes?
-                                 */
-                                if (!useleft && rec.ir_freecount) {
-                                        /*
-                                         * Yes, it's already set up.
-                                         */
-                                        xfs_btree_del_cursor(tcur,
-                                                XFS_BTREE_NOERROR);
-                                        break;
-                                }
-                                /*
-                                 * If used the left, get another one
-                                 * further left.
-                                 */
-                                if (useleft) {
-                                        if ((error = xfs_btree_decrement(tcur, 0,
-                                                        &i)))
-                                                goto error1;
-                                        doneleft = !i;
-                                        if (!doneleft) {
-                                                if ((error = xfs_inobt_get_rec(
-                                                            tcur,
-                                                            &trec.ir_startino,
-                                                            &trec.ir_freecount,
-                                                            &trec.ir_free, &i)))
-                                                        goto error1;
-                                                XFS_WANT_CORRUPTED_GOTO(i == 1,
-                                                        error1);
-                                        }
-                                }
-                                /*
-                                 * If used the right, get another one
-                                 * further right.
                                 */
-                                else {
+                                pag->pagl_leftrec = trec.ir_startino;
-                                        if ((error = xfs_btree_increment(cur, 0,
+                                pag->pagl_rightrec = rec.ir_startino;
-                                                        &i)))
+                                pag->pagl_pagino = pagino;
-                                                goto error1;
+                                goto newino;
-                                        doneright = !i;
+                        }
-                                        if (!doneright) {
-                                                if ((error = xfs_inobt_get_rec(
+                        /* figure out the closer block if both are valid. */
-                                                            cur,
+                        if (!doneleft && !doneright) {
-                                                            &rec.ir_startino,
+                                useleft = pagino -
-                                                            &rec.ir_freecount,
+                                 (trec.ir_startino + XFS_INODES_PER_CHUNK - 1) <
-                                                            &rec.ir_free, &i)))
+                                  rec.ir_startino - pagino;
-                                                        goto error1;
+                        } else {
-                                                XFS_WANT_CORRUPTED_GOTO(i == 1,
+                                useleft = !doneleft;
-                                                        error1);
-                                        }
-                                }
                        }
-                        ASSERT(!doneleft || !doneright);
+                        /* free inodes to the left? */
+                        if (useleft && trec.ir_freecount) {
+                                rec = trec;
+                                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                                cur = tcur;
+                                pag->pagl_leftrec = trec.ir_startino;
+                                pag->pagl_rightrec = rec.ir_startino;
+                                pag->pagl_pagino = pagino;
+                                goto alloc_inode;
+                        }
+                        /* free inodes to the right? */
+                        if (!useleft && rec.ir_freecount) {
+                                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                                pag->pagl_leftrec = trec.ir_startino;
+                                pag->pagl_rightrec = rec.ir_startino;
+                                pag->pagl_pagino = pagino;
+                                goto alloc_inode;
+                        }
+                        /* get next record to check */
+                        if (useleft) {
+                                error = xfs_ialloc_next_rec(tcur, &trec,
+                                                                 &doneleft, 1);
+                        } else {
+                                error = xfs_ialloc_next_rec(cur, &rec,
+                                                                 &doneright, 0);
+                        }
+                        if (error)
+                                goto error1;
                }
+                /*
+                 * We've reached the end of the btree. because
+                 * we are only searching a small chunk of the
+                 * btree each search, there is obviously free
+                 * inodes closer to the parent inode than we
+                 * are now. restart the search again.
+                 */
+                pag->pagl_pagino = NULLAGINO;
+                pag->pagl_leftrec = NULLAGINO;
+                pag->pagl_rightrec = NULLAGINO;
+                xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
+                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+                goto restart_pagno;
        }
        /*
-         * In a different a.g. from the parent.
+         * In a different AG from the parent.
         * See if the most recently allocated block has any free.
         */
-        else if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
+newino:
-                if ((error = xfs_inobt_lookup_eq(cur,
+        if (be32_to_cpu(agi->agi_newino) != NULLAGINO) {
-                                be32_to_cpu(agi->agi_newino), 0, 0, &i)))
+                error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino),
+                                         XFS_LOOKUP_EQ, &i);
+                if (error)
                        goto error0;
-                if (i == 1 &&
-                    (error = xfs_inobt_get_rec(cur, &rec.ir_startino,
+                if (i == 1) {
-                            &rec.ir_freecount, &rec.ir_free, &j)) == 0 &&
+                        error = xfs_inobt_get_rec(cur, &rec, &j);
-                    j == 1 &&
-                    rec.ir_freecount > 0) {
-                        /*
-                         * The last chunk allocated in the group still has
-                         * a free inode.
-                         */
-                }
-                /*
-                 * None left in the last group, search the whole a.g.
-                 */
-                else {
                        if (error)
                                goto error0;
-                        if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-                                goto error0;
+                        if (j == 1 && rec.ir_freecount > 0) {
-                        ASSERT(i == 1);
+                                /*
-                        for (;;) {
+                                 * The last chunk allocated in the group
-                                if ((error = xfs_inobt_get_rec(cur,
+                                 * still has a free inode.
-                                                &rec.ir_startino,
+                                 */
-                                                &rec.ir_freecount, &rec.ir_free,
+                                goto alloc_inode;
-                                                &i)))
-                                        goto error0;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                                if (rec.ir_freecount > 0)
-                                        break;
-                                if ((error = xfs_btree_increment(cur, 0, &i)))
-                                        goto error0;
-                                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
                        }
                }
        }
+        /*
+         * None left in the last group, search the whole AG
+         */
+        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &i);
+        if (error)
+                goto error0;
+        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        for (;;) {
+                error = xfs_inobt_get_rec(cur, &rec, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+                if (rec.ir_freecount > 0)
+                        break;
+                error = xfs_btree_increment(cur, 0, &i);
+                if (error)
+                        goto error0;
+                XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
+        }
+alloc_inode:
        offset = xfs_ialloc_find_free(&rec.ir_free);
        ASSERT(offset >= 0);
        ASSERT(offset < XFS_INODES_PER_CHUNK);
@@ -945,33 +1000,19 @@ nextag:
        ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino + offset);
        rec.ir_free &= ~XFS_INOBT_MASK(offset);
        rec.ir_freecount--;
-        if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount,
+        error = xfs_inobt_update(cur, &rec);
-                        rec.ir_free)))
+        if (error)
                goto error0;
        be32_add_cpu(&agi->agi_freecount, -1);
        xfs_ialloc_log_agi(tp, agbp, XFS_AGI_FREECOUNT);
        down_read(&mp->m_peraglock);
        mp->m_perag[tagno].pagi_freecount--;
        up_read(&mp->m_peraglock);
-#ifdef DEBUG
-        if (cur->bc_nlevels == 1) {
-                int     freecount = 0;
-                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+        error = xfs_check_agi_freecount(cur, agi);
-                        goto error0;
+        if (error)
-                do {
+                goto error0;
-                        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-                                        &rec.ir_freecount, &rec.ir_free, &i)))
-                                goto error0;
-                        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-                        freecount += rec.ir_freecount;
-                        if ((error = xfs_btree_increment(cur, 0, &i)))
-                                goto error0;
-                } while (i == 1);
-                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-                       XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
        xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, -1);
        *inop = ino;
@@ -1062,38 +1103,23 @@ xfs_difree(
         * Initialize the cursor.
         */
        cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-#ifdef DEBUG
-        if (cur->bc_nlevels == 1) {
-                int freecount = 0;
-                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
+        error = xfs_check_agi_freecount(cur, agi);
-                        goto error0;
+        if (error)
-                do {
+                goto error0;
-                        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino,
-                                        &rec.ir_freecount, &rec.ir_free, &i)))
-                                goto error0;
-                        if (i) {
-                                freecount += rec.ir_freecount;
-                                if ((error = xfs_btree_increment(cur, 0, &i)))
-                                        goto error0;
-                        }
-                } while (i == 1);
-                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-                       XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
        /*
         * Look for the entry describing this inode.
         */
-        if ((error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i))) {
+        if ((error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i))) {
                cmn_err(CE_WARN,
-                        "xfs_difree: xfs_inobt_lookup_le returned()  an error %d on %s.  Returning error.",
+                        "xfs_difree: xfs_inobt_lookup returned()  an error %d on %s.  Returning error.",
                        error, mp->m_fsname);
                goto error0;
        }
        XFS_WANT_CORRUPTED_GOTO(i == 1, error0);
-        if ((error = xfs_inobt_get_rec(cur, &rec.ir_startino, &rec.ir_freecount,
+        error = xfs_inobt_get_rec(cur, &rec, &i);
-                        &rec.ir_free, &i))) {
+        if (error) {
                cmn_err(CE_WARN,
                        "xfs_difree: xfs_inobt_get_rec()  returned an error %d on %s.  Returning error.",
                        error, mp->m_fsname);
@@ -1148,12 +1174,14 @@ xfs_difree(
        } else {
                *delete = 0;
-                if ((error = xfs_inobt_update(cur, rec.ir_startino, rec.ir_freecount, rec.ir_free))) {
+                error = xfs_inobt_update(cur, &rec);
+                if (error) {
                        cmn_err(CE_WARN,
-                                "xfs_difree: xfs_inobt_update()  returned an error %d on %s.  Returning error.",
+        "xfs_difree: xfs_inobt_update returned an error %d on %s.",
                                error, mp->m_fsname);
                        goto error0;
                }
                /* 
                 * Change the inode free counts and log the ag/sb changes.
                 */
@@ -1165,28 +1193,10 @@ xfs_difree(
                xfs_trans_mod_sb(tp, XFS_TRANS_SB_IFREE, 1);
        }
-#ifdef DEBUG
+        error = xfs_check_agi_freecount(cur, agi);
-        if (cur->bc_nlevels == 1) {
+        if (error)
-                int freecount = 0;
+                goto error0;
-                if ((error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &i)))
-                        goto error0;
-                do {
-                        if ((error = xfs_inobt_get_rec(cur,
-                                        &rec.ir_startino,
-                                        &rec.ir_freecount,
-                                        &rec.ir_free, &i)))
-                                goto error0;
-                        if (i) {
-                                freecount += rec.ir_freecount;
-                                if ((error = xfs_btree_increment(cur, 0, &i)))
-                                        goto error0;
-                        }
-                } while (i == 1);
-                ASSERT(freecount == be32_to_cpu(agi->agi_freecount) ||
-                       XFS_FORCED_SHUTDOWN(mp));
-        }
-#endif
        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
        return 0;
@@ -1297,9 +1307,7 @@ xfs_imap(
                chunk_agbno = agbno - offset_agbno;
        } else {
                xfs_btree_cur_t *cur;   /* inode btree cursor */
-                xfs_agino_t     chunk_agino; /* first agino in inode chunk */
+                xfs_inobt_rec_incore_t chunk_rec;
-                __int32_t       chunk_cnt; /* count of free inodes in chunk */
-                xfs_inofree_t   chunk_free; /* mask of free inodes in chunk */
                xfs_buf_t       *agbp;  /* agi buffer */
                int             i;      /* temp state */
@@ -1315,15 +1323,14 @@ xfs_imap(
                }
                cur = xfs_inobt_init_cursor(mp, tp, agbp, agno);
-                error = xfs_inobt_lookup_le(cur, agino, 0, 0, &i);
+                error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE, &i);
                if (error) {
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
-                                        "xfs_inobt_lookup_le() failed");
+                                        "xfs_inobt_lookup() failed");
                        goto error0;
                }
-                error = xfs_inobt_get_rec(cur, &chunk_agino, &chunk_cnt,
+                error = xfs_inobt_get_rec(cur, &chunk_rec, &i);
-                                &chunk_free, &i);
                if (error) {
                        xfs_fs_cmn_err(CE_ALERT, mp, "xfs_imap: "
                                        "xfs_inobt_get_rec() failed");
@@ -1341,7 +1348,7 @@ xfs_imap(
                xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
                if (error)
                        return error;
-                chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_agino);
+                chunk_agbno = XFS_AGINO_TO_AGBNO(mp, chunk_rec.ir_startino);
                offset_agbno = agbno - chunk_agbno;
        }
diff --git a/fs/xfs/xfs_ialloc.h b/fs/xfs/xfs_ialloc.h
index aeee8278f92c..bb5385475e1f 100644
--- a/fs/xfs/xfs_ialloc.h
+++ b/fs/xfs/xfs_ialloc.h
@@ -150,23 +150,15 @@ xfs_ialloc_pagi_init(
        xfs_agnumber_t  agno);          /* allocation group number */
 /*
- * Lookup the first record greater than or equal to ino
+ * Lookup a record by ino in the btree given by cur.
- * in the btree given by cur.
 */
-int xfs_inobt_lookup_ge(struct xfs_btree_cur *cur, xfs_agino_t ino,
+int xfs_inobt_lookup(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                __int32_t fcnt, xfs_inofree_t free, int *stat);
+                xfs_lookup_t dir, int *stat);
-/*
- * Lookup the first record less than or equal to ino
- * in the btree given by cur.
- */
-int xfs_inobt_lookup_le(struct xfs_btree_cur *cur, xfs_agino_t ino,
-                __int32_t fcnt, xfs_inofree_t free, int *stat);
 /*
 * Get the data from the pointed-to record.
 */
-extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur, xfs_agino_t *ino,
+extern int xfs_inobt_get_rec(struct xfs_btree_cur *cur,
-                             __int32_t *fcnt, xfs_inofree_t *free, int *stat);
+                xfs_inobt_rec_incore_t *rec, int *stat);
 #endif  /* __XFS_IALLOC_H__ */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 5fcec6f020a7..80e526489be5 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -64,6 +64,10 @@ xfs_inode_alloc(
        ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP);
        if (!ip)
                return NULL;
+        if (inode_init_always(mp->m_super, VFS_I(ip))) {
+                kmem_zone_free(xfs_inode_zone, ip);
+                return NULL;
+        }
        ASSERT(atomic_read(&ip->i_iocount) == 0);
        ASSERT(atomic_read(&ip->i_pincount) == 0);
@@ -78,7 +82,6 @@ xfs_inode_alloc(
        memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
        ip->i_flags = 0;
        ip->i_update_core = 0;
-        ip->i_update_size = 0;
        ip->i_delayed_blks = 0;
        memset(&ip->i_d, 0, sizeof(xfs_icdinode_t));
        ip->i_size = 0;
@@ -105,17 +108,6 @@ xfs_inode_alloc(
 #ifdef XFS_DIR2_TRACE
        ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS);
 #endif
-        /*
-        * Now initialise the VFS inode. We do this after the xfs_inode
-        * initialisation as internal failures will result in ->destroy_inode
-        * being called and that will pass down through the reclaim path and
-        * free the XFS inode. This path requires the XFS inode to already be
-        * initialised. Hence if this call fails, the xfs_inode has already
-        * been freed and we should not reference it at all in the error
-        * handling.
-        */
-        if (!inode_init_always(mp->m_super, VFS_I(ip)))
-                return NULL;
        /* prevent anyone from using this yet */
        VFS_I(ip)->i_state = I_NEW|I_LOCK;
@@ -123,6 +115,71 @@ xfs_inode_alloc(
        return ip;
 }
+STATIC void
+xfs_inode_free(
+        struct xfs_inode        *ip)
+{
+        switch (ip->i_d.di_mode & S_IFMT) {
+        case S_IFREG:
+        case S_IFDIR:
+        case S_IFLNK:
+                xfs_idestroy_fork(ip, XFS_DATA_FORK);
+                break;
+        }
+        if (ip->i_afp)
+                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+#ifdef XFS_INODE_TRACE
+        ktrace_free(ip->i_trace);
+#endif
+#ifdef XFS_BMAP_TRACE
+        ktrace_free(ip->i_xtrace);
+#endif
+#ifdef XFS_BTREE_TRACE
+        ktrace_free(ip->i_btrace);
+#endif
+#ifdef XFS_RW_TRACE
+        ktrace_free(ip->i_rwtrace);
+#endif
+#ifdef XFS_ILOCK_TRACE
+        ktrace_free(ip->i_lock_trace);
+#endif
+#ifdef XFS_DIR2_TRACE
+        ktrace_free(ip->i_dir_trace);
+#endif
+        if (ip->i_itemp) {
+                /*
+                 * Only if we are shutting down the fs will we see an
+                 * inode still in the AIL. If it is there, we should remove
+                 * it to prevent a use-after-free from occurring.
+                 */
+                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
+                struct xfs_ail  *ailp = lip->li_ailp;
+                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
+                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
+                if (lip->li_flags & XFS_LI_IN_AIL) {
+                        spin_lock(&ailp->xa_lock);
+                        if (lip->li_flags & XFS_LI_IN_AIL)
+                                xfs_trans_ail_delete(ailp, lip);
+                        else
+                                spin_unlock(&ailp->xa_lock);
+                }
+                xfs_inode_item_destroy(ip);
+                ip->i_itemp = NULL;
+        }
+        /* asserts to verify all state is correct here */
+        ASSERT(atomic_read(&ip->i_iocount) == 0);
+        ASSERT(atomic_read(&ip->i_pincount) == 0);
+        ASSERT(!spin_is_locked(&ip->i_flags_lock));
+        ASSERT(completion_done(&ip->i_flush));
+        kmem_zone_free(xfs_inode_zone, ip);
+}
 /*
 * Check the validity of the inode we just found it the cache
 */
@@ -133,80 +190,82 @@ xfs_iget_cache_hit(
        int                     flags,
        int                     lock_flags) __releases(pag->pag_ici_lock)
 {
+        struct inode            *inode = VFS_I(ip);
        struct xfs_mount        *mp = ip->i_mount;
-        int                     error = EAGAIN;
+        int                     error;
+        spin_lock(&ip->i_flags_lock);
        /*
-         * If INEW is set this inode is being set up
+         * If we are racing with another cache hit that is currently
-         * If IRECLAIM is set this inode is being torn down
+         * instantiating this inode or currently recycling it out of
-         * Pause and try again.
+         * reclaimabe state, wait for the initialisation to complete
+         * before continuing.
+         *
+         * XXX(hch): eventually we should do something equivalent to
+         *           wait_on_inode to wait for these flags to be cleared
+         *           instead of polling for it.
         */
-        if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) {
+        if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) {
                XFS_STATS_INC(xs_ig_frecycle);
+                error = EAGAIN;
                goto out_error;
        }
-        /* If IRECLAIMABLE is set, we've torn down the vfs inode part */
+        /*
-        if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) {
+         * If lookup is racing with unlink return an error immediately.
+         */
-                /*
+        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
-                 * If lookup is racing with unlink, then we should return an
+                error = ENOENT;
-                 * error immediately so we don't remove it from the reclaim
+                goto out_error;
-                 * list and potentially leak the inode.
+        }
-                 */
-                if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) {
-                        error = ENOENT;
-                        goto out_error;
-                }
+        /*
+         * If IRECLAIMABLE is set, we've torn down the VFS inode already.
+         * Need to carefully get it back into useable state.
+         */
+        if (ip->i_flags & XFS_IRECLAIMABLE) {
                xfs_itrace_exit_tag(ip, "xfs_iget.alloc");
                /*
-                 * We need to re-initialise the VFS inode as it has been
+                 * We need to set XFS_INEW atomically with clearing the
-                 * 'freed' by the VFS. Do this here so we can deal with
+                 * reclaimable tag so that we do have an indicator of the
-                 * errors cleanly, then tag it so it can be set up correctly
+                 * inode still being initialized.
-                 * later.
                 */
-                if (!inode_init_always(mp->m_super, VFS_I(ip))) {
+                ip->i_flags |= XFS_INEW;
-                        error = ENOMEM;
+                ip->i_flags &= ~XFS_IRECLAIMABLE;
-                        goto out_error;
+                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
-                }
-                /*
+                spin_unlock(&ip->i_flags_lock);
-                 * We must set the XFS_INEW flag before clearing the
+                read_unlock(&pag->pag_ici_lock);
-                 * XFS_IRECLAIMABLE flag so that if a racing lookup does
-                 * not find the XFS_IRECLAIMABLE above but has the igrab()
-                 * below succeed we can safely check XFS_INEW to detect
-                 * that this inode is still being initialised.
-                 */
-                xfs_iflags_set(ip, XFS_INEW);
-                xfs_iflags_clear(ip, XFS_IRECLAIMABLE);
-                /* clear the radix tree reclaim flag as well. */
+                error = -inode_init_always(mp->m_super, inode);
-                __xfs_inode_clear_reclaim_tag(mp, pag, ip);
+                if (error) {
-        } else if (!igrab(VFS_I(ip))) {
+                        /*
+                         * Re-initializing the inode failed, and we are in deep
+                         * trouble.  Try to re-add it to the reclaim list.
+                         */
+                        read_lock(&pag->pag_ici_lock);
+                        spin_lock(&ip->i_flags_lock);
+                        ip->i_flags &= ~XFS_INEW;
+                        ip->i_flags |= XFS_IRECLAIMABLE;
+                        __xfs_inode_set_reclaim_tag(pag, ip);
+                        goto out_error;
+                }
+                inode->i_state = I_LOCK|I_NEW;
+        } else {
                /* If the VFS inode is being torn down, pause and try again. */
-                XFS_STATS_INC(xs_ig_frecycle);
+                if (!igrab(inode)) {
-                goto out_error;
+                        error = EAGAIN;
-        } else if (xfs_iflags_test(ip, XFS_INEW)) {
+                        goto out_error;
-                /*
+                }
-                 * We are racing with another cache hit that is
-                 * currently recycling this inode out of the XFS_IRECLAIMABLE
-                 * state. Wait for the initialisation to complete before
-                 * continuing.
-                 */
-                wait_on_inode(VFS_I(ip));
-        }
-        if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) {
+                /* We've got a live one. */
-                error = ENOENT;
+                spin_unlock(&ip->i_flags_lock);
-                iput(VFS_I(ip));
+                read_unlock(&pag->pag_ici_lock);
-                goto out_error;
        }
-        /* We've got a live one. */
-        read_unlock(&pag->pag_ici_lock);
        if (lock_flags != 0)
                xfs_ilock(ip, lock_flags);
@@ -216,6 +275,7 @@ xfs_iget_cache_hit(
        return 0;
 out_error:
+        spin_unlock(&ip->i_flags_lock);
        read_unlock(&pag->pag_ici_lock);
        return error;
 }
@@ -299,7 +359,8 @@ out_preload_end:
        if (lock_flags)
                xfs_iunlock(ip, lock_flags);
 out_destroy:
-        xfs_destroy_inode(ip);
+        __destroy_inode(VFS_I(ip));
+        xfs_inode_free(ip);
        return error;
 }
@@ -394,32 +455,6 @@ out_error_or_again:
        return error;
 }
-/*
- * Look for the inode corresponding to the given ino in the hash table.
- * If it is there and its i_transp pointer matches tp, return it.
- * Otherwise, return NULL.
- */
-xfs_inode_t *
-xfs_inode_incore(xfs_mount_t    *mp,
-                 xfs_ino_t      ino,
-                 xfs_trans_t    *tp)
-{
-        xfs_inode_t     *ip;
-        xfs_perag_t     *pag;
-        pag = xfs_get_perag(mp, ino);
-        read_lock(&pag->pag_ici_lock);
-        ip = radix_tree_lookup(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ino));
-        read_unlock(&pag->pag_ici_lock);
-        xfs_put_perag(mp, pag);
-        /* the returned inode must match the transaction */
-        if (ip && (ip->i_transp != tp))
-                return NULL;
-        return ip;
-}
 /*
 * Decrement reference count of an inode structure and unlock it.
 *
@@ -504,62 +539,7 @@ xfs_ireclaim(
        xfs_qm_dqdetach(ip);
        xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-        switch (ip->i_d.di_mode & S_IFMT) {
+        xfs_inode_free(ip);
-        case S_IFREG:
-        case S_IFDIR:
-        case S_IFLNK:
-                xfs_idestroy_fork(ip, XFS_DATA_FORK);
-                break;
-        }
-        if (ip->i_afp)
-                xfs_idestroy_fork(ip, XFS_ATTR_FORK);
-#ifdef XFS_INODE_TRACE
-        ktrace_free(ip->i_trace);
-#endif
-#ifdef XFS_BMAP_TRACE
-        ktrace_free(ip->i_xtrace);
-#endif
-#ifdef XFS_BTREE_TRACE
-        ktrace_free(ip->i_btrace);
-#endif
-#ifdef XFS_RW_TRACE
-        ktrace_free(ip->i_rwtrace);
-#endif
-#ifdef XFS_ILOCK_TRACE
-        ktrace_free(ip->i_lock_trace);
-#endif
-#ifdef XFS_DIR2_TRACE
-        ktrace_free(ip->i_dir_trace);
-#endif
-        if (ip->i_itemp) {
-                /*
-                 * Only if we are shutting down the fs will we see an
-                 * inode still in the AIL. If it is there, we should remove
-                 * it to prevent a use-after-free from occurring.
-                 */
-                xfs_log_item_t  *lip = &ip->i_itemp->ili_item;
-                struct xfs_ail  *ailp = lip->li_ailp;
-                ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) ||
-                                       XFS_FORCED_SHUTDOWN(ip->i_mount));
-                if (lip->li_flags & XFS_LI_IN_AIL) {
-                        spin_lock(&ailp->xa_lock);
-                        if (lip->li_flags & XFS_LI_IN_AIL)
-                                xfs_trans_ail_delete(ailp, lip);
-                        else
-                                spin_unlock(&ailp->xa_lock);
-                }
-                xfs_inode_item_destroy(ip);
-                ip->i_itemp = NULL;
-        }
-        /* asserts to verify all state is correct here */
-        ASSERT(atomic_read(&ip->i_iocount) == 0);
-        ASSERT(atomic_read(&ip->i_pincount) == 0);
-        ASSERT(!spin_is_locked(&ip->i_flags_lock));
-        ASSERT(completion_done(&ip->i_flush));
-        kmem_zone_free(xfs_inode_zone, ip);
 }
 /*
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 1f22d65fed0a..c1dc7ef5a1d8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -343,6 +343,16 @@ xfs_iformat(
                return XFS_ERROR(EFSCORRUPTED);
        }
+        if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) &&
+                     !ip->i_mount->m_rtdev_targp)) {
+                xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount,
+                        "corrupt dinode %Lu, has realtime flag set.",
+                        ip->i_ino);
+                XFS_CORRUPTION_ERROR("xfs_iformat(realtime)",
+                                     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+                return XFS_ERROR(EFSCORRUPTED);
+        }
        switch (ip->i_d.di_mode & S_IFMT) {
        case S_IFIFO:
        case S_IFCHR:
@@ -641,7 +651,7 @@ xfs_iformat_btree(
        return 0;
 }
-void
+STATIC void
 xfs_dinode_from_disk(
        xfs_icdinode_t          *to,
        xfs_dinode_t            *from)
@@ -1237,7 +1247,7 @@ xfs_isize_check(
 * In that case the pages will still be in memory, but the inode size
 * will never have been updated.
 */
-xfs_fsize_t
+STATIC xfs_fsize_t
 xfs_file_last_byte(
        xfs_inode_t     *ip)
 {
@@ -3827,7 +3837,7 @@ xfs_iext_inline_to_direct(
 /*
 * Resize an extent indirection array to new_size bytes.
 */
-void
+STATIC void
 xfs_iext_realloc_indirect(
        xfs_ifork_t     *ifp,           /* inode fork pointer */
        int             new_size)       /* new indirection array size */
@@ -3852,7 +3862,7 @@ xfs_iext_realloc_indirect(
 /*
 * Switch from indirection array to linear (direct) extent allocations.
 */
-void
+STATIC void
 xfs_iext_indirect_to_direct(
         xfs_ifork_t    *ifp)           /* inode fork pointer */
 {
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 1804f866a71d..0b38b9a869ec 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -261,7 +261,6 @@ typedef struct xfs_inode {
        /* Miscellaneous state. */
        unsigned short          i_flags;        /* see defined flags below */
        unsigned char           i_update_core;  /* timestamps/size is dirty */
-        unsigned char           i_update_size;  /* di_size field is dirty */
        unsigned int            i_delayed_blks; /* count of delay alloc blks */
        xfs_icdinode_t          i_d;            /* most of ondisk inode */
@@ -310,23 +309,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip)
 }
 /*
- * Get rid of a partially initialized inode.
- *
- * We have to go through destroy_inode to make sure allocations
- * from init_inode_always like the security data are undone.
- *
- * We mark the inode bad so that it takes the short cut in
- * the reclaim path instead of going through the flush path
- * which doesn't make sense for an inode that has never seen the
- * light of day.
- */
-static inline void xfs_destroy_inode(struct xfs_inode *ip)
-{
-        make_bad_inode(VFS_I(ip));
-        return destroy_inode(VFS_I(ip));
-}
-/*
 * i_flags helper functions
 */
 static inline void
@@ -485,8 +467,6 @@ static inline void xfs_ifunlock(xfs_inode_t *ip)
 /*
 * xfs_iget.c prototypes.
 */
-xfs_inode_t     *xfs_inode_incore(struct xfs_mount *, xfs_ino_t,
-                                  struct xfs_trans *);
 int             xfs_iget(struct xfs_mount *, struct xfs_trans *, xfs_ino_t,
                         uint, uint, xfs_inode_t **, xfs_daddr_t);
 void            xfs_iput(xfs_inode_t *, uint);
@@ -521,7 +501,6 @@ void		xfs_ipin(xfs_inode_t *);
 void            xfs_iunpin(xfs_inode_t *);
 int             xfs_iflush(xfs_inode_t *, uint);
 void            xfs_ichgtime(xfs_inode_t *, int);
-xfs_fsize_t     xfs_file_last_byte(xfs_inode_t *);
 void            xfs_lock_inodes(xfs_inode_t **, int, uint);
 void            xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
@@ -589,8 +568,6 @@ int		xfs_itobp(struct xfs_mount *, struct xfs_trans *,
                          struct xfs_buf **, uint);
 int             xfs_iread(struct xfs_mount *, struct xfs_trans *,
                          struct xfs_inode *, xfs_daddr_t, uint);
-void            xfs_dinode_from_disk(struct xfs_icdinode *,
-                                     struct xfs_dinode *);
 void            xfs_dinode_to_disk(struct xfs_dinode *,
                                   struct xfs_icdinode *);
 void            xfs_idestroy_fork(struct xfs_inode *, int);
@@ -609,8 +586,6 @@ void		xfs_iext_remove_inline(xfs_ifork_t *, xfs_extnum_t, int);
 void            xfs_iext_remove_direct(xfs_ifork_t *, xfs_extnum_t, int);
 void            xfs_iext_remove_indirect(xfs_ifork_t *, xfs_extnum_t, int);
 void            xfs_iext_realloc_direct(xfs_ifork_t *, int);
-void            xfs_iext_realloc_indirect(xfs_ifork_t *, int);
-void            xfs_iext_indirect_to_direct(xfs_ifork_t *);
 void            xfs_iext_direct_to_inline(xfs_ifork_t *, xfs_extnum_t);
 void            xfs_iext_inline_to_direct(xfs_ifork_t *, int);
 void            xfs_iext_destroy(xfs_ifork_t *);
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 977c4aec587e..47d5b663c37e 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -263,14 +263,6 @@ xfs_inode_item_format(
        }
        /*
-         * We don't have to worry about re-ordering here because
-         * the update_size field is protected by the inode lock
-         * and we have that held in exclusive mode.
-         */
-        if (ip->i_update_size)
-                ip->i_update_size = 0;
-        /*
         * Make sure to get the latest atime from the Linux inode.
         */
        xfs_synchronize_atime(ip);
@@ -712,8 +704,6 @@ xfs_inode_item_unlock(
         * Clear out the fields of the inode log item particular
         * to the current transaction.
         */
-        iip->ili_ilock_recur = 0;
-        iip->ili_iolock_recur = 0;
        iip->ili_flags = 0;
        /*
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index a52ac125f055..65bae4c9b8bf 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -137,8 +137,6 @@ typedef struct xfs_inode_log_item {
        struct xfs_inode        *ili_inode;        /* inode ptr */
        xfs_lsn_t               ili_flush_lsn;     /* lsn at last flush */
        xfs_lsn_t               ili_last_lsn;      /* lsn at last transaction */
-        unsigned short          ili_ilock_recur;   /* lock recursion count */
-        unsigned short          ili_iolock_recur;  /* lock recursion count */
        unsigned short          ili_flags;         /* misc flags */
        unsigned short          ili_logged;        /* flushed logged data */
        unsigned int            ili_last_fields;   /* fields when flushed */
diff --git a/fs/xfs/xfs_inum.h b/fs/xfs/xfs_inum.h
index 7a28191cb0de..b8e4ee4e89a4 100644
--- a/fs/xfs/xfs_inum.h
+++ b/fs/xfs/xfs_inum.h
@@ -72,7 +72,6 @@ struct xfs_mount;
 #if XFS_BIG_INUMS
 #define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 56) - 1ULL))
-#define XFS_INO64_OFFSET        ((xfs_ino_t)(1ULL << 32))
 #else
 #define XFS_MAXINUMBER          ((xfs_ino_t)((1ULL << 32) - 1ULL))
 #endif
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index aeb2d2221c7d..b68f9107e26c 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -39,7 +39,7 @@
 #include "xfs_error.h"
 #include "xfs_btree.h"
-int
+STATIC int
 xfs_internal_inum(
        xfs_mount_t     *mp,
        xfs_ino_t       ino)
@@ -353,9 +353,6 @@ xfs_bulkstat(
        int                     end_of_ag; /* set if we've seen the ag end */
        int                     error;  /* error code */
        int                     fmterror;/* bulkstat formatter result */
-        __int32_t               gcnt;   /* current btree rec's count */
-        xfs_inofree_t           gfree;  /* current btree rec's free mask */
-        xfs_agino_t             gino;   /* current btree rec's start inode */
        int                     i;      /* loop index */
        int                     icount; /* count of inodes good in irbuf */
        size_t                  irbsize; /* size of irec buffer in bytes */
@@ -442,40 +439,43 @@ xfs_bulkstat(
                 * we need to get the remainder of the chunk we're in.
                 */
                if (agino > 0) {
+                        xfs_inobt_rec_incore_t r;
                        /*
                         * Lookup the inode chunk that this inode lives in.
                         */
-                        error = xfs_inobt_lookup_le(cur, agino, 0, 0, &tmp);
+                        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_LE,
+                                                 &tmp);
                        if (!error &&   /* no I/O error */
                            tmp &&      /* lookup succeeded */
                                        /* got the record, should always work */
-                            !(error = xfs_inobt_get_rec(cur, &gino, &gcnt,
+                            !(error = xfs_inobt_get_rec(cur, &r, &i)) &&
-                                    &gfree, &i)) &&
                            i == 1 &&
                                        /* this is the right chunk */
-                            agino < gino + XFS_INODES_PER_CHUNK &&
+                            agino < r.ir_startino + XFS_INODES_PER_CHUNK &&
                                        /* lastino was not last in chunk */
-                            (chunkidx = agino - gino + 1) <
+                            (chunkidx = agino - r.ir_startino + 1) <
                                    XFS_INODES_PER_CHUNK &&
                                        /* there are some left allocated */
                            xfs_inobt_maskn(chunkidx,
-                                    XFS_INODES_PER_CHUNK - chunkidx) & ~gfree) {
+                                    XFS_INODES_PER_CHUNK - chunkidx) &
+                                    ~r.ir_free) {
                                /*
                                 * Grab the chunk record.  Mark all the
                                 * uninteresting inodes (because they're
                                 * before our start point) free.
                                 */
                                for (i = 0; i < chunkidx; i++) {
-                                        if (XFS_INOBT_MASK(i) & ~gfree)
+                                        if (XFS_INOBT_MASK(i) & ~r.ir_free)
-                                                gcnt++;
+                                                r.ir_freecount++;
                                }
-                                gfree |= xfs_inobt_maskn(0, chunkidx);
+                                r.ir_free |= xfs_inobt_maskn(0, chunkidx);
-                                irbp->ir_startino = gino;
+                                irbp->ir_startino = r.ir_startino;
-                                irbp->ir_freecount = gcnt;
+                                irbp->ir_freecount = r.ir_freecount;
-                                irbp->ir_free = gfree;
+                                irbp->ir_free = r.ir_free;
                                irbp++;
-                                agino = gino + XFS_INODES_PER_CHUNK;
+                                agino = r.ir_startino + XFS_INODES_PER_CHUNK;
-                                icount = XFS_INODES_PER_CHUNK - gcnt;
+                                icount = XFS_INODES_PER_CHUNK - r.ir_freecount;
                        } else {
                                /*
                                 * If any of those tests failed, bump the
@@ -493,7 +493,7 @@ xfs_bulkstat(
                        /*
                         * Start of ag.  Lookup the first inode chunk.
                         */
-                        error = xfs_inobt_lookup_ge(cur, 0, 0, 0, &tmp);
+                        error = xfs_inobt_lookup(cur, 0, XFS_LOOKUP_GE, &tmp);
                        icount = 0;
                }
                /*
@@ -501,6 +501,8 @@ xfs_bulkstat(
                 * until we run out of inodes or space in the buffer.
                 */
                while (irbp < irbufend && icount < ubcount) {
+                        xfs_inobt_rec_incore_t r;
                        /*
                         * Loop as long as we're unable to read the
                         * inode btree.
@@ -510,51 +512,55 @@ xfs_bulkstat(
                                if (XFS_AGINO_TO_AGBNO(mp, agino) >=
                                                be32_to_cpu(agi->agi_length))
                                        break;
-                                error = xfs_inobt_lookup_ge(cur, agino, 0, 0,
+                                error = xfs_inobt_lookup(cur, agino,
-                                                            &tmp);
+                                                         XFS_LOOKUP_GE, &tmp);
                                cond_resched();
                        }
                        /*
                         * If ran off the end of the ag either with an error,
                         * or the normal way, set end and stop collecting.
                         */
-                        if (error ||
+                        if (error) {
-                            (error = xfs_inobt_get_rec(cur, &gino, &gcnt,
-                                    &gfree, &i)) ||
-                            i == 0) {
                                end_of_ag = 1;
                                break;
                        }
+                        error = xfs_inobt_get_rec(cur, &r, &i);
+                        if (error || i == 0) {
+                                end_of_ag = 1;
+                                break;
+                        }
                        /*
                         * If this chunk has any allocated inodes, save it.
                         * Also start read-ahead now for this chunk.
                         */
-                        if (gcnt < XFS_INODES_PER_CHUNK) {
+                        if (r.ir_freecount < XFS_INODES_PER_CHUNK) {
                                /*
                                 * Loop over all clusters in the next chunk.
                                 * Do a readahead if there are any allocated
                                 * inodes in that cluster.
                                 */
-                                for (agbno = XFS_AGINO_TO_AGBNO(mp, gino),
+                                agbno = XFS_AGINO_TO_AGBNO(mp, r.ir_startino);
-                                     chunkidx = 0;
+                                for (chunkidx = 0;
                                     chunkidx < XFS_INODES_PER_CHUNK;
                                     chunkidx += nicluster,
                                     agbno += nbcluster) {
-                                        if (xfs_inobt_maskn(chunkidx,
+                                        if (xfs_inobt_maskn(chunkidx, nicluster)
-                                                            nicluster) & ~gfree)
+                                                        & ~r.ir_free)
                                                xfs_btree_reada_bufs(mp, agno,
                                                        agbno, nbcluster);
                                }
-                                irbp->ir_startino = gino;
+                                irbp->ir_startino = r.ir_startino;
-                                irbp->ir_freecount = gcnt;
+                                irbp->ir_freecount = r.ir_freecount;
-                                irbp->ir_free = gfree;
+                                irbp->ir_free = r.ir_free;
                                irbp++;
-                                icount += XFS_INODES_PER_CHUNK - gcnt;
+                                icount += XFS_INODES_PER_CHUNK - r.ir_freecount;
                        }
                        /*
                         * Set agino to after this chunk and bump the cursor.
                         */
-                        agino = gino + XFS_INODES_PER_CHUNK;
+                        agino = r.ir_startino + XFS_INODES_PER_CHUNK;
                        error = xfs_btree_increment(cur, 0, &tmp);
                        cond_resched();
                }
@@ -820,9 +826,7 @@ xfs_inumbers(
        int             bufidx;
        xfs_btree_cur_t *cur;
        int             error;
-        __int32_t       gcnt;
+        xfs_inobt_rec_incore_t r;
-        xfs_inofree_t   gfree;
-        xfs_agino_t     gino;
        int             i;
        xfs_ino_t       ino;
        int             left;
@@ -855,7 +859,8 @@ xfs_inumbers(
                                continue;
                        }
                        cur = xfs_inobt_init_cursor(mp, NULL, agbp, agno);
-                        error = xfs_inobt_lookup_ge(cur, agino, 0, 0, &tmp);
+                        error = xfs_inobt_lookup(cur, agino, XFS_LOOKUP_GE,
+                                                 &tmp);
                        if (error) {
                                xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
                                cur = NULL;
@@ -870,9 +875,8 @@ xfs_inumbers(
                                continue;
                        }
                }
-                if ((error = xfs_inobt_get_rec(cur, &gino, &gcnt, &gfree,
+                error = xfs_inobt_get_rec(cur, &r, &i);
-                        &i)) ||
+                if (error || i == 0) {
-                    i == 0) {
                        xfs_buf_relse(agbp);
                        agbp = NULL;
                        xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
@@ -881,10 +885,12 @@ xfs_inumbers(
                        agino = 0;
                        continue;
                }
-                agino = gino + XFS_INODES_PER_CHUNK - 1;
+                agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1;
-                buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, gino);
+                buffer[bufidx].xi_startino =
-                buffer[bufidx].xi_alloccount = XFS_INODES_PER_CHUNK - gcnt;
+                        XFS_AGINO_TO_INO(mp, agno, r.ir_startino);
-                buffer[bufidx].xi_allocmask = ~gfree;
+                buffer[bufidx].xi_alloccount =
+                        XFS_INODES_PER_CHUNK - r.ir_freecount;
+                buffer[bufidx].xi_allocmask = ~r.ir_free;
                bufidx++;
                left--;
                if (bufidx == bcount) {
diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h
index 1fb04e7deb61..20792bf45946 100644
--- a/fs/xfs/xfs_itable.h
+++ b/fs/xfs/xfs_itable.h
@@ -99,11 +99,6 @@ xfs_bulkstat_one(
        void                    *dibuff,
        int                     *stat);
-int
-xfs_internal_inum(
-        xfs_mount_t             *mp,
-        xfs_ino_t               ino);
 typedef int (*inumbers_fmt_pf)(
        void                    __user *ubuffer, /* buffer to write to */
        const xfs_inogrp_t      *buffer,        /* buffer to read from */
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3750f04ede0b..9dbdff3ea484 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -3180,7 +3180,7 @@ try_again:
 STATIC void
 xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog)
 {
-        ASSERT(spin_is_locked(&log->l_icloglock));
+        assert_spin_locked(&log->l_icloglock);
        if (iclog->ic_state == XLOG_STATE_ACTIVE) {
                xlog_state_switch_iclogs(log, iclog, 0);
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index bcad5f4c1fd1..679c7c4926a2 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -451,8 +451,6 @@ extern int	 xlog_find_tail(xlog_t	*log,
 extern int       xlog_recover(xlog_t *log);
 extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern void      xlog_recover_process_iunlinks(xlog_t *log);
 extern struct xfs_buf *xlog_get_bp(xlog_t *, int);
 extern void      xlog_put_bp(struct xfs_buf *);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 47da2fb45377..1099395d7d6c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -3263,7 +3263,7 @@ xlog_recover_process_one_iunlink(
 * freeing of the inode and its removal from the list must be
 * atomic.
 */
-void
+STATIC void
 xlog_recover_process_iunlinks(
        xlog_t          *log)
 {
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5c6f092659c1..8b6c9e807efb 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1568,7 +1568,7 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 *
 * The m_sb_lock must be held when this routine is called.
 */
-int
+STATIC int
 xfs_mod_incore_sb_unlocked(
        xfs_mount_t     *mp,
        xfs_sb_field_t  field,
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index a5122382afde..a6c023bc0fb2 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -414,13 +414,10 @@ typedef struct xfs_mod_sb {
 extern int      xfs_log_sbcount(xfs_mount_t *, uint);
 extern int      xfs_mountfs(xfs_mount_t *mp);
-extern void     xfs_mountfs_check_barriers(xfs_mount_t *mp);
 extern void     xfs_unmountfs(xfs_mount_t *);
 extern int      xfs_unmountfs_writesb(xfs_mount_t *);
 extern int      xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
-extern int      xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
-                        int64_t, int);
 extern int      xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
                        uint, int);
 extern int      xfs_mount_log_sb(xfs_mount_t *, __int64_t);
diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c
index afee7eb24323..4b0613d99faa 100644
--- a/fs/xfs/xfs_mru_cache.c
+++ b/fs/xfs/xfs_mru_cache.c
@@ -564,35 +564,6 @@ xfs_mru_cache_lookup(
 }
 /*
- * To look up an element using its key, but leave its location in the internal
- * lists alone, call xfs_mru_cache_peek().  If the element isn't found, this
- * function returns NULL.
- *
- * See the comments above the declaration of the xfs_mru_cache_lookup() function
- * for important locking information pertaining to this call.
- */
-void *
-xfs_mru_cache_peek(
-        xfs_mru_cache_t *mru,
-        unsigned long   key)
-{
-        xfs_mru_cache_elem_t *elem;
-        ASSERT(mru && mru->lists);
-        if (!mru || !mru->lists)
-                return NULL;
-        spin_lock(&mru->lock);
-        elem = radix_tree_lookup(&mru->store, key);
-        if (!elem)
-                spin_unlock(&mru->lock);
-        else
-                __release(mru_lock); /* help sparse not be stupid */
-        return elem ? elem->value : NULL;
-}
-/*
 * To release the internal data structure spinlock after having performed an
 * xfs_mru_cache_lookup() or an xfs_mru_cache_peek(), call xfs_mru_cache_done()
 * with the data store pointer.
diff --git a/fs/xfs/xfs_mru_cache.h b/fs/xfs/xfs_mru_cache.h
index dd58ea1bbebe..5d439f34b0c9 100644
--- a/fs/xfs/xfs_mru_cache.h
+++ b/fs/xfs/xfs_mru_cache.h
@@ -49,7 +49,6 @@ int xfs_mru_cache_insert(struct xfs_mru_cache *mru, unsigned long key,
 void * xfs_mru_cache_remove(struct xfs_mru_cache *mru, unsigned long key);
 void xfs_mru_cache_delete(struct xfs_mru_cache *mru, unsigned long key);
 void *xfs_mru_cache_lookup(struct xfs_mru_cache *mru, unsigned long key);
-void *xfs_mru_cache_peek(struct xfs_mru_cache *mru, unsigned long key);
 void xfs_mru_cache_done(struct xfs_mru_cache *mru);
 #endif /* __XFS_MRU_CACHE_H__ */
diff --git a/fs/xfs/xfs_rw.c b/fs/xfs/xfs_rw.c
index fea68615ed23..3f816ad7ff19 100644
--- a/fs/xfs/xfs_rw.c
+++ b/fs/xfs/xfs_rw.c
@@ -88,90 +88,6 @@ xfs_write_clear_setuid(
 }
 /*
- * Handle logging requirements of various synchronous types of write.
- */
-int
-xfs_write_sync_logforce(
-        xfs_mount_t     *mp,
-        xfs_inode_t     *ip)
-{
-        int             error = 0;
-        /*
-         * If we're treating this as O_DSYNC and we have not updated the
-         * size, force the log.
-         */
-        if (!(mp->m_flags & XFS_MOUNT_OSYNCISOSYNC) &&
-            !(ip->i_update_size)) {
-                xfs_inode_log_item_t    *iip = ip->i_itemp;
-                /*
-                 * If an allocation transaction occurred
-                 * without extending the size, then we have to force
-                 * the log up the proper point to ensure that the
-                 * allocation is permanent.  We can't count on
-                 * the fact that buffered writes lock out direct I/O
-                 * writes - the direct I/O write could have extended
-                 * the size nontransactionally, then finished before
-                 * we started.  xfs_write_file will think that the file
-                 * didn't grow but the update isn't safe unless the
-                 * size change is logged.
-                 *
-                 * Force the log if we've committed a transaction
-                 * against the inode or if someone else has and
-                 * the commit record hasn't gone to disk (e.g.
-                 * the inode is pinned).  This guarantees that
-                 * all changes affecting the inode are permanent
-                 * when we return.
-                 */
-                if (iip && iip->ili_last_lsn) {
-                        error = _xfs_log_force(mp, iip->ili_last_lsn,
-                                        XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
-                } else if (xfs_ipincount(ip) > 0) {
-                        error = _xfs_log_force(mp, (xfs_lsn_t)0,
-                                        XFS_LOG_FORCE | XFS_LOG_SYNC, NULL);
-                }
-        } else {
-                xfs_trans_t     *tp;
-                /*
-                 * O_SYNC or O_DSYNC _with_ a size update are handled
-                 * the same way.
-                 *
-                 * If the write was synchronous then we need to make
-                 * sure that the inode modification time is permanent.
-                 * We'll have updated the timestamp above, so here
-                 * we use a synchronous transaction to log the inode.
-                 * It's not fast, but it's necessary.
-                 *
-                 * If this a dsync write and the size got changed
-                 * non-transactionally, then we need to ensure that
-                 * the size change gets logged in a synchronous
-                 * transaction.
-                 */
-                tp = xfs_trans_alloc(mp, XFS_TRANS_WRITE_SYNC);
-                if ((error = xfs_trans_reserve(tp, 0,
-                                                XFS_SWRITE_LOG_RES(mp),
-                                                0, 0, 0))) {
-                        /* Transaction reserve failed */
-                        xfs_trans_cancel(tp, 0);
-                } else {
-                        /* Transaction reserve successful */
-                        xfs_ilock(ip, XFS_ILOCK_EXCL);
-                        xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
-                        xfs_trans_ihold(tp, ip);
-                        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
-                        xfs_trans_set_sync(tp);
-                        error = xfs_trans_commit(tp, 0);
-                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                }
-        }
-        return error;
-}
-/*
 * Force a shutdown of the filesystem instantly while keeping
 * the filesystem consistent. We don't do an unmount here; just shutdown
 * the shop, make sure that absolutely nothing persistent happens to
diff --git a/fs/xfs/xfs_rw.h b/fs/xfs/xfs_rw.h
index f76c003ec55d..f5e4874c37d8 100644
--- a/fs/xfs/xfs_rw.h
+++ b/fs/xfs/xfs_rw.h
@@ -68,7 +68,6 @@ xfs_get_extsz_hint(
 * Prototypes for functions in xfs_rw.c.
 */
 extern int xfs_write_clear_setuid(struct xfs_inode *ip);
-extern int xfs_write_sync_logforce(struct xfs_mount *mp, struct xfs_inode *ip);
 extern int xfs_bwrite(struct xfs_mount *mp, struct xfs_buf *bp);
 extern int xfs_bioerror(struct xfs_buf *bp);
 extern int xfs_bioerror_relse(struct xfs_buf *bp);
@@ -78,10 +77,4 @@ extern int xfs_read_buf(struct xfs_mount *mp, xfs_buftarg_t *btp,
 extern void xfs_ioerror_alert(char *func, struct xfs_mount *mp,
                                xfs_buf_t *bp, xfs_daddr_t blkno);
-/*
- * Prototypes for functions in xfs_vnodeops.c.
- */
-extern int xfs_free_eofblocks(struct xfs_mount *mp, struct xfs_inode *ip,
-                        int flags);
 #endif /* __XFS_RW_H__ */
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 775249a54f6f..ed47fc77759c 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -68,7 +68,7 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_GROWFS                14
 #define XFS_TRANS_STRAT_WRITE           15
 #define XFS_TRANS_DIOSTRAT              16
-#define XFS_TRANS_WRITE_SYNC            17
+/* 17 was XFS_TRANS_WRITE_SYNC */
 #define XFS_TRANS_WRITEID               18
 #define XFS_TRANS_ADDAFORK              19
 #define XFS_TRANS_ATTRINVAL             20
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 8ee2f8c8b0a6..218829e6a152 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -307,7 +307,7 @@ xfs_trans_read_buf(
                        return (flags & XFS_BUF_TRYLOCK) ?
                                        EAGAIN : XFS_ERROR(ENOMEM);
-                if ((bp != NULL) && (XFS_BUF_GETERROR(bp) != 0)) {
+                if (XFS_BUF_GETERROR(bp) != 0) {
                        xfs_ioerror_alert("xfs_trans_read_buf", mp,
                                          bp, blkno);
                        error = XFS_BUF_GETERROR(bp);
@@ -315,7 +315,7 @@ xfs_trans_read_buf(
                        return error;
                }
 #ifdef DEBUG
-                if (xfs_do_error && (bp != NULL)) {
+                if (xfs_do_error) {
                        if (xfs_error_target == target) {
                                if (((xfs_req_num++) % xfs_error_mod) == 0) {
                                        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c
index 23d276af2e0c..785ff101da0a 100644
--- a/fs/xfs/xfs_trans_inode.c
+++ b/fs/xfs/xfs_trans_inode.c
@@ -49,30 +49,7 @@ xfs_trans_inode_broot_debug(
 /*
- * Get and lock the inode for the caller if it is not already
+ * Get an inode and join it to the transaction.
- * locked within the given transaction.  If it is already locked
- * within the transaction, just increment its lock recursion count
- * and return a pointer to it.
- *
- * For an inode to be locked in a transaction, the inode lock, as
- * opposed to the io lock, must be taken exclusively.  This ensures
- * that the inode can be involved in only 1 transaction at a time.
- * Lock recursion is handled on the io lock, but only for lock modes
- * of equal or lesser strength.  That is, you can recur on the io lock
- * held EXCL with a SHARED request but not vice versa.  Also, if
- * the inode is already a part of the transaction then you cannot
- * go from not holding the io lock to having it EXCL or SHARED.
- *
- * Use the inode cache routine xfs_inode_incore() to find the inode
- * if it is already owned by this transaction.
- *
- * If we don't already own the inode, use xfs_iget() to get it.
- * Since the inode log item structure is embedded in the incore
- * inode structure and is initialized when the inode is brought
- * into memory, there is nothing to do with it here.
- *
- * If the given transaction pointer is NULL, just call xfs_iget().
- * This simplifies code which must handle both cases.
 */
 int
 xfs_trans_iget(
@@ -84,62 +61,11 @@ xfs_trans_iget(
        xfs_inode_t     **ipp)
 {
        int                     error;
-        xfs_inode_t             *ip;
-        /*
-         * If the transaction pointer is NULL, just call the normal
-         * xfs_iget().
-         */
-        if (tp == NULL)
-                return xfs_iget(mp, NULL, ino, flags, lock_flags, ipp, 0);
-        /*
-         * If we find the inode in core with this transaction
-         * pointer in its i_transp field, then we know we already
-         * have it locked.  In this case we just increment the lock
-         * recursion count and return the inode to the caller.
-         * Assert that the inode is already locked in the mode requested
-         * by the caller.  We cannot do lock promotions yet, so
-         * die if someone gets this wrong.
-         */
-        if ((ip = xfs_inode_incore(tp->t_mountp, ino, tp)) != NULL) {
-                /*
-                 * Make sure that the inode lock is held EXCL and
-                 * that the io lock is never upgraded when the inode
-                 * is already a part of the transaction.
-                 */
-                ASSERT(ip->i_itemp != NULL);
-                ASSERT(lock_flags & XFS_ILOCK_EXCL);
-                ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
-                ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
-                       xfs_isilocked(ip, XFS_IOLOCK_EXCL));
-                ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) ||
-                       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL));
-                ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
-                       xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED));
-                ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) ||
-                       (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY));
-                if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) {
-                        ip->i_itemp->ili_iolock_recur++;
-                }
-                if (lock_flags & XFS_ILOCK_EXCL) {
-                        ip->i_itemp->ili_ilock_recur++;
-                }
-                *ipp = ip;
-                return 0;
-        }
-        ASSERT(lock_flags & XFS_ILOCK_EXCL);
-        error = xfs_iget(tp->t_mountp, tp, ino, flags, lock_flags, &ip, 0);
-        if (error) {
-                return error;
-        }
-        ASSERT(ip != NULL);
-        xfs_trans_ijoin(tp, ip, lock_flags);
+        error = xfs_iget(mp, tp, ino, flags, lock_flags, ipp, 0);
-        *ipp = ip;
+        if (!error && tp)
-        return 0;
+                xfs_trans_ijoin(tp, *ipp, lock_flags);
+        return error;
 }
 /*
@@ -163,8 +89,6 @@ xfs_trans_ijoin(
                xfs_inode_item_init(ip, ip->i_mount);
        iip = ip->i_itemp;
        ASSERT(iip->ili_flags == 0);
-        ASSERT(iip->ili_ilock_recur == 0);
-        ASSERT(iip->ili_iolock_recur == 0);
        /*
         * Get a log_item_desc to point at the new item.
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index c4eca5ed5dab..a434f287962d 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -538,7 +538,9 @@ xfs_readlink_bmap(
                d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock);
                byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount);
-                bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0);
+                bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt),
+                                        XBF_LOCK | XBF_MAPPED |
+                                        XBF_DONT_BLOCK);
                error = XFS_BUF_GETERROR(bp);
                if (error) {
                        xfs_ioerror_alert("xfs_readlink",
@@ -609,7 +611,7 @@ xfs_fsync(
        xfs_inode_t     *ip)
 {
        xfs_trans_t     *tp;
-        int             error;
+        int             error = 0;
        int             log_flushed = 0, changed = 1;
        xfs_itrace_entry(ip);
@@ -617,14 +619,9 @@ xfs_fsync(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return XFS_ERROR(EIO);
-        /* capture size updates in I/O completion before writing the inode. */
-        error = xfs_wait_on_pages(ip, 0, -1);
-        if (error)
-                return XFS_ERROR(error);
        /*
         * We always need to make sure that the required inode state is safe on
-         * disk.  The vnode might be clean but we still might need to force the
+         * disk.  The inode might be clean but we still might need to force the
         * log because of committed transactions that haven't hit the disk yet.
         * Likewise, there could be unflushed non-transactional changes to the
         * inode core that have to go to disk and this requires us to issue
@@ -636,7 +633,7 @@ xfs_fsync(
         */
        xfs_ilock(ip, XFS_ILOCK_SHARED);
-        if (!(ip->i_update_size || ip->i_update_core)) {
+        if (!ip->i_update_core) {
                /*
                 * Timestamps/size haven't changed since last inode flush or
                 * inode transaction commit.  That means either nothing got
@@ -716,7 +713,7 @@ xfs_fsync(
 * when the link count isn't zero and by xfs_dm_punch_hole() when
 * punching a hole to EOF.
 */
-int
+STATIC int
 xfs_free_eofblocks(
        xfs_mount_t     *mp,
        xfs_inode_t     *ip,
@@ -1474,8 +1471,8 @@ xfs_create(
        if (error == ENOSPC) {
                /* flush outstanding delalloc blocks and retry */
                xfs_flush_inodes(dp);
-                error = xfs_trans_reserve(tp, resblks, XFS_CREATE_LOG_RES(mp), 0,
+                error = xfs_trans_reserve(tp, resblks, log_res, 0,
-                        XFS_TRANS_PERM_LOG_RES, XFS_CREATE_LOG_COUNT);
+                                XFS_TRANS_PERM_LOG_RES, log_count);
        }
        if (error == ENOSPC) {
                /* No space at all so try a "no-allocation" reservation */
author	Artem Bityutskiy <Artem.Bityutskiy@nokia.com>	2009-09-21 05:09:22 -0400
committer	Artem Bityutskiy <Artem.Bityutskiy@nokia.com>	2009-09-21 05:09:22 -0400
commit	7cce2f4cb7f5f641f78c8e3eea4e7b1b96cb71c0 (patch)
tree	b064d077928cf224660ab1e1841cdab2c9fd8b08 /fs
parent	e055f7e873d900925c222cf2d1ec955af4a9ca90 (diff)
parent	ebc79c4f8da0f92efa968e0328f32334a2ce80cf (diff)