Merge branch 'master' into for-linus

Conflicts: fs/pipe.c Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
author: Jens Axboe <jaxboe@fusionio.com> 2010-06-01 06:42:12 -0400
committer: Jens Axboe <jaxboe@fusionio.com> 2010-06-01 06:42:12 -0400
commit: b4ca761577535b2b4d153689ee97342797dfff05 (patch)
tree: 29054d55508f1faa22ec32acf7c245751af03348 /fs
parent: 28f4197e5d4707311febeec8a0eb97cb5fd93c97 (diff)
parent: 67a3e12b05e055c0415c556a315a3d3eb637e29e (diff)
275 files changed, 11458 insertions, 6190 deletions
diff --git a/fs/9p/v9fs_vfs.h b/fs/9p/v9fs_vfs.h
index ed835836e0dc..32ef4009d030 100644
--- a/fs/9p/v9fs_vfs.h
+++ b/fs/9p/v9fs_vfs.h
@@ -40,7 +40,9 @@
 extern struct file_system_type v9fs_fs_type;
 extern const struct address_space_operations v9fs_addr_operations;
 extern const struct file_operations v9fs_file_operations;
+extern const struct file_operations v9fs_file_operations_dotl;
 extern const struct file_operations v9fs_dir_operations;
+extern const struct file_operations v9fs_dir_operations_dotl;
 extern const struct dentry_operations v9fs_dentry_operations;
 extern const struct dentry_operations v9fs_cached_dentry_operations;
diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c
index 0adfd64dfcee..d61e3b28ce37 100644
--- a/fs/9p/vfs_dir.c
+++ b/fs/9p/vfs_dir.c
@@ -203,3 +203,11 @@ const struct file_operations v9fs_dir_operations = {
        .open = v9fs_file_open,
        .release = v9fs_dir_release,
 };
+const struct file_operations v9fs_dir_operations_dotl = {
+        .read = generic_read_dir,
+        .llseek = generic_file_llseek,
+        .readdir = v9fs_dir_readdir,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index df52d488d2a6..2bedc6c94fc2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -257,15 +257,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
        return total;
 }
-static int v9fs_file_fsync(struct file *filp, struct dentry *dentry,
+static int v9fs_file_fsync(struct file *filp, int datasync)
-                                        int datasync)
 {
        struct p9_fid *fid;
        struct p9_wstat wstat;
        int retval;
-        P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp,
+        P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
-                                                dentry, datasync);
        fid = filp->private_data;
        v9fs_blank_wstat(&wstat);
@@ -296,3 +294,14 @@ const struct file_operations v9fs_file_operations = {
        .mmap = generic_file_readonly_mmap,
        .fsync = v9fs_file_fsync,
 };
+const struct file_operations v9fs_file_operations_dotl = {
+        .llseek = generic_file_llseek,
+        .read = v9fs_file_read,
+        .write = v9fs_file_write,
+        .open = v9fs_file_open,
+        .release = v9fs_dir_release,
+        .lock = v9fs_file_lock,
+        .mmap = generic_file_readonly_mmap,
+        .fsync = v9fs_file_fsync,
+};
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 6d4d86187c55..4331b3b5ee1c 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -44,9 +44,12 @@
 #include "cache.h"
 static const struct inode_operations v9fs_dir_inode_operations;
-static const struct inode_operations v9fs_dir_inode_operations_ext;
+static const struct inode_operations v9fs_dir_inode_operations_dotu;
+static const struct inode_operations v9fs_dir_inode_operations_dotl;
 static const struct inode_operations v9fs_file_inode_operations;
+static const struct inode_operations v9fs_file_inode_operations_dotl;
 static const struct inode_operations v9fs_symlink_inode_operations;
+static const struct inode_operations v9fs_symlink_inode_operations_dotl;
 /**
 * unixmode2p9mode - convert unix mode bits to plan 9
@@ -273,25 +276,44 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode)
                init_special_inode(inode, inode->i_mode, inode->i_rdev);
                break;
        case S_IFREG:
-                inode->i_op = &v9fs_file_inode_operations;
+                if (v9fs_proto_dotl(v9ses)) {
-                inode->i_fop = &v9fs_file_operations;
+                        inode->i_op = &v9fs_file_inode_operations_dotl;
+                        inode->i_fop = &v9fs_file_operations_dotl;
+                } else {
+                        inode->i_op = &v9fs_file_inode_operations;
+                        inode->i_fop = &v9fs_file_operations;
+                }
                break;
        case S_IFLNK:
-                if (!v9fs_proto_dotu(v9ses)) {
+                if (!v9fs_proto_dotu(v9ses) && !v9fs_proto_dotl(v9ses)) {
-                        P9_DPRINTK(P9_DEBUG_ERROR,
+                        P9_DPRINTK(P9_DEBUG_ERROR, "extended modes used with "
-                                   "extended modes used w/o 9P2000.u\n");
+                                                "legacy protocol.\n");
                        err = -EINVAL;
                        goto error;
                }
-                inode->i_op = &v9fs_symlink_inode_operations;
+                if (v9fs_proto_dotl(v9ses))
+                        inode->i_op = &v9fs_symlink_inode_operations_dotl;
+                else
+                        inode->i_op = &v9fs_symlink_inode_operations;
                break;
        case S_IFDIR:
                inc_nlink(inode);
-                if (v9fs_proto_dotu(v9ses))
+                if (v9fs_proto_dotl(v9ses))
-                        inode->i_op = &v9fs_dir_inode_operations_ext;
+                        inode->i_op = &v9fs_dir_inode_operations_dotl;
+                else if (v9fs_proto_dotu(v9ses))
+                        inode->i_op = &v9fs_dir_inode_operations_dotu;
                else
                        inode->i_op = &v9fs_dir_inode_operations;
-                inode->i_fop = &v9fs_dir_operations;
+                if (v9fs_proto_dotl(v9ses))
+                        inode->i_fop = &v9fs_dir_operations_dotl;
+                else
+                        inode->i_fop = &v9fs_dir_operations;
                break;
        default:
                P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n",
@@ -432,14 +454,12 @@ static int v9fs_remove(struct inode *dir, struct dentry *file, int rmdir)
 {
        int retval;
        struct inode *file_inode;
-        struct v9fs_session_info *v9ses;
        struct p9_fid *v9fid;
        P9_DPRINTK(P9_DEBUG_VFS, "inode: %p dentry: %p rmdir: %d\n", dir, file,
                rmdir);
        file_inode = file->d_inode;
-        v9ses = v9fs_inode2v9ses(file_inode);
        v9fid = v9fs_fid_clone(file);
        if (IS_ERR(v9fid))
                return PTR_ERR(v9fid);
@@ -482,12 +502,11 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        ofid = NULL;
        fid = NULL;
        name = (char *) dentry->d_name.name;
-        dfid = v9fs_fid_clone(dentry->d_parent);
+        dfid = v9fs_fid_lookup(dentry->d_parent);
        if (IS_ERR(dfid)) {
                err = PTR_ERR(dfid);
-                P9_DPRINTK(P9_DEBUG_VFS, "fid clone failed %d\n", err);
+                P9_DPRINTK(P9_DEBUG_VFS, "fid lookup failed %d\n", err);
-                dfid = NULL;
+                return ERR_PTR(err);
-                goto error;
        }
        /* clone a fid to use for creation */
@@ -495,8 +514,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        if (IS_ERR(ofid)) {
                err = PTR_ERR(ofid);
                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
-                ofid = NULL;
+                return ERR_PTR(err);
-                goto error;
        }
        err = p9_client_fcreate(ofid, name, perm, mode, extension);
@@ -506,14 +524,13 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        }
        /* now walk from the parent so we can get unopened fid */
-        fid = p9_client_walk(dfid, 1, &name, 0);
+        fid = p9_client_walk(dfid, 1, &name, 1);
        if (IS_ERR(fid)) {
                err = PTR_ERR(fid);
                P9_DPRINTK(P9_DEBUG_VFS, "p9_client_walk failed %d\n", err);
                fid = NULL;
                goto error;
-        } else
+        }
-                dfid = NULL;
        /* instantiate inode and assign the unopened fid to the dentry */
        inode = v9fs_inode_from_fid(v9ses, fid, dir->i_sb);
@@ -536,9 +553,6 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir,
        return ofid;
 error:
-        if (dfid)
-                p9_client_clunk(dfid);
        if (ofid)
                p9_client_clunk(ofid);
@@ -673,8 +687,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(fid)) {
                result = PTR_ERR(fid);
                if (result == -ENOENT) {
-                        d_add(dentry, NULL);
+                        inode = NULL;
-                        return NULL;
+                        goto inst_out;
                }
                return ERR_PTR(result);
@@ -691,7 +705,8 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry,
        if (result < 0)
                goto error;
-        if ((fid->qid.version) && (v9ses->cache))
+inst_out:
+        if (v9ses->cache)
                dentry->d_op = &v9fs_cached_dentry_operations;
        else
                dentry->d_op = &v9fs_dentry_operations;
@@ -770,6 +785,13 @@ v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                goto clunk_olddir;
        }
+        if (v9fs_proto_dotl(v9ses)) {
+                retval = p9_client_rename(oldfid, newdirfid,
+                                        (char *) new_dentry->d_name.name);
+                if (retval != -ENOSYS)
+                        goto clunk_newdir;
+        }
        /* 9P can only handle file rename in the same directory */
        if (memcmp(&olddirfid->qid, &newdirfid->qid, sizeof(newdirfid->qid))) {
                P9_DPRINTK(P9_DEBUG_ERROR,
@@ -1195,6 +1217,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
                sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev));
        else if (S_ISFIFO(mode))
                *name = 0;
+        else if (S_ISSOCK(mode))
+                *name = 0;
        else {
                __putname(name);
                return -EINVAL;
@@ -1206,7 +1230,21 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
        return retval;
 }
-static const struct inode_operations v9fs_dir_inode_operations_ext = {
+static const struct inode_operations v9fs_dir_inode_operations_dotu = {
+        .create = v9fs_vfs_create,
+        .lookup = v9fs_vfs_lookup,
+        .symlink = v9fs_vfs_symlink,
+        .link = v9fs_vfs_link,
+        .unlink = v9fs_vfs_unlink,
+        .mkdir = v9fs_vfs_mkdir,
+        .rmdir = v9fs_vfs_rmdir,
+        .mknod = v9fs_vfs_mknod,
+        .rename = v9fs_vfs_rename,
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
+static const struct inode_operations v9fs_dir_inode_operations_dotl = {
        .create = v9fs_vfs_create,
        .lookup = v9fs_vfs_lookup,
        .symlink = v9fs_vfs_symlink,
@@ -1237,6 +1275,11 @@ static const struct inode_operations v9fs_file_inode_operations = {
        .setattr = v9fs_vfs_setattr,
 };
+static const struct inode_operations v9fs_file_inode_operations_dotl = {
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
 static const struct inode_operations v9fs_symlink_inode_operations = {
        .readlink = generic_readlink,
        .follow_link = v9fs_vfs_follow_link,
@@ -1244,3 +1287,11 @@ static const struct inode_operations v9fs_symlink_inode_operations = {
        .getattr = v9fs_vfs_getattr,
        .setattr = v9fs_vfs_setattr,
 };
+static const struct inode_operations v9fs_symlink_inode_operations_dotl = {
+        .readlink = generic_readlink,
+        .follow_link = v9fs_vfs_follow_link,
+        .put_link = v9fs_vfs_put_link,
+        .getattr = v9fs_vfs_getattr,
+        .setattr = v9fs_vfs_setattr,
+};
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 806da5d3b3a0..be74d020436e 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -38,6 +38,7 @@
 #include <linux/idr.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/statfs.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
@@ -45,7 +46,7 @@
 #include "v9fs_vfs.h"
 #include "fid.h"
-static const struct super_operations v9fs_super_ops;
+static const struct super_operations v9fs_super_ops, v9fs_super_ops_dotl;
 /**
 * v9fs_set_super - set the superblock
@@ -76,7 +77,10 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses,
        sb->s_blocksize_bits = fls(v9ses->maxdata - 1);
        sb->s_blocksize = 1 << sb->s_blocksize_bits;
        sb->s_magic = V9FS_MAGIC;
-        sb->s_op = &v9fs_super_ops;
+        if (v9fs_proto_dotl(v9ses))
+                sb->s_op = &v9fs_super_ops_dotl;
+        else
+                sb->s_op = &v9fs_super_ops;
        sb->s_bdi = &v9ses->bdi;
        sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC |
@@ -211,6 +215,42 @@ v9fs_umount_begin(struct super_block *sb)
        v9fs_session_begin_cancel(v9ses);
 }
+static int v9fs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct v9fs_session_info *v9ses;
+        struct p9_fid *fid;
+        struct p9_rstatfs rs;
+        int res;
+        fid = v9fs_fid_lookup(dentry);
+        if (IS_ERR(fid)) {
+                res = PTR_ERR(fid);
+                goto done;
+        }
+        v9ses = v9fs_inode2v9ses(dentry->d_inode);
+        if (v9fs_proto_dotl(v9ses)) {
+                res = p9_client_statfs(fid, &rs);
+                if (res == 0) {
+                        buf->f_type = rs.type;
+                        buf->f_bsize = rs.bsize;
+                        buf->f_blocks = rs.blocks;
+                        buf->f_bfree = rs.bfree;
+                        buf->f_bavail = rs.bavail;
+                        buf->f_files = rs.files;
+                        buf->f_ffree = rs.ffree;
+                        buf->f_fsid.val[0] = rs.fsid & 0xFFFFFFFFUL;
+                        buf->f_fsid.val[1] = (rs.fsid >> 32) & 0xFFFFFFFFUL;
+                        buf->f_namelen = rs.namelen;
+                }
+                if (res != -ENOSYS)
+                        goto done;
+        }
+        res = simple_statfs(dentry, buf);
+done:
+        return res;
+}
 static const struct super_operations v9fs_super_ops = {
 #ifdef CONFIG_9P_FSCACHE
        .alloc_inode = v9fs_alloc_inode,
@@ -222,6 +262,17 @@ static const struct super_operations v9fs_super_ops = {
        .umount_begin = v9fs_umount_begin,
 };
+static const struct super_operations v9fs_super_ops_dotl = {
+#ifdef CONFIG_9P_FSCACHE
+        .alloc_inode = v9fs_alloc_inode,
+        .destroy_inode = v9fs_destroy_inode,
+#endif
+        .statfs = v9fs_statfs,
+        .clear_inode = v9fs_clear_inode,
+        .show_options = generic_show_options,
+        .umount_begin = v9fs_umount_begin,
+};
 struct file_system_type v9fs_fs_type = {
        .name = "9p",
        .get_sb = v9fs_get_sb,
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
        .readdir        = adfs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .splice_read    = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 0f5e30978135..6f850b06ab62 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -322,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
        if (error)
                goto out;
+        /* XXX: this is missing some actual on-disk truncation.. */
        if (ia_valid & ATTR_SIZE)
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
        if (error)
                goto out;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac12..f05b6155ccc8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -183,7 +183,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 void            affs_free_prealloc(struct inode *inode);
 extern void     affs_truncate(struct inode *);
-int             affs_file_fsync(struct file *, struct dentry *, int);
+int             affs_file_fsync(struct file *, int);
 /* dir.c */
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..322710c3eedf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -916,9 +916,9 @@ affs_truncate(struct inode *inode)
        affs_free_prealloc(inode);
 }
-int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int affs_file_fsync(struct file *filp, int datasync)
 {
-        struct inode * inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int ret, err;
        ret = write_inode_now(inode, 0);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                affs_brelse(bh);
                inode = affs_iget(sb, ino);
                if (IS_ERR(inode))
-                        return ERR_PTR(PTR_ERR(inode));
+                        return ERR_CAST(inode);
        }
        dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
        d_add(dentry, inode);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 807f284cc75e..5f679b77ce24 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -740,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
                              unsigned long, loff_t);
 extern int afs_writeback_all(struct afs_vnode *);
-extern int afs_fsync(struct file *, struct dentry *, int);
+extern int afs_fsync(struct file *, int);
 /*****************************************************************************/
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d4..3dab9e9948d0 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -701,8 +701,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
 * - the return status from this call provides a reliable indication of
 *   whether any write errors occurred for this process.
 */
-int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int afs_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct afs_writeback *wb, *xwb;
        struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
        int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..1ccf25cef1f0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/mempool.h>
 #include <linux/hash.h>
+#include <linux/compat.h>
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
                /* Complete the fput(s) */
                if (req->ki_filp != NULL)
-                        __fput(req->ki_filp);
+                        fput(req->ki_filp);
                /* Link the iocb into the context's free list */
                spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
        /*
         * Try to optimize the aio and eventfd file* puts, by avoiding to
-         * schedule work in case it is not __fput() time. In normal cases,
+         * schedule work in case it is not final fput() time. In normal cases,
         * we would not be holding the last reference to the file*, so
         * this function will be executed w/out any aio kthread wakeup.
         */
-        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
+        if (unlikely(!fput_atomic(req->ki_filp))) {
                get_ioctx(ctx);
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
        return ret;
 }
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
+static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
 {
        ssize_t ret;
-        ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
+#ifdef CONFIG_COMPAT
-                                    kiocb->ki_nbytes, 1,
+        if (compat)
-                                    &kiocb->ki_inline_vec, &kiocb->ki_iovec);
+                ret = compat_rw_copy_check_uvector(type,
+                                (struct compat_iovec __user *)kiocb->ki_buf,
+                                kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+                                &kiocb->ki_iovec);
+        else
+#endif
+                ret = rw_copy_check_uvector(type,
+                                (struct iovec __user *)kiocb->ki_buf,
+                                kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+                                &kiocb->ki_iovec);
        if (ret < 0)
                goto out;
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
 *      Performs the initial checks and aio retry method
 *      setup for the kiocb at the time of io submission.
 */
-static ssize_t aio_setup_iocb(struct kiocb *kiocb)
+static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 {
        struct file *file = kiocb->ki_filp;
        ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
                ret = security_file_permission(file, MAY_READ);
                if (unlikely(ret))
                        break;
-                ret = aio_setup_vectored_rw(READ, kiocb);
+                ret = aio_setup_vectored_rw(READ, kiocb, compat);
                if (ret)
                        break;
                ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
                ret = security_file_permission(file, MAY_WRITE);
                if (unlikely(ret))
                        break;
-                ret = aio_setup_vectored_rw(WRITE, kiocb);
+                ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
                if (ret)
                        break;
                ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
 }
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                         struct iocb *iocb, struct hlist_head *batch_hash)
+                         struct iocb *iocb, struct hlist_head *batch_hash,
+                         bool compat)
 {
        struct kiocb *req;
        struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
        req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
        req->ki_opcode = iocb->aio_lio_opcode;
-        ret = aio_setup_iocb(req);
+        ret = aio_setup_iocb(req, compat);
        if (ret)
                goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
        return ret;
 }
-/* sys_io_submit:
+long do_io_submit(aio_context_t ctx_id, long nr,
- *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+                  struct iocb __user *__user *iocbpp, bool compat)
- *      the number of iocbs queued.  May return -EINVAL if the aio_context
- *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
- *      *iocbpp[0] is not properly initialized, if the operation specified
- *      is invalid for the file descriptor in the iocb.  May fail with
- *      -EFAULT if any of the data structures point to invalid data.  May
- *      fail with -EBADF if the file descriptor specified in the first
- *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
- *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
- *      fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
-                struct iocb __user * __user *, iocbpp)
 {
        struct kioctx *ctx;
        long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
                        break;
                }
-                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
+                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
                if (ret)
                        break;
        }
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
        return i ? i : ret;
 }
+/* sys_io_submit:
+ *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+ *      the number of iocbs queued.  May return -EINVAL if the aio_context
+ *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ *      *iocbpp[0] is not properly initialized, if the operation specified
+ *      is invalid for the file descriptor in the iocb.  May fail with
+ *      -EFAULT if any of the data structures point to invalid data.  May
+ *      fail with -EBADF if the file descriptor specified in the first
+ *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
+ *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+ *      fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+                struct iocb __user * __user *, iocbpp)
+{
+        return do_io_submit(ctx_id, nr, iocbpp, 0);
+}
 /* lookup_kiocb
 *      Finds a given iocb for cancellation.
 */
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9bd4b3876c99..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void)
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
-        inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
+        inode->i_mode = S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_flags |= S_PRIVATE;
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..b4fa3b0aa596 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
 * @offset:     the new size to assign to the inode
 * @Returns:    0 on success, -ve errno on failure
 *
+ * inode_newsize_ok must be called with i_mutex held.
+ *
 * inode_newsize_ok will check filesystem limits and ulimits to check that the
 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
 * when necessary. Caller must not proceed with inode size change if failure is
 * returned. @inode must be a file (not directory), with appropriate
 * permissions to allow truncate (inode_newsize_ok does NOT check these
 * conditions).
- *
- * inode_newsize_ok must be called with i_mutex held.
 */
 int inode_newsize_ok(const struct inode *inode, loff_t offset)
 {
@@ -104,17 +104,25 @@ out_big:
 }
 EXPORT_SYMBOL(inode_newsize_ok);
-int inode_setattr(struct inode * inode, struct iattr * attr)
+/**
+ * generic_setattr - copy simple metadata updates into the generic inode
+ * @inode:      the inode to be updated
+ * @attr:       the new attributes
+ *
+ * generic_setattr must be called with i_mutex held.
+ *
+ * generic_setattr updates the inode's metadata with that specified
+ * in attr. Noticably missing is inode size update, which is more complex
+ * as it requires pagecache updates. See simple_setsize.
+ *
+ * The inode is not marked as dirty after this operation. The rationale is
+ * that for "simple" filesystems, the struct inode is the inode storage.
+ * The caller is free to mark the inode dirty afterwards if needed.
+ */
+void generic_setattr(struct inode *inode, const struct iattr *attr)
 {
        unsigned int ia_valid = attr->ia_valid;
-        if (ia_valid & ATTR_SIZE &&
-            attr->ia_size != i_size_read(inode)) {
-                int error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        if (ia_valid & ATTR_UID)
                inode->i_uid = attr->ia_uid;
        if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
                        mode &= ~S_ISGID;
                inode->i_mode = mode;
        }
+}
+EXPORT_SYMBOL(generic_setattr);
+/*
+ * note this function is deprecated, the new truncate sequence should be
+ * used instead -- see eg. simple_setsize, generic_setattr.
+ */
+int inode_setattr(struct inode *inode, const struct iattr *attr)
+{
+        unsigned int ia_valid = attr->ia_valid;
+        if (ia_valid & ATTR_SIZE &&
+            attr->ia_size != i_size_read(inode)) {
+                int error;
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, attr);
        mark_inode_dirty(inode);
        return 0;
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc79..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
 static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 const struct file_operations autofs_root_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = autofs_root_readdir,
        .ioctl          = autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d29b7f6df862..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
 */
 static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
 {
-        struct autofs_dev_ioctl tmp, *ads;
+        struct autofs_dev_ioctl tmp;
        if (copy_from_user(&tmp, in, sizeof(tmp)))
                return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
        if (tmp.size < sizeof(tmp))
                return ERR_PTR(-EINVAL);
-        ads = kmalloc(tmp.size, GFP_KERNEL);
+        return memdup_user(in, tmp.size);
-        if (!ads)
-                return ERR_PTR(-ENOMEM);
-        if (copy_from_user(ads, in, tmp.size)) {
-                kfree(ads);
-                return ERR_PTR(-EFAULT);
-        }
-        return ads;
 }
 static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
@@ -736,11 +727,14 @@ static const struct file_operations _dev_ioctl_fops = {
 };
 static struct miscdevice _autofs_dev_ioctl_misc = {
-        .minor          = MISC_DYNAMIC_MINOR,
+        .minor          = AUTOFS_MINOR,
        .name           = AUTOFS_DEVICE_NAME,
        .fops           = &_dev_ioctl_fops
 };
+MODULE_ALIAS_MISCDEV(AUTOFS_MINOR);
+MODULE_ALIAS("devname:autofs");
 /* Register/deregister misc character device */
 int autofs_dev_ioctl_init(void)
 {
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index e8e5e63ac950..db4117ed7803 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -18,13 +18,14 @@
 #include <linux/slab.h>
 #include <linux/param.h>
 #include <linux/time.h>
+#include <linux/smp_lock.h>
 #include "autofs_i.h"
 static int autofs4_dir_symlink(struct inode *,struct dentry *,const char *);
 static int autofs4_dir_unlink(struct inode *,struct dentry *);
 static int autofs4_dir_rmdir(struct inode *,struct dentry *);
 static int autofs4_dir_mkdir(struct inode *,struct dentry *,int);
-static int autofs4_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
+static long autofs4_root_ioctl(struct file *,unsigned int,unsigned long);
 static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,struct dentry *, struct nameidata *);
 static void *autofs4_follow_link(struct dentry *, struct nameidata *);
@@ -38,7 +39,7 @@ const struct file_operations autofs4_root_operations = {
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
        .llseek         = dcache_dir_lseek,
-        .ioctl          = autofs4_root_ioctl,
+        .unlocked_ioctl = autofs4_root_ioctl,
 };
 const struct file_operations autofs4_dir_operations = {
@@ -902,8 +903,8 @@ int is_autofs4_dentry(struct dentry *dentry)
 * ioctl()'s on the root directory is the chief method for the daemon to
 * generate kernel reactions
 */
-static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
+static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
-                             unsigned int cmd, unsigned long arg)
+                                       unsigned int cmd, unsigned long arg)
 {
        struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb);
        void __user *p = (void __user *)arg;
@@ -947,3 +948,16 @@ static int autofs4_root_ioctl(struct inode *inode, struct file *filp,
                return -ENOSYS;
        }
 }
+static long autofs4_root_ioctl(struct file *filp,
+                               unsigned int cmd, unsigned long arg)
+{
+        long ret;
+        struct inode *inode = filp->f_dentry->d_inode;
+        lock_kernel();
+        ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg);
+        unlock_kernel();
+        return ret;
+}
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..52e59bf4aa5f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
        return -EIO;
 }
-static int bad_file_fsync(struct file *file, struct dentry *dentry,
+static int bad_file_fsync(struct file *file, int datasync)
-                        int datasync)
 {
        return -EIO;
 }
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 8f73841fc974..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = bfs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 26e5f5026620..7346c96308a5 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
+        return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
-                                iov, offset, nr_segs, blkdev_get_blocks, NULL);
+                                I_BDEV(inode), iov, offset, nr_segs,
+                                blkdev_get_blocks, NULL);
 }
 int __sync_blockdev(struct block_device *bdev, int wait)
@@ -309,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
                        struct page **pagep, void **fsdata)
 {
        *pagep = NULL;
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        return block_write_begin_newtrunc(file, mapping, pos, len, flags,
-                                blkdev_get_block);
+                                pagep, fsdata, blkdev_get_block);
 }
 static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -358,12 +359,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
        return retval;
 }
        
-/*
+int blkdev_fsync(struct file *filp, int datasync)
- *      Filp is never NULL; the only case when ->fsync() is called with
- *      NULL first argument is nfsd_sync_dir() and that's not a directory.
- */
- 
-int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
        struct inode *bd_inode = filp->f_mapping->host;
        struct block_device *bdev = I_BDEV(bd_inode);
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a30141..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
                                if (!list_empty(&worker->pending) ||
                                    !list_empty(&worker->prio_pending)) {
                                        spin_unlock_irq(&worker->lock);
+                                        set_current_state(TASK_RUNNING);
                                        goto again;
                                }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
         * of extent items we've reserved metadata for.
         */
        spinlock_t accounting_lock;
+        atomic_t outstanding_extents;
        int reserved_extents;
-        int outstanding_extents;
        /*
         * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
         * of these.
         */
        unsigned ordered_data_close:1;
+        unsigned orphan_meta_reserved:1;
        unsigned dummy_inode:1;
        /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b205..0d1d966b0fe4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct extent_buffer *buf,
-                                       struct extent_buffer *cow)
+                                       struct extent_buffer *cow,
+                                       int *last_ref)
 {
        u64 refs;
        u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
                clean_tree_block(trans, root, buf);
+                *last_ref = 1;
        }
        return 0;
 }
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        struct btrfs_disk_key disk_key;
        struct extent_buffer *cow;
        int level;
+        int last_ref = 0;
        int unlock_orig = 0;
        u64 parent_start;
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                            (unsigned long)btrfs_header_fsid(cow),
                            BTRFS_FSID_SIZE);
-        update_ref_for_cow(trans, root, buf, cow);
+        update_ref_for_cow(trans, root, buf, cow, &last_ref);
+        if (root->ref_cows)
+                btrfs_reloc_cow_block(trans, root, buf, cow);
        if (buf == root->node) {
                WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                extent_buffer_get(cow);
                spin_unlock(&root->node_lock);
-                btrfs_free_tree_block(trans, root, buf->start, buf->len,
+                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                parent_start, root->root_key.objectid, level);
+                                      last_ref);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-                btrfs_free_tree_block(trans, root, buf->start, buf->len,
+                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                parent_start, root->root_key.objectid, level);
+                                      last_ref);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
        return bin_search(eb, key, level, slot);
 }
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+        spin_lock(&root->accounting_lock);
+        btrfs_set_root_used(&root->root_item,
+                            btrfs_root_used(&root->root_item) + size);
+        spin_unlock(&root->accounting_lock);
+}
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+        spin_lock(&root->accounting_lock);
+        btrfs_set_root_used(&root->root_item,
+                            btrfs_root_used(&root->root_item) - size);
+        spin_unlock(&root->accounting_lock);
+}
 /* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(child);
                btrfs_set_lock_blocking(child);
                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
-                BUG_ON(ret);
+                if (ret) {
+                        btrfs_tree_unlock(child);
+                        free_extent_buffer(child);
+                        goto enospc;
+                }
                spin_lock(&root->node_lock);
                root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_unlock(mid);
                /* once for the path */
                free_extent_buffer(mid);
-                ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
-                                            0, root->root_key.objectid, level);
+                root_sub_used(root, mid->len);
+                btrfs_free_tree_block(trans, root, mid, 0, 1);
                /* once for the root ptr */
                free_extent_buffer(mid);
-                return ret;
+                return 0;
        }
        if (btrfs_header_nritems(mid) >
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret < 0 && wret != -ENOSPC)
                        ret = wret;
                if (btrfs_header_nritems(right) == 0) {
-                        u64 bytenr = right->start;
-                        u32 blocksize = right->len;
                        clean_tree_block(trans, root, right);
                        btrfs_tree_unlock(right);
-                        free_extent_buffer(right);
-                        right = NULL;
                        wret = del_ptr(trans, root, path, level + 1, pslot +
                                       1);
                        if (wret)
                                ret = wret;
-                        wret = btrfs_free_tree_block(trans, root,
+                        root_sub_used(root, right->len);
-                                                     bytenr, blocksize, 0,
+                        btrfs_free_tree_block(trans, root, right, 0, 1);
-                                                     root->root_key.objectid,
+                        free_extent_buffer(right);
-                                                     level);
+                        right = NULL;
-                        if (wret)
-                                ret = wret;
                } else {
                        struct btrfs_disk_key right_key;
                        btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                BUG_ON(wret == 1);
        }
        if (btrfs_header_nritems(mid) == 0) {
-                /* we've managed to empty the middle node, drop it */
-                u64 bytenr = mid->start;
-                u32 blocksize = mid->len;
                clean_tree_block(trans, root, mid);
                btrfs_tree_unlock(mid);
-                free_extent_buffer(mid);
-                mid = NULL;
                wret = del_ptr(trans, root, path, level + 1, pslot);
                if (wret)
                        ret = wret;
-                wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
+                root_sub_used(root, mid->len);
-                                         0, root->root_key.objectid, level);
+                btrfs_free_tree_block(trans, root, mid, 0, 1);
-                if (wret)
+                free_extent_buffer(mid);
-                        ret = wret;
+                mid = NULL;
        } else {
                /* update the parent key to reflect our changes */
                struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        btrfs_release_path(NULL, p);
        ret = -EAGAIN;
-        tmp = read_tree_block(root, blocknr, blocksize, gen);
+        tmp = read_tree_block(root, blocknr, blocksize, 0);
        if (tmp) {
                /*
                 * If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
                                              p->nodes[level + 1],
                                              p->slots[level + 1], &b);
                        if (err) {
-                                free_extent_buffer(b);
                                ret = err;
                                goto done;
                        }
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        if (IS_ERR(c))
                return PTR_ERR(c);
+        root_add_used(root, root->nodesize);
        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_nritems(c, 1);
        btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
        int nritems;
        BUG_ON(!path->nodes[level]);
+        btrfs_assert_tree_locked(path->nodes[level]);
        lower = path->nodes[level];
        nritems = btrfs_header_nritems(lower);
        BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        if (IS_ERR(split))
                return PTR_ERR(split);
+        root_add_used(root, root->nodesize);
        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_level(split, btrfs_header_level(c));
        btrfs_set_header_bytenr(split, split->start);
@@ -2415,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (left_nritems)
                btrfs_mark_buffer_dirty(left);
+        else
+                clean_tree_block(trans, root, left);
        btrfs_mark_buffer_dirty(right);
        btrfs_item_key(right, &disk_key, 0);
@@ -2660,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(left);
        if (right_nritems)
                btrfs_mark_buffer_dirty(right);
+        else
+                clean_tree_block(trans, root, right);
        btrfs_item_key(right, &disk_key, 0);
        wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        /* then fixup the leaf pointer in the path */
        if (path->slots[0] < push_items) {
                path->slots[0] += old_left_nritems;
-                if (btrfs_header_nritems(path->nodes[0]) == 0)
-                        clean_tree_block(trans, root, path->nodes[0]);
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
                path->nodes[0] = left;
@@ -2932,10 +2953,10 @@ again:
        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
                                        &disk_key, 0, l->start, 0);
-        if (IS_ERR(right)) {
+        if (IS_ERR(right))
-                BUG_ON(1);
                return PTR_ERR(right);
-        }
+        root_add_used(root, root->leafsize);
        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(right, right->start);
@@ -3054,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
        btrfs_set_path_blocking(path);
        ret = split_leaf(trans, root, &key, path, ins_len, 1);
-        BUG_ON(ret);
+        if (ret)
+                goto err;
        path->keep_locks = 0;
        btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
         */
        btrfs_unlock_up_safe(path, 0);
-        ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
+        root_sub_used(root, leaf->len);
-                                    0, root->root_key.objectid, 0);
-        return ret;
+        btrfs_free_tree_block(trans, root, leaf, 0, 1);
+        return 0;
 }
 /*
 * delete the item at the leaf level in path.  If that empties
@@ -3865,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                if (leaf == root->node) {
                        btrfs_set_header_level(leaf, 0);
                } else {
+                        btrfs_set_path_blocking(path);
+                        clean_tree_block(trans, root, leaf);
                        ret = btrfs_del_leaf(trans, root, path, leaf);
                        BUG_ON(ret);
                }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678e..29c20092847e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
 struct btrfs_trans_handle;
 struct btrfs_transaction;
+struct btrfs_pending_snapshot;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 #define BTRFS_BLOCK_GROUP_DUP      (1 << 5)
 #define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+#define BTRFS_NR_RAID_TYPES        5
 struct btrfs_block_group_item {
        __le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
        u64 flags;
        u64 total_bytes;        /* total bytes in the space */
-        u64 bytes_used;         /* total bytes used on disk */
+        u64 bytes_used;         /* total bytes used,
+                                   this does't take mirrors into account */
        u64 bytes_pinned;       /* total bytes pinned, will be freed when the
                                   transaction finishes */
        u64 bytes_reserved;     /* total bytes the allocator has reserved for
                                   current allocations */
        u64 bytes_readonly;     /* total bytes that are read only */
-        u64 bytes_super;        /* total bytes reserved for the super blocks */
-        u64 bytes_root;         /* the number of bytes needed to commit a
-                                   transaction */
        u64 bytes_may_use;      /* number of bytes that may be used for
                                   delalloc/allocations */
-        u64 bytes_delalloc;     /* number of bytes currently reserved for
+        u64 disk_used;          /* total bytes used on disk */
-                                   delayed allocation */
        int full;               /* indicates that we cannot allocate any more
                                   chunks for this space */
        int force_alloc;        /* set if we need to force a chunk alloc for
                                   this space */
-        int force_delalloc;     /* make people start doing filemap_flush until
-                                   we're under a threshold */
        struct list_head list;
-        /* for controlling how we free up space for allocations */
-        wait_queue_head_t allocate_wait;
-        wait_queue_head_t flush_wait;
-        int allocating_chunk;
-        int flushing;
        /* for block groups in our same type */
-        struct list_head block_groups;
+        struct list_head block_groups[BTRFS_NR_RAID_TYPES];
        spinlock_t lock;
        struct rw_semaphore groups_sem;
        atomic_t caching_threads;
 };
+struct btrfs_block_rsv {
+        u64 size;
+        u64 reserved;
+        u64 freed[2];
+        struct btrfs_space_info *space_info;
+        struct list_head list;
+        spinlock_t lock;
+        atomic_t usage;
+        unsigned int priority:8;
+        unsigned int durable:1;
+        unsigned int refill_used:1;
+        unsigned int full:1;
+};
 /*
 * free clusters are used to claim free space in relatively large chunks,
 * allowing us to do less seeky writes.  They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
+        u64 reserved_pinned;
        u64 bytes_super;
        u64 flags;
        u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
        /* logical->physical extent mapping */
        struct btrfs_mapping_tree mapping_tree;
+        /* block reservation for extent, checksum and root tree */
+        struct btrfs_block_rsv global_block_rsv;
+        /* block reservation for delay allocation */
+        struct btrfs_block_rsv delalloc_block_rsv;
+        /* block reservation for metadata operations */
+        struct btrfs_block_rsv trans_block_rsv;
+        /* block reservation for chunk tree */
+        struct btrfs_block_rsv chunk_block_rsv;
+        struct btrfs_block_rsv empty_block_rsv;
+        /* list of block reservations that cross multiple transactions */
+        struct list_head durable_block_rsv_list;
+        struct mutex durable_block_rsv_mutex;
        u64 generation;
        u64 last_trans_committed;
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
        struct btrfs_workers endio_meta_write_workers;
        struct btrfs_workers endio_write_workers;
        struct btrfs_workers submit_workers;
-        struct btrfs_workers enospc_workers;
        /*
         * fixup workers take dirty pages that didn't properly go through
         * the cow mechanism and make them safe to write.  It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
        int do_barriers;
        int closing;
        int log_root_recovering;
+        int enospc_unlink;
        u64 total_pinned;
@@ -1012,6 +1035,9 @@ struct btrfs_root {
        struct completion kobj_unregister;
        struct mutex objectid_mutex;
+        spinlock_t accounting_lock;
+        struct btrfs_block_rsv *block_rsv;
        struct mutex log_mutex;
        wait_queue_head_t log_writer_wait;
        wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
        int ref_cows;
        int track_dirty;
        int in_radix;
-        int clean_orphans;
        u64 defrag_trans_start;
        struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
        struct list_head root_list;
-        spinlock_t list_lock;
+        spinlock_t orphan_lock;
        struct list_head orphan_list;
+        struct btrfs_block_rsv *orphan_block_rsv;
+        int orphan_item_inserted;
+        int orphan_cleanup_state;
        spinlock_t inode_lock;
        /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
                                        u64 hint, u64 empty_size);
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
+                           struct btrfs_root *root,
-                          u64 bytenr, u32 blocksize,
+                           struct extent_buffer *buf,
-                          u64 parent, u64 root_objectid, int level);
+                           u64 parent, int last_ref);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-                                struct btrfs_block_group_cache *group);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+                                struct btrfs_root *root,
-                                          struct inode *inode, int num_items);
+                                int num_items, int *retries);
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-                                        struct inode *inode, int num_items);
+                                struct btrfs_root *root);
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
-                                u64 bytes);
+                                  struct inode *inode);
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
+void btrfs_orphan_release_metadata(struct inode *inode);
-                                    struct inode *inode, u64 bytes);
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+                                struct btrfs_pending_snapshot *pending);
-                                 u64 bytes);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-                              u64 bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+                          struct btrfs_block_rsv *rsv);
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv,
+                        u64 num_bytes, int *retries);
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved, int min_factor);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+                            struct btrfs_block_rsv *dst_rsv,
+                            u64 num_bytes);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+                             struct btrfs_block_rsv *block_rsv,
+                             u64 num_bytes);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *cache);
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *cache);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
+int btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
                           u64 inode_objectid, u64 ref_objectid, u64 *index);
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const char *name, int name_len,
+                        u64 inode_objectid, u64 ref_objectid, int mod);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
                          struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+                              struct bio *bio, u64 logical_offset, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending,
+                                u64 *bytes_to_reserve);
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t size);
 int btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+                              u64 start, u64 num_bytes, u64 min_size,
+                              loff_t actual_len, u64 *alloc_hint);
 extern const struct dentry_operations btrfs_dentry_operations;
 /* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 /* file.c */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_sync_file(struct file *file, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                            int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root);
 int btrfs_recover_relocation(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct extent_buffer *buf,
+                           struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending,
+                              u64 *bytes_to_reserve);
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e3..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
 }
 /*
- * helper function to lookup reference count and flags of extent.
- *
- * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree. the head
- * node may also store the extent flags to set. This way you can check
- * to see what the reference count and extent flags would be if all of
- * the delayed refs are not processed.
- */
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 bytenr,
-                             u64 num_bytes, u64 *refs, u64 *flags)
-{
-        struct btrfs_delayed_ref_node *ref;
-        struct btrfs_delayed_ref_head *head;
-        struct btrfs_delayed_ref_root *delayed_refs;
-        struct btrfs_path *path;
-        struct btrfs_extent_item *ei;
-        struct extent_buffer *leaf;
-        struct btrfs_key key;
-        u32 item_size;
-        u64 num_refs;
-        u64 extent_flags;
-        int ret;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
-        key.offset = num_bytes;
-        delayed_refs = &trans->transaction->delayed_refs;
-again:
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
-                                &key, path, 0, 0);
-        if (ret < 0)
-                goto out;
-        if (ret == 0) {
-                leaf = path->nodes[0];
-                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-                if (item_size >= sizeof(*ei)) {
-                        ei = btrfs_item_ptr(leaf, path->slots[0],
-                                            struct btrfs_extent_item);
-                        num_refs = btrfs_extent_refs(leaf, ei);
-                        extent_flags = btrfs_extent_flags(leaf, ei);
-                } else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                        struct btrfs_extent_item_v0 *ei0;
-                        BUG_ON(item_size != sizeof(*ei0));
-                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
-                                             struct btrfs_extent_item_v0);
-                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
-                        /* FIXME: this isn't correct for data */
-                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-#else
-                        BUG();
-#endif
-                }
-                BUG_ON(num_refs == 0);
-        } else {
-                num_refs = 0;
-                extent_flags = 0;
-                ret = 0;
-        }
-        spin_lock(&delayed_refs->lock);
-        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
-        if (ref) {
-                head = btrfs_delayed_node_to_head(ref);
-                if (!mutex_trylock(&head->mutex)) {
-                        atomic_inc(&ref->refs);
-                        spin_unlock(&delayed_refs->lock);
-                        btrfs_release_path(root->fs_info->extent_root, path);
-                        mutex_lock(&head->mutex);
-                        mutex_unlock(&head->mutex);
-                        btrfs_put_delayed_ref(ref);
-                        goto again;
-                }
-                if (head->extent_op && head->extent_op->update_flags)
-                        extent_flags |= head->extent_op->flags_to_set;
-                else
-                        BUG_ON(num_refs == 0);
-                num_refs += ref->ref_mod;
-                mutex_unlock(&head->mutex);
-        }
-        WARN_ON(num_refs == 0);
-        if (refs)
-                *refs = num_refs;
-        if (flags)
-                *flags = extent_flags;
-out:
-        spin_unlock(&delayed_refs->lock);
-        btrfs_free_path(path);
-        return ret;
-}
-/*
 * helper function to update an extent delayed ref in the
 * rbtree.  existing and update must both have the same
 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 bytenr,
-                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 bytenr, u64 num_bytes, u64 orig_parent,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d02..f3b287c22caf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
        int rw;
        int mirror_num;
        unsigned long bio_flags;
+        /*
+         * bio_offset is optional, can be used if the pages in the bio
+         * can't tell us where in the file the bio should go
+         */
+        u64 bio_offset;
        struct btrfs_work work;
 };
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
        async = container_of(work, struct  async_submit_bio, work);
        fs_info = BTRFS_I(async->inode)->root->fs_info;
        async->submit_bio_start(async->inode, async->rw, async->bio,
-                               async->mirror_num, async->bio_flags);
+                               async->mirror_num, async->bio_flags,
+                               async->bio_offset);
 }
 static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
                wake_up(&fs_info->async_submit_wait);
        async->submit_bio_done(async->inode, async->rw, async->bio,
-                               async->mirror_num, async->bio_flags);
+                               async->mirror_num, async->bio_flags,
+                               async->bio_offset);
 }
 static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
                        unsigned long bio_flags,
+                        u64 bio_offset,
                        extent_submit_bio_hook_t *submit_bio_start,
                        extent_submit_bio_hook_t *submit_bio_done)
 {
@@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->work.flags = 0;
        async->bio_flags = bio_flags;
+        async->bio_offset = bio_offset;
        atomic_inc(&fs_info->nr_async_submits);
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
 static int __btree_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                    unsigned long bio_flags)
+                                    unsigned long bio_flags,
+                                    u64 bio_offset)
 {
        /*
         * when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
 }
 static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                                 int mirror_num, unsigned long bio_flags)
+                                 int mirror_num, unsigned long bio_flags,
+                                 u64 bio_offset)
 {
        /*
         * when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 }
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                                 int mirror_num, unsigned long bio_flags)
+                                 int mirror_num, unsigned long bio_flags,
+                                 u64 bio_offset)
 {
        int ret;
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
         */
        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num, 0,
+                                   bio_offset,
                                   __btree_submit_bio_start,
                                   __btree_submit_bio_done);
 }
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->ref_cows = 0;
        root->track_dirty = 0;
        root->in_radix = 0;
-        root->clean_orphans = 0;
+        root->orphan_item_inserted = 0;
+        root->orphan_cleanup_state = 0;
        root->fs_info = fs_info;
        root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->name = NULL;
        root->in_sysfs = 0;
        root->inode_tree = RB_ROOT;
+        root->block_rsv = NULL;
+        root->orphan_block_rsv = NULL;
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
        INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->node_lock);
-        spin_lock_init(&root->list_lock);
+        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
+        spin_lock_init(&root->accounting_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        return 0;
 }
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info)
-{
-        struct extent_buffer *eb;
-        struct btrfs_root *log_root_tree = fs_info->log_root_tree;
-        u64 start = 0;
-        u64 end = 0;
-        int ret;
-        if (!log_root_tree)
-                return 0;
-        while (1) {
-                ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
-                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
-                if (ret)
-                        break;
-                clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
-                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
-        }
-        eb = fs_info->log_root_tree->node;
-        WARN_ON(btrfs_header_level(eb) != 0);
-        WARN_ON(btrfs_header_nritems(eb) != 0);
-        ret = btrfs_free_reserved_extent(fs_info->tree_root,
-                                eb->start, eb->len);
-        BUG_ON(ret);
-        free_extent_buffer(eb);
-        kfree(fs_info->log_root_tree);
-        fs_info->log_root_tree = NULL;
-        return 0;
-}
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info)
 {
@@ -1191,19 +1172,23 @@ again:
        if (root)
                return root;
-        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
-        if (ret == 0)
-                ret = -ENOENT;
-        if (ret < 0)
-                return ERR_PTR(ret);
        root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
        if (IS_ERR(root))
                return root;
-        WARN_ON(btrfs_root_refs(&root->root_item) == 0);
        set_anon_super(&root->anon_super, NULL);
+        if (btrfs_root_refs(&root->root_item) == 0) {
+                ret = -ENOENT;
+                goto fail;
+        }
+        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+        if (ret < 0)
+                goto fail;
+        if (ret == 0)
+                root->orphan_item_inserted = 1;
        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
        if (ret)
                goto fail;
@@ -1212,10 +1197,9 @@ again:
        ret = radix_tree_insert(&fs_info->fs_roots_radix,
                                (unsigned long)root->root_key.objectid,
                                root);
-        if (ret == 0) {
+        if (ret == 0)
                root->in_radix = 1;
-                root->clean_orphans = 1;
-        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
        radix_tree_preload_end();
        if (ret) {
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
        struct btrfs_root *root = arg;
        do {
-                smp_mb();
-                if (root->fs_info->closing)
-                        break;
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
                if (freezing(current)) {
                        refrigerator();
                } else {
-                        smp_mb();
-                        if (root->fs_info->closing)
-                                break;
                        set_current_state(TASK_INTERRUPTIBLE);
-                        schedule();
+                        if (!kthread_should_stop())
+                                schedule();
                        __set_current_state(TASK_RUNNING);
                }
        } while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
        struct btrfs_root *root = arg;
        struct btrfs_trans_handle *trans;
        struct btrfs_transaction *cur;
+        u64 transid;
        unsigned long now;
        unsigned long delay;
        int ret;
        do {
-                smp_mb();
-                if (root->fs_info->closing)
-                        break;
                delay = HZ * 30;
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
-                mutex_lock(&root->fs_info->trans_mutex);
+                spin_lock(&root->fs_info->new_trans_lock);
                cur = root->fs_info->running_transaction;
                if (!cur) {
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                        spin_unlock(&root->fs_info->new_trans_lock);
                        goto sleep;
                }
                now = get_seconds();
-                if (now < cur->start_time || now - cur->start_time < 30) {
+                if (!cur->blocked &&
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                    (now < cur->start_time || now - cur->start_time < 30)) {
+                        spin_unlock(&root->fs_info->new_trans_lock);
                        delay = HZ * 5;
                        goto sleep;
                }
-                mutex_unlock(&root->fs_info->trans_mutex);
+                transid = cur->transid;
-                trans = btrfs_start_transaction(root, 1);
+                spin_unlock(&root->fs_info->new_trans_lock);
-                ret = btrfs_commit_transaction(trans, root);
+                trans = btrfs_join_transaction(root, 1);
+                if (transid == trans->transid) {
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                } else {
+                        btrfs_end_transaction(trans, root);
+                }
 sleep:
                wake_up_process(root->fs_info->cleaner_kthread);
                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
                if (freezing(current)) {
                        refrigerator();
                } else {
-                        if (root->fs_info->closing)
-                                break;
                        set_current_state(TASK_INTERRUPTIBLE);
-                        schedule_timeout(delay);
+                        if (!kthread_should_stop() &&
+                            !btrfs_transaction_blocked(root->fs_info))
+                                schedule_timeout(delay);
                        __set_current_state(TASK_RUNNING);
                }
        } while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
        btrfs_mapping_init(&fs_info->mapping_tree);
+        btrfs_init_block_rsv(&fs_info->global_block_rsv);
+        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
+        btrfs_init_block_rsv(&fs_info->trans_block_rsv);
+        btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
+        btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+        INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+        mutex_init(&fs_info->durable_block_rsv_mutex);
        atomic_set(&fs_info->nr_async_submits, 0);
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                           min_t(u64, fs_devices->num_devices,
                           fs_info->thread_pool_size),
                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->enospc_workers, "enospc",
-                           fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
        /* a higher idle thresh on the submit workers makes it much more
         * likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_start_workers(&fs_info->endio_meta_workers, 1);
        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
        btrfs_start_workers(&fs_info->endio_write_workers, 1);
-        btrfs_start_workers(&fs_info->enospc_workers, 1);
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        csum_root->track_dirty = 1;
+        fs_info->generation = generation;
+        fs_info->last_trans_committed = generation;
+        fs_info->data_alloc_profile = (u64)-1;
+        fs_info->metadata_alloc_profile = (u64)-1;
+        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
        ret = btrfs_read_block_groups(extent_root);
        if (ret) {
                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
                goto fail_block_groups;
        }
-        fs_info->generation = generation;
-        fs_info->last_trans_committed = generation;
-        fs_info->data_alloc_profile = (u64)-1;
-        fs_info->metadata_alloc_profile = (u64)-1;
-        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
        if (IS_ERR(fs_info->cleaner_kthread))
@@ -1977,6 +1963,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        BUG_ON(ret);
        if (!(sb->s_flags & MS_RDONLY)) {
+                ret = btrfs_cleanup_fs_roots(fs_info);
+                BUG_ON(ret);
                ret = btrfs_recover_relocation(tree_root);
                if (ret < 0) {
                        printk(KERN_WARNING
@@ -2040,7 +2029,6 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
-        btrfs_stop_workers(&fs_info->enospc_workers);
 fail_iput:
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
@@ -2405,11 +2393,11 @@ int btrfs_commit_super(struct btrfs_root *root)
        down_write(&root->fs_info->cleanup_work_sem);
        up_write(&root->fs_info->cleanup_work_sem);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_join_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
        /* run commit again to drop the original snapshot */
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_join_transaction(root, 1);
        btrfs_commit_transaction(trans, root);
        ret = btrfs_write_and_wait_transaction(NULL, root);
        BUG_ON(ret);
@@ -2426,15 +2414,15 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
-        kthread_stop(root->fs_info->transaction_kthread);
-        kthread_stop(root->fs_info->cleaner_kthread);
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret =  btrfs_commit_super(root);
                if (ret)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        kthread_stop(root->fs_info->transaction_kthread);
+        kthread_stop(root->fs_info->cleaner_kthread);
        fs_info->closing = 2;
        smp_mb();
@@ -2473,7 +2461,6 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
-        btrfs_stop_workers(&fs_info->enospc_workers);
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
-                        unsigned long bio_flags,
+                        unsigned long bio_flags, u64 bio_offset,
                        extent_submit_bio_hook_t *submit_bio_start,
                        extent_submit_bio_hook_t *submit_bio_done);
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6a4f459ad76..b9080d71991a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
-                              u64 bytenr, u64 num_bytes, int alloc,
+                              u64 bytenr, u64 num_bytes, int alloc);
-                              int mark_free);
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+                                 u64 num_bytes, int reserve, int sinfo);
-                                   u64 num_bytes, int reserve);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          struct btrfs_path *path,
-                          u64 bytenr, u64 num_bytes,
-                          int is_data, int reserved,
-                          struct extent_buffer **must_clean);
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 {
-        if (atomic_dec_and_test(&cache->count))
+        if (atomic_dec_and_test(&cache->count)) {
+                WARN_ON(cache->pinned > 0);
+                WARN_ON(cache->reserved > 0);
+                WARN_ON(cache->reserved_pinned > 0);
                kfree(cache);
+        }
 }
 /*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
        exclude_super_stripes(extent_root, block_group);
        spin_lock(&block_group->space_info->lock);
-        block_group->space_info->bytes_super += block_group->bytes_super;
+        block_group->space_info->bytes_readonly += block_group->bytes_super;
        spin_unlock(&block_group->space_info->lock);
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
+        flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+                 BTRFS_BLOCK_GROUP_METADATA;
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 }
 /*
+ * helper function to lookup reference count and flags of extent.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, u64 *refs, u64 *flags)
+{
+        struct btrfs_delayed_ref_head *head;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        u32 item_size;
+        u64 num_refs;
+        u64 extent_flags;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = num_bytes;
+        if (!trans) {
+                path->skip_locking = 1;
+                path->search_commit_root = 1;
+        }
+again:
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+                                &key, path, 0, 0);
+        if (ret < 0)
+                goto out_free;
+        if (ret == 0) {
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                if (item_size >= sizeof(*ei)) {
+                        ei = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_extent_item);
+                        num_refs = btrfs_extent_refs(leaf, ei);
+                        extent_flags = btrfs_extent_flags(leaf, ei);
+                } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        struct btrfs_extent_item_v0 *ei0;
+                        BUG_ON(item_size != sizeof(*ei0));
+                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                                             struct btrfs_extent_item_v0);
+                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
+                        /* FIXME: this isn't correct for data */
+                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+                        BUG();
+#endif
+                }
+                BUG_ON(num_refs == 0);
+        } else {
+                num_refs = 0;
+                extent_flags = 0;
+                ret = 0;
+        }
+        if (!trans)
+                goto out;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        head = btrfs_find_delayed_ref_head(trans, bytenr);
+        if (head) {
+                if (!mutex_trylock(&head->mutex)) {
+                        atomic_inc(&head->node.refs);
+                        spin_unlock(&delayed_refs->lock);
+                        btrfs_release_path(root->fs_info->extent_root, path);
+                        mutex_lock(&head->mutex);
+                        mutex_unlock(&head->mutex);
+                        btrfs_put_delayed_ref(&head->node);
+                        goto again;
+                }
+                if (head->extent_op && head->extent_op->update_flags)
+                        extent_flags |= head->extent_op->flags_to_set;
+                else
+                        BUG_ON(num_refs == 0);
+                num_refs += head->node.ref_mod;
+                mutex_unlock(&head->mutex);
+        }
+        spin_unlock(&delayed_refs->lock);
+out:
+        WARN_ON(num_refs == 0);
+        if (refs)
+                *refs = num_refs;
+        if (flags)
+                *flags = extent_flags;
+out_free:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
        return ret;
 }
 /* helper function to actually process a single delayed ref entry */
 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                BUG_ON(extent_op);
                head = btrfs_delayed_node_to_head(node);
                if (insert_reserved) {
-                        int mark_free = 0;
+                        btrfs_pin_extent(root, node->bytenr,
-                        struct extent_buffer *must_clean = NULL;
+                                         node->num_bytes, 1);
-                        ret = pin_down_bytes(trans, root, NULL,
-                                             node->bytenr, node->num_bytes,
-                                             head->is_data, 1, &must_clean);
-                        if (ret > 0)
-                                mark_free = 1;
-                        if (must_clean) {
-                                clean_tree_block(NULL, root, must_clean);
-                                btrfs_tree_unlock(must_clean);
-                                free_extent_buffer(must_clean);
-                        }
                        if (head->is_data) {
                                ret = btrfs_del_csums(trans, root,
                                                      node->bytenr,
                                                      node->num_bytes);
                                BUG_ON(ret);
                        }
-                        if (mark_free) {
-                                ret = btrfs_free_reserved_extent(root,
-                                                        node->bytenr,
-                                                        node->num_bytes);
-                                BUG_ON(ret);
-                        }
                }
                mutex_unlock(&head->mutex);
                return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                ret = 0;
 out:
        btrfs_free_path(path);
+        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+                WARN_ON(ret > 0);
        return ret;
 }
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                             struct btrfs_space_info **space_info)
 {
        struct btrfs_space_info *found;
+        int i;
+        int factor;
+        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                     BTRFS_BLOCK_GROUP_RAID10))
+                factor = 2;
+        else
+                factor = 1;
        found = __find_space_info(info, flags);
        if (found) {
                spin_lock(&found->lock);
                found->total_bytes += total_bytes;
                found->bytes_used += bytes_used;
+                found->disk_used += bytes_used * factor;
                found->full = 0;
                spin_unlock(&found->lock);
                *space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (!found)
                return -ENOMEM;
-        INIT_LIST_HEAD(&found->block_groups);
+        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
-        init_waitqueue_head(&found->flush_wait);
-        init_waitqueue_head(&found->allocate_wait);
        spin_lock_init(&found->lock);
-        found->flags = flags;
+        found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+                                BTRFS_BLOCK_GROUP_SYSTEM |
+                                BTRFS_BLOCK_GROUP_METADATA);
        found->total_bytes = total_bytes;
        found->bytes_used = bytes_used;
+        found->disk_used = bytes_used * factor;
        found->bytes_pinned = 0;
        found->bytes_reserved = 0;
        found->bytes_readonly = 0;
-        found->bytes_delalloc = 0;
+        found->bytes_may_use = 0;
        found->full = 0;
        found->force_alloc = 0;
        *space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
        }
 }
-static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
-{
-        spin_lock(&cache->space_info->lock);
-        spin_lock(&cache->lock);
-        if (!cache->ro) {
-                cache->space_info->bytes_readonly += cache->key.offset -
-                                        btrfs_block_group_used(&cache->item);
-                cache->ro = 1;
-        }
-        spin_unlock(&cache->lock);
-        spin_unlock(&cache->space_info->lock);
-}
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        return flags;
 }
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        u64 alloc_profile;
-        if (data) {
-                alloc_profile = info->avail_data_alloc_bits &
-                        info->data_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-        } else if (root == root->fs_info->chunk_root) {
-                alloc_profile = info->avail_system_alloc_bits &
-                        info->system_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-        } else {
-                alloc_profile = info->avail_metadata_alloc_bits &
-                        info->metadata_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-        }
-        return btrfs_reduce_alloc_profile(root, data);
-}
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
-        u64 alloc_target;
-        alloc_target = btrfs_get_alloc_profile(root, 1);
-        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-                                                       alloc_target);
-}
-static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
-{
-        u64 num_bytes;
-        int level;
-        level = BTRFS_MAX_LEVEL - 2;
-        /*
-         * NOTE: these calculations are absolutely the worst possible case.
-         * This assumes that _every_ item we insert will require a new leaf, and
-         * that the tree has grown to its maximum level size.
-         */
-        /*
-         * for every item we insert we could insert both an extent item and a
-         * extent ref item.  Then for ever item we insert, we will need to cow
-         * both the original leaf, plus the leaf to the left and right of it.
-         *
-         * Unless we are talking about the extent root, then we just want the
-         * number of items * 2, since we just need the extent item plus its ref.
-         */
-        if (root == root->fs_info->extent_root)
-                num_bytes = num_items * 2;
-        else
-                num_bytes = (num_items + (2 * num_items)) * 3;
-        /*
-         * num_bytes is total number of leaves we could need times the leaf
-         * size, and then for every leaf we could end up cow'ing 2 nodes per
-         * level, down to the leaf level.
-         */
-        num_bytes = (num_bytes * root->leafsize) +
-                (num_bytes * (level * 2)) * root->nodesize;
-        return num_bytes;
-}
-/*
- * Unreserve metadata space for delalloc.  If we have less reserved credits than
- * we have extents, this function does nothing.
- */
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
-                                          struct inode *inode, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 alloc_target;
-        bool bug = false;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-                                           num_items);
-        spin_lock(&meta_sinfo->lock);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
-        if (BTRFS_I(inode)->reserved_extents <=
-            BTRFS_I(inode)->outstanding_extents) {
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                spin_unlock(&meta_sinfo->lock);
-                return 0;
-        }
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-        BTRFS_I(inode)->reserved_extents -= num_items;
-        BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
-        if (meta_sinfo->bytes_delalloc < num_bytes) {
-                bug = true;
-                meta_sinfo->bytes_delalloc = 0;
-        } else {
-                meta_sinfo->bytes_delalloc -= num_bytes;
-        }
-        spin_unlock(&meta_sinfo->lock);
-        BUG_ON(bug);
-        return 0;
-}
-static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
 {
-        u64 thresh;
+        if (flags & BTRFS_BLOCK_GROUP_DATA)
+                flags |= root->fs_info->avail_data_alloc_bits &
-        thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+                         root->fs_info->data_alloc_profile;
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+                flags |= root->fs_info->avail_system_alloc_bits &
-                meta_sinfo->bytes_may_use;
+                         root->fs_info->system_alloc_profile;
+        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-        thresh = meta_sinfo->total_bytes - thresh;
+                flags |= root->fs_info->avail_metadata_alloc_bits &
-        thresh *= 80;
+                         root->fs_info->metadata_alloc_profile;
-        do_div(thresh, 100);
+        return btrfs_reduce_alloc_profile(root, flags);
-        if (thresh <= meta_sinfo->bytes_delalloc)
-                meta_sinfo->force_delalloc = 1;
-        else
-                meta_sinfo->force_delalloc = 0;
 }
-struct async_flush {
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
-        struct btrfs_root *root;
-        struct btrfs_space_info *info;
-        struct btrfs_work work;
-};
-static noinline void flush_delalloc_async(struct btrfs_work *work)
 {
-        struct async_flush *async;
+        u64 flags;
-        struct btrfs_root *root;
-        struct btrfs_space_info *info;
-        async = container_of(work, struct async_flush, work);
-        root = async->root;
-        info = async->info;
-        btrfs_start_delalloc_inodes(root, 0);
-        wake_up(&info->flush_wait);
-        btrfs_wait_ordered_extents(root, 0, 0);
-        spin_lock(&info->lock);
-        info->flushing = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->flush_wait);
-        kfree(async);
-}
-static void wait_on_flush(struct btrfs_space_info *info)
-{
-        DEFINE_WAIT(wait);
-        u64 used;
-        while (1) {
-                prepare_to_wait(&info->flush_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                spin_lock(&info->lock);
-                if (!info->flushing) {
-                        spin_unlock(&info->lock);
-                        break;
-                }
-                used = info->bytes_used + info->bytes_reserved +
-                        info->bytes_pinned + info->bytes_readonly +
-                        info->bytes_super + info->bytes_root +
-                        info->bytes_may_use + info->bytes_delalloc;
-                if (used < info->total_bytes) {
-                        spin_unlock(&info->lock);
-                        break;
-                }
-                spin_unlock(&info->lock);
-                schedule();
-        }
-        finish_wait(&info->flush_wait, &wait);
-}
-static void flush_delalloc(struct btrfs_root *root,
-                                 struct btrfs_space_info *info)
-{
-        struct async_flush *async;
-        bool wait = false;
-        spin_lock(&info->lock);
-        if (!info->flushing)
+        if (data)
-                info->flushing = 1;
+                flags = BTRFS_BLOCK_GROUP_DATA;
+        else if (root == root->fs_info->chunk_root)
+                flags = BTRFS_BLOCK_GROUP_SYSTEM;
        else
-                wait = true;
+                flags = BTRFS_BLOCK_GROUP_METADATA;
-        spin_unlock(&info->lock);
-        if (wait) {
-                wait_on_flush(info);
-                return;
-        }
-        async = kzalloc(sizeof(*async), GFP_NOFS);
-        if (!async)
-                goto flush;
-        async->root = root;
-        async->info = info;
-        async->work.func = flush_delalloc_async;
-        btrfs_queue_worker(&root->fs_info->enospc_workers,
+        return get_alloc_profile(root, flags);
-                           &async->work);
-        wait_on_flush(info);
-        return;
-flush:
-        btrfs_start_delalloc_inodes(root, 0);
-        btrfs_wait_ordered_extents(root, 0, 0);
-        spin_lock(&info->lock);
-        info->flushing = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->flush_wait);
 }
-static int maybe_allocate_chunk(struct btrfs_root *root,
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-                                 struct btrfs_space_info *info)
-{
-        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
-        struct btrfs_trans_handle *trans;
-        bool wait = false;
-        int ret = 0;
-        u64 min_metadata;
-        u64 free_space;
-        free_space = btrfs_super_total_bytes(disk_super);
-        /*
-         * we allow the metadata to grow to a max of either 10gb or 5% of the
-         * space in the volume.
-         */
-        min_metadata = min((u64)10 * 1024 * 1024 * 1024,
-                             div64_u64(free_space * 5, 100));
-        if (info->total_bytes >= min_metadata) {
-                spin_unlock(&info->lock);
-                return 0;
-        }
-        if (info->full) {
-                spin_unlock(&info->lock);
-                return 0;
-        }
-        if (!info->allocating_chunk) {
-                info->force_alloc = 1;
-                info->allocating_chunk = 1;
-        } else {
-                wait = true;
-        }
-        spin_unlock(&info->lock);
-        if (wait) {
-                wait_event(info->allocate_wait,
-                           !info->allocating_chunk);
-                return 1;
-        }
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                             4096 + 2 * 1024 * 1024,
-                             info->flags, 0);
-        btrfs_end_transaction(trans, root);
-        if (ret)
-                goto out;
-out:
-        spin_lock(&info->lock);
-        info->allocating_chunk = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->allocate_wait);
-        if (ret)
-                return 0;
-        return 1;
-}
-/*
- * Reserve metadata space for delalloc.
- */
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
-                                        struct inode *inode, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 used;
-        u64 alloc_target;
-        int flushed = 0;
-        int force_delalloc;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-                                           num_items);
-again:
-        spin_lock(&meta_sinfo->lock);
-        force_delalloc = meta_sinfo->force_delalloc;
-        if (unlikely(!meta_sinfo->bytes_root))
-                meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-        if (!flushed)
-                meta_sinfo->bytes_delalloc += num_bytes;
-        used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-                meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-        if (used > meta_sinfo->total_bytes) {
-                flushed++;
-                if (flushed == 1) {
-                        if (maybe_allocate_chunk(root, meta_sinfo))
-                                goto again;
-                        flushed++;
-                } else {
-                        spin_unlock(&meta_sinfo->lock);
-                }
-                if (flushed == 2) {
-                        filemap_flush(inode->i_mapping);
-                        goto again;
-                } else if (flushed == 3) {
-                        flush_delalloc(root, meta_sinfo);
-                        goto again;
-                }
-                spin_lock(&meta_sinfo->lock);
-                meta_sinfo->bytes_delalloc -= num_bytes;
-                spin_unlock(&meta_sinfo->lock);
-                printk(KERN_ERR "enospc, has %d, reserved %d\n",
-                       BTRFS_I(inode)->outstanding_extents,
-                       BTRFS_I(inode)->reserved_extents);
-                dump_space_info(meta_sinfo, 0, 0);
-                return -ENOSPC;
-        }
-        BTRFS_I(inode)->reserved_extents += num_items;
-        check_force_delalloc(meta_sinfo);
-        spin_unlock(&meta_sinfo->lock);
-        if (!flushed && force_delalloc)
-                filemap_flush(inode->i_mapping);
-        return 0;
-}
-/*
- * unreserve num_items number of items worth of metadata space.  This needs to
- * be paired with btrfs_reserve_metadata_space.
- *
- * NOTE: if you have the option, run this _AFTER_ you do a
- * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
- * oprations which will result in more used metadata, so we want to make sure we
- * can do that without issue.
- */
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 alloc_target;
-        bool bug = false;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root, num_items);
-        spin_lock(&meta_sinfo->lock);
-        if (meta_sinfo->bytes_may_use < num_bytes) {
-                bug = true;
-                meta_sinfo->bytes_may_use = 0;
-        } else {
-                meta_sinfo->bytes_may_use -= num_bytes;
-        }
-        spin_unlock(&meta_sinfo->lock);
-        BUG_ON(bug);
-        return 0;
-}
-/*
- * Reserve some metadata space for use.  We'll calculate the worste case number
- * of bytes that would be needed to modify num_items number of items.  If we
- * have space, fantastic, if not, you get -ENOSPC.  Please call
- * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
- * items you reserved, since whatever metadata you needed should have already
- * been allocated.
- *
- * This will commit the transaction to make more space if we don't have enough
- * metadata space.  THe only time we don't do this is if we're reserving space
- * inside of a transaction, then we will just return -ENOSPC and it is the
- * callers responsibility to handle it properly.
- */
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
 {
-        struct btrfs_fs_info *info = root->fs_info;
+        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-        struct btrfs_space_info *meta_sinfo;
+                                                       BTRFS_BLOCK_GROUP_DATA);
-        u64 num_bytes;
-        u64 used;
-        u64 alloc_target;
-        int retries = 0;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root, num_items);
-again:
-        spin_lock(&meta_sinfo->lock);
-        if (unlikely(!meta_sinfo->bytes_root))
-                meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-        if (!retries)
-                meta_sinfo->bytes_may_use += num_bytes;
-        used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-                meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-        if (used > meta_sinfo->total_bytes) {
-                retries++;
-                if (retries == 1) {
-                        if (maybe_allocate_chunk(root, meta_sinfo))
-                                goto again;
-                        retries++;
-                } else {
-                        spin_unlock(&meta_sinfo->lock);
-                }
-                if (retries == 2) {
-                        flush_delalloc(root, meta_sinfo);
-                        goto again;
-                }
-                spin_lock(&meta_sinfo->lock);
-                meta_sinfo->bytes_may_use -= num_bytes;
-                spin_unlock(&meta_sinfo->lock);
-                dump_space_info(meta_sinfo, 0, 0);
-                return -ENOSPC;
-        }
-        check_force_delalloc(meta_sinfo);
-        spin_unlock(&meta_sinfo->lock);
-        return 0;
 }
 /*
 * This will check the space that the inode allocates from to make sure we have
 * enough space for bytes.
 */
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
-                                u64 bytes)
 {
        struct btrfs_space_info *data_sinfo;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 used;
-        int ret = 0, committed = 0, flushed = 0;
+        int ret = 0, committed = 0;
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 again:
        /* make sure we have enough space to handle the data first */
        spin_lock(&data_sinfo->lock);
-        used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
+        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
-                data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
+                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
-                data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
+                data_sinfo->bytes_may_use;
-                data_sinfo->bytes_super;
        if (used + bytes > data_sinfo->total_bytes) {
                struct btrfs_trans_handle *trans;
-                if (!flushed) {
-                        spin_unlock(&data_sinfo->lock);
-                        flush_delalloc(root, data_sinfo);
-                        flushed = 1;
-                        goto again;
-                }
                /*
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_join_transaction(root, 1);
-                        if (!trans)
+                        if (IS_ERR(trans))
-                                return -ENOMEM;
+                                return PTR_ERR(trans);
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                             bytes + 2 * 1024 * 1024,
                                             alloc_target, 0);
                        btrfs_end_transaction(trans, root);
-                        if (ret)
+                        if (ret < 0)
                                return ret;
                        if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
                if (!committed && !root->fs_info->open_ioctl_trans) {
                        committed = 1;
                        trans = btrfs_join_transaction(root, 1);
-                        if (!trans)
+                        if (IS_ERR(trans))
-                                return -ENOMEM;
+                                return PTR_ERR(trans);
                        ret = btrfs_commit_transaction(trans, root);
                        if (ret)
                                return ret;
                        goto again;
                }
-                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+#if 0 /* I hope we never need this code again, just in case */
-                       ", %llu bytes_used, %llu bytes_reserved, "
+                printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
-                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
+                       "%llu bytes_reserved, " "%llu bytes_pinned, "
-                       "%llu total\n", (unsigned long long)bytes,
+                       "%llu bytes_readonly, %llu may use %llu total\n",
-                       (unsigned long long)data_sinfo->bytes_delalloc,
+                       (unsigned long long)bytes,
                       (unsigned long long)data_sinfo->bytes_used,
                       (unsigned long long)data_sinfo->bytes_reserved,
                       (unsigned long long)data_sinfo->bytes_pinned,
                       (unsigned long long)data_sinfo->bytes_readonly,
                       (unsigned long long)data_sinfo->bytes_may_use,
                       (unsigned long long)data_sinfo->total_bytes);
+#endif
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
 }
 /*
- * if there was an error for whatever reason after calling
+ * called when we are clearing an delalloc extent from the
- * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ * inode's io_tree or there was an error for whatever reason
+ * after calling btrfs_check_data_free_space
 */
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
-                                    struct inode *inode, u64 bytes)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_space_info *data_sinfo;
        /* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
        spin_unlock(&data_sinfo->lock);
 }
-/* called when we are adding a delalloc extent to the inode's io_tree */
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
-                                  u64 bytes)
-{
-        struct btrfs_space_info *data_sinfo;
-        /* get the space info for where this inode will be storing its data */
-        data_sinfo = BTRFS_I(inode)->space_info;
-        /* make sure we have enough space to handle the data first */
-        spin_lock(&data_sinfo->lock);
-        data_sinfo->bytes_delalloc += bytes;
-        /*
-         * we are adding a delalloc extent without calling
-         * btrfs_check_data_free_space first.  This happens on a weird
-         * writepage condition, but shouldn't hurt our accounting
-         */
-        if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
-                data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
-                BTRFS_I(inode)->reserved_bytes = 0;
-        } else {
-                data_sinfo->bytes_may_use -= bytes;
-                BTRFS_I(inode)->reserved_bytes -= bytes;
-        }
-        spin_unlock(&data_sinfo->lock);
-}
-/* called when we are clearing an delalloc extent from the inode's io_tree */
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
-                              u64 bytes)
-{
-        struct btrfs_space_info *info;
-        info = BTRFS_I(inode)->space_info;
-        spin_lock(&info->lock);
-        info->bytes_delalloc -= bytes;
-        spin_unlock(&info->lock);
-}
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
        struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
        rcu_read_unlock();
 }
+static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+                              u64 alloc_bytes)
+{
+        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+        if (sinfo->bytes_used + sinfo->bytes_reserved +
+            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+                return 0;
+        if (sinfo->bytes_used + sinfo->bytes_reserved +
+            alloc_bytes < div_factor(num_bytes, 8))
+                return 0;
+        return 1;
+}
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force)
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
-        u64 thresh;
        int ret = 0;
        mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                goto out;
        }
-        thresh = space_info->total_bytes - space_info->bytes_readonly;
+        if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
-        thresh = div_factor(thresh, 8);
-        if (!force &&
-           (space_info->bytes_used + space_info->bytes_pinned +
-            space_info->bytes_reserved + alloc_bytes) < thresh) {
                spin_unlock(&space_info->lock);
                goto out;
        }
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        spin_lock(&space_info->lock);
        if (ret)
                space_info->full = 1;
+        else
+                ret = 1;
        space_info->force_alloc = 0;
        spin_unlock(&space_info->lock);
 out:
@@ -3461,13 +3073,713 @@ out:
        return ret;
 }
+static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_space_info *sinfo, u64 num_bytes)
+{
+        int ret;
+        int end_trans = 0;
+        if (sinfo->full)
+                return 0;
+        spin_lock(&sinfo->lock);
+        ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+        spin_unlock(&sinfo->lock);
+        if (!ret)
+                return 0;
+        if (!trans) {
+                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
+                end_trans = 1;
+        }
+        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                             num_bytes + 2 * 1024 * 1024,
+                             get_alloc_profile(root, sinfo->flags), 0);
+        if (end_trans)
+                btrfs_end_transaction(trans, root);
+        return ret == 1 ? 1 : 0;
+}
+/*
+ * shrink metadata reservation for delalloc
+ */
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 to_reclaim)
+{
+        struct btrfs_block_rsv *block_rsv;
+        u64 reserved;
+        u64 max_reclaim;
+        u64 reclaimed = 0;
+        int pause = 1;
+        int ret;
+        block_rsv = &root->fs_info->delalloc_block_rsv;
+        spin_lock(&block_rsv->lock);
+        reserved = block_rsv->reserved;
+        spin_unlock(&block_rsv->lock);
+        if (reserved == 0)
+                return 0;
+        max_reclaim = min(reserved, to_reclaim);
+        while (1) {
+                ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+                if (!ret) {
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(pause);
+                        pause <<= 1;
+                        if (pause > HZ / 10)
+                                pause = HZ / 10;
+                } else {
+                        pause = 1;
+                }
+                spin_lock(&block_rsv->lock);
+                if (reserved > block_rsv->reserved)
+                        reclaimed = reserved - block_rsv->reserved;
+                reserved = block_rsv->reserved;
+                spin_unlock(&block_rsv->lock);
+                if (reserved == 0 || reclaimed >= max_reclaim)
+                        break;
+                if (trans && trans->transaction->blocked)
+                        return -EAGAIN;
+        }
+        return reclaimed >= to_reclaim;
+}
+static int should_retry_reserve(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes, int *retries)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        int ret;
+        if ((*retries) > 2)
+                return -ENOSPC;
+        ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+        if (ret)
+                return 1;
+        if (trans && trans->transaction->in_commit)
+                return -ENOSPC;
+        ret = shrink_delalloc(trans, root, num_bytes);
+        if (ret)
+                return ret;
+        spin_lock(&space_info->lock);
+        if (space_info->bytes_pinned < num_bytes)
+                ret = 1;
+        spin_unlock(&space_info->lock);
+        if (ret)
+                return -ENOSPC;
+        (*retries)++;
+        if (trans)
+                return -EAGAIN;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(IS_ERR(trans));
+        ret = btrfs_commit_transaction(trans, root);
+        BUG_ON(ret);
+        return 1;
+}
+static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+                                  u64 num_bytes)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        u64 unused;
+        int ret = -ENOSPC;
+        spin_lock(&space_info->lock);
+        unused = space_info->bytes_used + space_info->bytes_reserved +
+                 space_info->bytes_pinned + space_info->bytes_readonly;
+        if (unused < space_info->total_bytes)
+                unused = space_info->total_bytes - unused;
+        else
+                unused = 0;
+        if (unused >= num_bytes) {
+                if (block_rsv->priority >= 10) {
+                        space_info->bytes_reserved += num_bytes;
+                        ret = 0;
+                } else {
+                        if ((unused + block_rsv->reserved) *
+                            block_rsv->priority >=
+                            (num_bytes + block_rsv->reserved) * 10) {
+                                space_info->bytes_reserved += num_bytes;
+                                ret = 0;
+                        }
+                }
+        }
+        spin_unlock(&space_info->lock);
+        return ret;
+}
+static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root)
+{
+        struct btrfs_block_rsv *block_rsv;
+        if (root->ref_cows)
+                block_rsv = trans->block_rsv;
+        else
+                block_rsv = root->block_rsv;
+        if (!block_rsv)
+                block_rsv = &root->fs_info->empty_block_rsv;
+        return block_rsv;
+}
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+                               u64 num_bytes)
+{
+        int ret = -ENOSPC;
+        spin_lock(&block_rsv->lock);
+        if (block_rsv->reserved >= num_bytes) {
+                block_rsv->reserved -= num_bytes;
+                if (block_rsv->reserved < block_rsv->size)
+                        block_rsv->full = 0;
+                ret = 0;
+        }
+        spin_unlock(&block_rsv->lock);
+        return ret;
+}
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes, int update_size)
+{
+        spin_lock(&block_rsv->lock);
+        block_rsv->reserved += num_bytes;
+        if (update_size)
+                block_rsv->size += num_bytes;
+        else if (block_rsv->reserved >= block_rsv->size)
+                block_rsv->full = 1;
+        spin_unlock(&block_rsv->lock);
+}
+void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+                             struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        spin_lock(&block_rsv->lock);
+        if (num_bytes == (u64)-1)
+                num_bytes = block_rsv->size;
+        block_rsv->size -= num_bytes;
+        if (block_rsv->reserved >= block_rsv->size) {
+                num_bytes = block_rsv->reserved - block_rsv->size;
+                block_rsv->reserved = block_rsv->size;
+                block_rsv->full = 1;
+        } else {
+                num_bytes = 0;
+        }
+        spin_unlock(&block_rsv->lock);
+        if (num_bytes > 0) {
+                if (dest) {
+                        block_rsv_add_bytes(dest, num_bytes, 0);
+                } else {
+                        spin_lock(&space_info->lock);
+                        space_info->bytes_reserved -= num_bytes;
+                        spin_unlock(&space_info->lock);
+                }
+        }
+}
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+                                   struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+        int ret;
+        ret = block_rsv_use_bytes(src, num_bytes);
+        if (ret)
+                return ret;
+        block_rsv_add_bytes(dst, num_bytes, 1);
+        return 0;
+}
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+{
+        memset(rsv, 0, sizeof(*rsv));
+        spin_lock_init(&rsv->lock);
+        atomic_set(&rsv->usage, 1);
+        rsv->priority = 6;
+        INIT_LIST_HEAD(&rsv->list);
+}
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+{
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 alloc_target;
+        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+        if (!block_rsv)
+                return NULL;
+        btrfs_init_block_rsv(block_rsv);
+        alloc_target = btrfs_get_alloc_profile(root, 0);
+        block_rsv->space_info = __find_space_info(fs_info,
+                                                  BTRFS_BLOCK_GROUP_METADATA);
+        return block_rsv;
+}
+void btrfs_free_block_rsv(struct btrfs_root *root,
+                          struct btrfs_block_rsv *rsv)
+{
+        if (rsv && atomic_dec_and_test(&rsv->usage)) {
+                btrfs_block_rsv_release(root, rsv, (u64)-1);
+                if (!rsv->durable)
+                        kfree(rsv);
+        }
+}
+/*
+ * make the block_rsv struct be able to capture freed space.
+ * the captured space will re-add to the the block_rsv struct
+ * after transaction commit
+ */
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_rsv *block_rsv)
+{
+        block_rsv->durable = 1;
+        mutex_lock(&fs_info->durable_block_rsv_mutex);
+        list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+        mutex_unlock(&fs_info->durable_block_rsv_mutex);
+}
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv,
+                        u64 num_bytes, int *retries)
+{
+        int ret;
+        if (num_bytes == 0)
+                return 0;
+again:
+        ret = reserve_metadata_bytes(block_rsv, num_bytes);
+        if (!ret) {
+                block_rsv_add_bytes(block_rsv, num_bytes, 1);
+                return 0;
+        }
+        ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
+        if (ret > 0)
+                goto again;
+        return ret;
+}
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved, int min_factor)
+{
+        u64 num_bytes = 0;
+        int commit_trans = 0;
+        int ret = -ENOSPC;
+        if (!block_rsv)
+                return 0;
+        spin_lock(&block_rsv->lock);
+        if (min_factor > 0)
+                num_bytes = div_factor(block_rsv->size, min_factor);
+        if (min_reserved > num_bytes)
+                num_bytes = min_reserved;
+        if (block_rsv->reserved >= num_bytes) {
+                ret = 0;
+        } else {
+                num_bytes -= block_rsv->reserved;
+                if (block_rsv->durable &&
+                    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+                        commit_trans = 1;
+        }
+        spin_unlock(&block_rsv->lock);
+        if (!ret)
+                return 0;
+        if (block_rsv->refill_used) {
+                ret = reserve_metadata_bytes(block_rsv, num_bytes);
+                if (!ret) {
+                        block_rsv_add_bytes(block_rsv, num_bytes, 0);
+                        return 0;
+                }
+        }
+        if (commit_trans) {
+                if (trans)
+                        return -EAGAIN;
+                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
+                ret = btrfs_commit_transaction(trans, root);
+                return 0;
+        }
+        WARN_ON(1);
+        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+                block_rsv->size, block_rsv->reserved,
+                block_rsv->freed[0], block_rsv->freed[1]);
+        return -ENOSPC;
+}
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+                            struct btrfs_block_rsv *dst_rsv,
+                            u64 num_bytes)
+{
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+void btrfs_block_rsv_release(struct btrfs_root *root,
+                             struct btrfs_block_rsv *block_rsv,
+                             u64 num_bytes)
+{
+        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+        if (global_rsv->full || global_rsv == block_rsv ||
+            block_rsv->space_info != global_rsv->space_info)
+                global_rsv = NULL;
+        block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+}
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_space_info *sinfo;
+        u64 num_bytes;
+        u64 meta_used;
+        u64 data_used;
+        int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+#if 0
+        /*
+         * per tree used space accounting can be inaccuracy, so we
+         * can't rely on it.
+         */
+        spin_lock(&fs_info->extent_root->accounting_lock);
+        num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
+        spin_unlock(&fs_info->extent_root->accounting_lock);
+        spin_lock(&fs_info->csum_root->accounting_lock);
+        num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
+        spin_unlock(&fs_info->csum_root->accounting_lock);
+        spin_lock(&fs_info->tree_root->accounting_lock);
+        num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
+        spin_unlock(&fs_info->tree_root->accounting_lock);
+#endif
+        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+        spin_lock(&sinfo->lock);
+        data_used = sinfo->bytes_used;
+        spin_unlock(&sinfo->lock);
+        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+        spin_lock(&sinfo->lock);
+        meta_used = sinfo->bytes_used;
+        spin_unlock(&sinfo->lock);
+        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+                    csum_size * 2;
+        num_bytes += div64_u64(data_used + meta_used, 50);
+        if (num_bytes * 3 > meta_used)
+                num_bytes = div64_u64(meta_used, 3);
+        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+}
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+        struct btrfs_space_info *sinfo = block_rsv->space_info;
+        u64 num_bytes;
+        num_bytes = calc_global_metadata_size(fs_info);
+        spin_lock(&block_rsv->lock);
+        spin_lock(&sinfo->lock);
+        block_rsv->size = num_bytes;
+        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+                    sinfo->bytes_reserved + sinfo->bytes_readonly;
+        if (sinfo->total_bytes > num_bytes) {
+                num_bytes = sinfo->total_bytes - num_bytes;
+                block_rsv->reserved += num_bytes;
+                sinfo->bytes_reserved += num_bytes;
+        }
+        if (block_rsv->reserved >= block_rsv->size) {
+                num_bytes = block_rsv->reserved - block_rsv->size;
+                sinfo->bytes_reserved -= num_bytes;
+                block_rsv->reserved = block_rsv->size;
+                block_rsv->full = 1;
+        }
+#if 0
+        printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
+                block_rsv->size, block_rsv->reserved);
+#endif
+        spin_unlock(&sinfo->lock);
+        spin_unlock(&block_rsv->lock);
+}
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_space_info *space_info;
+        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+        fs_info->chunk_block_rsv.space_info = space_info;
+        fs_info->chunk_block_rsv.priority = 10;
+        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+        fs_info->global_block_rsv.space_info = space_info;
+        fs_info->global_block_rsv.priority = 10;
+        fs_info->global_block_rsv.refill_used = 1;
+        fs_info->delalloc_block_rsv.space_info = space_info;
+        fs_info->trans_block_rsv.space_info = space_info;
+        fs_info->empty_block_rsv.space_info = space_info;
+        fs_info->empty_block_rsv.priority = 10;
+        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+        btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+        btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+        update_global_block_rsv(fs_info);
+}
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+        WARN_ON(fs_info->trans_block_rsv.size > 0);
+        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+        WARN_ON(fs_info->chunk_block_rsv.size > 0);
+        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+}
+static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+{
+        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+                3 * num_items;
+}
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 int num_items, int *retries)
+{
+        u64 num_bytes;
+        int ret;
+        if (num_items == 0 || root->fs_info->chunk_root == root)
+                return 0;
+        num_bytes = calc_trans_metadata_size(root, num_items);
+        ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+                                  num_bytes, retries);
+        if (!ret) {
+                trans->bytes_reserved += num_bytes;
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
+        }
+        return ret;
+}
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+        if (!trans->bytes_reserved)
+                return;
+        BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+        btrfs_block_rsv_release(root, trans->block_rsv,
+                                trans->bytes_reserved);
+        trans->bytes_reserved = 0;
+}
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+                                  struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+        /*
+         * one for deleting orphan item, one for updating inode and
+         * two for calling btrfs_truncate_inode_items.
+         *
+         * btrfs_truncate_inode_items is a delete operation, it frees
+         * more space than it uses in most cases. So two units of
+         * metadata space should be enough for calling it many times.
+         * If all of the metadata space is used, we can commit
+         * transaction and use space it freed.
+         */
+        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+        /*
+         * two for root back/forward refs, two for directory entries
+         * and one for root of the snapshot.
+         */
+        u64 num_bytes = calc_trans_metadata_size(root, 5);
+        dst_rsv->space_info = src_rsv->space_info;
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+{
+        return num_bytes >>= 3;
+}
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+        u64 to_reserve;
+        int nr_extents;
+        int retries = 0;
+        int ret;
+        if (btrfs_transaction_in_commit(root->fs_info))
+                schedule_timeout(1);
+        num_bytes = ALIGN(num_bytes, root->sectorsize);
+again:
+        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+        if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+                nr_extents -= BTRFS_I(inode)->reserved_extents;
+                to_reserve = calc_trans_metadata_size(root, nr_extents);
+        } else {
+                nr_extents = 0;
+                to_reserve = 0;
+        }
+        to_reserve += calc_csum_metadata_size(inode, num_bytes);
+        ret = reserve_metadata_bytes(block_rsv, to_reserve);
+        if (ret) {
+                spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
+                                           &retries);
+                if (ret > 0)
+                        goto again;
+                return ret;
+        }
+        BTRFS_I(inode)->reserved_extents += nr_extents;
+        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+        block_rsv_add_bytes(block_rsv, to_reserve, 1);
+        if (block_rsv->size > 512 * 1024 * 1024)
+                shrink_delalloc(NULL, root, to_reserve);
+        return 0;
+}
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 to_free;
+        int nr_extents;
+        num_bytes = ALIGN(num_bytes, root->sectorsize);
+        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+        if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+                nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
+                BTRFS_I(inode)->reserved_extents -= nr_extents;
+        } else {
+                nr_extents = 0;
+        }
+        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+        to_free = calc_csum_metadata_size(inode, num_bytes);
+        if (nr_extents > 0)
+                to_free += calc_trans_metadata_size(root, nr_extents);
+        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+                                to_free);
+}
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+        int ret;
+        ret = btrfs_check_data_free_space(inode, num_bytes);
+        if (ret)
+                return ret;
+        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+        if (ret) {
+                btrfs_free_reserved_data_space(inode, num_bytes);
+                return ret;
+        }
+        return 0;
+}
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+        btrfs_delalloc_release_metadata(inode, num_bytes);
+        btrfs_free_reserved_data_space(inode, num_bytes);
+}
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
-                              u64 bytenr, u64 num_bytes, int alloc,
+                              u64 bytenr, u64 num_bytes, int alloc)
-                              int mark_free)
 {
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *info = root->fs_info;
+        int factor;
        u64 total = num_bytes;
        u64 old_val;
        u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache)
                        return -1;
+                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+                                    BTRFS_BLOCK_GROUP_RAID1 |
+                                    BTRFS_BLOCK_GROUP_RAID10))
+                        factor = 2;
+                else
+                        factor = 1;
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        old_val += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
-                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
-                        if (cache->ro)
+                        cache->space_info->bytes_used += num_bytes;
-                                cache->space_info->bytes_readonly -= num_bytes;
+                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
                } else {
                        old_val -= num_bytes;
-                        cache->space_info->bytes_used -= num_bytes;
-                        if (cache->ro)
-                                cache->space_info->bytes_readonly += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
+                        cache->pinned += num_bytes;
+                        cache->space_info->bytes_pinned += num_bytes;
+                        cache->space_info->bytes_used -= num_bytes;
+                        cache->space_info->disk_used -= num_bytes * factor;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                        if (mark_free) {
-                                int ret;
-                                ret = btrfs_discard_extent(root, bytenr,
+                        set_extent_dirty(info->pinned_extents,
-                                                           num_bytes);
+                                         bytenr, bytenr + num_bytes - 1,
-                                WARN_ON(ret);
+                                         GFP_NOFS | __GFP_NOFAIL);
-                                ret = btrfs_add_free_space(cache, bytenr,
-                                                           num_bytes);
-                                WARN_ON(ret);
-                        }
                }
                btrfs_put_block_group(cache);
                total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
        return bytenr;
 }
-/*
+static int pin_down_extent(struct btrfs_root *root,
- * this function must be called within transaction
+                           struct btrfs_block_group_cache *cache,
- */
+                           u64 bytenr, u64 num_bytes, int reserved)
-int btrfs_pin_extent(struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, int reserved)
 {
-        struct btrfs_fs_info *fs_info = root->fs_info;
-        struct btrfs_block_group_cache *cache;
-        cache = btrfs_lookup_block_group(fs_info, bytenr);
-        BUG_ON(!cache);
        spin_lock(&cache->space_info->lock);
        spin_lock(&cache->lock);
        cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
-        btrfs_put_block_group(cache);
+        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+        return 0;
+}
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+                     u64 bytenr, u64 num_bytes, int reserved)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+        BUG_ON(!cache);
+        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
-        set_extent_dirty(fs_info->pinned_extents,
+        btrfs_put_block_group(cache);
-                         bytenr, bytenr + num_bytes - 1, GFP_NOFS);
        return 0;
 }
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+/*
-                                   u64 num_bytes, int reserve)
+ * update size of reserved extents. this function may return -EAGAIN
+ * if 'reserve' is true or 'sinfo' is false.
+ */
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                 u64 num_bytes, int reserve, int sinfo)
 {
-        spin_lock(&cache->space_info->lock);
+        int ret = 0;
-        spin_lock(&cache->lock);
+        if (sinfo) {
-        if (reserve) {
+                struct btrfs_space_info *space_info = cache->space_info;
-                cache->reserved += num_bytes;
+                spin_lock(&space_info->lock);
-                cache->space_info->bytes_reserved += num_bytes;
+                spin_lock(&cache->lock);
+                if (reserve) {
+                        if (cache->ro) {
+                                ret = -EAGAIN;
+                        } else {
+                                cache->reserved += num_bytes;
+                                space_info->bytes_reserved += num_bytes;
+                        }
+                } else {
+                        if (cache->ro)
+                                space_info->bytes_readonly += num_bytes;
+                        cache->reserved -= num_bytes;
+                        space_info->bytes_reserved -= num_bytes;
+                }
+                spin_unlock(&cache->lock);
+                spin_unlock(&space_info->lock);
        } else {
-                cache->reserved -= num_bytes;
+                spin_lock(&cache->lock);
-                cache->space_info->bytes_reserved -= num_bytes;
+                if (cache->ro) {
+                        ret = -EAGAIN;
+                } else {
+                        if (reserve)
+                                cache->reserved += num_bytes;
+                        else
+                                cache->reserved -= num_bytes;
+                }
+                spin_unlock(&cache->lock);
        }
-        spin_unlock(&cache->lock);
+        return ret;
-        spin_unlock(&cache->space_info->lock);
-        return 0;
 }
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                fs_info->pinned_extents = &fs_info->freed_extents[0];
        up_write(&fs_info->extent_commit_sem);
+        update_global_block_rsv(fs_info);
        return 0;
 }
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                        btrfs_add_free_space(cache, start, len);
                }
+                start += len;
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
                cache->space_info->bytes_pinned -= len;
+                if (cache->ro) {
+                        cache->space_info->bytes_readonly += len;
+                } else if (cache->reserved_pinned > 0) {
+                        len = min(len, cache->reserved_pinned);
+                        cache->reserved_pinned -= len;
+                        cache->space_info->bytes_reserved += len;
+                }
                spin_unlock(&cache->lock);
                spin_unlock(&cache->space_info->lock);
-                start += len;
        }
        if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *unpin;
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *next_rsv;
        u64 start;
        u64 end;
+        int idx;
        int ret;
        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
-        return ret;
+        mutex_lock(&fs_info->durable_block_rsv_mutex);
-}
+        list_for_each_entry_safe(block_rsv, next_rsv,
+                                 &fs_info->durable_block_rsv_list, list) {
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
+                idx = trans->transid & 0x1;
-                          struct btrfs_root *root,
+                if (block_rsv->freed[idx] > 0) {
-                          struct btrfs_path *path,
+                        block_rsv_add_bytes(block_rsv,
-                          u64 bytenr, u64 num_bytes,
+                                            block_rsv->freed[idx], 0);
-                          int is_data, int reserved,
+                        block_rsv->freed[idx] = 0;
-                          struct extent_buffer **must_clean)
+                }
-{
+                if (atomic_read(&block_rsv->usage) == 0) {
-        int err = 0;
+                        btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-        struct extent_buffer *buf;
-        if (is_data)
-                goto pinit;
-        /*
-         * discard is sloooow, and so triggering discards on
-         * individual btree blocks isn't a good plan.  Just
-         * pin everything in discard mode.
-         */
-        if (btrfs_test_opt(root, DISCARD))
-                goto pinit;
-        buf = btrfs_find_tree_block(root, bytenr, num_bytes);
-        if (!buf)
-                goto pinit;
-        /* we can reuse a block if it hasn't been written
+                        if (block_rsv->freed[0] == 0 &&
-         * and it is from this transaction.  We can't
+                            block_rsv->freed[1] == 0) {
-         * reuse anything from the tree log root because
+                                list_del_init(&block_rsv->list);
-         * it has tiny sub-transactions.
+                                kfree(block_rsv);
-         */
+                        }
-        if (btrfs_buffer_uptodate(buf, 0) &&
+                } else {
-            btrfs_try_tree_lock(buf)) {
+                        btrfs_block_rsv_release(root, block_rsv, 0);
-                u64 header_owner = btrfs_header_owner(buf);
-                u64 header_transid = btrfs_header_generation(buf);
-                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-                    header_transid == trans->transid &&
-                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                        *must_clean = buf;
-                        return 1;
                }
-                btrfs_tree_unlock(buf);
        }
-        free_extent_buffer(buf);
+        mutex_unlock(&fs_info->durable_block_rsv_mutex);
-pinit:
-        if (path)
-                btrfs_set_path_blocking(path);
-        /* unlocks the pinned mutex */
-        btrfs_pin_extent(root, bytenr, num_bytes, reserved);
-        BUG_ON(err < 0);
        return 0;
 }
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
        } else {
-                int mark_free = 0;
-                struct extent_buffer *must_clean = NULL;
                if (found_extent) {
                        BUG_ON(is_data && refs_to_drop !=
                               extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
-                ret = pin_down_bytes(trans, root, path, bytenr,
-                                     num_bytes, is_data, 0, &must_clean);
-                if (ret > 0)
-                        mark_free = 1;
-                BUG_ON(ret < 0);
-                /*
-                 * it is going to be very rare for someone to be waiting
-                 * on the block we're freeing.  del_items might need to
-                 * schedule, so rather than get fancy, just force it
-                 * to blocking here
-                 */
-                if (must_clean)
-                        btrfs_set_lock_blocking(must_clean);
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
                btrfs_release_path(extent_root, path);
-                if (must_clean) {
-                        clean_tree_block(NULL, root, must_clean);
-                        btrfs_tree_unlock(must_clean);
-                        free_extent_buffer(must_clean);
-                }
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                             (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
-                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-                                         mark_free);
                BUG_ON(ret);
        }
        btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 }
 /*
- * when we free an extent, it is possible (and likely) that we free the last
+ * when we free an block, it is possible (and likely) that we free the last
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct rb_node *node;
-        int ret;
+        int ret = 0;
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,99 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        list_del_init(&head->cluster);
        spin_unlock(&delayed_refs->lock);
-        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+        BUG_ON(head->extent_op);
-                                  &head->node, head->extent_op,
+        if (head->must_insert_reserved)
-                                  head->must_insert_reserved);
+                ret = 1;
-        BUG_ON(ret);
+        mutex_unlock(&head->mutex);
        btrfs_put_delayed_ref(&head->node);
-        return 0;
+        return ret;
 out:
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct extent_buffer *buf,
+                           u64 parent, int last_ref)
+{
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_group_cache *cache = NULL;
+        int ret;
+        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+                ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+                                                parent, root->root_key.objectid,
+                                                btrfs_header_level(buf),
+                                                BTRFS_DROP_DELAYED_REF, NULL);
+                BUG_ON(ret);
+        }
+        if (!last_ref)
+                return;
+        block_rsv = get_block_rsv(trans, root);
+        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+        BUG_ON(block_rsv->space_info != cache->space_info);
+        if (btrfs_header_generation(buf) == trans->transid) {
+                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+                        ret = check_ref_cleanup(trans, root, buf->start);
+                        if (!ret)
+                                goto pin;
+                }
+                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+                        pin_down_extent(root, cache, buf->start, buf->len, 1);
+                        goto pin;
+                }
+                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+                btrfs_add_free_space(cache, buf->start, buf->len);
+                ret = update_reserved_bytes(cache, buf->len, 0, 0);
+                if (ret == -EAGAIN) {
+                        /* block group became read-only */
+                        update_reserved_bytes(cache, buf->len, 0, 1);
+                        goto out;
+                }
+                ret = 1;
+                spin_lock(&block_rsv->lock);
+                if (block_rsv->reserved < block_rsv->size) {
+                        block_rsv->reserved += buf->len;
+                        ret = 0;
+                }
+                spin_unlock(&block_rsv->lock);
+                if (ret) {
+                        spin_lock(&cache->space_info->lock);
+                        cache->space_info->bytes_reserved -= buf->len;
+                        spin_unlock(&cache->space_info->lock);
+                }
+                goto out;
+        }
+pin:
+        if (block_rsv->durable && !cache->ro) {
+                ret = 0;
+                spin_lock(&cache->lock);
+                if (!cache->ro) {
+                        cache->reserved_pinned += buf->len;
+                        ret = 1;
+                }
+                spin_unlock(&cache->lock);
+                if (ret) {
+                        spin_lock(&block_rsv->lock);
+                        block_rsv->freed[trans->transid & 0x1] += buf->len;
+                        spin_unlock(&block_rsv->lock);
+                }
+        }
+out:
+        btrfs_put_block_group(cache);
+}
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4440,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                                        parent, root_objectid, (int)owner,
                                        BTRFS_DROP_DELAYED_REF, NULL);
                BUG_ON(ret);
-                ret = check_ref_cleanup(trans, root, bytenr);
-                BUG_ON(ret);
        } else {
                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
                                        parent, root_objectid, owner,
@@ -4067,21 +4449,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          u64 bytenr, u32 blocksize,
-                          u64 parent, u64 root_objectid, int level)
-{
-        u64 used;
-        spin_lock(&root->node_lock);
-        used = btrfs_root_used(&root->root_item) - blocksize;
-        btrfs_set_root_used(&root->root_item, used);
-        spin_unlock(&root->node_lock);
-        return btrfs_free_extent(trans, root, bytenr, blocksize,
-                                 parent, root_objectid, level, 0);
-}
 static u64 stripe_align(struct btrfs_root *root, u64 val)
 {
        u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4501,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return 0;
 }
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+        int index;
+        if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+                index = 0;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+                index = 1;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+                index = 2;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+                index = 3;
+        else
+                index = 4;
+        return index;
+}
 enum btrfs_loop_type {
        LOOP_FIND_IDEAL = 0,
        LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4538,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                                     u64 num_bytes, u64 empty_size,
                                     u64 search_start, u64 search_end,
                                     u64 hint_byte, struct btrfs_key *ins,
-                                     u64 exclude_start, u64 exclude_nr,
                                     int data)
 {
        int ret = 0;
@@ -4168,6 +4550,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+        int index = 0;
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
@@ -4237,6 +4620,7 @@ ideal_cache:
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
+                                index = get_block_group_index(block_group);
                                goto have_block_group;
                        }
                } else if (block_group) {
@@ -4245,7 +4629,8 @@ ideal_cache:
        }
 search:
        down_read(&space_info->groups_sem);
-        list_for_each_entry(block_group, &space_info->block_groups, list) {
+        list_for_each_entry(block_group, &space_info->block_groups[index],
+                            list) {
                u64 offset;
                int cached;
@@ -4436,23 +4821,22 @@ checks:
                        goto loop;
                }
-                if (exclude_nr > 0 &&
+                ins->objectid = search_start;
-                    (search_start + num_bytes > exclude_start &&
+                ins->offset = num_bytes;
-                     search_start < exclude_start + exclude_nr)) {
-                        search_start = exclude_start + exclude_nr;
+                if (offset < search_start)
+                        btrfs_add_free_space(block_group, offset,
+                                             search_start - offset);
+                BUG_ON(offset > search_start);
+                ret = update_reserved_bytes(block_group, num_bytes, 1,
+                                            (data & BTRFS_BLOCK_GROUP_DATA));
+                if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
-                        /*
-                         * if search_start is still in this block group
-                         * then we just re-search this block group
-                         */
-                        if (search_start >= block_group->key.objectid &&
-                            search_start < (block_group->key.objectid +
-                                            block_group->key.offset))
-                                goto have_block_group;
                        goto loop;
                }
+                /* we are all good, lets return */
                ins->objectid = search_start;
                ins->offset = num_bytes;
@@ -4460,18 +4844,18 @@ checks:
                        btrfs_add_free_space(block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                update_reserved_extents(block_group, num_bytes, 1);
-                /* we are all good, lets return */
                break;
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
+                BUG_ON(index != get_block_group_index(block_group));
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
+        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+                goto search;
        /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
         *                      for them to make caching progress.  Also
         *                      determine the best possible bg to cache
@@ -4485,6 +4869,7 @@ loop:
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
            (found_uncached_bg || empty_size || empty_cluster ||
             allowed_chunk_alloc)) {
+                index = 0;
                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                        found_uncached_bg = false;
                        loop++;
@@ -4567,31 +4952,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups)
 {
        struct btrfs_block_group_cache *cache;
+        int index = 0;
        spin_lock(&info->lock);
        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
               (unsigned long long)(info->total_bytes - info->bytes_used -
                                    info->bytes_pinned - info->bytes_reserved -
-                                    info->bytes_super),
+                                    info->bytes_readonly),
               (info->full) ? "" : "not ");
-        printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+        printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
-               " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+               "reserved=%llu, may_use=%llu, readonly=%llu\n",
-               "\n",
               (unsigned long long)info->total_bytes,
+               (unsigned long long)info->bytes_used,
               (unsigned long long)info->bytes_pinned,
-               (unsigned long long)info->bytes_delalloc,
+               (unsigned long long)info->bytes_reserved,
               (unsigned long long)info->bytes_may_use,
-               (unsigned long long)info->bytes_used,
+               (unsigned long long)info->bytes_readonly);
-               (unsigned long long)info->bytes_root,
-               (unsigned long long)info->bytes_super,
-               (unsigned long long)info->bytes_reserved);
        spin_unlock(&info->lock);
        if (!dump_block_groups)
                return;
        down_read(&info->groups_sem);
-        list_for_each_entry(cache, &info->block_groups, list) {
+again:
+        list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
                       "%llu pinned %llu reserved\n",
@@ -4603,6 +4987,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                btrfs_dump_free_space(cache, bytes);
                spin_unlock(&cache->lock);
        }
+        if (++index < BTRFS_NR_RAID_TYPES)
+                goto again;
        up_read(&info->groups_sem);
 }
@@ -4628,9 +5014,8 @@ again:
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
-                               search_start, search_end, hint_byte, ins,
+                               search_start, search_end, hint_byte,
-                               trans->alloc_exclude_start,
+                               ins, data);
-                               trans->alloc_exclude_nr, data);
        if (ret == -ENOSPC && num_bytes > min_alloc_size) {
                num_bytes = num_bytes >> 1;
@@ -4668,7 +5053,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        ret = btrfs_discard_extent(root, start, len);
        btrfs_add_free_space(cache, start, len);
-        update_reserved_extents(cache, len, 0);
+        update_reserved_bytes(cache, len, 0, 1);
        btrfs_put_block_group(cache);
        return ret;
@@ -4731,8 +5116,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
-        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-                                 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5176,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
-        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-                                 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5252,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                put_caching_control(caching_ctl);
        }
-        update_reserved_extents(block_group, ins->offset, 1);
+        ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+        BUG_ON(ret);
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
        return ret;
 }
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-static int alloc_tree_block(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            u64 num_bytes, u64 parent, u64 root_objectid,
-                            struct btrfs_disk_key *key, int level,
-                            u64 empty_size, u64 hint_byte, u64 search_end,
-                            struct btrfs_key *ins)
-{
-        int ret;
-        u64 flags = 0;
-        ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
-                                   empty_size, hint_byte, search_end,
-                                   ins, 0);
-        if (ret)
-                return ret;
-        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
-                if (parent == 0)
-                        parent = ins->objectid;
-                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
-        } else
-                BUG_ON(parent > 0);
-        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-                struct btrfs_delayed_extent_op *extent_op;
-                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                BUG_ON(!extent_op);
-                if (key)
-                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
-                else
-                        memset(&extent_op->key, 0, sizeof(extent_op->key));
-                extent_op->flags_to_set = flags;
-                extent_op->update_key = 1;
-                extent_op->update_flags = 1;
-                extent_op->is_data = 0;
-                ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
-                                        ins->offset, parent, root_objectid,
-                                        level, BTRFS_ADD_DELAYED_EXTENT,
-                                        extent_op);
-                BUG_ON(ret);
-        }
-        if (root_objectid == root->root_key.objectid) {
-                u64 used;
-                spin_lock(&root->node_lock);
-                used = btrfs_root_used(&root->root_item) + num_bytes;
-                btrfs_set_root_used(&root->root_item, used);
-                spin_unlock(&root->node_lock);
-        }
-        return ret;
-}
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -4974,8 +5298,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        return buf;
 }
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+              struct btrfs_root *root, u32 blocksize)
+{
+        struct btrfs_block_rsv *block_rsv;
+        int ret;
+        block_rsv = get_block_rsv(trans, root);
+        if (block_rsv->size == 0) {
+                ret = reserve_metadata_bytes(block_rsv, blocksize);
+                if (ret)
+                        return ERR_PTR(ret);
+                return block_rsv;
+        }
+        ret = block_rsv_use_bytes(block_rsv, blocksize);
+        if (!ret)
+                return block_rsv;
+        WARN_ON(1);
+        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+                block_rsv->size, block_rsv->reserved,
+                block_rsv->freed[0], block_rsv->freed[1]);
+        return ERR_PTR(-ENOSPC);
+}
+static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+        block_rsv_add_bytes(block_rsv, blocksize, 0);
+        block_rsv_release_bytes(block_rsv, NULL, 0);
+}
 /*
- * helper function to allocate a block for a given tree
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
 * returns the tree buffer or NULL.
 */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5346,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        u64 hint, u64 empty_size)
 {
        struct btrfs_key ins;
-        int ret;
+        struct btrfs_block_rsv *block_rsv;
        struct extent_buffer *buf;
+        u64 flags = 0;
+        int ret;
-        ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
+        block_rsv = use_block_rsv(trans, root, blocksize);
-                               key, level, empty_size, hint, (u64)-1, &ins);
+        if (IS_ERR(block_rsv))
+                return ERR_CAST(block_rsv);
+        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+                                   empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
-                BUG_ON(ret > 0);
+                unuse_block_rsv(block_rsv, blocksize);
                return ERR_PTR(ret);
        }
        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
                                    blocksize, level);
+        BUG_ON(IS_ERR(buf));
+        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                if (parent == 0)
+                        parent = ins.objectid;
+                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        } else
+                BUG_ON(parent > 0);
+        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+                struct btrfs_delayed_extent_op *extent_op;
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                BUG_ON(!extent_op);
+                if (key)
+                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
+                else
+                        memset(&extent_op->key, 0, sizeof(extent_op->key));
+                extent_op->flags_to_set = flags;
+                extent_op->update_key = 1;
+                extent_op->update_flags = 1;
+                extent_op->is_data = 0;
+                ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+                                        ins.offset, parent, root_objectid,
+                                        level, BTRFS_ADD_DELAYED_EXTENT,
+                                        extent_op);
+                BUG_ON(ret);
+        }
        return buf;
 }
@@ -5321,7 +5717,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path,
                                 struct walk_control *wc)
 {
-        int ret = 0;
+        int ret;
        int level = wc->level;
        struct extent_buffer *eb = path->nodes[level];
        u64 parent = 0;
@@ -5399,13 +5795,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                               btrfs_header_owner(path->nodes[level + 1]));
        }
-        ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
+        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
-                                root->root_key.objectid, level, 0);
-        BUG_ON(ret);
 out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
-        return ret;
+        return 0;
 }
 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5877,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
 */
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -5501,7 +5896,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
        wc = kzalloc(sizeof(*wc), GFP_NOFS);
        BUG_ON(!wc);
-        trans = btrfs_start_transaction(tree_root, 1);
+        trans = btrfs_start_transaction(tree_root, 0);
+        if (block_rsv)
+                trans->block_rsv = block_rsv;
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                level = btrfs_header_level(root->node);
@@ -5589,22 +5986,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
                }
                BUG_ON(wc->level == 0);
-                if (trans->transaction->in_commit ||
+                if (btrfs_should_end_transaction(trans, tree_root)) {
-                    trans->transaction->delayed_refs.flushing) {
                        ret = btrfs_update_root(trans, tree_root,
                                                &root->root_key,
                                                root_item);
                        BUG_ON(ret);
-                        btrfs_end_transaction(trans, tree_root);
+                        btrfs_end_transaction_throttle(trans, tree_root);
-                        trans = btrfs_start_transaction(tree_root, 1);
+                        trans = btrfs_start_transaction(tree_root, 0);
-                } else {
+                        if (block_rsv)
-                        unsigned long update;
+                                trans->block_rsv = block_rsv;
-                        update = trans->delayed_ref_updates;
-                        trans->delayed_ref_updates = 0;
-                        if (update)
-                                btrfs_run_delayed_refs(trans, tree_root,
-                                                       update);
                }
        }
        btrfs_release_path(root, path);
@@ -5632,7 +6023,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
                kfree(root);
        }
 out:
-        btrfs_end_transaction(trans, tree_root);
+        btrfs_end_transaction_throttle(trans, tree_root);
        kfree(wc);
        btrfs_free_path(path);
        return err;
@@ -7228,48 +7619,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        return flags;
 }
-static int __alloc_chunk_for_shrink(struct btrfs_root *root,
+static int set_block_group_ro(struct btrfs_block_group_cache *cache)
-                     struct btrfs_block_group_cache *shrink_block_group,
-                     int force)
 {
-        struct btrfs_trans_handle *trans;
+        struct btrfs_space_info *sinfo = cache->space_info;
-        u64 new_alloc_flags;
+        u64 num_bytes;
-        u64 calc;
+        int ret = -ENOSPC;
-        spin_lock(&shrink_block_group->lock);
+        if (cache->ro)
-        if (btrfs_block_group_used(&shrink_block_group->item) +
+                return 0;
-            shrink_block_group->reserved > 0) {
-                spin_unlock(&shrink_block_group->lock);
-                trans = btrfs_start_transaction(root, 1);
+        spin_lock(&sinfo->lock);
-                spin_lock(&shrink_block_group->lock);
+        spin_lock(&cache->lock);
+        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+                    cache->bytes_super - btrfs_block_group_used(&cache->item);
+        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+            sinfo->bytes_may_use + sinfo->bytes_readonly +
+            cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+                sinfo->bytes_readonly += num_bytes;
+                sinfo->bytes_reserved += cache->reserved_pinned;
+                cache->reserved_pinned = 0;
+                cache->ro = 1;
+                ret = 0;
+        }
+        spin_unlock(&cache->lock);
+        spin_unlock(&sinfo->lock);
+        return ret;
+}
-                new_alloc_flags = update_block_group_flags(root,
+int btrfs_set_block_group_ro(struct btrfs_root *root,
-                                                   shrink_block_group->flags);
+                             struct btrfs_block_group_cache *cache)
-                if (new_alloc_flags != shrink_block_group->flags) {
-                        calc =
-                             btrfs_block_group_used(&shrink_block_group->item);
-                } else {
-                        calc = shrink_block_group->key.offset;
-                }
-                spin_unlock(&shrink_block_group->lock);
-                do_chunk_alloc(trans, root->fs_info->extent_root,
+{
-                               calc + 2 * 1024 * 1024, new_alloc_flags, force);
+        struct btrfs_trans_handle *trans;
+        u64 alloc_flags;
+        int ret;
-                btrfs_end_transaction(trans, root);
+        BUG_ON(cache->ro);
-        } else
-                spin_unlock(&shrink_block_group->lock);
+        trans = btrfs_join_transaction(root, 1);
-        return 0;
+        BUG_ON(IS_ERR(trans));
-}
+        alloc_flags = update_block_group_flags(root, cache->flags);
+        if (alloc_flags != cache->flags)
+                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+        ret = set_block_group_ro(cache);
-                                         struct btrfs_block_group_cache *group)
+        if (!ret)
+                goto out;
+        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        if (ret < 0)
+                goto out;
+        ret = set_block_group_ro(cache);
+out:
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+                              struct btrfs_block_group_cache *cache)
 {
-        __alloc_chunk_for_shrink(root, group, 1);
+        struct btrfs_space_info *sinfo = cache->space_info;
-        set_block_group_readonly(group);
+        u64 num_bytes;
+        BUG_ON(!cache->ro);
+        spin_lock(&sinfo->lock);
+        spin_lock(&cache->lock);
+        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+                    cache->bytes_super - btrfs_block_group_used(&cache->item);
+        sinfo->bytes_readonly -= num_bytes;
+        cache->ro = 0;
+        spin_unlock(&cache->lock);
+        spin_unlock(&sinfo->lock);
        return 0;
 }
@@ -7436,17 +7859,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
         */
        synchronize_rcu();
+        release_global_block_rsv(info);
        while(!list_empty(&info->space_info)) {
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
+                if (space_info->bytes_pinned > 0 ||
+                    space_info->bytes_reserved > 0) {
+                        WARN_ON(1);
+                        dump_space_info(space_info, 0, 0);
+                }
                list_del(&space_info->list);
                kfree(space_info);
        }
        return 0;
 }
+static void __link_block_group(struct btrfs_space_info *space_info,
+                               struct btrfs_block_group_cache *cache)
+{
+        int index = get_block_group_index(cache);
+        down_write(&space_info->groups_sem);
+        list_add_tail(&cache->list, &space_info->block_groups[index]);
+        up_write(&space_info->groups_sem);
+}
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
        struct btrfs_path *path;
@@ -7468,10 +7907,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        while (1) {
                ret = find_first_block_group(root, path, &key);
-                if (ret > 0) {
+                if (ret > 0)
-                        ret = 0;
+                        break;
-                        goto error;
-                }
                if (ret != 0)
                        goto error;
@@ -7480,7 +7917,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                cache = kzalloc(sizeof(*cache), GFP_NOFS);
                if (!cache) {
                        ret = -ENOMEM;
-                        break;
+                        goto error;
                }
                atomic_set(&cache->count, 1);
@@ -7537,20 +7974,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                BUG_ON(ret);
                cache->space_info = space_info;
                spin_lock(&cache->space_info->lock);
-                cache->space_info->bytes_super += cache->bytes_super;
+                cache->space_info->bytes_readonly += cache->bytes_super;
                spin_unlock(&cache->space_info->lock);
-                down_write(&space_info->groups_sem);
+                __link_block_group(space_info, cache);
-                list_add_tail(&cache->list, &space_info->block_groups);
-                up_write(&space_info->groups_sem);
                ret = btrfs_add_block_group_cache(root->fs_info, cache);
                BUG_ON(ret);
                set_avail_alloc_bits(root->fs_info, cache->flags);
                if (btrfs_chunk_readonly(root, cache->key.objectid))
-                        set_block_group_readonly(cache);
+                        set_block_group_ro(cache);
        }
+        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+                if (!(get_alloc_profile(root, space_info->flags) &
+                      (BTRFS_BLOCK_GROUP_RAID10 |
+                       BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_DUP)))
+                        continue;
+                /*
+                 * avoid allocating from un-mirrored block group if there are
+                 * mirrored block groups.
+                 */
+                list_for_each_entry(cache, &space_info->block_groups[3], list)
+                        set_block_group_ro(cache);
+                list_for_each_entry(cache, &space_info->block_groups[4], list)
+                        set_block_group_ro(cache);
+        }
+        init_global_block_rsv(info);
        ret = 0;
 error:
        btrfs_free_path(path);
@@ -7611,12 +8064,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        spin_lock(&cache->space_info->lock);
-        cache->space_info->bytes_super += cache->bytes_super;
+        cache->space_info->bytes_readonly += cache->bytes_super;
        spin_unlock(&cache->space_info->lock);
-        down_write(&cache->space_info->groups_sem);
+        __link_block_group(cache->space_info, cache);
-        list_add_tail(&cache->list, &cache->space_info->block_groups);
-        up_write(&cache->space_info->groups_sem);
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
        BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab2..a4080c21ec55 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
        return state;
 }
-static void free_extent_state(struct extent_state *state)
+void free_extent_state(struct extent_state *state)
 {
        if (!state)
                return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
 }
 static int set_state_cb(struct extent_io_tree *tree,
-                         struct extent_state *state,
+                         struct extent_state *state, int *bits)
-                         unsigned long bits)
 {
        if (tree->ops && tree->ops->set_bit_hook) {
                return tree->ops->set_bit_hook(tree->mapping->host,
-                                               state->start, state->end,
+                                               state, bits);
-                                               state->state, bits);
        }
        return 0;
 }
 static void clear_state_cb(struct extent_io_tree *tree,
-                           struct extent_state *state,
+                           struct extent_state *state, int *bits)
-                           unsigned long bits)
 {
        if (tree->ops && tree->ops->clear_bit_hook)
                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
 */
 static int insert_state(struct extent_io_tree *tree,
                        struct extent_state *state, u64 start, u64 end,
-                        int bits)
+                        int *bits)
 {
        struct rb_node *node;
+        int bits_to_set = *bits & ~EXTENT_CTLBITS;
        int ret;
        if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
        if (ret)
                return ret;
-        if (bits & EXTENT_DIRTY)
+        if (bits_to_set & EXTENT_DIRTY)
                tree->dirty_bytes += end - start + 1;
-        state->state |= bits;
+        state->state |= bits_to_set;
        node = tree_insert(&tree->state, end, &state->rb_node);
        if (node) {
                struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 * struct is freed and removed from the tree
 */
 static int clear_state_bit(struct extent_io_tree *tree,
-                            struct extent_state *state, int bits, int wake,
+                            struct extent_state *state,
-                            int delete)
+                            int *bits, int wake)
 {
-        int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+        int bits_to_clear = *bits & ~EXTENT_CTLBITS;
        int ret = state->state & bits_to_clear;
-        if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                WARN_ON(range > tree->dirty_bytes);
                tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
        state->state &= ~bits_to_clear;
        if (wake)
                wake_up(&state->wq);
-        if (delete || state->state == 0) {
+        if (state->state == 0) {
                if (state->tree) {
-                        clear_state_cb(tree, state, state->state);
                        rb_erase(&state->rb_node, &tree->state);
                        state->tree = NULL;
                        free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int set = 0;
        int clear = 0;
+        if (delete)
+                bits |= ~EXTENT_CTLBITS;
+        bits |= EXTENT_FIRST_DELALLOC;
        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                clear = 1;
 again:
@@ -580,8 +581,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        set |= clear_state_bit(tree, state, bits, wake,
+                        set |= clear_state_bit(tree, state, &bits, wake);
-                                               delete);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
                if (wake)
                        wake_up(&state->wq);
-                set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+                set |= clear_state_bit(tree, prealloc, &bits, wake);
                prealloc = NULL;
                goto out;
@@ -613,7 +613,7 @@ hit_next:
        else
                next_node = NULL;
-        set |= clear_state_bit(tree, state, bits, wake, delete);
+        set |= clear_state_bit(tree, state, &bits, wake);
        if (last_end == (u64)-1)
                goto out;
        start = last_end + 1;
@@ -706,19 +706,19 @@ out:
 static int set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
-                           int bits)
+                           int *bits)
 {
        int ret;
+        int bits_to_set = *bits & ~EXTENT_CTLBITS;
        ret = set_state_cb(tree, state, bits);
        if (ret)
                return ret;
+        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
-        if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                tree->dirty_bytes += range;
        }
-        state->state |= bits;
+        state->state |= bits_to_set;
        return 0;
 }
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
 * [start, end] is inclusive This takes the tree lock.
 */
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                          int bits, int exclusive_bits, u64 *failed_start,
+                   int bits, int exclusive_bits, u64 *failed_start,
-                          struct extent_state **cached_state,
+                   struct extent_state **cached_state, gfp_t mask)
-                          gfp_t mask)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        u64 last_start;
        u64 last_end;
+        bits |= EXTENT_FIRST_DELALLOC;
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
         */
        node = tree_search(tree, start);
        if (!node) {
-                err = insert_state(tree, prealloc, start, end, bits);
+                err = insert_state(tree, prealloc, start, end, &bits);
                prealloc = NULL;
                BUG_ON(err == -EEXIST);
                goto out;
@@ -802,7 +802,7 @@ hit_next:
                        goto out;
                }
-                err = set_state_bits(tree, state, bits);
+                err = set_state_bits(tree, state, &bits);
                if (err)
                        goto out;
@@ -852,7 +852,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        err = set_state_bits(tree, state, bits);
+                        err = set_state_bits(tree, state, &bits);
                        if (err)
                                goto out;
                        cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
                else
                        this_end = last_start - 1;
                err = insert_state(tree, prealloc, start, this_end,
-                                   bits);
+                                   &bits);
                BUG_ON(err == -EEXIST);
                if (err) {
                        prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
-                err = set_state_bits(tree, prealloc, bits);
+                err = set_state_bits(tree, prealloc, &bits);
                if (err) {
                        prealloc = NULL;
                        goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 {
        return clear_extent_bit(tree, start, end,
                                EXTENT_DIRTY | EXTENT_DELALLOC |
-                                EXTENT_DO_ACCOUNTING, 0, 0,
+                                EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
-                                NULL, mask);
 }
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
        if (op & EXTENT_CLEAR_DELALLOC)
                clear_bits |= EXTENT_DELALLOC;
-        if (op & EXTENT_CLEAR_ACCOUNTING)
-                clear_bits |= EXTENT_DO_ACCOUNTING;
        clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
        if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
                    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
        if (tree->ops && tree->ops->submit_bio_hook)
                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-                                           mirror_num, bio_flags);
+                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        sector_t sector;
        struct extent_map *em;
        struct block_device *bdev;
+        struct btrfs_ordered_extent *ordered;
        int ret;
        int nr = 0;
        size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        set_page_extent_mapped(page);
        end = page_end;
-        lock_extent(tree, start, end, GFP_NOFS);
+        while (1) {
+                lock_extent(tree, start, end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, start);
+                if (!ordered)
+                        break;
+                unlock_extent(tree, start, end, GFP_NOFS);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+        }
        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
                char *userpage;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
 #define EXTENT_BOUNDARY (1 << 9)
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_FIRST_DELALLOC (1 << 12)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 /* flags for bio submission */
 #define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
 typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
                                       struct bio *bio, int mirror_num,
-                                       unsigned long bio_flags);
+                                       unsigned long bio_flags, u64 bio_offset);
 struct extent_io_ops {
        int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
                             u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
                                    struct extent_state *state);
        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
                                      struct extent_state *state, int uptodate);
-        int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+        int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-                            unsigned long old, unsigned long bits);
+                            int *bits);
        int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-                              unsigned long bits);
+                              int *bits);
        int (*merge_extent_hook)(struct inode *inode,
                                 struct extent_state *new,
                                 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
                     u64 max_bytes, unsigned long bits);
+void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   int bits, int filled, struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    int bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                   int bits, int exclusive_bits, u64 *failed_start,
+                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa3..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
-                          struct bio *bio, u32 *dst)
+                                   struct inode *inode, struct bio *bio,
+                                   u64 logical_offset, u32 *dst, int dio)
 {
        u32 sum;
        struct bio_vec *bvec = bio->bi_io_vec;
        int bio_index = 0;
-        u64 offset;
+        u64 offset = 0;
        u64 item_start_offset = 0;
        u64 item_last_offset = 0;
        u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
        WARN_ON(bio->bi_vcnt <= 0);
        disk_bytenr = (u64)bio->bi_sector << 9;
+        if (dio)
+                offset = logical_offset;
        while (bio_index < bio->bi_vcnt) {
-                offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+                if (!dio)
+                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
                ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
                if (ret == 0)
                        goto found;
@@ -238,6 +242,7 @@ found:
                else
                        set_state_private(io_tree, offset, sum);
                disk_bytenr += bvec->bv_len;
+                offset += bvec->bv_len;
                bio_index++;
                bvec++;
        }
@@ -245,6 +250,18 @@ found:
        return 0;
 }
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+                          struct bio *bio, u32 *dst)
+{
+        return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+                              struct bio *bio, u64 offset, u32 *dst)
+{
+        return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                             struct list_head *list)
 {
@@ -657,6 +674,9 @@ again:
                goto found;
        }
        ret = PTR_ERR(item);
+        if (ret != -EFBIG && ret != -ENOENT)
+                goto fail_unlock;
        if (ret == -EFBIG) {
                u32 item_size;
                /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4ca..787b50a16a14 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                                         int write_bytes,
                                         struct page **prepared_pages,
-                                         const char __user *buf)
+                                         struct iov_iter *i)
 {
-        long page_fault = 0;
+        size_t copied;
-        int i;
+        int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
-        for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+        while (write_bytes > 0) {
                size_t count = min_t(size_t,
                                     PAGE_CACHE_SIZE - offset, write_bytes);
-                struct page *page = prepared_pages[i];
+                struct page *page = prepared_pages[pg];
-                fault_in_pages_readable(buf, count);
+again:
+                if (unlikely(iov_iter_fault_in_readable(i, count)))
+                        return -EFAULT;
                /* Copy data from userspace to the current page */
-                kmap(page);
+                copied = iov_iter_copy_from_user(page, i, offset, count);
-                page_fault = __copy_from_user(page_address(page) + offset,
-                                              buf, count);
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
-                kunmap(page);
+                iov_iter_advance(i, copied);
-                buf += count;
+                write_bytes -= copied;
-                write_bytes -= count;
-                if (page_fault)
+                if (unlikely(copied == 0)) {
-                        break;
+                        count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+                                      iov_iter_single_seg_count(i));
+                        goto again;
+                }
+                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+                        offset += copied;
+                } else {
+                        pg++;
+                        offset = 0;
+                }
        }
-        return page_fault ? -EFAULT : 0;
+        return 0;
 }
 /*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
                                        NULL);
-        if (err)
+        BUG_ON(err);
-                return err;
        for (i = 0; i < num_pages; i++) {
                struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
                 * at this time.
                 */
        }
-        return err;
+        return 0;
 }
 /*
@@ -823,45 +832,46 @@ again:
        return 0;
 }
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
-                                size_t count, loff_t *ppos)
+                                    const struct iovec *iov,
+                                    unsigned long nr_segs, loff_t pos)
 {
-        loff_t pos;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct page *pinned[2];
+        struct page **pages = NULL;
+        struct iov_iter i;
+        loff_t *ppos = &iocb->ki_pos;
        loff_t start_pos;
        ssize_t num_written = 0;
        ssize_t err = 0;
+        size_t count;
+        size_t ocount;
        int ret = 0;
-        struct inode *inode = fdentry(file)->d_inode;
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page **pages = NULL;
        int nrptrs;
-        struct page *pinned[2];
        unsigned long first_index;
        unsigned long last_index;
        int will_write;
+        int buffered = 0;
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
-        nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
-                     PAGE_CACHE_SIZE / (sizeof(struct page *)));
        pinned[0] = NULL;
        pinned[1] = NULL;
-        pos = *ppos;
        start_pos = pos;
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-        /* do the reserve before the mutex lock in case we have to do some
-         * flushing.  We wouldn't deadlock, but this is more polite.
-         */
-        err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-        if (err)
-                goto out_nolock;
        mutex_lock(&inode->i_mutex);
+        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+        if (err)
+                goto out;
+        count = ocount;
        current->backing_dev_info = inode->i_mapping->backing_dev_info;
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                goto out;
        file_update_time(file);
+        BTRFS_I(inode)->sequence++;
+        if (unlikely(file->f_flags & O_DIRECT)) {
+                num_written = generic_file_direct_write(iocb, iov, &nr_segs,
+                                                        pos, ppos, count,
+                                                        ocount);
+                /*
+                 * the generic O_DIRECT will update in-memory i_size after the
+                 * DIOs are done.  But our endio handlers that update the on
+                 * disk i_size never update past the in memory i_size.  So we
+                 * need one more update here to catch any additions to the
+                 * file
+                 */
+                if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+                        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+                        mark_inode_dirty(inode);
+                }
+                if (num_written < 0) {
+                        ret = num_written;
+                        num_written = 0;
+                        goto out;
+                } else if (num_written == count) {
+                        /* pick up pos changes done by the generic code */
+                        pos = *ppos;
+                        goto out;
+                }
+                /*
+                 * We are going to do buffered for the rest of the range, so we
+                 * need to make sure to invalidate the buffered pages when we're
+                 * done.
+                 */
+                buffered = 1;
+                pos += num_written;
+        }
+        iov_iter_init(&i, iov, nr_segs, count, num_written);
+        nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+                     (sizeof(struct page *)));
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
        /* generic_write_checks can change our pos */
        start_pos = pos;
-        BTRFS_I(inode)->sequence++;
        first_index = pos >> PAGE_CACHE_SHIFT;
-        last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
        /*
         * there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                        unlock_page(pinned[0]);
                }
        }
-        if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
                if (!PageUptodate(pinned[1])) {
                        ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                }
        }
-        while (count > 0) {
+        while (iov_iter_count(&i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-                size_t write_bytes = min(count, nrptrs *
+                size_t write_bytes = min(iov_iter_count(&i),
-                                        (size_t)PAGE_CACHE_SIZE -
+                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
                                         offset);
                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
                                        PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                WARN_ON(num_pages > nrptrs);
                memset(pages, 0, sizeof(struct page *) * nrptrs);
-                ret = btrfs_check_data_free_space(root, inode, write_bytes);
+                ret = btrfs_delalloc_reserve_space(inode, write_bytes);
                if (ret)
                        goto out;
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                                    pos, first_index, last_index,
                                    write_bytes);
                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, write_bytes);
-                                                       write_bytes);
                        goto out;
                }
                ret = btrfs_copy_from_user(pos, num_pages,
-                                           write_bytes, pages, buf);
+                                           write_bytes, pages, &i);
-                if (ret) {
+                if (ret == 0) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        dirty_and_release_pages(NULL, root, file, pages,
-                                                       write_bytes);
+                                                num_pages, pos, write_bytes);
-                        btrfs_drop_pages(pages, num_pages);
-                        goto out;
                }
-                ret = dirty_and_release_pages(NULL, root, file, pages,
-                                              num_pages, pos, write_bytes);
                btrfs_drop_pages(pages, num_pages);
                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, write_bytes);
-                                                       write_bytes);
                        goto out;
                }
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                        btrfs_throttle(root);
                }
-                buf += write_bytes;
-                count -= write_bytes;
                pos += write_bytes;
                num_written += write_bytes;
@@ -976,9 +1016,7 @@ out:
        mutex_unlock(&inode->i_mutex);
        if (ret)
                err = ret;
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-out_nolock:
        kfree(pages);
        if (pinned[0])
                page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
                        num_written = err;
                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_start_transaction(root, 0);
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
                        if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
                                btrfs_end_transaction(trans, root);
                        }
                }
-                if (file->f_flags & O_DIRECT) {
+                if (file->f_flags & O_DIRECT && buffered) {
                        invalidate_mapping_pages(inode->i_mapping,
                              start_pos >> PAGE_CACHE_SHIFT,
                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1063,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 * important optimization for directories because holding the mutex prevents
 * new operations on the dir while we write to disk.
 */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int btrfs_sync_file(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1104,9 +1143,9 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        if (file && file->private_data)
                btrfs_ioctl_trans_end(file);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
-        if (!trans) {
+        if (IS_ERR(trans)) {
-                ret = -ENOMEM;
+                ret = PTR_ERR(trans);
                goto out;
        }
@@ -1161,7 +1200,7 @@ const struct file_operations btrfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .splice_read    = generic_file_splice_read,
-        .write          = btrfs_file_write,
+        .aio_write      = btrfs_file_aio_write,
        .mmap           = btrfs_file_mmap,
        .open           = generic_file_open,
        .release        = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
        return 0;
 }
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const char *name, int name_len,
+                        u64 inode_objectid, u64 ref_objectid, int mod)
+{
+        struct btrfs_key key;
+        struct btrfs_inode_ref *ref;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        int ret;
+        key.objectid = inode_objectid;
+        key.type = BTRFS_INODE_REF_KEY;
+        key.offset = ref_objectid;
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0)
+                return NULL;
+        if (!find_name_in_backref(path, name, name_len, &ref))
+                return NULL;
+        return ref;
+}
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d601629b85d1..fa6ccc1bfe2a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                   inline_len, compressed_size,
                                   compressed_pages);
        BUG_ON(ret);
+        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
 }
@@ -414,6 +415,7 @@ again:
                trans = btrfs_join_transaction(root, 1);
                BUG_ON(!trans);
                btrfs_set_trans_block_group(trans, inode);
+                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                /* lets try to make an inline extent */
                if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
                             start, end, NULL,
                             EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
                             EXTENT_CLEAR_DELALLOC |
-                             EXTENT_CLEAR_ACCOUNTING |
                             EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
                        btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
        return 0;
 }
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+                                      u64 num_bytes)
+{
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
+        u64 alloc_hint = 0;
+        read_lock(&em_tree->lock);
+        em = search_extent_mapping(em_tree, start, num_bytes);
+        if (em) {
+                /*
+                 * if block start isn't an actual block number then find the
+                 * first block in this inode and use that as a hint.  If that
+                 * block is also bogus then just don't worry about it.
+                 */
+                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+                        free_extent_map(em);
+                        em = search_extent_mapping(em_tree, 0, 0);
+                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+                                alloc_hint = em->block_start;
+                        if (em)
+                                free_extent_map(em);
+                } else {
+                        alloc_hint = em->block_start;
+                        free_extent_map(em);
+                }
+        }
+        read_unlock(&em_tree->lock);
+        return alloc_hint;
+}
 /*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(!trans);
        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        actual_end = min_t(u64, isize, end + 1);
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
                                     EXTENT_CLEAR_UNLOCK_PAGE |
                                     EXTENT_CLEAR_UNLOCK |
                                     EXTENT_CLEAR_DELALLOC |
-                                     EXTENT_CLEAR_ACCOUNTING |
                                     EXTENT_CLEAR_DIRTY |
                                     EXTENT_SET_WRITEBACK |
                                     EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
        BUG_ON(disk_num_bytes >
               btrfs_super_total_bytes(&root->fs_info->super_copy));
+        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
-        read_lock(&BTRFS_I(inode)->extent_tree.lock);
-        em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
-                                   start, num_bytes);
-        if (em) {
-                /*
-                 * if block start isn't an actual block number then find the
-                 * first block in this inode and use that as a hint.  If that
-                 * block is also bogus then just don't worry about it.
-                 */
-                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                        free_extent_map(em);
-                        em = search_extent_mapping(em_tree, 0, 0);
-                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
-                                alloc_hint = em->block_start;
-                        if (em)
-                                free_extent_map(em);
-                } else {
-                        alloc_hint = em->block_start;
-                        free_extent_map(em);
-                }
-        }
-        read_unlock(&BTRFS_I(inode)->extent_tree.lock);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
        while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
                                               num_bytes, num_bytes, type);
                BUG_ON(ret);
+                if (root->root_key.objectid ==
+                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
+                                                      num_bytes);
+                        BUG_ON(ret);
+                }
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                cur_offset, cur_offset + num_bytes - 1,
                                locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 }
 static int btrfs_split_extent_hook(struct inode *inode,
-                                    struct extent_state *orig, u64 split)
+                                   struct extent_state *orig, u64 split)
 {
+        /* not delalloc, ignore it */
        if (!(orig->state & EXTENT_DELALLOC))
                return 0;
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
-        BTRFS_I(inode)->outstanding_extents++;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        return 0;
 }
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
        if (!(other->state & EXTENT_DELALLOC))
                return 0;
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-        BTRFS_I(inode)->outstanding_extents--;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        return 0;
 }
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 * bytes in this file, and to maintain the list of inodes that
 * have pending delalloc work to be done.
 */
-static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+static int btrfs_set_bit_hook(struct inode *inode,
-                       unsigned long old, unsigned long bits)
+                              struct extent_state *state, int *bits)
 {
        /*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-        if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+                u64 len = state->end + 1 - state->start;
-                spin_lock(&BTRFS_I(inode)->accounting_lock);
+                if (*bits & EXTENT_FIRST_DELALLOC)
-                BTRFS_I(inode)->outstanding_extents++;
+                        *bits &= ~EXTENT_FIRST_DELALLOC;
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                else
-                btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+                        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
                spin_lock(&root->fs_info->delalloc_lock);
-                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+                BTRFS_I(inode)->delalloc_bytes += len;
-                root->fs_info->delalloc_bytes += end - start + 1;
+                root->fs_info->delalloc_bytes += len;
                if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
                                      &root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 * extent_io.c clear_bit_hook, see set_bit_hook for why
 */
 static int btrfs_clear_bit_hook(struct inode *inode,
-                                struct extent_state *state, unsigned long bits)
+                                struct extent_state *state, int *bits)
 {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-        if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+                u64 len = state->end + 1 - state->start;
-                if (bits & EXTENT_DO_ACCOUNTING) {
+                if (*bits & EXTENT_FIRST_DELALLOC)
-                        spin_lock(&BTRFS_I(inode)->accounting_lock);
+                        *bits &= ~EXTENT_FIRST_DELALLOC;
-                        WARN_ON(!BTRFS_I(inode)->outstanding_extents);
+                else if (!(*bits & EXTENT_DO_ACCOUNTING))
-                        BTRFS_I(inode)->outstanding_extents--;
+                        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-                        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+                if (*bits & EXTENT_DO_ACCOUNTING)
-                }
+                        btrfs_delalloc_release_metadata(inode, len);
+                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+                        btrfs_free_reserved_data_space(inode, len);
                spin_lock(&root->fs_info->delalloc_lock);
-                if (state->end - state->start + 1 >
+                root->fs_info->delalloc_bytes -= len;
-                    root->fs_info->delalloc_bytes) {
+                BTRFS_I(inode)->delalloc_bytes -= len;
-                        printk(KERN_INFO "btrfs warning: delalloc account "
-                               "%llu %llu\n",
-                               (unsigned long long)
-                               state->end - state->start + 1,
-                               (unsigned long long)
-                               root->fs_info->delalloc_bytes);
-                        btrfs_delalloc_free_space(root, inode, (u64)-1);
-                        root->fs_info->delalloc_bytes = 0;
-                        BTRFS_I(inode)->delalloc_bytes = 0;
-                } else {
-                        btrfs_delalloc_free_space(root, inode,
-                                                  state->end -
-                                                  state->start + 1);
-                        root->fs_info->delalloc_bytes -= state->end -
-                                state->start + 1;
-                        BTRFS_I(inode)->delalloc_bytes -= state->end -
-                                state->start + 1;
-                }
                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 */
 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                    unsigned long bio_flags)
+                                    unsigned long bio_flags,
+                                    u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
 * are inserted into the btree
 */
 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                          int mirror_num, unsigned long bio_flags)
+                          int mirror_num, unsigned long bio_flags,
+                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 * on write, or reading the csums from the tree before a read
 */
 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                          int mirror_num, unsigned long bio_flags)
+                          int mirror_num, unsigned long bio_flags,
+                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                /* we're doing a write, do the async checksumming */
                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
-                                   bio_flags, __btrfs_submit_bio_start,
+                                   bio_flags, bio_offset,
+                                   __btrfs_submit_bio_start,
                                   __btrfs_submit_bio_done);
        }
@@ -1520,6 +1525,7 @@ again:
                goto again;
        }
+        BUG();
        btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
        ClearPageChecked(page);
 out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct btrfs_trans_handle *trans;
+        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
                        trans = btrfs_join_transaction(root, 1);
+                        btrfs_set_trans_block_group(trans, inode);
+                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
-                        btrfs_end_transaction(trans, root);
                }
                goto out;
        }
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                         0, &cached_state, GFP_NOFS);
        trans = btrfs_join_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        /* this also removes the ordered extent from the tree */
        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
-        btrfs_end_transaction(trans, root);
 out:
+        btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+        if (trans)
+                btrfs_end_transaction(trans, root);
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
@@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
-                                                      failrec->bio_flags);
+                                                      failrec->bio_flags, 0);
        return 0;
 }
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 }
 /*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending,
+                                u64 *bytes_to_reserve)
+{
+        struct btrfs_root *root;
+        struct btrfs_block_rsv *block_rsv;
+        u64 num_bytes;
+        int index;
+        root = pending->root;
+        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+                return;
+        block_rsv = root->orphan_block_rsv;
+        /* orphan block reservation for the snapshot */
+        num_bytes = block_rsv->size;
+        /*
+         * after the snapshot is created, COWing tree blocks may use more
+         * space than it frees. So we should make sure there is enough
+         * reserved space.
+         */
+        index = trans->transid & 0x1;
+        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+                num_bytes += block_rsv->size -
+                             (block_rsv->reserved + block_rsv->freed[index]);
+        }
+        *bytes_to_reserve += num_bytes;
+}
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_root *snap = pending->snap;
+        struct btrfs_block_rsv *block_rsv;
+        u64 num_bytes;
+        int index;
+        int ret;
+        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+                return;
+        /* refill source subvolume's orphan block reservation */
+        block_rsv = root->orphan_block_rsv;
+        index = trans->transid & 0x1;
+        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+                num_bytes = block_rsv->size -
+                            (block_rsv->reserved + block_rsv->freed[index]);
+                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                              root->orphan_block_rsv,
+                                              num_bytes);
+                BUG_ON(ret);
+        }
+        /* setup orphan block reservation for the snapshot */
+        block_rsv = btrfs_alloc_block_rsv(snap);
+        BUG_ON(!block_rsv);
+        btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+        snap->orphan_block_rsv = block_rsv;
+        num_bytes = root->orphan_block_rsv->size;
+        ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                      block_rsv, num_bytes);
+        BUG_ON(ret);
+#if 0
+        /* insert orphan item for the snapshot */
+        WARN_ON(!root->orphan_item_inserted);
+        ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                       snap->root_key.objectid);
+        BUG_ON(ret);
+        snap->orphan_item_inserted = 1;
+#endif
+}
+enum btrfs_orphan_cleanup_state {
+        ORPHAN_CLEANUP_STARTED  = 1,
+        ORPHAN_CLEANUP_DONE     = 2,
+};
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root)
+{
+        int ret;
+        if (!list_empty(&root->orphan_list) ||
+            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+                return;
+        if (root->orphan_item_inserted &&
+            btrfs_root_refs(&root->root_item) > 0) {
+                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+                                            root->root_key.objectid);
+                BUG_ON(ret);
+                root->orphan_item_inserted = 0;
+        }
+        if (root->orphan_block_rsv) {
+                WARN_ON(root->orphan_block_rsv->size > 0);
+                btrfs_free_block_rsv(root, root->orphan_block_rsv);
+                root->orphan_block_rsv = NULL;
+        }
+}
+/*
 * This creates an orphan entry for the given inode in case something goes
 * wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ *       this function.
 */
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        int ret = 0;
+        struct btrfs_block_rsv *block_rsv = NULL;
+        int reserve = 0;
+        int insert = 0;
+        int ret;
-        spin_lock(&root->list_lock);
+        if (!root->orphan_block_rsv) {
+                block_rsv = btrfs_alloc_block_rsv(root);
+                BUG_ON(!block_rsv);
+        }
-        /* already on the orphan list, we're good */
+        spin_lock(&root->orphan_lock);
-        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+        if (!root->orphan_block_rsv) {
-                spin_unlock(&root->list_lock);
+                root->orphan_block_rsv = block_rsv;
-                return 0;
+        } else if (block_rsv) {
+                btrfs_free_block_rsv(root, block_rsv);
+                block_rsv = NULL;
+        }
+        if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+#if 0
+                /*
+                 * For proper ENOSPC handling, we should do orphan
+                 * cleanup when mounting. But this introduces backward
+                 * compatibility issue.
+                 */
+                if (!xchg(&root->orphan_item_inserted, 1))
+                        insert = 2;
+                else
+                        insert = 1;
+#endif
+                insert = 1;
+        } else {
+                WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
        }
-        list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+        if (!BTRFS_I(inode)->orphan_meta_reserved) {
+                BTRFS_I(inode)->orphan_meta_reserved = 1;
+                reserve = 1;
+        }
+        spin_unlock(&root->orphan_lock);
-        spin_unlock(&root->list_lock);
+        if (block_rsv)
+                btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-        /*
+        /* grab metadata reservation from transaction handle */
-         * insert an orphan item to track this unlinked/truncated file
+        if (reserve) {
-         */
+                ret = btrfs_orphan_reserve_metadata(trans, inode);
-        ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
-        return ret;
+        /* insert an orphan item to track this unlinked/truncated file */
+        if (insert >= 1) {
+                ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
+        /* insert an orphan item to track subvolume contains orphan files */
+        if (insert >= 2) {
+                ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                               root->root_key.objectid);
+                BUG_ON(ret);
+        }
+        return 0;
 }
 /*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int delete_item = 0;
+        int release_rsv = 0;
        int ret = 0;
-        spin_lock(&root->list_lock);
+        spin_lock(&root->orphan_lock);
+        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-        if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+                list_del_init(&BTRFS_I(inode)->i_orphan);
-                spin_unlock(&root->list_lock);
+                delete_item = 1;
-                return 0;
        }
-        list_del_init(&BTRFS_I(inode)->i_orphan);
+        if (BTRFS_I(inode)->orphan_meta_reserved) {
-        if (!trans) {
+                BTRFS_I(inode)->orphan_meta_reserved = 0;
-                spin_unlock(&root->list_lock);
+                release_rsv = 1;
-                return 0;
        }
+        spin_unlock(&root->orphan_lock);
-        spin_unlock(&root->list_lock);
+        if (trans && delete_item) {
+                ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
-        ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+        if (release_rsv)
+                btrfs_orphan_release_metadata(inode);
-        return ret;
+        return 0;
 }
 /*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        struct inode *inode;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
-        if (!xchg(&root->clean_orphans, 0))
+        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
                return;
        path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-                if (IS_ERR(inode))
+                BUG_ON(IS_ERR(inode));
-                        break;
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
                 */
-                spin_lock(&root->list_lock);
+                spin_lock(&root->orphan_lock);
                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-                spin_unlock(&root->list_lock);
+                spin_unlock(&root->orphan_lock);
                /*
                 * if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 * do a destroy_inode
                 */
                if (is_bad_inode(inode)) {
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_start_transaction(root, 0);
                        btrfs_orphan_del(trans, inode);
                        btrfs_end_transaction(trans, root);
                        iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                /* this will do delete_inode and everything for us */
                iput(inode);
        }
+        btrfs_free_path(path);
+        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+        if (root->orphan_block_rsv)
+                btrfs_block_rsv_release(root, root->orphan_block_rsv,
+                                        (u64)-1);
+        if (root->orphan_block_rsv || root->orphan_item_inserted) {
+                trans = btrfs_join_transaction(root, 1);
+                btrfs_end_transaction(trans, root);
+        }
        if (nr_unlink)
                printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
        if (nr_truncate)
                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
-        btrfs_free_path(path);
 }
 /*
@@ -2478,29 +2666,201 @@ out:
        return ret;
 }
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+                             struct btrfs_path *path)
+{
+        struct extent_buffer *eb;
+        int level;
+        int ret;
+        u64 refs;
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                if (!path->nodes[level])
+                        break;
+                eb = path->nodes[level];
+                if (!btrfs_block_can_be_shared(root, eb))
+                        continue;
+                ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+                                               &refs, NULL);
+                if (refs > 1)
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+                                                       struct dentry *dentry)
 {
-        struct btrfs_root *root;
        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_path *path;
+        struct btrfs_inode_ref *ref;
+        struct btrfs_dir_item *di;
        struct inode *inode = dentry->d_inode;
+        u64 index;
+        int check_link = 1;
+        int err = -ENOSPC;
        int ret;
-        unsigned long nr = 0;
-        root = BTRFS_I(dir)->root;
+        trans = btrfs_start_transaction(root, 10);
+        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+                return trans;
-        /*
+        if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
-         * 5 items for unlink inode
+                return ERR_PTR(-ENOSPC);
-         * 1 for orphan
-         */
+        /* check if there is someone else holds reference */
-        ret = btrfs_reserve_metadata_space(root, 6);
+        if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
-        if (ret)
+                return ERR_PTR(-ENOSPC);
-                return ret;
+        if (atomic_read(&inode->i_count) > 2)
+                return ERR_PTR(-ENOSPC);
+        if (xchg(&root->fs_info->enospc_unlink, 1))
+                return ERR_PTR(-ENOSPC);
-        trans = btrfs_start_transaction(root, 1);
+        path = btrfs_alloc_path();
+        if (!path) {
+                root->fs_info->enospc_unlink = 0;
+                return ERR_PTR(-ENOMEM);
+        }
+        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
-                btrfs_unreserve_metadata_space(root, 6);
+                btrfs_free_path(path);
-                return PTR_ERR(trans);
+                root->fs_info->enospc_unlink = 0;
+                return trans;
+        }
+        path->skip_locking = 1;
+        path->search_commit_root = 1;
+        ret = btrfs_lookup_inode(trans, root, path,
+                                &BTRFS_I(dir)->location, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret == 0) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                check_link = 0;
+        }
+        btrfs_release_path(root, path);
+        ret = btrfs_lookup_inode(trans, root, path,
+                                &BTRFS_I(inode)->location, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret == 0) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                check_link = 0;
+        }
+        btrfs_release_path(root, path);
+        if (ret == 0 && S_ISREG(inode->i_mode)) {
+                ret = btrfs_lookup_file_extent(trans, root, path,
+                                               inode->i_ino, (u64)-1, 0);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                BUG_ON(ret == 0);
+                if (check_path_shared(root, path))
+                        goto out;
+                btrfs_release_path(root, path);
+        }
+        if (!check_link) {
+                err = 0;
+                goto out;
+        }
+        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                                dentry->d_name.name, dentry->d_name.len, 0);
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto out;
+        }
+        if (di) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                err = 0;
+                goto out;
        }
+        btrfs_release_path(root, path);
+        ref = btrfs_lookup_inode_ref(trans, root, path,
+                                dentry->d_name.name, dentry->d_name.len,
+                                inode->i_ino, dir->i_ino, 0);
+        if (IS_ERR(ref)) {
+                err = PTR_ERR(ref);
+                goto out;
+        }
+        BUG_ON(!ref);
+        if (check_path_shared(root, path))
+                goto out;
+        index = btrfs_inode_ref_index(path->nodes[0], ref);
+        btrfs_release_path(root, path);
+        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+                                dentry->d_name.name, dentry->d_name.len, 0);
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto out;
+        }
+        BUG_ON(ret == -ENOENT);
+        if (check_path_shared(root, path))
+                goto out;
+        err = 0;
+out:
+        btrfs_free_path(path);
+        if (err) {
+                btrfs_end_transaction(trans, root);
+                root->fs_info->enospc_unlink = 0;
+                return ERR_PTR(err);
+        }
+        trans->block_rsv = &root->fs_info->global_block_rsv;
+        return trans;
+}
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
+{
+        if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+                BUG_ON(!root->fs_info->enospc_unlink);
+                root->fs_info->enospc_unlink = 0;
+        }
+        btrfs_end_transaction_throttle(trans, root);
+}
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_trans_handle *trans;
+        struct inode *inode = dentry->d_inode;
+        int ret;
+        unsigned long nr = 0;
+        trans = __unlink_start_trans(dir, dentry);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        btrfs_set_trans_block_group(trans, dir);
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
                                 dentry->d_name.name, dentry->d_name.len);
+        BUG_ON(ret);
-        if (inode->i_nlink == 0)
+        if (inode->i_nlink == 0) {
                ret = btrfs_orphan_add(trans, inode);
+                BUG_ON(ret);
+        }
        nr = trans->blocks_used;
+        __unlink_end_trans(trans, root);
-        btrfs_end_transaction_throttle(trans, root);
-        btrfs_unreserve_metadata_space(root, 6);
        btrfs_btree_balance_dirty(root, nr);
        return ret;
 }
@@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
        int err = 0;
-        int ret;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
        unsigned long nr = 0;
@@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
            inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                return -ENOTEMPTY;
-        ret = btrfs_reserve_metadata_space(root, 5);
+        trans = __unlink_start_trans(dir, dentry);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
-        trans = btrfs_start_transaction(root, 1);
-        if (IS_ERR(trans)) {
-                btrfs_unreserve_metadata_space(root, 5);
                return PTR_ERR(trans);
-        }
        btrfs_set_trans_block_group(trans, dir);
@@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
                btrfs_i_size_write(inode, 0);
 out:
        nr = trans->blocks_used;
-        ret = btrfs_end_transaction_throttle(trans, root);
+        __unlink_end_trans(trans, root);
-        btrfs_unreserve_metadata_space(root, 5);
        btrfs_btree_balance_dirty(root, nr);
-        if (ret && !err)
-                err = ret;
        return err;
 }
@@ -3029,6 +3380,7 @@ out:
        if (pending_del_nr) {
                ret = btrfs_del_items(trans, root, path, pending_del_slot,
                                      pending_del_nr);
+                BUG_ON(ret);
        }
        btrfs_free_path(path);
        return err;
@@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        if ((offset & (blocksize - 1)) == 0)
                goto out;
-        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-        if (ret)
-                goto out;
-        ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
        if (ret)
                goto out;
@@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 again:
        page = grab_cache_page(mapping, index);
        if (!page) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-                btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
                goto out;
        }
@@ -3132,8 +3479,7 @@ again:
 out_unlock:
        if (ret)
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        unlock_page(page);
        page_cache_release(page);
 out:
@@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-        struct extent_map *em;
+        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 mask = root->sectorsize - 1;
        u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                        u64 hint_byte = 0;
                        hole_size = last_byte - cur_offset;
-                        err = btrfs_reserve_metadata_space(root, 2);
+                        trans = btrfs_start_transaction(root, 2);
-                        if (err)
+                        if (IS_ERR(trans)) {
+                                err = PTR_ERR(trans);
                                break;
+                        }
-                        trans = btrfs_start_transaction(root, 1);
                        btrfs_set_trans_block_group(trans, inode);
                        err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                                        last_byte - 1, 0);
                        btrfs_end_transaction(trans, root);
-                        btrfs_unreserve_metadata_space(root, 2);
                }
                free_extent_map(em);
+                em = NULL;
                cur_offset = last_byte;
                if (cur_offset >= block_end)
                        break;
        }
+        free_extent_map(em);
        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
                             GFP_NOFS);
        return err;
@@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
                }
        }
-        ret = btrfs_reserve_metadata_space(root, 1);
+        trans = btrfs_start_transaction(root, 5);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
        ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_unreserve_metadata_space(root, 1);
        btrfs_btree_balance_dirty(root, nr);
        if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
                i_size_write(inode, attr->ia_size);
                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
+                trans->block_rsv = root->orphan_block_rsv;
+                BUG_ON(!trans->block_rsv);
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
@@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
        btrfs_i_size_write(inode, 0);
        while (1) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
-                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+                trans->block_rsv = root->orphan_block_rsv;
+                ret = btrfs_block_rsv_check(trans, root,
+                                            root->orphan_block_rsv, 0, 5);
+                if (ret) {
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        continue;
+                }
+                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
                if (ret != -EAGAIN)
                        break;
@@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
                btrfs_end_transaction(trans, root);
                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
        }
        if (ret == 0) {
@@ -3596,40 +3956,10 @@ again:
        return 0;
 }
-static noinline void init_btrfs_i(struct inode *inode)
-{
-        struct btrfs_inode *bi = BTRFS_I(inode);
-        bi->generation = 0;
-        bi->sequence = 0;
-        bi->last_trans = 0;
-        bi->last_sub_trans = 0;
-        bi->logged_trans = 0;
-        bi->delalloc_bytes = 0;
-        bi->reserved_bytes = 0;
-        bi->disk_i_size = 0;
-        bi->flags = 0;
-        bi->index_cnt = (u64)-1;
-        bi->last_unlink_trans = 0;
-        bi->ordered_data_close = 0;
-        bi->force_compress = 0;
-        extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-        extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-                             inode->i_mapping, GFP_NOFS);
-        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-                             inode->i_mapping, GFP_NOFS);
-        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
-        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-        mutex_init(&BTRFS_I(inode)->log_mutex);
-}
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
        struct btrfs_iget_args *args = p;
        inode->i_ino = args->ino;
-        init_btrfs_i(inode);
        BTRFS_I(inode)->root = args->root;
        btrfs_set_inode_space_info(args->root, inode);
        return 0;
@@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
        if (!inode)
                return ERR_PTR(-ENOMEM);
-        init_btrfs_i(inode);
        BTRFS_I(inode)->root = root;
        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
        BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct btrfs_trans_handle *trans;
        int ret = 0;
-        if (root->fs_info->btree_inode == inode)
+        if (BTRFS_I(inode)->dummy_inode)
                return 0;
        if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
+        int ret;
+        if (BTRFS_I(inode)->dummy_inode)
+                return;
        trans = btrfs_join_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
-        btrfs_update_inode(trans, root, inode);
+        ret = btrfs_update_inode(trans, root, inode);
+        if (ret && ret == -ENOSPC) {
+                /* whoops, lets try again with the full transaction */
+                btrfs_end_transaction(trans, root);
+                trans = btrfs_start_transaction(root, 1);
+                if (IS_ERR(trans)) {
+                        if (printk_ratelimit()) {
+                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %lu error %ld\n",
+                                       inode->i_ino, PTR_ERR(trans));
+                        }
+                        return;
+                }
+                btrfs_set_trans_block_group(trans, inode);
+                ret = btrfs_update_inode(trans, root, inode);
+                if (ret) {
+                        if (printk_ratelimit()) {
+                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %lu error %d\n",
+                                       inode->i_ino, ret);
+                        }
+                }
+        }
        btrfs_end_transaction(trans, root);
 }
@@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         * btrfs_get_inode_index_count has an explanation for the magic
         * number
         */
-        init_btrfs_i(inode);
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
@@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
+        btrfs_btree_balance_dirty(root, nr);
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-        btrfs_btree_balance_dirty(root, nr);
        return err;
 }
@@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = NULL;
-        int err;
        int drop_inode = 0;
+        int err;
        unsigned long nr = 0;
        u64 objectid;
        u64 index = 0;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino,
@@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (root->objectid != BTRFS_I(inode)->root->objectid)
                return -EPERM;
-        /*
-         * 1 item for inode ref
-         * 2 items for dir items
-         */
-        err = btrfs_reserve_metadata_space(root, 3);
-        if (err)
-                return err;
        btrfs_inc_nlink(inode);
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
-        trans = btrfs_start_transaction(root, 1);
+        /*
+         * 1 item for inode ref
+         * 2 items for dir items
+         */
+        trans = btrfs_start_transaction(root, 3);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto fail;
+        }
        btrfs_set_trans_block_group(trans, dir);
        atomic_inc(&inode->i_count);
@@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
 fail:
-        btrfs_unreserve_metadata_space(root, 3);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        u64 index = 0;
        unsigned long nr = 1;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 items for inode and ref
         * 2 items for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                err = -ENOMEM;
-                goto out_unlock;
-        }
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_fail;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_fail:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-out_unlock:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_on_err)
                iput(inode);
        btrfs_btree_balance_dirty(root, nr);
@@ -4770,6 +5098,7 @@ again:
                        }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
+                        WARN_ON(1);
                        if (!trans) {
                                kunmap(page);
                                free_extent_map(em);
@@ -4866,11 +5195,651 @@ out:
        return em;
 }
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                  u64 start, u64 len)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct extent_map *em;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct btrfs_key ins;
+        u64 alloc_hint;
+        int ret;
+        btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        trans = btrfs_join_transaction(root, 0);
+        if (!trans)
+                return ERR_PTR(-ENOMEM);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+        alloc_hint = get_extent_allocation_hint(inode, start, len);
+        ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+                                   alloc_hint, (u64)-1, &ins, 1);
+        if (ret) {
+                em = ERR_PTR(ret);
+                goto out;
+        }
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em) {
+                em = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        em->start = start;
+        em->orig_start = em->start;
+        em->len = ins.offset;
+        em->block_start = ins.objectid;
+        em->block_len = ins.offset;
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+        while (1) {
+                write_lock(&em_tree->lock);
+                ret = add_extent_mapping(em_tree, em);
+                write_unlock(&em_tree->lock);
+                if (ret != -EEXIST)
+                        break;
+                btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+        }
+        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+                                           ins.offset, ins.offset, 0);
+        if (ret) {
+                btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+                em = ERR_PTR(ret);
+        }
+out:
+        btrfs_end_transaction(trans, root);
+        return em;
+}
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+                                      struct inode *inode, u64 offset, u64 len)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct extent_buffer *leaf;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 disk_bytenr;
+        u64 backref_offset;
+        u64 extent_end;
+        u64 num_bytes;
+        int slot;
+        int found_type;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                       offset, 0);
+        if (ret < 0)
+                goto out;
+        slot = path->slots[0];
+        if (ret == 1) {
+                if (slot == 0) {
+                        /* can't find the item, must cow */
+                        ret = 0;
+                        goto out;
+                }
+                slot--;
+        }
+        ret = 0;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &key, slot);
+        if (key.objectid != inode->i_ino ||
+            key.type != BTRFS_EXTENT_DATA_KEY) {
+                /* not our file or wrong item type, must cow */
+                goto out;
+        }
+        if (key.offset > offset) {
+                /* Wrong offset, must cow */
+                goto out;
+        }
+        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+        found_type = btrfs_file_extent_type(leaf, fi);
+        if (found_type != BTRFS_FILE_EXTENT_REG &&
+            found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+                /* not a regular extent, must cow */
+                goto out;
+        }
+        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+        backref_offset = btrfs_file_extent_offset(leaf, fi);
+        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+        if (extent_end < offset + len) {
+                /* extent doesn't include our full range, must cow */
+                goto out;
+        }
+        if (btrfs_extent_readonly(root, disk_bytenr))
+                goto out;
+        /*
+         * look for other files referencing this extent, if we
+         * find any we must cow
+         */
+        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                                  key.offset - backref_offset, disk_bytenr))
+                goto out;
+        /*
+         * adjust disk_bytenr and num_bytes to cover just the bytes
+         * in this extent we are about to write.  If there
+         * are any csums in that range we have to cow in order
+         * to keep the csums correct
+         */
+        disk_bytenr += backref_offset;
+        disk_bytenr += offset - key.offset;
+        num_bytes = min(offset + len, extent_end) - offset;
+        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+                                goto out;
+        /*
+         * all of the above have passed, it is safe to overwrite this extent
+         * without cow
+         */
+        ret = 1;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        struct extent_map *em;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 start = iblock << inode->i_blkbits;
+        u64 len = bh_result->b_size;
+        struct btrfs_trans_handle *trans;
+        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+        if (IS_ERR(em))
+                return PTR_ERR(em);
+        /*
+         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+         * io.  INLINE is special, and we could probably kludge it in here, but
+         * it's still buffered so for safety lets just fall back to the generic
+         * buffered path.
+         *
+         * For COMPRESSED we _have_ to read the entire extent in so we can
+         * decompress it, so there will be buffering required no matter what we
+         * do, so go ahead and fallback to buffered.
+         *
+         * We return -ENOTBLK because thats what makes DIO go ahead and go back
+         * to buffered IO.  Don't blame me, this is the price we pay for using
+         * the generic code.
+         */
+        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+            em->block_start == EXTENT_MAP_INLINE) {
+                free_extent_map(em);
+                return -ENOTBLK;
+        }
+        /* Just a good old fashioned hole, return */
+        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                free_extent_map(em);
+                /* DIO will do one hole at a time, so just unlock a sector */
+                unlock_extent(&BTRFS_I(inode)->io_tree, start,
+                              start + root->sectorsize - 1, GFP_NOFS);
+                return 0;
+        }
+        /*
+         * We don't allocate a new extent in the following cases
+         *
+         * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+         * existing extent.
+         * 2) The extent is marked as PREALLOC.  We're good to go here and can
+         * just use the extent.
+         *
+         */
+        if (!create) {
+                len = em->len - (start - em->start);
+                goto map;
+        }
+        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+             em->block_start != EXTENT_MAP_HOLE)) {
+                int type;
+                int ret;
+                u64 block_start;
+                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                        type = BTRFS_ORDERED_PREALLOC;
+                else
+                        type = BTRFS_ORDERED_NOCOW;
+                len = min(len, em->len - (start - em->start));
+                block_start = em->block_start + (start - em->start);
+                /*
+                 * we're not going to log anything, but we do need
+                 * to make sure the current transaction stays open
+                 * while we look for nocow cross refs
+                 */
+                trans = btrfs_join_transaction(root, 0);
+                if (!trans)
+                        goto must_cow;
+                if (can_nocow_odirect(trans, inode, start, len) == 1) {
+                        ret = btrfs_add_ordered_extent_dio(inode, start,
+                                           block_start, len, len, type);
+                        btrfs_end_transaction(trans, root);
+                        if (ret) {
+                                free_extent_map(em);
+                                return ret;
+                        }
+                        goto unlock;
+                }
+                btrfs_end_transaction(trans, root);
+        }
+must_cow:
+        /*
+         * this will cow the extent, reset the len in case we changed
+         * it above
+         */
+        len = bh_result->b_size;
+        free_extent_map(em);
+        em = btrfs_new_extent_direct(inode, start, len);
+        if (IS_ERR(em))
+                return PTR_ERR(em);
+        len = min(len, em->len - (start - em->start));
+unlock:
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+                          EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+                          0, NULL, GFP_NOFS);
+map:
+        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+                inode->i_blkbits;
+        bh_result->b_size = len;
+        bh_result->b_bdev = em->bdev;
+        set_buffer_mapped(bh_result);
+        if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                set_buffer_new(bh_result);
+        free_extent_map(em);
+        return 0;
+}
+struct btrfs_dio_private {
+        struct inode *inode;
+        u64 logical_offset;
+        u64 disk_bytenr;
+        u64 bytes;
+        u32 *csums;
+        void *private;
+};
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        struct btrfs_dio_private *dip = bio->bi_private;
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 start;
+        u32 *private = dip->csums;
+        start = dip->logical_offset;
+        do {
+                if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+                        struct page *page = bvec->bv_page;
+                        char *kaddr;
+                        u32 csum = ~(u32)0;
+                        unsigned long flags;
+                        local_irq_save(flags);
+                        kaddr = kmap_atomic(page, KM_IRQ0);
+                        csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+                                               csum, bvec->bv_len);
+                        btrfs_csum_final(csum, (char *)&csum);
+                        kunmap_atomic(kaddr, KM_IRQ0);
+                        local_irq_restore(flags);
+                        flush_dcache_page(bvec->bv_page);
+                        if (csum != *private) {
+                                printk(KERN_ERR "btrfs csum failed ino %lu off"
+                                      " %llu csum %u private %u\n",
+                                      inode->i_ino, (unsigned long long)start,
+                                      csum, *private);
+                                err = -EIO;
+                        }
+                }
+                start += bvec->bv_len;
+                private++;
+                bvec++;
+        } while (bvec <= bvec_end);
+        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+                      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+        bio->bi_private = dip->private;
+        kfree(dip->csums);
+        kfree(dip);
+        dio_end_io(bio, err);
+}
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+        struct btrfs_dio_private *dip = bio->bi_private;
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_ordered_extent *ordered = NULL;
+        struct extent_state *cached_state = NULL;
+        int ret;
+        if (err)
+                goto out_done;
+        ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+                                             dip->logical_offset, dip->bytes);
+        if (!ret)
+                goto out_done;
+        BUG_ON(!ordered);
+        trans = btrfs_join_transaction(root, 1);
+        if (!trans) {
+                err = -ENOMEM;
+                goto out;
+        }
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+                ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+                if (!ret)
+                        ret = btrfs_update_inode(trans, root, inode);
+                err = ret;
+                goto out;
+        }
+        lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                         ordered->file_offset + ordered->len - 1, 0,
+                         &cached_state, GFP_NOFS);
+        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+                ret = btrfs_mark_extent_written(trans, inode,
+                                                ordered->file_offset,
+                                                ordered->file_offset +
+                                                ordered->len);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
+        } else {
+                ret = insert_reserved_file_extent(trans, inode,
+                                                  ordered->file_offset,
+                                                  ordered->start,
+                                                  ordered->disk_len,
+                                                  ordered->len,
+                                                  ordered->len,
+                                                  0, 0, 0,
+                                                  BTRFS_FILE_EXTENT_REG);
+                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                                   ordered->file_offset, ordered->len);
+                if (ret) {
+                        err = ret;
+                        WARN_ON(1);
+                        goto out_unlock;
+                }
+        }
+        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+        btrfs_ordered_update_i_size(inode, 0, ordered);
+        btrfs_update_inode(trans, root, inode);
+out_unlock:
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                             ordered->file_offset + ordered->len - 1,
+                             &cached_state, GFP_NOFS);
+out:
+        btrfs_delalloc_release_metadata(inode, ordered->len);
+        btrfs_end_transaction(trans, root);
+        btrfs_put_ordered_extent(ordered);
+        btrfs_put_ordered_extent(ordered);
+out_done:
+        bio->bi_private = dip->private;
+        kfree(dip->csums);
+        kfree(dip);
+        dio_end_io(bio, err);
+}
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+                                    struct bio *bio, int mirror_num,
+                                    unsigned long bio_flags, u64 offset)
+{
+        int ret;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+        BUG_ON(ret);
+        return 0;
+}
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+                                loff_t file_offset)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_dio_private *dip;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        u64 start;
+        int skip_sum;
+        int write = rw & (1 << BIO_RW);
+        int ret = 0;
+        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+        dip = kmalloc(sizeof(*dip), GFP_NOFS);
+        if (!dip) {
+                ret = -ENOMEM;
+                goto free_ordered;
+        }
+        dip->csums = NULL;
+        if (!skip_sum) {
+                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+                if (!dip->csums) {
+                        ret = -ENOMEM;
+                        goto free_ordered;
+                }
+        }
+        dip->private = bio->bi_private;
+        dip->inode = inode;
+        dip->logical_offset = file_offset;
+        start = dip->logical_offset;
+        dip->bytes = 0;
+        do {
+                dip->bytes += bvec->bv_len;
+                bvec++;
+        } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+        dip->disk_bytenr = (u64)bio->bi_sector << 9;
+        bio->bi_private = dip;
+        if (write)
+                bio->bi_end_io = btrfs_endio_direct_write;
+        else
+                bio->bi_end_io = btrfs_endio_direct_read;
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        if (ret)
+                goto out_err;
+        if (write && !skip_sum) {
+                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                   inode, rw, bio, 0, 0,
+                                   dip->logical_offset,
+                                   __btrfs_submit_bio_start_direct_io,
+                                   __btrfs_submit_bio_done);
+                if (ret)
+                        goto out_err;
+                return;
+        } else if (!skip_sum)
+                btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                          dip->logical_offset, dip->csums);
+        ret = btrfs_map_bio(root, rw, bio, 0, 1);
+        if (ret)
+                goto out_err;
+        return;
+out_err:
+        kfree(dip->csums);
+        kfree(dip);
+free_ordered:
+        /*
+         * If this is a write, we need to clean up the reserved space and kill
+         * the ordered extent.
+         */
+        if (write) {
+                struct btrfs_ordered_extent *ordered;
+                ordered = btrfs_lookup_ordered_extent(inode,
+                                                      dip->logical_offset);
+                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+                        btrfs_free_reserved_extent(root, ordered->start,
+                                                   ordered->disk_len);
+                btrfs_put_ordered_extent(ordered);
+                btrfs_put_ordered_extent(ordered);
+        }
+        bio_endio(bio, ret);
+}
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+                        const struct iovec *iov, loff_t offset,
+                        unsigned long nr_segs)
+{
+        int seg;
+        size_t size;
+        unsigned long addr;
+        unsigned blocksize_mask = root->sectorsize - 1;
+        ssize_t retval = -EINVAL;
+        loff_t end = offset;
+        if (offset & blocksize_mask)
+                goto out;
+        /* Check the memory alignment.  Blocks cannot straddle pages */
+        for (seg = 0; seg < nr_segs; seg++) {
+                addr = (unsigned long)iov[seg].iov_base;
+                size = iov[seg].iov_len;
+                end += size;
+                if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                        goto out;
+        }
+        retval = 0;
+out:
+        return retval;
+}
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
 {
-        return -EINVAL;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        struct btrfs_ordered_extent *ordered;
+        struct extent_state *cached_state = NULL;
+        u64 lockstart, lockend;
+        ssize_t ret;
+        int writing = rw & WRITE;
+        int write_bits = 0;
+        size_t count = iov_length(iov, nr_segs);
+        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+                            offset, nr_segs)) {
+                return 0;
+        }
+        lockstart = offset;
+        lockend = offset + count - 1;
+        if (writing) {
+                ret = btrfs_delalloc_reserve_space(inode, count);
+                if (ret)
+                        goto out;
+        }
+        while (1) {
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                 0, &cached_state, GFP_NOFS);
+                /*
+                 * We're concerned with the entire range that we're going to be
+                 * doing DIO to, so we need to make sure theres no ordered
+                 * extents in this range.
+                 */
+                ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                     lockend - lockstart + 1);
+                if (!ordered)
+                        break;
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                     &cached_state, GFP_NOFS);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+                cond_resched();
+        }
+        /*
+         * we don't use btrfs_set_extent_delalloc because we don't want
+         * the dirty or uptodate bits
+         */
+        if (writing) {
+                write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                     EXTENT_DELALLOC, 0, NULL, &cached_state,
+                                     GFP_NOFS);
+                if (ret) {
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                         lockend, EXTENT_LOCKED | write_bits,
+                                         1, 0, &cached_state, GFP_NOFS);
+                        goto out;
+                }
+        }
+        free_extent_state(cached_state);
+        cached_state = NULL;
+        ret = __blockdev_direct_IO(rw, iocb, inode,
+                   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+                   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                   btrfs_submit_direct, 0);
+        if (ret < 0 && ret != -EIOCBQUEUED) {
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+                              offset + iov_length(iov, nr_segs) - 1,
+                              EXTENT_LOCKED | write_bits, 1, 0,
+                              &cached_state, GFP_NOFS);
+        } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+                /*
+                 * We're falling back to buffered, unlock the section we didn't
+                 * do IO on.
+                 */
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+                              offset + iov_length(iov, nr_segs) - 1,
+                              EXTENT_LOCKED | write_bits, 1, 0,
+                              &cached_state, GFP_NOFS);
+        }
+out:
+        free_extent_state(cached_state);
+        return ret;
 }
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_start;
        u64 page_end;
-        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (ret) {
                if (ret == -ENOMEM)
                        ret = VM_FAULT_OOM;
@@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                goto out;
        }
-        ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-        if (ret) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-                ret = VM_FAULT_SIGBUS;
-                goto out;
-        }
        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
        lock_page(page);
@@ -5059,7 +6021,6 @@ again:
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                /* page got truncated out from underneath us */
                goto out_unlock;
        }
@@ -5100,7 +6061,6 @@ again:
                unlock_extent_cached(io_tree, page_start, page_end,
                                     &cached_state, GFP_NOFS);
                ret = VM_FAULT_SIGBUS;
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                goto out_unlock;
        }
        ret = 0;
@@ -5127,10 +6087,10 @@ again:
        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 out_unlock:
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        if (!ret)
                return VM_FAULT_LOCKED;
        unlock_page(page);
+        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
        return ret;
 }
@@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
+        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = root->orphan_block_rsv;
        /*
         * setattr is responsible for setting the ordered_data_close flag,
@@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
                btrfs_add_ordered_operation(trans, root, inode);
        while (1) {
+                if (!trans) {
+                        trans = btrfs_start_transaction(root, 0);
+                        BUG_ON(IS_ERR(trans));
+                        btrfs_set_trans_block_group(trans, inode);
+                        trans->block_rsv = root->orphan_block_rsv;
+                }
+                ret = btrfs_block_rsv_check(trans, root,
+                                            root->orphan_block_rsv, 0, 5);
+                if (ret) {
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        trans = NULL;
+                        continue;
+                }
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
@@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
+                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
-                trans = btrfs_start_transaction(root, 1);
-                btrfs_set_trans_block_group(trans, inode);
        }
        if (ret == 0 && inode->i_nlink > 0) {
@@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
        struct btrfs_inode *ei;
+        struct inode *inode;
        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
+        ei->root = NULL;
+        ei->space_info = NULL;
+        ei->generation = 0;
+        ei->sequence = 0;
        ei->last_trans = 0;
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
-        ei->outstanding_extents = 0;
+        ei->delalloc_bytes = 0;
-        ei->reserved_extents = 0;
+        ei->reserved_bytes = 0;
-        ei->root = NULL;
+        ei->disk_i_size = 0;
+        ei->flags = 0;
+        ei->index_cnt = (u64)-1;
+        ei->last_unlink_trans = 0;
        spin_lock_init(&ei->accounting_lock);
+        atomic_set(&ei->outstanding_extents, 0);
+        ei->reserved_extents = 0;
+        ei->ordered_data_close = 0;
+        ei->orphan_meta_reserved = 0;
+        ei->dummy_inode = 0;
+        ei->force_compress = 0;
+        inode = &ei->vfs_inode;
+        extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+        extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+        mutex_init(&ei->log_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
+        INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->ordered_operations);
-        return &ei->vfs_inode;
+        RB_CLEAR_NODE(&ei->rb_node);
+        return inode;
 }
 void btrfs_destroy_inode(struct inode *inode)
@@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
+        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+        WARN_ON(BTRFS_I(inode)->reserved_extents);
        /*
         * This can happen where we create an inode, but somebody else also
@@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
-        spin_lock(&root->list_lock);
+        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
                       inode->i_ino);
                list_del_init(&BTRFS_I(inode)->i_orphan);
        }
-        spin_unlock(&root->list_lock);
+        spin_unlock(&root->orphan_lock);
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (S_ISDIR(old_inode->i_mode) && new_inode &&
            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
-        /*
-         * We want to reserve the absolute worst case amount of items.  So if
-         * both inodes are subvols and we need to unlink them then that would
-         * require 4 item modifications, but if they are both normal inodes it
-         * would require 5 item modifications, so we'll assume their normal
-         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
-         * should cover the worst case number of items we'll modify.
-         */
-        ret = btrfs_reserve_metadata_space(root, 11);
-        if (ret)
-                return ret;
        /*
         * we're using rename to replace one file with another.
         * and the replacement file is large.  Start IO on it now so
@@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* close the racy window with snapshot create/destroy ioctl */
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                down_read(&root->fs_info->subvol_sem);
+        /*
+         * We want to reserve the absolute worst case amount of items.  So if
+         * both inodes are subvols and we need to unlink them then that would
+         * require 4 item modifications, but if they are both normal inodes it
+         * would require 5 item modifications, so we'll assume their normal
+         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+         * should cover the worst case number of items we'll modify.
+         */
+        trans = btrfs_start_transaction(root, 20);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, new_dir);
        if (dest != root)
@@ -5550,7 +6552,6 @@ out_fail:
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
-        btrfs_unreserve_metadata_space(root, 11);
        return ret;
 }
@@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        return 0;
 }
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+        struct btrfs_inode *binode;
+        struct inode *inode = NULL;
+        spin_lock(&root->fs_info->delalloc_lock);
+        while (!list_empty(&root->fs_info->delalloc_inodes)) {
+                binode = list_entry(root->fs_info->delalloc_inodes.next,
+                                    struct btrfs_inode, delalloc_inodes);
+                inode = igrab(&binode->vfs_inode);
+                if (inode) {
+                        list_move_tail(&binode->delalloc_inodes,
+                                       &root->fs_info->delalloc_inodes);
+                        break;
+                }
+                list_del_init(&binode->delalloc_inodes);
+                cond_resched_lock(&root->fs_info->delalloc_lock);
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
+        if (inode) {
+                write_inode_now(inode, 0);
+                if (delay_iput)
+                        btrfs_add_delayed_iput(inode);
+                else
+                        iput(inode);
+                return 1;
+        }
+        return 0;
+}
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                         const char *symname)
 {
@@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 items for inode item and ref
         * 2 items for dir items
         * 1 item for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto out_fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-out_fail:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -5726,33 +6751,28 @@ out_fail:
        return err;
 }
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
-                        u64 alloc_hint, int mode, loff_t actual_len)
+                              u64 start, u64 num_bytes, u64 min_size,
+                              loff_t actual_len, u64 *alloc_hint)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
-        u64 num_bytes = end - start;
        int ret = 0;
-        u64 i_size;
        while (num_bytes > 0) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 3);
+                if (IS_ERR(trans)) {
-                ret = btrfs_reserve_extent(trans, root, num_bytes,
+                        ret = PTR_ERR(trans);
-                                           root->sectorsize, 0, alloc_hint,
+                        break;
-                                           (u64)-1, &ins, 1);
-                if (ret) {
-                        WARN_ON(1);
-                        goto stop_trans;
                }
-                ret = btrfs_reserve_metadata_space(root, 3);
+                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+                                           0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
-                        btrfs_free_reserved_extent(root, ins.objectid,
+                        btrfs_end_transaction(trans, root);
-                                                   ins.offset);
+                        break;
-                        goto stop_trans;
                }
                ret = insert_reserved_file_extent(trans, inode,
@@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
-                alloc_hint = ins.objectid + ins.offset;
+                *alloc_hint = ins.objectid + ins.offset;
                inode->i_ctime = CURRENT_TIME;
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                        (actual_len > inode->i_size) &&
+                    (actual_len > inode->i_size) &&
-                        (cur_offset > inode->i_size)) {
+                    (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
-                                i_size  = actual_len;
+                                i_size_write(inode, actual_len);
                        else
-                                i_size = cur_offset;
+                                i_size_write(inode, cur_offset);
-                        i_size_write(inode, i_size);
+                        i_size_write(inode, cur_offset);
-                        btrfs_ordered_update_i_size(inode, i_size, NULL);
+                        btrfs_ordered_update_i_size(inode, cur_offset, NULL);
                }
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
                btrfs_end_transaction(trans, root);
-                btrfs_unreserve_metadata_space(root, 3);
        }
        return ret;
-stop_trans:
-        btrfs_end_transaction(trans, root);
-        return ret;
 }
 static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
-        ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-                                          alloc_end - alloc_start);
        if (ret)
                goto out;
@@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                        ret = prealloc_file_range(inode,
+                        ret = btrfs_prealloc_file_range(inode, 0, cur_offset,
-                                                  cur_offset, last_byte,
+                                                        last_byte - cur_offset,
-                                                alloc_hint, mode, offset+len);
+                                                        1 << inode->i_blkbits,
+                                                        offset + len,
+                                                        &alloc_hint);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
                        }
                }
-                if (em->block_start <= EXTENT_MAP_LAST_BYTE)
-                        alloc_hint = em->block_start;
                free_extent_map(em);
                cur_offset = last_byte;
@@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_NOFS);
-        btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-                                       alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a867..4cdb98cf26de 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
        u64 index = 0;
+        ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
+                                       0, &objectid);
+        if (ret)
+                return ret;
        /*
         * 1 - inode item
         * 2 - refs
         * 1 - root item
         * 2 - dir items
         */
-        ret = btrfs_reserve_metadata_space(root, 6);
+        trans = btrfs_start_transaction(root, 6);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
-        ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
-                                       0, &objectid);
-        if (ret)
-                goto fail;
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
                                      0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
        err = btrfs_commit_transaction(trans, root);
        if (err && !ret)
                ret = err;
-        btrfs_unreserve_metadata_space(root, 6);
        return ret;
 }
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
-                           char *name, int namelen)
 {
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        if (!root->ref_cows)
                return -EINVAL;
-        /*
-         * 1 - inode item
-         * 2 - refs
-         * 1 - root item
-         * 2 - dir items
-         */
-        ret = btrfs_reserve_metadata_space(root, 6);
-        if (ret)
-                goto fail;
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-        if (!pending_snapshot) {
+        if (!pending_snapshot)
-                ret = -ENOMEM;
+                return -ENOMEM;
-                btrfs_unreserve_metadata_space(root, 6);
-                goto fail;
+        btrfs_init_block_rsv(&pending_snapshot->block_rsv);
-        }
-        pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
-        if (!pending_snapshot->name) {
-                ret = -ENOMEM;
-                kfree(pending_snapshot);
-                btrfs_unreserve_metadata_space(root, 6);
-                goto fail;
-        }
-        memcpy(pending_snapshot->name, name, namelen);
-        pending_snapshot->name[namelen] = '\0';
        pending_snapshot->dentry = dentry;
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
        pending_snapshot->root = root;
+        trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto fail;
+        }
+        ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
+        BUG_ON(ret);
        list_add(&pending_snapshot->list,
                 &trans->transaction->pending_snapshots);
-        ret = btrfs_commit_transaction(trans, root);
+        ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
        BUG_ON(ret);
-        btrfs_unreserve_metadata_space(root, 6);
+        ret = pending_snapshot->error;
+        if (ret)
+                goto fail;
+        btrfs_orphan_cleanup(pending_snapshot->snap);
        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
        if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        d_instantiate(dentry, inode);
        ret = 0;
 fail:
+        kfree(pending_snapshot);
        return ret;
 }
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
                goto out_up_read;
        if (snap_src) {
-                error = create_snapshot(snap_src, dentry,
+                error = create_snapshot(snap_src, dentry);
-                                        name, namelen);
        } else {
                error = create_subvol(BTRFS_I(dir)->root, dentry,
                                      name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
                        BTRFS_I(inode)->force_compress = 1;
-                ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+                ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-                if (ret) {
+                if (ret)
-                        ret = -ENOSPC;
+                        goto err_unlock;
-                        break;
-                }
-                ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
-                                                       PAGE_CACHE_SIZE);
-                        ret = -ENOSPC;
-                        break;
-                }
 again:
                if (inode->i_size == 0 ||
                    i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
                }
                page = grab_cache_page(inode->i_mapping, i);
-                if (!page)
+                if (!page) {
+                        ret = -ENOMEM;
                        goto err_reservations;
+                }
                if (!PageUptodate(page)) {
                        btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
                        if (!PageUptodate(page)) {
                                unlock_page(page);
                                page_cache_release(page);
+                                ret = -EIO;
                                goto err_reservations;
                        }
                }
@@ -644,8 +623,7 @@ again:
                wait_on_page_writeback(page);
                if (PageDirty(page)) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-                                                       PAGE_CACHE_SIZE);
                        goto loop_unlock;
                }
@@ -683,7 +661,6 @@ loop_unlock:
                page_cache_release(page);
                mutex_unlock(&inode->i_mutex);
-                btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
                balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
                i++;
        }
@@ -713,9 +690,9 @@ loop_unlock:
        return 0;
 err_reservations:
+        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+err_unlock:
        mutex_unlock(&inode->i_mutex);
-        btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        return ret;
 }
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                device->name, (unsigned long long)new_size);
        if (new_size > old_size) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
        } else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        if (err)
                goto out_up_write;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out;
+        }
+        trans->block_rsv = &root->fs_info->global_block_rsv;
        ret = btrfs_unlink_subvol(trans, root, dir,
                                dest->root_key.objectid,
                                dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        dest->root_item.drop_level = 0;
        btrfs_set_root_refs(&dest->root_item, 0);
-        ret = btrfs_insert_orphan_item(trans,
+        if (!xchg(&dest->orphan_item_inserted, 1)) {
-                                root->fs_info->tree_root,
+                ret = btrfs_insert_orphan_item(trans,
-                                dest->root_key.objectid);
+                                        root->fs_info->tree_root,
-        BUG_ON(ret);
+                                        dest->root_key.objectid);
+                BUG_ON(ret);
+        }
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        ret = -EPERM;
                        goto out;
                }
-                btrfs_defrag_root(root, 0);
+                ret = btrfs_defrag_root(root, 0);
-                btrfs_defrag_root(root->fs_info->extent_root, 0);
+                if (ret)
+                        goto out;
+                ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
                break;
        case S_IFREG:
                if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        /* the rest are all set to zero by kzalloc */
                        range->len = (u64)-1;
                }
-                btrfs_defrag_file(file, range);
+                ret = btrfs_defrag_file(file, range);
                kfree(range);
                break;
+        default:
+                ret = -EINVAL;
        }
 out:
        mnt_drop_write(file->f_path.mnt);
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                btrfs_wait_ordered_range(src, off, off+len);
        }
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
-        /* punch hole in destination first */
-        btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
        /* clone data */
        key.objectid = src->i_ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                 * note the key will change type as we walk through the
                 * tree.
                 */
-                ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
                        goto out;
@@ -1629,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        new_key.objectid = inode->i_ino;
                        new_key.offset = key.offset + destoff - off;
+                        trans = btrfs_start_transaction(root, 1);
+                        if (IS_ERR(trans)) {
+                                ret = PTR_ERR(trans);
+                                goto out;
+                        }
                        if (type == BTRFS_FILE_EXTENT_REG ||
                            type == BTRFS_FILE_EXTENT_PREALLOC) {
+                                if (off > key.offset) {
+                                        datao += off - key.offset;
+                                        datal -= off - key.offset;
+                                }
+                                if (key.offset + datal > off + len)
+                                        datal = off + len - key.offset;
+                                ret = btrfs_drop_extents(trans, inode,
+                                                         new_key.offset,
+                                                         new_key.offset + datal,
+                                                         &hint_byte, 1);
+                                BUG_ON(ret);
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
-                                if (ret)
+                                BUG_ON(ret);
-                                        goto out;
                                leaf = path->nodes[0];
                                slot = path->slots[0];
@@ -1645,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                extent = btrfs_item_ptr(leaf, slot,
                                                struct btrfs_file_extent_item);
-                                if (off > key.offset) {
-                                        datao += off - key.offset;
-                                        datal -= off - key.offset;
-                                }
-                                if (key.offset + datal > off + len)
-                                        datal = off + len - key.offset;
                                /* disko == 0 means it's a hole */
                                if (!disko)
                                        datao = 0;
@@ -1683,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                if (comp && (skip || trim)) {
                                        ret = -EINVAL;
+                                        btrfs_end_transaction(trans, root);
                                        goto out;
                                }
                                size -= skip + trim;
                                datal -= skip + trim;
+                                ret = btrfs_drop_extents(trans, inode,
+                                                         new_key.offset,
+                                                         new_key.offset + datal,
+                                                         &hint_byte, 1);
+                                BUG_ON(ret);
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
-                                if (ret)
+                                BUG_ON(ret);
-                                        goto out;
                                if (skip) {
                                        u32 start =
@@ -1708,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        }
                        btrfs_mark_buffer_dirty(leaf);
-                }
+                        btrfs_release_path(root, path);
+                        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                        if (new_key.offset + datal > inode->i_size)
+                                btrfs_i_size_write(inode,
+                                                   new_key.offset + datal);
+                        BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+                        ret = btrfs_update_inode(trans, root, inode);
+                        BUG_ON(ret);
+                        btrfs_end_transaction(trans, root);
+                }
 next:
                btrfs_release_path(root, path);
                key.offset++;
@@ -1717,17 +1727,7 @@ next:
        ret = 0;
 out:
        btrfs_release_path(root, path);
-        if (ret == 0) {
-                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                if (destoff + olen > inode->i_size)
-                        btrfs_i_size_write(inode, destoff + olen);
-                BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
-                ret = btrfs_update_inode(trans, root, inode);
-        }
-        btrfs_end_transaction(trans, root);
        unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-        if (ret)
-                vmtruncate(inode, 0);
 out_unlock:
        mutex_unlock(&src->i_mutex);
        mutex_unlock(&inode->i_mutex);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2dc..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
        return 1;
 }
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+                          u64 len)
+{
+        if (file_offset + len <= entry->file_offset ||
+            entry->file_offset + entry->len <= file_offset)
+                return 0;
+        return 1;
+}
 /*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                             u64 start, u64 len, u64 disk_len, int type)
+                                      u64 start, u64 len, u64 disk_len,
+                                      int type, int dio)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
+        if (dio)
+                set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
        /* one ref for the tree */
        atomic_set(&entry->refs, 1);
        init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        return 0;
 }
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+                             u64 start, u64 len, u64 disk_len, int type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 0);
+}
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+                                 u64 start, u64 len, u64 disk_len, int type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 1);
+}
 /*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
 * when an ordered extent is finished.  If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
        tree->last = NULL;
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
-        WARN_ON(!BTRFS_I(inode)->outstanding_extents);
-        BTRFS_I(inode)->outstanding_extents--;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-        btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
-                                              inode, 1);
        spin_lock(&root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
         * start IO on any dirty ones so the wait doesn't stall waiting
         * for pdflush to find them
         */
-        filemap_fdatawrite_range(inode->i_mapping, start, end);
+        if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+                filemap_fdatawrite_range(inode->i_mapping, start, end);
        if (wait) {
                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
                                                 &entry->flags));
@@ -588,6 +609,47 @@ out:
        return entry;
 }
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+                                                        u64 file_offset,
+                                                        u64 len)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        spin_lock(&tree->lock);
+        node = tree_search(tree, file_offset);
+        if (!node) {
+                node = tree_search(tree, file_offset + len);
+                if (!node)
+                        goto out;
+        }
+        while (1) {
+                entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+                if (range_overlaps(entry, file_offset, len))
+                        break;
+                if (entry->file_offset >= file_offset + len) {
+                        entry = NULL;
+                        break;
+                }
+                entry = NULL;
+                node = rb_next(node);
+                if (!node)
+                        break;
+        }
+out:
+        if (entry)
+                atomic_inc(&entry->refs);
+        spin_unlock(&tree->lock);
+        return entry;
+}
 /*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
 struct btrfs_ordered_extent {
        /* logical offset in the file */
        u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
                                   u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                             u64 start, u64 len, u64 disk_len, int tyep);
+                             u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+                                 u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_sum(struct inode *inode,
                          struct btrfs_ordered_extent *entry,
                          struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+                                                        u64 file_offset,
+                                                        u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941ded..05d41e569236 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
 struct backref_node {
        struct rb_node rb_node;
        u64 bytenr;
-        /* objectid tree block owner */
+        u64 new_bytenr;
+        /* objectid of tree block owner, can be not uptodate */
        u64 owner;
+        /* link to pending, changed or detached list */
+        struct list_head list;
        /* list of upper level blocks reference this block */
        struct list_head upper;
        /* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
        struct extent_buffer *eb;
        /* level of tree block */
        unsigned int level:8;
-        /* 1 if the block is root of old snapshot */
+        /* is the block in non-reference counted tree */
-        unsigned int old_root:1;
+        unsigned int cowonly:1;
-        /* 1 if no child blocks in the cache */
+        /* 1 if no child node in the cache */
        unsigned int lowest:1;
        /* is the extent buffer locked */
        unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
        unsigned int processed:1;
        /* have backrefs of this block been checked */
        unsigned int checked:1;
+        /*
+         * 1 if corresponding block has been cowed but some upper
+         * level block pointers may not point to the new location
+         */
+        unsigned int pending:1;
+        /*
+         * 1 if the backref node isn't connected to any other
+         * backref node.
+         */
+        unsigned int detached:1;
 };
 /*
@@ -74,7 +88,6 @@ struct backref_node {
 struct backref_edge {
        struct list_head list[2];
        struct backref_node *node[2];
-        u64 blockptr;
 };
 #define LOWER   0
@@ -83,9 +96,25 @@ struct backref_edge {
 struct backref_cache {
        /* red black tree of all backref nodes in the cache */
        struct rb_root rb_root;
-        /* list of backref nodes with no child block in the cache */
+        /* for passing backref nodes to btrfs_reloc_cow_block */
+        struct backref_node *path[BTRFS_MAX_LEVEL];
+        /*
+         * list of blocks that have been cowed but some block
+         * pointers in upper level blocks may not reflect the
+         * new location
+         */
        struct list_head pending[BTRFS_MAX_LEVEL];
-        spinlock_t lock;
+        /* list of backref nodes with no child node */
+        struct list_head leaves;
+        /* list of blocks that have been cowed in current transaction */
+        struct list_head changed;
+        /* list of detached backref node. */
+        struct list_head detached;
+        u64 last_trans;
+        int nr_nodes;
+        int nr_edges;
 };
 /*
@@ -113,15 +142,6 @@ struct tree_block {
        unsigned int key_ready:1;
 };
-/* inode vector */
-#define INODEVEC_SIZE 16
-struct inodevec {
-        struct list_head list;
-        struct inode *inode[INODEVEC_SIZE];
-        int nr;
-};
 #define MAX_EXTENTS 128
 struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
        struct btrfs_root *extent_root;
        /* inode for moving data */
        struct inode *data_inode;
-        struct btrfs_workers workers;
+        struct btrfs_block_rsv *block_rsv;
+        struct backref_cache backref_cache;
+        struct file_extent_cluster cluster;
        /* tree blocks have been processed */
        struct extent_io_tree processed_blocks;
        /* map start of tree root to corresponding reloc tree */
        struct mapping_tree reloc_root_tree;
        /* list of reloc trees */
        struct list_head reloc_roots;
+        /* size of metadata reservation for merging reloc trees */
+        u64 merging_rsv_size;
+        /* size of relocated tree nodes */
+        u64 nodes_relocated;
        u64 search_start;
        u64 extents_found;
-        u64 extents_skipped;
-        int stage;
+        int block_rsv_retries;
-        int create_reloc_root;
+        unsigned int stage:8;
+        unsigned int create_reloc_tree:1;
+        unsigned int merge_reloc_tree:1;
        unsigned int found_file_extent:1;
-        unsigned int found_old_snapshot:1;
+        unsigned int commit_transaction:1;
 };
 /* stages of data relocation */
 #define MOVE_DATA_EXTENTS       0
 #define UPDATE_DATA_PTRS        1
-/*
+static void remove_backref_node(struct backref_cache *cache,
- * merge reloc tree to corresponding fs tree in worker threads
+                                struct backref_node *node);
- */
+static void __mark_block_processed(struct reloc_control *rc,
-struct async_merge {
+                                   struct backref_node *node);
-        struct btrfs_work work;
-        struct reloc_control *rc;
-        struct btrfs_root *root;
-        struct completion *done;
-        atomic_t *num_pending;
-};
 static void mapping_tree_init(struct mapping_tree *tree)
 {
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
        cache->rb_root = RB_ROOT;
        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
                INIT_LIST_HEAD(&cache->pending[i]);
-        spin_lock_init(&cache->lock);
+        INIT_LIST_HEAD(&cache->changed);
+        INIT_LIST_HEAD(&cache->detached);
+        INIT_LIST_HEAD(&cache->leaves);
+}
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+        struct backref_node *node;
+        int i;
+        while (!list_empty(&cache->detached)) {
+                node = list_entry(cache->detached.next,
+                                  struct backref_node, list);
+                remove_backref_node(cache, node);
+        }
+        while (!list_empty(&cache->leaves)) {
+                node = list_entry(cache->leaves.next,
+                                  struct backref_node, lower);
+                remove_backref_node(cache, node);
+        }
+        cache->last_trans = 0;
+        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+                BUG_ON(!list_empty(&cache->pending[i]));
+        BUG_ON(!list_empty(&cache->changed));
+        BUG_ON(!list_empty(&cache->detached));
+        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+        BUG_ON(cache->nr_nodes);
+        BUG_ON(cache->nr_edges);
+}
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+        struct backref_node *node;
+        node = kzalloc(sizeof(*node), GFP_NOFS);
+        if (node) {
+                INIT_LIST_HEAD(&node->list);
+                INIT_LIST_HEAD(&node->upper);
+                INIT_LIST_HEAD(&node->lower);
+                RB_CLEAR_NODE(&node->rb_node);
+                cache->nr_nodes++;
+        }
+        return node;
+}
+static void free_backref_node(struct backref_cache *cache,
+                              struct backref_node *node)
+{
+        if (node) {
+                cache->nr_nodes--;
+                kfree(node);
+        }
+}
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+        struct backref_edge *edge;
+        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+        if (edge)
+                cache->nr_edges++;
+        return edge;
 }
-static void backref_node_init(struct backref_node *node)
+static void free_backref_edge(struct backref_cache *cache,
+                              struct backref_edge *edge)
 {
-        memset(node, 0, sizeof(*node));
+        if (edge) {
-        INIT_LIST_HEAD(&node->upper);
+                cache->nr_edges--;
-        INIT_LIST_HEAD(&node->lower);
+                kfree(edge);
-        RB_CLEAR_NODE(&node->rb_node);
+        }
 }
 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
                edges[idx++] = edge;
                node = edge->node[UPPER];
        }
+        BUG_ON(node->detached);
        *index = idx;
        return node;
 }
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
        return NULL;
 }
+static void unlock_node_buffer(struct backref_node *node)
+{
+        if (node->locked) {
+                btrfs_tree_unlock(node->eb);
+                node->locked = 0;
+        }
+}
 static void drop_node_buffer(struct backref_node *node)
 {
        if (node->eb) {
-                if (node->locked) {
+                unlock_node_buffer(node);
-                        btrfs_tree_unlock(node->eb);
-                        node->locked = 0;
-                }
                free_extent_buffer(node->eb);
                node->eb = NULL;
        }
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
 static void drop_backref_node(struct backref_cache *tree,
                              struct backref_node *node)
 {
-        BUG_ON(!node->lowest);
        BUG_ON(!list_empty(&node->upper));
        drop_node_buffer(node);
+        list_del(&node->list);
        list_del(&node->lower);
+        if (!RB_EMPTY_NODE(&node->rb_node))
-        rb_erase(&node->rb_node, &tree->rb_root);
+                rb_erase(&node->rb_node, &tree->rb_root);
-        kfree(node);
+        free_backref_node(tree, node);
 }
 /*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
        if (!node)
                return;
-        BUG_ON(!node->lowest);
+        BUG_ON(!node->lowest && !node->detached);
        while (!list_empty(&node->upper)) {
                edge = list_entry(node->upper.next, struct backref_edge,
                                  list[LOWER]);
                upper = edge->node[UPPER];
                list_del(&edge->list[LOWER]);
                list_del(&edge->list[UPPER]);
-                kfree(edge);
+                free_backref_edge(cache, edge);
+                if (RB_EMPTY_NODE(&upper->rb_node)) {
+                        BUG_ON(!list_empty(&node->upper));
+                        drop_backref_node(cache, node);
+                        node = upper;
+                        node->lowest = 1;
+                        continue;
+                }
                /*
-                 * add the node to pending list if no other
+                 * add the node to leaf node list if no other
                 * child block cached.
                 */
                if (list_empty(&upper->lower)) {
-                        list_add_tail(&upper->lower,
+                        list_add_tail(&upper->lower, &cache->leaves);
-                                      &cache->pending[upper->level]);
                        upper->lowest = 1;
                }
        }
        drop_backref_node(cache, node);
 }
+static void update_backref_node(struct backref_cache *cache,
+                                struct backref_node *node, u64 bytenr)
+{
+        struct rb_node *rb_node;
+        rb_erase(&node->rb_node, &cache->rb_root);
+        node->bytenr = bytenr;
+        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        BUG_ON(rb_node);
+}
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+                                struct backref_cache *cache)
+{
+        struct backref_node *node;
+        int level = 0;
+        if (cache->last_trans == 0) {
+                cache->last_trans = trans->transid;
+                return 0;
+        }
+        if (cache->last_trans == trans->transid)
+                return 0;
+        /*
+         * detached nodes are used to avoid unnecessary backref
+         * lookup. transaction commit changes the extent tree.
+         * so the detached nodes are no longer useful.
+         */
+        while (!list_empty(&cache->detached)) {
+                node = list_entry(cache->detached.next,
+                                  struct backref_node, list);
+                remove_backref_node(cache, node);
+        }
+        while (!list_empty(&cache->changed)) {
+                node = list_entry(cache->changed.next,
+                                  struct backref_node, list);
+                list_del_init(&node->list);
+                BUG_ON(node->pending);
+                update_backref_node(cache, node, node->new_bytenr);
+        }
+        /*
+         * some nodes can be left in the pending list if there were
+         * errors during processing the pending nodes.
+         */
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                list_for_each_entry(node, &cache->pending[level], list) {
+                        BUG_ON(!node->pending);
+                        if (node->bytenr == node->new_bytenr)
+                                continue;
+                        update_backref_node(cache, node, node->new_bytenr);
+                }
+        }
+        cache->last_trans = 0;
+        return 1;
+}
+static int should_ignore_root(struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        if (!root->ref_cows)
+                return 0;
+        reloc_root = root->reloc_root;
+        if (!reloc_root)
+                return 0;
+        if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+            root->fs_info->running_transaction->transid - 1)
+                return 0;
+        /*
+         * if there is reloc tree and it was created in previous
+         * transaction backref lookup can find the reloc tree,
+         * so backref node for the fs tree root is useless for
+         * relocation.
+         */
+        return 1;
+}
 /*
 * find reloc tree by address of tree root
 */
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
 * for all upper level blocks that directly/indirectly reference the
 * block are also cached.
 */
-static struct backref_node *build_backref_tree(struct reloc_control *rc,
+static noinline_for_stack
-                                               struct backref_cache *cache,
+struct backref_node *build_backref_tree(struct reloc_control *rc,
-                                               struct btrfs_key *node_key,
+                                        struct btrfs_key *node_key,
-                                               int level, u64 bytenr)
+                                        int level, u64 bytenr)
 {
+        struct backref_cache *cache = &rc->backref_cache;
        struct btrfs_path *path1;
        struct btrfs_path *path2;
        struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
        unsigned long end;
        unsigned long ptr;
        LIST_HEAD(list);
+        LIST_HEAD(useless);
+        int cowonly;
        int ret;
        int err = 0;
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
                goto out;
        }
-        node = kmalloc(sizeof(*node), GFP_NOFS);
+        node = alloc_backref_node(cache);
        if (!node) {
                err = -ENOMEM;
                goto out;
        }
-        backref_node_init(node);
        node->bytenr = bytenr;
-        node->owner = 0;
        node->level = level;
        node->lowest = 1;
        cur = node;
@@ -587,17 +780,20 @@ again:
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
-                        if (key.objectid == key.offset &&
+                        if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
-                            key.type == BTRFS_EXTENT_REF_V0_KEY) {
                                struct btrfs_extent_ref_v0 *ref0;
                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
                                                struct btrfs_extent_ref_v0);
                                root = find_tree_root(rc, eb, ref0);
-                                if (root)
+                                if (!root->ref_cows)
-                                        cur->root = root;
+                                        cur->cowonly = 1;
-                                else
+                                if (key.objectid == key.offset) {
-                                        cur->old_root = 1;
+                                        if (root && !should_ignore_root(root))
-                                break;
+                                                cur->root = root;
+                                        else
+                                                list_add(&cur->list, &useless);
+                                        break;
+                                }
                        }
 #else
                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +810,20 @@ again:
                                break;
                        }
-                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        edge = alloc_backref_edge(cache);
                        if (!edge) {
                                err = -ENOMEM;
                                goto out;
                        }
                        rb_node = tree_search(&cache->rb_root, key.offset);
                        if (!rb_node) {
-                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                upper = alloc_backref_node(cache);
                                if (!upper) {
-                                        kfree(edge);
+                                        free_backref_edge(cache, edge);
                                        err = -ENOMEM;
                                        goto out;
                                }
-                                backref_node_init(upper);
                                upper->bytenr = key.offset;
-                                upper->owner = 0;
                                upper->level = cur->level + 1;
                                /*
                                 *  backrefs for the upper level block isn't
@@ -639,11 +833,12 @@ again:
                        } else {
                                upper = rb_entry(rb_node, struct backref_node,
                                                 rb_node);
+                                BUG_ON(!upper->checked);
                                INIT_LIST_HEAD(&edge->list[UPPER]);
                        }
-                        list_add(&edge->list[LOWER], &cur->upper);
+                        list_add_tail(&edge->list[LOWER], &cur->upper);
-                        edge->node[UPPER] = upper;
                        edge->node[LOWER] = cur;
+                        edge->node[UPPER] = upper;
                        goto next;
                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +852,17 @@ again:
                        goto out;
                }
+                if (!root->ref_cows)
+                        cur->cowonly = 1;
                if (btrfs_root_level(&root->root_item) == cur->level) {
                        /* tree root */
                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
                               cur->bytenr);
-                        cur->root = root;
+                        if (should_ignore_root(root))
+                                list_add(&cur->list, &useless);
+                        else
+                                cur->root = root;
                        break;
                }
@@ -692,11 +893,14 @@ again:
                        if (!path2->nodes[level]) {
                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
                                       lower->bytenr);
-                                lower->root = root;
+                                if (should_ignore_root(root))
+                                        list_add(&lower->list, &useless);
+                                else
+                                        lower->root = root;
                                break;
                        }
-                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        edge = alloc_backref_edge(cache);
                        if (!edge) {
                                err = -ENOMEM;
                                goto out;
@@ -705,16 +909,17 @@ again:
                        eb = path2->nodes[level];
                        rb_node = tree_search(&cache->rb_root, eb->start);
                        if (!rb_node) {
-                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                upper = alloc_backref_node(cache);
                                if (!upper) {
-                                        kfree(edge);
+                                        free_backref_edge(cache, edge);
                                        err = -ENOMEM;
                                        goto out;
                                }
-                                backref_node_init(upper);
                                upper->bytenr = eb->start;
                                upper->owner = btrfs_header_owner(eb);
                                upper->level = lower->level + 1;
+                                if (!root->ref_cows)
+                                        upper->cowonly = 1;
                                /*
                                 * if we know the block isn't shared
@@ -744,10 +949,12 @@ again:
                                                 rb_node);
                                BUG_ON(!upper->checked);
                                INIT_LIST_HEAD(&edge->list[UPPER]);
+                                if (!upper->owner)
+                                        upper->owner = btrfs_header_owner(eb);
                        }
                        list_add_tail(&edge->list[LOWER], &lower->upper);
-                        edge->node[UPPER] = upper;
                        edge->node[LOWER] = lower;
+                        edge->node[UPPER] = upper;
                        if (rb_node)
                                break;
@@ -785,8 +992,13 @@ next:
         * into the cache.
         */
        BUG_ON(!node->checked);
-        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        cowonly = node->cowonly;
-        BUG_ON(rb_node);
+        if (!cowonly) {
+                rb_node = tree_insert(&cache->rb_root, node->bytenr,
+                                      &node->rb_node);
+                BUG_ON(rb_node);
+                list_add_tail(&node->lower, &cache->leaves);
+        }
        list_for_each_entry(edge, &node->upper, list[LOWER])
                list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1007,14 @@ next:
                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
                list_del_init(&edge->list[UPPER]);
                upper = edge->node[UPPER];
+                if (upper->detached) {
+                        list_del(&edge->list[LOWER]);
+                        lower = edge->node[LOWER];
+                        free_backref_edge(cache, edge);
+                        if (list_empty(&lower->upper))
+                                list_add(&lower->list, &useless);
+                        continue;
+                }
                if (!RB_EMPTY_NODE(&upper->rb_node)) {
                        if (upper->lowest) {
@@ -807,25 +1027,69 @@ next:
                }
                BUG_ON(!upper->checked);
-                rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                BUG_ON(cowonly != upper->cowonly);
-                                      &upper->rb_node);
+                if (!cowonly) {
-                BUG_ON(rb_node);
+                        rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                                              &upper->rb_node);
+                        BUG_ON(rb_node);
+                }
                list_add_tail(&edge->list[UPPER], &upper->lower);
                list_for_each_entry(edge, &upper->upper, list[LOWER])
                        list_add_tail(&edge->list[UPPER], &list);
        }
+        /*
+         * process useless backref nodes. backref nodes for tree leaves
+         * are deleted from the cache. backref nodes for upper level
+         * tree blocks are left in the cache to avoid unnecessary backref
+         * lookup.
+         */
+        while (!list_empty(&useless)) {
+                upper = list_entry(useless.next, struct backref_node, list);
+                list_del_init(&upper->list);
+                BUG_ON(!list_empty(&upper->upper));
+                if (upper == node)
+                        node = NULL;
+                if (upper->lowest) {
+                        list_del_init(&upper->lower);
+                        upper->lowest = 0;
+                }
+                while (!list_empty(&upper->lower)) {
+                        edge = list_entry(upper->lower.next,
+                                          struct backref_edge, list[UPPER]);
+                        list_del(&edge->list[UPPER]);
+                        list_del(&edge->list[LOWER]);
+                        lower = edge->node[LOWER];
+                        free_backref_edge(cache, edge);
+                        if (list_empty(&lower->upper))
+                                list_add(&lower->list, &useless);
+                }
+                __mark_block_processed(rc, upper);
+                if (upper->level > 0) {
+                        list_add(&upper->list, &cache->detached);
+                        upper->detached = 1;
+                } else {
+                        rb_erase(&upper->rb_node, &cache->rb_root);
+                        free_backref_node(cache, upper);
+                }
+        }
 out:
        btrfs_free_path(path1);
        btrfs_free_path(path2);
        if (err) {
-                INIT_LIST_HEAD(&list);
+                while (!list_empty(&useless)) {
+                        lower = list_entry(useless.next,
+                                           struct backref_node, upper);
+                        list_del_init(&lower->upper);
+                }
                upper = node;
+                INIT_LIST_HEAD(&list);
                while (upper) {
                        if (RB_EMPTY_NODE(&upper->rb_node)) {
                                list_splice_tail(&upper->upper, &list);
-                                kfree(upper);
+                                free_backref_node(cache, upper);
                        }
                        if (list_empty(&list))
@@ -833,15 +1097,104 @@ out:
                        edge = list_entry(list.next, struct backref_edge,
                                          list[LOWER]);
+                        list_del(&edge->list[LOWER]);
                        upper = edge->node[UPPER];
-                        kfree(edge);
+                        free_backref_edge(cache, edge);
                }
                return ERR_PTR(err);
        }
+        BUG_ON(node && node->detached);
        return node;
 }
 /*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+                              struct reloc_control *rc,
+                              struct btrfs_root *src,
+                              struct btrfs_root *dest)
+{
+        struct btrfs_root *reloc_root = src->reloc_root;
+        struct backref_cache *cache = &rc->backref_cache;
+        struct backref_node *node = NULL;
+        struct backref_node *new_node;
+        struct backref_edge *edge;
+        struct backref_edge *new_edge;
+        struct rb_node *rb_node;
+        if (cache->last_trans > 0)
+                update_backref_cache(trans, cache);
+        rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+        if (rb_node) {
+                node = rb_entry(rb_node, struct backref_node, rb_node);
+                if (node->detached)
+                        node = NULL;
+                else
+                        BUG_ON(node->new_bytenr != reloc_root->node->start);
+        }
+        if (!node) {
+                rb_node = tree_search(&cache->rb_root,
+                                      reloc_root->commit_root->start);
+                if (rb_node) {
+                        node = rb_entry(rb_node, struct backref_node,
+                                        rb_node);
+                        BUG_ON(node->detached);
+                }
+        }
+        if (!node)
+                return 0;
+        new_node = alloc_backref_node(cache);
+        if (!new_node)
+                return -ENOMEM;
+        new_node->bytenr = dest->node->start;
+        new_node->level = node->level;
+        new_node->lowest = node->lowest;
+        new_node->root = dest;
+        if (!node->lowest) {
+                list_for_each_entry(edge, &node->lower, list[UPPER]) {
+                        new_edge = alloc_backref_edge(cache);
+                        if (!new_edge)
+                                goto fail;
+                        new_edge->node[UPPER] = new_node;
+                        new_edge->node[LOWER] = edge->node[LOWER];
+                        list_add_tail(&new_edge->list[UPPER],
+                                      &new_node->lower);
+                }
+        }
+        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+                              &new_node->rb_node);
+        BUG_ON(rb_node);
+        if (!new_node->lowest) {
+                list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+                        list_add_tail(&new_edge->list[LOWER],
+                                      &new_edge->node[LOWER]->upper);
+                }
+        }
+        return 0;
+fail:
+        while (!list_empty(&new_node->lower)) {
+                new_edge = list_entry(new_node->lower.next,
+                                      struct backref_edge, list[UPPER]);
+                list_del(&new_edge->list[UPPER]);
+                free_backref_edge(cache, new_edge);
+        }
+        free_backref_node(cache, new_node);
+        return -ENOMEM;
+}
+/*
 * helper to add 'address of tree root -> reloc tree' mapping
 */
 static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1254,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
        return 0;
 }
-/*
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
- * create reloc tree for a given fs tree. reloc tree is just a
+                                        struct btrfs_root *root, u64 objectid)
- * snapshot of the fs tree with special root objectid.
- */
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root)
 {
        struct btrfs_root *reloc_root;
        struct extent_buffer *eb;
@@ -914,36 +1263,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
        struct btrfs_key root_key;
        int ret;
-        if (root->reloc_root) {
-                reloc_root = root->reloc_root;
-                reloc_root->last_trans = trans->transid;
-                return 0;
-        }
-        if (!root->fs_info->reloc_ctl ||
-            !root->fs_info->reloc_ctl->create_reloc_root ||
-            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
-                return 0;
        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
        BUG_ON(!root_item);
        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
-        root_key.offset = root->root_key.objectid;
+        root_key.offset = objectid;
-        ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+        if (root->root_key.objectid == objectid) {
-                              BTRFS_TREE_RELOC_OBJECTID);
+                /* called by btrfs_init_reloc_root */
-        BUG_ON(ret);
+                ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+                                      BTRFS_TREE_RELOC_OBJECTID);
+                BUG_ON(ret);
+                btrfs_set_root_last_snapshot(&root->root_item,
+                                             trans->transid - 1);
+        } else {
+                /*
+                 * called by btrfs_reloc_post_snapshot_hook.
+                 * the source tree is a reloc tree, all tree blocks
+                 * modified after it was created have RELOC flag
+                 * set in their headers. so it's OK to not update
+                 * the 'last_snapshot'.
+                 */
+                ret = btrfs_copy_root(trans, root, root->node, &eb,
+                                      BTRFS_TREE_RELOC_OBJECTID);
+                BUG_ON(ret);
+        }
-        btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
        memcpy(root_item, &root->root_item, sizeof(*root_item));
-        btrfs_set_root_refs(root_item, 1);
        btrfs_set_root_bytenr(root_item, eb->start);
        btrfs_set_root_level(root_item, btrfs_header_level(eb));
        btrfs_set_root_generation(root_item, trans->transid);
-        memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
-        root_item->drop_level = 0;
+        if (root->root_key.objectid == objectid) {
+                btrfs_set_root_refs(root_item, 0);
+                memset(&root_item->drop_progress, 0,
+                       sizeof(struct btrfs_disk_key));
+                root_item->drop_level = 0;
+        }
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
@@ -957,6 +1315,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
                                                 &root_key);
        BUG_ON(IS_ERR(reloc_root));
        reloc_root->last_trans = trans->transid;
+        return reloc_root;
+}
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct reloc_control *rc = root->fs_info->reloc_ctl;
+        int clear_rsv = 0;
+        if (root->reloc_root) {
+                reloc_root = root->reloc_root;
+                reloc_root->last_trans = trans->transid;
+                return 0;
+        }
+        if (!rc || !rc->create_reloc_tree ||
+            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                return 0;
+        if (!trans->block_rsv) {
+                trans->block_rsv = rc->block_rsv;
+                clear_rsv = 1;
+        }
+        reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+        if (clear_rsv)
+                trans->block_rsv = NULL;
        __add_reloc_root(reloc_root);
        root->reloc_root = reloc_root;
@@ -980,7 +1369,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
-        if (btrfs_root_refs(root_item) == 0) {
+        if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+            btrfs_root_refs(root_item) == 0) {
                root->reloc_root = NULL;
                del = 1;
        }
@@ -1102,8 +1492,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
                goto out;
        }
-        if (new_bytenr)
+        *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-                *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        ret = 0;
 out:
        btrfs_free_path(path);
@@ -1114,19 +1503,18 @@ out:
 * update file extent items in the tree leaf to point to
 * the new locations.
 */
-static int replace_file_extents(struct btrfs_trans_handle *trans,
+static noinline_for_stack
-                                struct reloc_control *rc,
+int replace_file_extents(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
+                         struct reloc_control *rc,
-                                struct extent_buffer *leaf,
+                         struct btrfs_root *root,
-                                struct list_head *inode_list)
+                         struct extent_buffer *leaf)
 {
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        struct inode *inode = NULL;
-        struct inodevec *ivec = NULL;
        u64 parent;
        u64 bytenr;
-        u64 new_bytenr;
+        u64 new_bytenr = 0;
        u64 num_bytes;
        u64 end;
        u32 nritems;
@@ -1166,21 +1554,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
                 * to complete and drop the extent cache
                 */
                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-                        if (!ivec || ivec->nr == INODEVEC_SIZE) {
-                                ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
-                                BUG_ON(!ivec);
-                                ivec->nr = 0;
-                                list_add_tail(&ivec->list, inode_list);
-                        }
                        if (first) {
                                inode = find_next_inode(root, key.objectid);
-                                if (inode)
-                                        ivec->inode[ivec->nr++] = inode;
                                first = 0;
                        } else if (inode && inode->i_ino < key.objectid) {
+                                btrfs_add_delayed_iput(inode);
                                inode = find_next_inode(root, key.objectid);
-                                if (inode)
-                                        ivec->inode[ivec->nr++] = inode;
                        }
                        if (inode && inode->i_ino == key.objectid) {
                                end = key.offset +
@@ -1204,8 +1583,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
                ret = get_new_location(rc->data_inode, &new_bytenr,
                                       bytenr, num_bytes);
-                if (ret > 0)
+                if (ret > 0) {
+                        WARN_ON(1);
                        continue;
+                }
                BUG_ON(ret < 0);
                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1606,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
        }
        if (dirty)
                btrfs_mark_buffer_dirty(leaf);
+        if (inode)
+                btrfs_add_delayed_iput(inode);
        return 0;
 }
@@ -1248,11 +1631,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
 * if no block got replaced, 0 is returned. if there are other
 * errors, a negative error number is returned.
 */
-static int replace_path(struct btrfs_trans_handle *trans,
+static noinline_for_stack
-                        struct btrfs_root *dest, struct btrfs_root *src,
+int replace_path(struct btrfs_trans_handle *trans,
-                        struct btrfs_path *path, struct btrfs_key *next_key,
+                 struct btrfs_root *dest, struct btrfs_root *src,
-                        struct extent_buffer **leaf,
+                 struct btrfs_path *path, struct btrfs_key *next_key,
-                        int lowest_level, int max_level)
+                 int lowest_level, int max_level)
 {
        struct extent_buffer *eb;
        struct extent_buffer *parent;
@@ -1263,16 +1646,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
        u64 new_ptr_gen;
        u64 last_snapshot;
        u32 blocksize;
+        int cow = 0;
        int level;
        int ret;
        int slot;
        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
-        BUG_ON(lowest_level > 1 && leaf);
        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+again:
        slot = path->slots[lowest_level];
        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
@@ -1286,8 +1669,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
                return 0;
        }
-        ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+        if (cow) {
-        BUG_ON(ret);
+                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+                BUG_ON(ret);
+        }
        btrfs_set_lock_blocking(eb);
        if (next_key) {
@@ -1331,7 +1716,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
                    memcmp_node_keys(parent, slot, path, level)) {
-                        if (level <= lowest_level && !leaf) {
+                        if (level <= lowest_level) {
                                ret = 0;
                                break;
                        }
@@ -1339,16 +1724,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
                        eb = read_tree_block(dest, old_bytenr, blocksize,
                                             old_ptr_gen);
                        btrfs_tree_lock(eb);
-                        ret = btrfs_cow_block(trans, dest, eb, parent,
+                        if (cow) {
-                                              slot, &eb);
+                                ret = btrfs_cow_block(trans, dest, eb, parent,
-                        BUG_ON(ret);
+                                                      slot, &eb);
-                        btrfs_set_lock_blocking(eb);
+                                BUG_ON(ret);
-                        if (level <= lowest_level) {
-                                *leaf = eb;
-                                ret = 0;
-                                break;
                        }
+                        btrfs_set_lock_blocking(eb);
                        btrfs_tree_unlock(parent);
                        free_extent_buffer(parent);
@@ -1357,6 +1738,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
                        continue;
                }
+                if (!cow) {
+                        btrfs_tree_unlock(parent);
+                        free_extent_buffer(parent);
+                        cow = 1;
+                        goto again;
+                }
                btrfs_node_key_to_cpu(path->nodes[level], &key,
                                      path->slots[level]);
                btrfs_release_path(src, path);
@@ -1562,20 +1950,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
        return 0;
 }
-static void put_inodes(struct list_head *list)
-{
-        struct inodevec *ivec;
-        while (!list_empty(list)) {
-                ivec = list_entry(list->next, struct inodevec, list);
-                list_del(&ivec->list);
-                while (ivec->nr > 0) {
-                        ivec->nr--;
-                        iput(ivec->inode[ivec->nr]);
-                }
-                kfree(ivec);
-        }
-}
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key)
@@ -1608,13 +1982,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        struct btrfs_root *reloc_root;
        struct btrfs_root_item *root_item;
        struct btrfs_path *path;
-        struct extent_buffer *leaf = NULL;
+        struct extent_buffer *leaf;
        unsigned long nr;
        int level;
        int max_level;
        int replaced = 0;
        int ret;
        int err = 0;
+        u32 min_reserved;
        path = btrfs_alloc_path();
        if (!path)
@@ -1648,34 +2023,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                btrfs_unlock_up_safe(path, 0);
        }
-        if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+        min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
-                trans = btrfs_start_transaction(root, 1);
+        memset(&next_key, 0, sizeof(next_key));
-                leaf = path->nodes[0];
+        while (1) {
-                btrfs_item_key_to_cpu(leaf, &key, 0);
+                trans = btrfs_start_transaction(root, 0);
-                btrfs_release_path(reloc_root, path);
+                trans->block_rsv = rc->block_rsv;
-                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
-                if (ret < 0) {
+                                            min_reserved, 0);
-                        err = ret;
+                if (ret) {
-                        goto out;
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        continue;
                }
-                leaf = path->nodes[0];
-                btrfs_unlock_up_safe(path, 1);
-                ret = replace_file_extents(trans, rc, root, leaf,
-                                           &inode_list);
-                if (ret < 0)
-                        err = ret;
-                goto out;
-        }
-        memset(&next_key, 0, sizeof(next_key));
-        while (1) {
-                leaf = NULL;
                replaced = 0;
-                trans = btrfs_start_transaction(root, 1);
                max_level = level;
                ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2053,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                if (!find_next_key(path, level, &key) &&
                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
                        ret = 0;
-                } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
-                        ret = replace_path(trans, root, reloc_root,
-                                           path, &next_key, &leaf,
-                                           level, max_level);
                } else {
-                        ret = replace_path(trans, root, reloc_root,
+                        ret = replace_path(trans, root, reloc_root, path,
-                                           path, &next_key, NULL,
+                                           &next_key, level, max_level);
-                                           level, max_level);
                }
                if (ret < 0) {
                        err = ret;
@@ -1708,16 +2067,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                        btrfs_node_key_to_cpu(path->nodes[level], &key,
                                              path->slots[level]);
                        replaced = 1;
-                } else if (leaf) {
-                        /*
-                         * no block got replaced, try replacing file extents
-                         */
-                        btrfs_item_key_to_cpu(leaf, &key, 0);
-                        ret = replace_file_extents(trans, rc, root, leaf,
-                                                   &inode_list);
-                        btrfs_tree_unlock(leaf);
-                        free_extent_buffer(leaf);
-                        BUG_ON(ret < 0);
                }
                ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2083,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                root_item->drop_level = level;
                nr = trans->blocks_used;
-                btrfs_end_transaction(trans, root);
+                btrfs_end_transaction_throttle(trans, root);
                btrfs_btree_balance_dirty(root, nr);
-                /*
-                 * put inodes outside transaction, otherwise we may deadlock.
-                 */
-                put_inodes(&inode_list);
                if (replaced && rc->stage == UPDATE_DATA_PTRS)
                        invalidate_extent_cache(root, &key, &next_key);
        }
@@ -1765,87 +2109,125 @@ out:
                       sizeof(root_item->drop_progress));
                root_item->drop_level = 0;
                btrfs_set_root_refs(root_item, 0);
+                btrfs_update_reloc_root(trans, root);
        }
        nr = trans->blocks_used;
-        btrfs_end_transaction(trans, root);
+        btrfs_end_transaction_throttle(trans, root);
        btrfs_btree_balance_dirty(root, nr);
-        put_inodes(&inode_list);
        if (replaced && rc->stage == UPDATE_DATA_PTRS)
                invalidate_extent_cache(root, &key, &next_key);
        return err;
 }
-/*
+static noinline_for_stack
- * callback for the work threads.
+int prepare_to_merge(struct reloc_control *rc, int err)
- * this function merges reloc tree with corresponding fs tree,
- * and then drops the reloc tree.
- */
-static void merge_func(struct btrfs_work *work)
 {
-        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = rc->extent_root;
-        struct btrfs_root *root;
        struct btrfs_root *reloc_root;
-        struct async_merge *async;
+        struct btrfs_trans_handle *trans;
+        LIST_HEAD(reloc_roots);
+        u64 num_bytes = 0;
+        int ret;
+        int retries = 0;
+        mutex_lock(&root->fs_info->trans_mutex);
+        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+        rc->merging_rsv_size += rc->nodes_relocated * 2;
+        mutex_unlock(&root->fs_info->trans_mutex);
+again:
+        if (!err) {
+                num_bytes = rc->merging_rsv_size;
+                ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+                                          num_bytes, &retries);
+                if (ret)
+                        err = ret;
+        }
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        if (!err) {
+                if (num_bytes != rc->merging_rsv_size) {
+                        btrfs_end_transaction(trans, rc->extent_root);
+                        btrfs_block_rsv_release(rc->extent_root,
+                                                rc->block_rsv, num_bytes);
+                        retries = 0;
+                        goto again;
+                }
+        }
-        async = container_of(work, struct async_merge, work);
+        rc->merge_reloc_tree = 1;
-        reloc_root = async->root;
+        while (!list_empty(&rc->reloc_roots)) {
+                reloc_root = list_entry(rc->reloc_roots.next,
+                                        struct btrfs_root, root_list);
+                list_del_init(&reloc_root->root_list);
-        if (btrfs_root_refs(&reloc_root->root_item) > 0) {
                root = read_fs_root(reloc_root->fs_info,
                                    reloc_root->root_key.offset);
                BUG_ON(IS_ERR(root));
                BUG_ON(root->reloc_root != reloc_root);
-                merge_reloc_root(async->rc, root);
+                /*
+                 * set reference count to 1, so btrfs_recover_relocation
-                trans = btrfs_start_transaction(root, 1);
+                 * knows it should resumes merging
+                 */
+                if (!err)
+                        btrfs_set_root_refs(&reloc_root->root_item, 1);
                btrfs_update_reloc_root(trans, root);
-                btrfs_end_transaction(trans, root);
-        }
-        btrfs_drop_snapshot(reloc_root, 0);
+                list_add(&reloc_root->root_list, &reloc_roots);
+        }
-        if (atomic_dec_and_test(async->num_pending))
+        list_splice(&reloc_roots, &rc->reloc_roots);
-                complete(async->done);
-        kfree(async);
+        if (!err)
+                btrfs_commit_transaction(trans, rc->extent_root);
+        else
+                btrfs_end_transaction(trans, rc->extent_root);
+        return err;
 }
-static int merge_reloc_roots(struct reloc_control *rc)
+static noinline_for_stack
+int merge_reloc_roots(struct reloc_control *rc)
 {
-        struct async_merge *async;
        struct btrfs_root *root;
-        struct completion done;
+        struct btrfs_root *reloc_root;
-        atomic_t num_pending;
+        LIST_HEAD(reloc_roots);
+        int found = 0;
+        int ret;
+again:
+        root = rc->extent_root;
+        mutex_lock(&root->fs_info->trans_mutex);
+        list_splice_init(&rc->reloc_roots, &reloc_roots);
+        mutex_unlock(&root->fs_info->trans_mutex);
-        init_completion(&done);
+        while (!list_empty(&reloc_roots)) {
-        atomic_set(&num_pending, 1);
+                found = 1;
+                reloc_root = list_entry(reloc_roots.next,
+                                        struct btrfs_root, root_list);
-        while (!list_empty(&rc->reloc_roots)) {
+                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
-                root = list_entry(rc->reloc_roots.next,
+                        root = read_fs_root(reloc_root->fs_info,
-                                  struct btrfs_root, root_list);
+                                            reloc_root->root_key.offset);
-                list_del_init(&root->root_list);
+                        BUG_ON(IS_ERR(root));
+                        BUG_ON(root->reloc_root != reloc_root);
-                async = kmalloc(sizeof(*async), GFP_NOFS);
+                        ret = merge_reloc_root(rc, root);
-                BUG_ON(!async);
+                        BUG_ON(ret);
-                async->work.func = merge_func;
+                } else {
-                async->work.flags = 0;
+                        list_del_init(&reloc_root->root_list);
-                async->rc = rc;
+                }
-                async->root = root;
+                btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
-                async->done = &done;
-                async->num_pending = &num_pending;
-                atomic_inc(&num_pending);
-                btrfs_queue_worker(&rc->workers, &async->work);
        }
-        if (!atomic_dec_and_test(&num_pending))
+        if (found) {
-                wait_for_completion(&done);
+                found = 0;
+                goto again;
+        }
        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
        return 0;
 }
@@ -1876,119 +2258,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
        return btrfs_record_root_in_trans(trans, root);
 }
-/*
+static noinline_for_stack
- * select one tree from trees that references the block.
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
- * for blocks in refernce counted trees, we preper reloc tree.
+                                     struct reloc_control *rc,
- * if no reloc tree found and reloc_only is true, NULL is returned.
+                                     struct backref_node *node,
- */
+                                     struct backref_edge *edges[], int *nr)
-static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
-                                            struct backref_node *node,
-                                            struct backref_edge *edges[],
-                                            int *nr, int reloc_only)
 {
        struct backref_node *next;
        struct btrfs_root *root;
-        int index;
+        int index = 0;
-        int loop = 0;
-again:
-        index = 0;
        next = node;
        while (1) {
                cond_resched();
                next = walk_up_backref(next, edges, &index);
                root = next->root;
-                if (!root) {
+                BUG_ON(!root);
-                        BUG_ON(!node->old_root);
+                BUG_ON(!root->ref_cows);
-                        goto skip;
-                }
-                /* no other choice for non-refernce counted tree */
-                if (!root->ref_cows) {
-                        BUG_ON(reloc_only);
-                        break;
-                }
                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
                        record_reloc_root_in_trans(trans, root);
                        break;
                }
-                if (loop) {
+                btrfs_record_root_in_trans(trans, root);
-                        btrfs_record_root_in_trans(trans, root);
+                root = root->reloc_root;
+                if (next->new_bytenr != root->node->start) {
+                        BUG_ON(next->new_bytenr);
+                        BUG_ON(!list_empty(&next->list));
+                        next->new_bytenr = root->node->start;
+                        next->root = root;
+                        list_add_tail(&next->list,
+                                      &rc->backref_cache.changed);
+                        __mark_block_processed(rc, next);
                        break;
                }
-                if (reloc_only || next != node) {
+                WARN_ON(1);
-                        if (!root->reloc_root)
-                                btrfs_record_root_in_trans(trans, root);
-                        root = root->reloc_root;
-                        /*
-                         * if the reloc tree was created in current
-                         * transation, there is no node in backref tree
-                         * corresponds to the root of the reloc tree.
-                         */
-                        if (btrfs_root_last_snapshot(&root->root_item) ==
-                            trans->transid - 1)
-                                break;
-                }
-skip:
                root = NULL;
                next = walk_down_backref(edges, &index);
                if (!next || next->level <= node->level)
                        break;
        }
+        if (!root)
+                return NULL;
-        if (!root && !loop && !reloc_only) {
+        *nr = index;
-                loop = 1;
+        next = node;
-                goto again;
+        /* setup backref node path for btrfs_reloc_cow_block */
+        while (1) {
+                rc->backref_cache.path[next->level] = next;
+                if (--index < 0)
+                        break;
+                next = edges[index]->node[UPPER];
        }
-        if (root)
-                *nr = index;
-        else
-                *nr = 0;
        return root;
 }
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
 static noinline_for_stack
 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
                                   struct backref_node *node)
 {
+        struct backref_node *next;
+        struct btrfs_root *root;
+        struct btrfs_root *fs_root = NULL;
        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
-        int nr;
+        int index = 0;
-        return __select_one_root(trans, node, edges, &nr, 0);
+        next = node;
+        while (1) {
+                cond_resched();
+                next = walk_up_backref(next, edges, &index);
+                root = next->root;
+                BUG_ON(!root);
+                /* no other choice for non-refernce counted tree */
+                if (!root->ref_cows)
+                        return root;
+                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+                        fs_root = root;
+                if (next != node)
+                        return NULL;
+                next = walk_down_backref(edges, &index);
+                if (!next || next->level <= node->level)
+                        break;
+        }
+        if (!fs_root)
+                return ERR_PTR(-ENOENT);
+        return fs_root;
 }
 static noinline_for_stack
-struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+u64 calcu_metadata_size(struct reloc_control *rc,
-                                     struct backref_node *node,
+                        struct backref_node *node, int reserve)
-                                     struct backref_edge *edges[], int *nr)
 {
-        return __select_one_root(trans, node, edges, nr, 1);
+        struct backref_node *next = node;
+        struct backref_edge *edge;
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        u64 num_bytes = 0;
+        int index = 0;
+        BUG_ON(reserve && node->processed);
+        while (next) {
+                cond_resched();
+                while (1) {
+                        if (next->processed && (reserve || next != node))
+                                break;
+                        num_bytes += btrfs_level_size(rc->extent_root,
+                                                      next->level);
+                        if (list_empty(&next->upper))
+                                break;
+                        edge = list_entry(next->upper.next,
+                                          struct backref_edge, list[LOWER]);
+                        edges[index++] = edge;
+                        next = edge->node[UPPER];
+                }
+                next = walk_down_backref(edges, &index);
+        }
+        return num_bytes;
 }
-static void grab_path_buffers(struct btrfs_path *path,
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
-                              struct backref_node *node,
+                                  struct reloc_control *rc,
-                              struct backref_edge *edges[], int nr)
+                                  struct backref_node *node)
 {
-        int i = 0;
+        struct btrfs_root *root = rc->extent_root;
-        while (1) {
+        u64 num_bytes;
-                drop_node_buffer(node);
+        int ret;
-                node->eb = path->nodes[node->level];
-                BUG_ON(!node->eb);
+        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
-                if (path->locks[node->level])
-                        node->locked = 1;
-                path->nodes[node->level] = NULL;
-                path->locks[node->level] = 0;
-                if (i >= nr)
-                        break;
-                edges[i]->blockptr = node->eb->start;
+        trans->block_rsv = rc->block_rsv;
-                node = edges[i]->node[UPPER];
+        ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
-                i++;
+                                  &rc->block_rsv_retries);
+        if (ret) {
+                if (ret == -EAGAIN)
+                        rc->commit_transaction = 1;
+                return ret;
        }
+        rc->block_rsv_retries = 0;
+        return 0;
+}
+static void release_metadata_space(struct reloc_control *rc,
+                                   struct backref_node *node)
+{
+        u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
 }
 /*
@@ -1999,6 +2431,7 @@ static void grab_path_buffers(struct btrfs_path *path,
 * in that case this function just updates pointers.
 */
 static int do_relocation(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc,
                         struct backref_node *node,
                         struct btrfs_key *key,
                         struct btrfs_path *path, int lowest)
@@ -2019,18 +2452,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
        BUG_ON(lowest && node->eb);
        path->lowest_level = node->level + 1;
+        rc->backref_cache.path[node->level] = node;
        list_for_each_entry(edge, &node->upper, list[LOWER]) {
                cond_resched();
-                if (node->eb && node->eb->start == edge->blockptr)
-                        continue;
                upper = edge->node[UPPER];
-                root = select_reloc_root(trans, upper, edges, &nr);
+                root = select_reloc_root(trans, rc, upper, edges, &nr);
-                if (!root)
+                BUG_ON(!root);
-                        continue;
+                if (upper->eb && !upper->locked) {
-                if (upper->eb && !upper->locked)
+                        if (!lowest) {
+                                ret = btrfs_bin_search(upper->eb, key,
+                                                       upper->level, &slot);
+                                BUG_ON(ret);
+                                bytenr = btrfs_node_blockptr(upper->eb, slot);
+                                if (node->eb->start == bytenr)
+                                        goto next;
+                        }
                        drop_node_buffer(upper);
+                }
                if (!upper->eb) {
                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2480,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        }
                        BUG_ON(ret > 0);
-                        slot = path->slots[upper->level];
+                        if (!upper->eb) {
+                                upper->eb = path->nodes[upper->level];
+                                path->nodes[upper->level] = NULL;
+                        } else {
+                                BUG_ON(upper->eb != path->nodes[upper->level]);
+                        }
-                        btrfs_unlock_up_safe(path, upper->level + 1);
+                        upper->locked = 1;
-                        grab_path_buffers(path, upper, edges, nr);
+                        path->locks[upper->level] = 0;
+                        slot = path->slots[upper->level];
                        btrfs_release_path(NULL, path);
                } else {
                        ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2499,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                }
                bytenr = btrfs_node_blockptr(upper->eb, slot);
-                if (!lowest) {
+                if (lowest) {
-                        if (node->eb->start == bytenr) {
+                        BUG_ON(bytenr != node->bytenr);
-                                btrfs_tree_unlock(upper->eb);
-                                upper->locked = 0;
-                                continue;
-                        }
                } else {
-                        BUG_ON(node->bytenr != bytenr);
+                        if (node->eb->start == bytenr)
+                                goto next;
                }
                blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2515,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                if (!node->eb) {
                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
                                              slot, &eb);
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
                        if (ret < 0) {
                                err = ret;
-                                break;
+                                goto next;
                        }
-                        btrfs_set_lock_blocking(eb);
+                        BUG_ON(node->eb != eb);
-                        node->eb = eb;
-                        node->locked = 1;
                } else {
                        btrfs_set_node_blockptr(upper->eb, slot,
                                                node->eb->start);
@@ -2096,67 +2539,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
                        BUG_ON(ret);
                }
-                if (!lowest) {
+next:
-                        btrfs_tree_unlock(upper->eb);
+                if (!upper->pending)
-                        upper->locked = 0;
+                        drop_node_buffer(upper);
-                }
+                else
+                        unlock_node_buffer(upper);
+                if (err)
+                        break;
        }
+        if (!err && node->pending) {
+                drop_node_buffer(node);
+                list_move_tail(&node->list, &rc->backref_cache.changed);
+                node->pending = 0;
+        }
        path->lowest_level = 0;
+        BUG_ON(err == -ENOSPC);
        return err;
 }
 static int link_to_upper(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc,
                         struct backref_node *node,
                         struct btrfs_path *path)
 {
        struct btrfs_key key;
-        if (!node->eb || list_empty(&node->upper))
-                return 0;
        btrfs_node_key_to_cpu(node->eb, &key, 0);
-        return do_relocation(trans, node, &key, path, 0);
+        return do_relocation(trans, rc, node, &key, path, 0);
 }
 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
-                                struct backref_cache *cache,
+                                struct reloc_control *rc,
-                                struct btrfs_path *path)
+                                struct btrfs_path *path, int err)
 {
+        LIST_HEAD(list);
+        struct backref_cache *cache = &rc->backref_cache;
        struct backref_node *node;
        int level;
        int ret;
-        int err = 0;
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                while (!list_empty(&cache->pending[level])) {
                        node = list_entry(cache->pending[level].next,
-                                          struct backref_node, lower);
+                                          struct backref_node, list);
-                        BUG_ON(node->level != level);
+                        list_move_tail(&node->list, &list);
+                        BUG_ON(!node->pending);
-                        ret = link_to_upper(trans, node, path);
+                        if (!err) {
-                        if (ret < 0)
+                                ret = link_to_upper(trans, rc, node, path);
-                                err = ret;
+                                if (ret < 0)
-                        /*
+                                        err = ret;
-                         * this remove the node from the pending list and
+                        }
-                         * may add some other nodes to the level + 1
-                         * pending list
-                         */
-                        remove_backref_node(cache, node);
                }
+                list_splice_init(&list, &cache->pending[level]);
        }
-        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
        return err;
 }
 static void mark_block_processed(struct reloc_control *rc,
-                                 struct backref_node *node)
+                                 u64 bytenr, u32 blocksize)
+{
+        set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+                        EXTENT_DIRTY, GFP_NOFS);
+}
+static void __mark_block_processed(struct reloc_control *rc,
+                                   struct backref_node *node)
 {
        u32 blocksize;
        if (node->level == 0 ||
            in_block_group(node->bytenr, rc->block_group)) {
                blocksize = btrfs_level_size(rc->extent_root, node->level);
-                set_extent_bits(&rc->processed_blocks, node->bytenr,
+                mark_block_processed(rc, node->bytenr, blocksize);
-                                node->bytenr + blocksize - 1, EXTENT_DIRTY,
-                                GFP_NOFS);
        }
        node->processed = 1;
 }
@@ -2179,7 +2635,7 @@ static void update_processed_blocks(struct reloc_control *rc,
                        if (next->processed)
                                break;
-                        mark_block_processed(rc, next);
+                        __mark_block_processed(rc, next);
                        if (list_empty(&next->upper))
                                break;
@@ -2202,138 +2658,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
        return 0;
 }
-/*
- * check if there are any file extent pointers in the leaf point to
- * data require processing
- */
-static int check_file_extents(struct reloc_control *rc,
-                              u64 bytenr, u32 blocksize, u64 ptr_gen)
-{
-        struct btrfs_key found_key;
-        struct btrfs_file_extent_item *fi;
-        struct extent_buffer *leaf;
-        u32 nritems;
-        int i;
-        int ret = 0;
-        leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
-        nritems = btrfs_header_nritems(leaf);
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                btrfs_item_key_to_cpu(leaf, &found_key, i);
-                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, fi) ==
-                    BTRFS_FILE_EXTENT_INLINE)
-                        continue;
-                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-                if (bytenr == 0)
-                        continue;
-                if (in_block_group(bytenr, rc->block_group)) {
-                        ret = 1;
-                        break;
-                }
-        }
-        free_extent_buffer(leaf);
-        return ret;
-}
-/*
- * scan child blocks of a given block to find blocks require processing
- */
-static int add_child_blocks(struct btrfs_trans_handle *trans,
-                            struct reloc_control *rc,
-                            struct backref_node *node,
-                            struct rb_root *blocks)
-{
-        struct tree_block *block;
-        struct rb_node *rb_node;
-        u64 bytenr;
-        u64 ptr_gen;
-        u32 blocksize;
-        u32 nritems;
-        int i;
-        int err = 0;
-        nritems = btrfs_header_nritems(node->eb);
-        blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                bytenr = btrfs_node_blockptr(node->eb, i);
-                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-                if (ptr_gen == trans->transid)
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-                        continue;
-                if (tree_block_processed(bytenr, blocksize, rc))
-                        continue;
-                readahead_tree_block(rc->extent_root,
-                                     bytenr, blocksize, ptr_gen);
-        }
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                bytenr = btrfs_node_blockptr(node->eb, i);
-                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-                if (ptr_gen == trans->transid)
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-                        continue;
-                if (tree_block_processed(bytenr, blocksize, rc))
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
-                        continue;
-                block = kmalloc(sizeof(*block), GFP_NOFS);
-                if (!block) {
-                        err = -ENOMEM;
-                        break;
-                }
-                block->bytenr = bytenr;
-                btrfs_node_key_to_cpu(node->eb, &block->key, i);
-                block->level = node->level - 1;
-                block->key_ready = 1;
-                rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
-                BUG_ON(rb_node);
-        }
-        if (err)
-                free_block_list(blocks);
-        return err;
-}
-/*
- * find adjacent blocks require processing
- */
-static noinline_for_stack
-int add_adjacent_blocks(struct btrfs_trans_handle *trans,
-                        struct reloc_control *rc,
-                        struct backref_cache *cache,
-                        struct rb_root *blocks, int level,
-                        struct backref_node **upper)
-{
-        struct backref_node *node;
-        int ret = 0;
-        WARN_ON(!list_empty(&cache->pending[level]));
-        if (list_empty(&cache->pending[level + 1]))
-                return 1;
-        node = list_entry(cache->pending[level + 1].next,
-                          struct backref_node, lower);
-        if (node->eb)
-                ret = add_child_blocks(trans, rc, node, blocks);
-        *upper = node;
-        return ret;
-}
 static int get_tree_block_key(struct reloc_control *rc,
                              struct tree_block *block)
 {
@@ -2371,40 +2695,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                                struct btrfs_path *path)
 {
        struct btrfs_root *root;
-        int ret;
+        int release = 0;
+        int ret = 0;
+        if (!node)
+                return 0;
+        BUG_ON(node->processed);
        root = select_one_root(trans, node);
-        if (unlikely(!root)) {
+        if (root == ERR_PTR(-ENOENT)) {
-                rc->found_old_snapshot = 1;
                update_processed_blocks(rc, node);
-                return 0;
+                goto out;
        }
-        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+        if (!root || root->ref_cows) {
-                ret = do_relocation(trans, node, key, path, 1);
+                ret = reserve_metadata_space(trans, rc, node);
-                if (ret < 0)
+                if (ret)
-                        goto out;
-                if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
-                        ret = replace_file_extents(trans, rc, root,
-                                                   node->eb, NULL);
-                        if (ret < 0)
-                                goto out;
-                }
-                drop_node_buffer(node);
-        } else if (!root->ref_cows) {
-                path->lowest_level = node->level;
-                ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-                btrfs_release_path(root, path);
-                if (ret < 0)
                        goto out;
-        } else if (root != node->root) {
+                release = 1;
-                WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
        }
-        update_processed_blocks(rc, node);
+        if (root) {
-        ret = 0;
+                if (root->ref_cows) {
+                        BUG_ON(node->new_bytenr);
+                        BUG_ON(!list_empty(&node->list));
+                        btrfs_record_root_in_trans(trans, root);
+                        root = root->reloc_root;
+                        node->new_bytenr = root->node->start;
+                        node->root = root;
+                        list_add_tail(&node->list, &rc->backref_cache.changed);
+                } else {
+                        path->lowest_level = node->level;
+                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+                        btrfs_release_path(root, path);
+                        if (ret > 0)
+                                ret = 0;
+                }
+                if (!ret)
+                        update_processed_blocks(rc, node);
+        } else {
+                ret = do_relocation(trans, rc, node, key, path, 1);
+        }
 out:
-        drop_node_buffer(node);
+        if (ret || node->level == 0 || node->cowonly) {
+                if (release)
+                        release_metadata_space(rc, node);
+                remove_backref_node(&rc->backref_cache, node);
+        }
        return ret;
 }
@@ -2415,12 +2752,10 @@ static noinline_for_stack
 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
                         struct reloc_control *rc, struct rb_root *blocks)
 {
-        struct backref_cache *cache;
        struct backref_node *node;
        struct btrfs_path *path;
        struct tree_block *block;
        struct rb_node *rb_node;
-        int level = -1;
        int ret;
        int err = 0;
@@ -2428,21 +2763,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
-        cache = kmalloc(sizeof(*cache), GFP_NOFS);
-        if (!cache) {
-                btrfs_free_path(path);
-                return -ENOMEM;
-        }
-        backref_cache_init(cache);
        rb_node = rb_first(blocks);
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
-                if (level == -1)
-                        level = block->level;
-                else
-                        BUG_ON(level != block->level);
                if (!block->key_ready)
                        reada_tree_block(rc, block);
                rb_node = rb_next(rb_node);
@@ -2460,7 +2783,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
-                node = build_backref_tree(rc, cache, &block->key,
+                node = build_backref_tree(rc, &block->key,
                                          block->level, block->bytenr);
                if (IS_ERR(node)) {
                        err = PTR_ERR(node);
@@ -2470,79 +2793,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
                ret = relocate_tree_block(trans, rc, node, &block->key,
                                          path);
                if (ret < 0) {
-                        err = ret;
+                        if (ret != -EAGAIN || rb_node == rb_first(blocks))
+                                err = ret;
                        goto out;
                }
-                remove_backref_node(cache, node);
                rb_node = rb_next(rb_node);
        }
+out:
-        if (level > 0)
-                goto out;
        free_block_list(blocks);
+        err = finish_pending_nodes(trans, rc, path, err);
-        /*
+        btrfs_free_path(path);
-         * now backrefs of some upper level tree blocks have been cached,
+        return err;
-         * try relocating blocks referenced by these upper level blocks.
+}
-         */
-        while (1) {
-                struct backref_node *upper = NULL;
-                if (trans->transaction->in_commit ||
-                    trans->transaction->delayed_refs.flushing)
-                        break;
-                ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
+static noinline_for_stack
-                                          &upper);
+int prealloc_file_extent_cluster(struct inode *inode,
-                if (ret < 0)
+                                 struct file_extent_cluster *cluster)
-                        err = ret;
+{
-                if (ret != 0)
+        u64 alloc_hint = 0;
-                        break;
+        u64 start;
+        u64 end;
+        u64 offset = BTRFS_I(inode)->index_cnt;
+        u64 num_bytes;
+        int nr = 0;
+        int ret = 0;
-                rb_node = rb_first(blocks);
+        BUG_ON(cluster->start != cluster->boundary[0]);
-                while (rb_node) {
+        mutex_lock(&inode->i_mutex);
-                        block = rb_entry(rb_node, struct tree_block, rb_node);
-                        if (trans->transaction->in_commit ||
-                            trans->transaction->delayed_refs.flushing)
-                                goto out;
-                        BUG_ON(!block->key_ready);
-                        node = build_backref_tree(rc, cache, &block->key,
-                                                  level, block->bytenr);
-                        if (IS_ERR(node)) {
-                                err = PTR_ERR(node);
-                                goto out;
-                        }
-                        ret = relocate_tree_block(trans, rc, node,
+        ret = btrfs_check_data_free_space(inode, cluster->end +
-                                                  &block->key, path);
+                                          1 - cluster->start);
-                        if (ret < 0) {
+        if (ret)
-                                err = ret;
+                goto out;
-                                goto out;
-                        }
-                        remove_backref_node(cache, node);
-                        rb_node = rb_next(rb_node);
-                }
-                free_block_list(blocks);
-                if (upper) {
+        while (nr < cluster->nr) {
-                        ret = link_to_upper(trans, upper, path);
+                start = cluster->boundary[nr] - offset;
-                        if (ret < 0) {
+                if (nr + 1 < cluster->nr)
-                                err = ret;
+                        end = cluster->boundary[nr + 1] - 1 - offset;
-                                break;
+                else
-                        }
+                        end = cluster->end - offset;
-                        remove_backref_node(cache, upper);
-                }
+                lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                num_bytes = end + 1 - start;
+                ret = btrfs_prealloc_file_range(inode, 0, start,
+                                                num_bytes, num_bytes,
+                                                end + 1, &alloc_hint);
+                unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                if (ret)
+                        break;
+                nr++;
        }
+        btrfs_free_reserved_data_space(inode, cluster->end +
+                                       1 - cluster->start);
 out:
-        free_block_list(blocks);
+        mutex_unlock(&inode->i_mutex);
+        return ret;
-        ret = finish_pending_nodes(trans, cache, path);
-        if (ret < 0)
-                err = ret;
-        kfree(cache);
-        btrfs_free_path(path);
-        return err;
 }
 static noinline_for_stack
@@ -2588,7 +2894,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
        u64 offset = BTRFS_I(inode)->index_cnt;
        unsigned long index;
        unsigned long last_index;
-        unsigned int dirty_page = 0;
        struct page *page;
        struct file_ra_state *ra;
        int nr = 0;
@@ -2601,21 +2906,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
        if (!ra)
                return -ENOMEM;
-        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+        ret = prealloc_file_extent_cluster(inode, cluster);
-        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+        if (ret)
+                goto out;
-        mutex_lock(&inode->i_mutex);
+        file_ra_state_init(ra, inode->i_mapping);
-        i_size_write(inode, cluster->end + 1 - offset);
        ret = setup_extent_mapping(inode, cluster->start - offset,
                                   cluster->end - offset, cluster->start);
        if (ret)
-                goto out_unlock;
+                goto out;
-        file_ra_state_init(ra, inode->i_mapping);
-        WARN_ON(cluster->start != cluster->boundary[0]);
+        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
        while (index <= last_index) {
+                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+                if (ret)
+                        goto out;
                page = find_lock_page(inode->i_mapping, index);
                if (!page) {
                        page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2931,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                                  last_index + 1 - index);
                        page = grab_cache_page(inode->i_mapping, index);
                        if (!page) {
+                                btrfs_delalloc_release_metadata(inode,
+                                                        PAGE_CACHE_SIZE);
                                ret = -ENOMEM;
-                                goto out_unlock;
+                                goto out;
                        }
                }
@@ -2640,8 +2950,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
                        if (!PageUptodate(page)) {
                                unlock_page(page);
                                page_cache_release(page);
+                                btrfs_delalloc_release_metadata(inode,
+                                                        PAGE_CACHE_SIZE);
                                ret = -EIO;
-                                goto out_unlock;
+                                goto out;
                        }
                }
@@ -2660,10 +2972,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                        EXTENT_BOUNDARY, GFP_NOFS);
                        nr++;
                }
-                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
                set_page_dirty(page);
-                dirty_page++;
                unlock_extent(&BTRFS_I(inode)->io_tree,
                              page_start, page_end, GFP_NOFS);
@@ -2671,20 +2982,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
                page_cache_release(page);
                index++;
-                if (nr < cluster->nr &&
+                balance_dirty_pages_ratelimited(inode->i_mapping);
-                    page_end + 1 + offset == cluster->boundary[nr]) {
+                btrfs_throttle(BTRFS_I(inode)->root);
-                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                                           dirty_page);
-                        dirty_page = 0;
-                }
-        }
-        if (dirty_page) {
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                                   dirty_page);
        }
        WARN_ON(nr != cluster->nr);
-out_unlock:
+out:
-        mutex_unlock(&inode->i_mutex);
        kfree(ra);
        return ret;
 }
@@ -2870,9 +3172,6 @@ out:
 static int block_use_full_backref(struct reloc_control *rc,
                                  struct extent_buffer *eb)
 {
-        struct btrfs_path *path;
-        struct btrfs_extent_item *ei;
-        struct btrfs_key key;
        u64 flags;
        int ret;
@@ -2880,28 +3179,14 @@ static int block_use_full_backref(struct reloc_control *rc,
            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
                return 1;
-        path = btrfs_alloc_path();
+        ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
-        BUG_ON(!path);
+                                       eb->start, eb->len, NULL, &flags);
-        key.objectid = eb->start;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
-        key.offset = eb->len;
-        path->search_commit_root = 1;
-        path->skip_locking = 1;
-        ret = btrfs_search_slot(NULL, rc->extent_root,
-                                &key, path, 0, 0);
        BUG_ON(ret);
-        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
-                            struct btrfs_extent_item);
-        flags = btrfs_extent_flags(path->nodes[0], ei);
-        BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
                ret = 1;
        else
                ret = 0;
-        btrfs_free_path(path);
        return ret;
 }
@@ -3074,22 +3359,10 @@ int add_data_references(struct reloc_control *rc,
        struct btrfs_extent_inline_ref *iref;
        unsigned long ptr;
        unsigned long end;
-        u32 blocksize;
+        u32 blocksize = btrfs_level_size(rc->extent_root, 0);
        int ret;
        int err = 0;
-        ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
-                               extent_key->offset);
-        BUG_ON(ret < 0);
-        if (ret > 0) {
-                /* the relocated data is fragmented */
-                rc->extents_skipped++;
-                btrfs_release_path(rc->extent_root, path);
-                return 0;
-        }
-        blocksize = btrfs_level_size(rc->extent_root, 0);
        eb = path->nodes[0];
        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3443,8 @@ int add_data_references(struct reloc_control *rc,
 */
 static noinline_for_stack
 int find_next_extent(struct btrfs_trans_handle *trans,
-                     struct reloc_control *rc, struct btrfs_path *path)
+                     struct reloc_control *rc, struct btrfs_path *path,
+                     struct btrfs_key *extent_key)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
@@ -3225,6 +3499,7 @@ next:
                        rc->search_start = end + 1;
                } else {
                        rc->search_start = key.objectid + key.offset;
+                        memcpy(extent_key, &key, sizeof(key));
                        return 0;
                }
        }
@@ -3262,12 +3537,49 @@ static int check_extent_flags(u64 flags)
        return 0;
 }
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+        struct btrfs_trans_handle *trans;
+        int ret;
+        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+        if (!rc->block_rsv)
+                return -ENOMEM;
+        /*
+         * reserve some space for creating reloc trees.
+         * btrfs_init_reloc_root will use them when there
+         * is no reservation in transaction handle.
+         */
+        ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+                                  rc->extent_root->nodesize * 256,
+                                  &rc->block_rsv_retries);
+        if (ret)
+                return ret;
+        rc->block_rsv->refill_used = 1;
+        btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
+        memset(&rc->cluster, 0, sizeof(rc->cluster));
+        rc->search_start = rc->block_group->key.objectid;
+        rc->extents_found = 0;
+        rc->nodes_relocated = 0;
+        rc->merging_rsv_size = 0;
+        rc->block_rsv_retries = 0;
+        rc->create_reloc_tree = 1;
+        set_reloc_control(rc);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        return 0;
+}
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
        struct rb_root blocks = RB_ROOT;
        struct btrfs_key key;
-        struct file_extent_cluster *cluster;
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
@@ -3277,33 +3589,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        int ret;
        int err = 0;
-        cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
-        if (!cluster)
-                return -ENOMEM;
        path = btrfs_alloc_path();
-        if (!path) {
+        if (!path)
-                kfree(cluster);
                return -ENOMEM;
-        }
-        rc->extents_found = 0;
-        rc->extents_skipped = 0;
-        rc->search_start = rc->block_group->key.objectid;
-        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
-                          GFP_NOFS);
-        rc->create_reloc_root = 1;
-        set_reloc_control(rc);
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        ret = prepare_to_relocate(rc);
-        btrfs_commit_transaction(trans, rc->extent_root);
+        if (ret) {
+                err = ret;
+                goto out_free;
+        }
        while (1) {
-                trans = btrfs_start_transaction(rc->extent_root, 1);
+                trans = btrfs_start_transaction(rc->extent_root, 0);
+                if (update_backref_cache(trans, &rc->backref_cache)) {
+                        btrfs_end_transaction(trans, rc->extent_root);
+                        continue;
+                }
-                ret = find_next_extent(trans, rc, path);
+                ret = find_next_extent(trans, rc, path, &key);
                if (ret < 0)
                        err = ret;
                if (ret != 0)
@@ -3313,9 +3617,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                    struct btrfs_extent_item);
-                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
-                item_size = btrfs_item_size_nr(path->nodes[0],
-                                               path->slots[0]);
                if (item_size >= sizeof(*ei)) {
                        flags = btrfs_extent_flags(path->nodes[0], ei);
                        ret = check_extent_flags(flags);
@@ -3356,73 +3658,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                        ret = add_tree_block(rc, &key, path, &blocks);
                } else if (rc->stage == UPDATE_DATA_PTRS &&
-                         (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        ret = add_data_references(rc, &key, path, &blocks);
                } else {
                        btrfs_release_path(rc->extent_root, path);
                        ret = 0;
                }
                if (ret < 0) {
-                        err = 0;
+                        err = ret;
                        break;
                }
                if (!RB_EMPTY_ROOT(&blocks)) {
                        ret = relocate_tree_blocks(trans, rc, &blocks);
                        if (ret < 0) {
+                                if (ret != -EAGAIN) {
+                                        err = ret;
+                                        break;
+                                }
+                                rc->extents_found--;
+                                rc->search_start = key.objectid;
+                        }
+                }
+                ret = btrfs_block_rsv_check(trans, rc->extent_root,
+                                            rc->block_rsv, 0, 5);
+                if (ret < 0) {
+                        if (ret != -EAGAIN) {
                                err = ret;
+                                WARN_ON(1);
                                break;
                        }
+                        rc->commit_transaction = 1;
                }
-                nr = trans->blocks_used;
+                if (rc->commit_transaction) {
-                btrfs_end_transaction(trans, rc->extent_root);
+                        rc->commit_transaction = 0;
+                        ret = btrfs_commit_transaction(trans, rc->extent_root);
+                        BUG_ON(ret);
+                } else {
+                        nr = trans->blocks_used;
+                        btrfs_end_transaction_throttle(trans, rc->extent_root);
+                        btrfs_btree_balance_dirty(rc->extent_root, nr);
+                }
                trans = NULL;
-                btrfs_btree_balance_dirty(rc->extent_root, nr);
                if (rc->stage == MOVE_DATA_EXTENTS &&
                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        rc->found_file_extent = 1;
                        ret = relocate_data_extent(rc->data_inode,
-                                                   &key, cluster);
+                                                   &key, &rc->cluster);
                        if (ret < 0) {
                                err = ret;
                                break;
                        }
                }
        }
-        btrfs_free_path(path);
+        btrfs_release_path(rc->extent_root, path);
+        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+                          GFP_NOFS);
        if (trans) {
                nr = trans->blocks_used;
-                btrfs_end_transaction(trans, rc->extent_root);
+                btrfs_end_transaction_throttle(trans, rc->extent_root);
                btrfs_btree_balance_dirty(rc->extent_root, nr);
        }
        if (!err) {
-                ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+                ret = relocate_file_extent_cluster(rc->data_inode,
+                                                   &rc->cluster);
                if (ret < 0)
                        err = ret;
        }
-        kfree(cluster);
+        rc->create_reloc_tree = 0;
+        set_reloc_control(rc);
-        rc->create_reloc_root = 0;
+        backref_cache_cleanup(&rc->backref_cache);
-        smp_mb();
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
-        if (rc->extents_found > 0) {
+        err = prepare_to_merge(rc, err);
-                trans = btrfs_start_transaction(rc->extent_root, 1);
-                btrfs_commit_transaction(trans, rc->extent_root);
-        }
        merge_reloc_roots(rc);
+        rc->merge_reloc_tree = 0;
        unset_reloc_control(rc);
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
        /* get rid of pinned extents */
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
+        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+        btrfs_free_path(path);
        return err;
 }
@@ -3448,7 +3777,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        btrfs_set_inode_generation(leaf, item, 1);
        btrfs_set_inode_size(leaf, item, 0);
        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+                                          BTRFS_INODE_PREALLOC);
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(root, path);
 out:
@@ -3460,8 +3790,9 @@ out:
 * helper to create inode for data relocation.
 * the inode is in data relocation tree and its link count is 0
 */
-static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+static noinline_for_stack
-                                        struct btrfs_block_group_cache *group)
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_group_cache *group)
 {
        struct inode *inode = NULL;
        struct btrfs_trans_handle *trans;
@@ -3475,8 +3806,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        if (IS_ERR(root))
                return ERR_CAST(root);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 6);
-        BUG_ON(!trans);
+        if (IS_ERR(trans))
+                return ERR_CAST(trans);
        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
        if (err)
@@ -3496,7 +3828,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 out:
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
        if (err) {
                if (inode)
@@ -3506,6 +3837,21 @@ out:
        return inode;
 }
+static struct reloc_control *alloc_reloc_control(void)
+{
+        struct reloc_control *rc;
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc)
+                return NULL;
+        INIT_LIST_HEAD(&rc->reloc_roots);
+        backref_cache_init(&rc->backref_cache);
+        mapping_tree_init(&rc->reloc_root_tree);
+        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+        return rc;
+}
 /*
 * function to relocate all extents in a block group.
 */
@@ -3514,24 +3860,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        struct reloc_control *rc;
        int ret;
+        int rw = 0;
        int err = 0;
-        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        rc = alloc_reloc_control();
        if (!rc)
                return -ENOMEM;
-        mapping_tree_init(&rc->reloc_root_tree);
+        rc->extent_root = extent_root;
-        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
-        INIT_LIST_HEAD(&rc->reloc_roots);
        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
        BUG_ON(!rc->block_group);
-        btrfs_init_workers(&rc->workers, "relocate",
+        if (!rc->block_group->ro) {
-                           fs_info->thread_pool_size, NULL);
+                ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+                if (ret) {
-        rc->extent_root = extent_root;
+                        err = ret;
-        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+                        goto out;
+                }
+                rw = 1;
+        }
        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
        if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3896,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
        while (1) {
-                rc->extents_found = 0;
-                rc->extents_skipped = 0;
                mutex_lock(&fs_info->cleaner_mutex);
                btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3904,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                mutex_unlock(&fs_info->cleaner_mutex);
                if (ret < 0) {
                        err = ret;
-                        break;
+                        goto out;
                }
                if (rc->extents_found == 0)
@@ -3573,18 +3918,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                        invalidate_mapping_pages(rc->data_inode->i_mapping,
                                                 0, -1);
                        rc->stage = UPDATE_DATA_PTRS;
-                } else if (rc->stage == UPDATE_DATA_PTRS &&
-                           rc->extents_skipped >= rc->extents_found) {
-                        iput(rc->data_inode);
-                        rc->data_inode = create_reloc_inode(fs_info,
-                                                            rc->block_group);
-                        if (IS_ERR(rc->data_inode)) {
-                                err = PTR_ERR(rc->data_inode);
-                                rc->data_inode = NULL;
-                                break;
-                        }
-                        rc->stage = MOVE_DATA_EXTENTS;
-                        rc->found_file_extent = 0;
                }
        }
@@ -3597,8 +3930,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        WARN_ON(rc->block_group->reserved > 0);
        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
 out:
+        if (err && rw)
+                btrfs_set_block_group_rw(extent_root, rc->block_group);
        iput(rc->data_inode);
-        btrfs_stop_workers(&rc->workers);
        btrfs_put_block_group(rc->block_group);
        kfree(rc);
        return err;
@@ -3609,7 +3943,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
        struct btrfs_trans_handle *trans;
        int ret;
-        trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
        memset(&root->root_item.drop_progress, 0,
                sizeof(root->root_item.drop_progress));
@@ -3702,20 +4036,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        if (list_empty(&reloc_roots))
                goto out;
-        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        rc = alloc_reloc_control();
        if (!rc) {
                err = -ENOMEM;
                goto out;
        }
-        mapping_tree_init(&rc->reloc_root_tree);
-        INIT_LIST_HEAD(&rc->reloc_roots);
-        btrfs_init_workers(&rc->workers, "relocate",
-                           root->fs_info->thread_pool_size, NULL);
        rc->extent_root = root->fs_info->extent_root;
        set_reloc_control(rc);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        rc->merge_reloc_tree = 1;
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
@@ -3735,20 +4069,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                fs_root->reloc_root = reloc_root;
        }
-        trans = btrfs_start_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
        merge_reloc_roots(rc);
        unset_reloc_control(rc);
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
 out:
-        if (rc) {
+        kfree(rc);
-                btrfs_stop_workers(&rc->workers);
-                kfree(rc);
-        }
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
@@ -3814,3 +4144,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
        btrfs_put_ordered_extent(ordered);
        return 0;
 }
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct extent_buffer *buf,
+                           struct extent_buffer *cow)
+{
+        struct reloc_control *rc;
+        struct backref_node *node;
+        int first_cow = 0;
+        int level;
+        int ret;
+        rc = root->fs_info->reloc_ctl;
+        if (!rc)
+                return;
+        BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+               root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+        level = btrfs_header_level(buf);
+        if (btrfs_header_generation(buf) <=
+            btrfs_root_last_snapshot(&root->root_item))
+                first_cow = 1;
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+            rc->create_reloc_tree) {
+                WARN_ON(!first_cow && level == 0);
+                node = rc->backref_cache.path[level];
+                BUG_ON(node->bytenr != buf->start &&
+                       node->new_bytenr != buf->start);
+                drop_node_buffer(node);
+                extent_buffer_get(cow);
+                node->eb = cow;
+                node->new_bytenr = cow->start;
+                if (!node->pending) {
+                        list_move_tail(&node->list,
+                                       &rc->backref_cache.pending[level]);
+                        node->pending = 1;
+                }
+                if (first_cow)
+                        __mark_block_processed(rc, node);
+                if (first_cow && level > 0)
+                        rc->nodes_relocated += buf->len;
+        }
+        if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+                ret = replace_file_extents(trans, rc, root, cow);
+                BUG_ON(ret);
+        }
+}
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending,
+                              u64 *bytes_to_reserve)
+{
+        struct btrfs_root *root;
+        struct reloc_control *rc;
+        root = pending->root;
+        if (!root->reloc_root)
+                return;
+        rc = root->fs_info->reloc_ctl;
+        if (!rc->merge_reloc_tree)
+                return;
+        root = root->reloc_root;
+        BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+        /*
+         * relocation is in the stage of merging trees. the space
+         * used by merging a reloc tree is twice the size of
+         * relocated tree nodes in the worst case. half for cowing
+         * the reloc tree, half for cowing the fs tree. the space
+         * used by cowing the reloc tree will be freed after the
+         * tree is dropped. if we create snapshot, cowing the fs
+         * tree may use more space than it frees. so we need
+         * reserve extra space.
+         */
+        *bytes_to_reserve += rc->nodes_relocated;
+}
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+                               struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root *new_root;
+        struct reloc_control *rc;
+        int ret;
+        if (!root->reloc_root)
+                return;
+        rc = root->fs_info->reloc_ctl;
+        rc->merging_rsv_size += rc->nodes_relocated;
+        if (rc->merge_reloc_tree) {
+                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                              rc->block_rsv,
+                                              rc->nodes_relocated);
+                BUG_ON(ret);
+        }
+        new_root = pending->snap;
+        reloc_root = create_reloc_root(trans, root->reloc_root,
+                                       new_root->root_key.objectid);
+        __add_reloc_root(reloc_root);
+        new_root->reloc_root = reloc_root;
+        if (rc->create_reloc_tree) {
+                ret = clone_backref_node(trans, rc, root, reloc_root);
+                BUG_ON(ret);
+        }
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..b91ccd972644 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
        struct extent_buffer *leaf;
        struct btrfs_path *path;
        struct btrfs_key key;
+        struct btrfs_key root_key;
+        struct btrfs_root *root;
        int err = 0;
        int ret;
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = 0;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = (u64)-1;
        while (1) {
                ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
                if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                    key.type != BTRFS_ORPHAN_ITEM_KEY)
                        break;
-                ret = btrfs_find_dead_roots(tree_root, key.offset);
+                root_key.objectid = key.offset;
-                if (ret) {
+                key.offset++;
+                root = btrfs_read_fs_root_no_name(tree_root->fs_info,
+                                                  &root_key);
+                if (!IS_ERR(root))
+                        continue;
+                ret = PTR_ERR(root);
+                if (ret != -ENOENT) {
                        err = ret;
                        break;
                }
-                key.offset++;
+                ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
+                if (ret) {
+                        err = ret;
+                        break;
+                }
        }
        btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1866dff0538e..d34b2dfc9628 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -498,7 +498,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_start_delalloc_inodes(root, 0);
        btrfs_wait_ordered_extents(root, 0, 0);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        ret = btrfs_commit_transaction(trans, root);
        return ret;
 }
@@ -694,11 +694,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
                        return -EINVAL;
-                /* recover relocation */
+                ret = btrfs_cleanup_fs_roots(root->fs_info);
-                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
-                ret = btrfs_cleanup_fs_roots(root->fs_info);
+                /* recover relocation */
+                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
                sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +714,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
-        u64 data_used = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
        rcu_read_lock();
-        list_for_each_entry_rcu(found, head, list) {
+        list_for_each_entry_rcu(found, head, list)
-                if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
+                total_used += found->disk_used;
-                                    BTRFS_BLOCK_GROUP_RAID10|
-                                    BTRFS_BLOCK_GROUP_RAID1)) {
-                        total_used += found->bytes_used;
-                        if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                                data_used += found->bytes_used;
-                        else
-                                data_used += found->total_bytes;
-                }
-                total_used += found->bytes_used;
-                if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                        data_used += found->bytes_used;
-                else
-                        data_used += found->total_bytes;
-        }
        rcu_read_unlock();
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-        buf->f_bavail = buf->f_blocks - (data_used >> bits);
+        buf->f_bavail = buf->f_bfree;
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
@@ -832,11 +816,14 @@ static const struct file_operations btrfs_ctl_fops = {
 };
 static struct miscdevice btrfs_misc = {
-        .minor          = MISC_DYNAMIC_MINOR,
+        .minor          = BTRFS_MINOR,
        .name           = "btrfs-control",
        .fops           = &btrfs_ctl_fops
 };
+MODULE_ALIAS_MISCDEV(BTRFS_MINOR);
+MODULE_ALIAS("devname:btrfs-control");
 static int btrfs_interface_init(void)
 {
        return misc_register(&btrfs_misc);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b90..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
        TRANS_USERSPACE,
 };
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+        if (!root->fs_info->log_root_recovering &&
+            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+             type == TRANS_USERSPACE))
+                return 1;
+        return 0;
+}
 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-                                             int num_blocks, int type)
+                                                    u64 num_items, int type)
 {
-        struct btrfs_trans_handle *h =
+        struct btrfs_trans_handle *h;
-                kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+        struct btrfs_transaction *cur_trans;
+        int retries = 0;
        int ret;
+again:
+        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+        if (!h)
+                return ERR_PTR(-ENOMEM);
        mutex_lock(&root->fs_info->trans_mutex);
-        if (!root->fs_info->log_root_recovering &&
+        if (may_wait_transaction(root, type))
-            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
-             type == TRANS_USERSPACE))
                wait_current_trans(root);
        ret = join_transaction(root);
        BUG_ON(ret);
-        h->transid = root->fs_info->running_transaction->transid;
+        cur_trans = root->fs_info->running_transaction;
-        h->transaction = root->fs_info->running_transaction;
+        cur_trans->use_count++;
-        h->blocks_reserved = num_blocks;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        h->transid = cur_trans->transid;
+        h->transaction = cur_trans;
        h->blocks_used = 0;
        h->block_group = 0;
-        h->alloc_exclude_nr = 0;
+        h->bytes_reserved = 0;
-        h->alloc_exclude_start = 0;
        h->delayed_ref_updates = 0;
+        h->block_rsv = NULL;
-        if (!current->journal_info && type != TRANS_USERSPACE)
+        smp_mb();
-                current->journal_info = h;
+        if (cur_trans->blocked && may_wait_transaction(root, type)) {
+                btrfs_commit_transaction(h, root);
+                goto again;
+        }
+        if (num_items > 0) {
+                ret = btrfs_trans_reserve_metadata(h, root, num_items,
+                                                   &retries);
+                if (ret == -EAGAIN) {
+                        btrfs_commit_transaction(h, root);
+                        goto again;
+                }
+                if (ret < 0) {
+                        btrfs_end_transaction(h, root);
+                        return ERR_PTR(ret);
+                }
+        }
-        root->fs_info->running_transaction->use_count++;
+        mutex_lock(&root->fs_info->trans_mutex);
        record_root_in_trans(h, root);
        mutex_unlock(&root->fs_info->trans_mutex);
+        if (!current->journal_info && type != TRANS_USERSPACE)
+                current->journal_info = h;
        return h;
 }
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                   int num_blocks)
+                                                   int num_items)
 {
-        return start_transaction(root, num_blocks, TRANS_START);
+        return start_transaction(root, num_items, TRANS_START);
 }
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
                                                   int num_blocks)
 {
-        return start_transaction(root, num_blocks, TRANS_JOIN);
+        return start_transaction(root, 0, TRANS_JOIN);
 }
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
                                                         int num_blocks)
 {
-        return start_transaction(r, num_blocks, TRANS_USERSPACE);
+        return start_transaction(r, 0, TRANS_USERSPACE);
 }
 /* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
        mutex_unlock(&root->fs_info->trans_mutex);
 }
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+        int ret;
+        ret = btrfs_block_rsv_check(trans, root,
+                                    &root->fs_info->global_block_rsv, 0, 5);
+        return ret ? 1 : 0;
+}
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root)
+{
+        struct btrfs_transaction *cur_trans = trans->transaction;
+        int updates;
+        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+                return 1;
+        updates = trans->delayed_ref_updates;
+        trans->delayed_ref_updates = 0;
+        if (updates)
+                btrfs_run_delayed_refs(trans, root, updates);
+        return should_end_transaction(trans, root);
+}
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, int throttle)
 {
-        struct btrfs_transaction *cur_trans;
+        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
        int count = 0;
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                count++;
        }
+        btrfs_trans_release_metadata(trans, root);
+        if (!root->fs_info->open_ioctl_trans &&
+            should_end_transaction(trans, root))
+                trans->transaction->blocked = 1;
+        if (cur_trans->blocked && !cur_trans->in_commit) {
+                if (throttle)
+                        return btrfs_commit_transaction(trans, root);
+                else
+                        wake_up_process(info->transaction_kthread);
+        }
        mutex_lock(&info->trans_mutex);
-        cur_trans = info->running_transaction;
+        WARN_ON(cur_trans != info->running_transaction);
-        WARN_ON(cur_trans != trans->transaction);
        WARN_ON(cur_trans->num_writers < 1);
        cur_trans->num_writers--;
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_free_log(trans, root);
                        btrfs_update_reloc_root(trans, root);
+                        btrfs_orphan_commit_root(trans, root);
                        if (root->commit_root != root->node) {
                                switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 {
        struct btrfs_fs_info *info = root->fs_info;
-        int ret;
        struct btrfs_trans_handle *trans;
+        int ret;
        unsigned long nr;
-        smp_mb();
+        if (xchg(&root->defrag_running, 1))
-        if (root->defrag_running)
                return 0;
-        trans = btrfs_start_transaction(root, 1);
        while (1) {
-                root->defrag_running = 1;
+                trans = btrfs_start_transaction(root, 0);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
                ret = btrfs_defrag_leaves(trans, root, cacheonly);
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(info->tree_root, nr);
                cond_resched();
-                trans = btrfs_start_transaction(root, 1);
                if (root->fs_info->closing || ret != -EAGAIN)
                        break;
        }
        root->defrag_running = 0;
-        smp_mb();
+        return ret;
-        btrfs_end_transaction(trans, root);
-        return 0;
 }
 #if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = pending->root;
        struct btrfs_root *parent_root;
        struct inode *parent_inode;
+        struct dentry *dentry;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
        int ret;
-        u64 objectid;
+        int retries = 0;
-        int namelen;
+        u64 to_reserve = 0;
        u64 index = 0;
+        u64 objectid;
-        parent_inode = pending->dentry->d_parent->d_inode;
-        parent_root = BTRFS_I(parent_inode)->root;
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
-                ret = -ENOMEM;
+                pending->error = -ENOMEM;
                goto fail;
        }
        ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
-        if (ret)
+        if (ret) {
+                pending->error = ret;
                goto fail;
+        }
+        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+        btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
+        if (to_reserve > 0) {
+                ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+                                          to_reserve, &retries);
+                if (ret) {
+                        pending->error = ret;
+                        goto fail;
+                }
+        }
        key.objectid = objectid;
-        /* record when the snapshot was created in key.offset */
+        key.offset = (u64)-1;
-        key.offset = trans->transid;
+        key.type = BTRFS_ROOT_ITEM_KEY;
-        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-        memcpy(&pending->root_key, &key, sizeof(key));
+        trans->block_rsv = &pending->block_rsv;
-        pending->root_key.offset = (u64)-1;
+        dentry = pending->dentry;
+        parent_inode = dentry->d_parent->d_inode;
+        parent_root = BTRFS_I(parent_inode)->root;
        record_root_in_trans(trans, parent_root);
        /*
         * insert the directory item
         */
-        namelen = strlen(pending->name);
        ret = btrfs_set_inode_index(parent_inode, &index);
        BUG_ON(ret);
        ret = btrfs_insert_dir_item(trans, parent_root,
-                            pending->name, namelen,
+                                dentry->d_name.name, dentry->d_name.len,
-                            parent_inode->i_ino,
+                                parent_inode->i_ino, &key,
-                            &pending->root_key, BTRFS_FT_DIR, index);
+                                BTRFS_FT_DIR, index);
        BUG_ON(ret);
-        btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+        btrfs_i_size_write(parent_inode, parent_inode->i_size +
+                                         dentry->d_name.len * 2);
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        free_extent_buffer(old);
        btrfs_set_root_node(new_root_item, tmp);
-        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+        /* record when the snapshot was created in key.offset */
-                                new_root_item);
+        key.offset = trans->transid;
-        BUG_ON(ret);
+        ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
        btrfs_tree_unlock(tmp);
        free_extent_buffer(tmp);
+        BUG_ON(ret);
-        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+        /*
-                                 pending->root_key.objectid,
+         * insert root back/forward references
+         */
+        ret = btrfs_add_root_ref(trans, tree_root, objectid,
                                 parent_root->root_key.objectid,
-                                 parent_inode->i_ino, index, pending->name,
+                                 parent_inode->i_ino, index,
-                                 namelen);
+                                 dentry->d_name.name, dentry->d_name.len);
        BUG_ON(ret);
+        key.offset = (u64)-1;
+        pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+        BUG_ON(IS_ERR(pending->snap));
+        btrfs_reloc_post_snapshot(trans, pending);
+        btrfs_orphan_post_snapshot(trans, pending);
 fail:
        kfree(new_root_item);
-        return ret;
+        btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
+        return 0;
 }
 /*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
        return ret;
 }
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+        int ret = 0;
+        spin_lock(&info->new_trans_lock);
+        if (info->running_transaction)
+                ret = info->running_transaction->blocked;
+        spin_unlock(&info->new_trans_lock);
+        return ret;
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
+        btrfs_trans_release_metadata(trans, root);
        cur_trans = trans->transaction;
        /*
         * set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        snap_pending = 1;
                WARN_ON(cur_trans != trans->transaction);
-                prepare_to_wait(&cur_trans->writer_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
                if (cur_trans->num_writers > 1)
                        timeout = MAX_SCHEDULE_TIMEOUT;
                else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                 */
                btrfs_run_ordered_operations(root, 1);
+                prepare_to_wait(&cur_trans->writer_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
                smp_mb();
                if (cur_trans->num_writers > 1 || should_grow)
                        schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
                if (btrfs_header_backref_rev(root->node) <
                    BTRFS_MIXED_BACKREF_REV)
-                        btrfs_drop_snapshot(root, 0);
+                        btrfs_drop_snapshot(root, NULL, 0);
                else
-                        btrfs_drop_snapshot(root, 1);
+                        btrfs_drop_snapshot(root, NULL, 1);
        }
        return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
        u64 transid;
+        u64 block_group;
+        u64 bytes_reserved;
        unsigned long blocks_reserved;
        unsigned long blocks_used;
-        struct btrfs_transaction *transaction;
-        u64 block_group;
-        u64 alloc_exclude_start;
-        u64 alloc_exclude_nr;
        unsigned long delayed_ref_updates;
+        struct btrfs_transaction *transaction;
+        struct btrfs_block_rsv *block_rsv;
 };
 struct btrfs_pending_snapshot {
        struct dentry *dentry;
        struct btrfs_root *root;
-        char *name;
+        struct btrfs_root *snap;
-        struct btrfs_key root_key;
+        /* block reservation for the operation */
+        struct btrfs_block_rsv block_rsv;
+        /* extra metadata reseration for relocation */
+        int error;
        struct list_head list;
 };
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                   int num_blocks);
+                                                   int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
-                                                   int num_blocks);
+                                                  int num_blocks);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-                                                   int num_blocks);
+                                                         int num_blocks);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
                                struct extent_io_tree *dirty_pages, int mark);
 int btrfs_wait_marked_extents(struct btrfs_root *root,
                                struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                                 path->nodes[1], 0,
                                 cache_only, &last_ret,
                                 &root->defrag_progress);
-        WARN_ON(ret && ret != -EAGAIN);
+        if (ret) {
+                WARN_ON(ret == -EAGAIN);
+                goto out;
+        }
        if (next_key_ret == 0) {
                memcpy(&root->defrag_progress, &key, sizeof(key));
                ret = -EAGAIN;
        }
-        btrfs_release_path(root, path);
 out:
        if (path)
                btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d4..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root)
 {
        int ret;
+        int err = 0;
        mutex_lock(&root->log_mutex);
        if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
        mutex_lock(&root->fs_info->tree_log_mutex);
        if (!root->fs_info->log_root_tree) {
                ret = btrfs_init_log_root_tree(trans, root->fs_info);
-                BUG_ON(ret);
+                if (ret)
+                        err = ret;
        }
-        if (!root->log_root) {
+        if (err == 0 && !root->log_root) {
                ret = btrfs_add_log_tree(trans, root);
-                BUG_ON(ret);
+                if (ret)
+                        err = ret;
        }
        mutex_unlock(&root->fs_info->tree_log_mutex);
        root->log_batch++;
        atomic_inc(&root->log_writers);
        mutex_unlock(&root->log_mutex);
-        return 0;
+        return err;
 }
 /*
@@ -376,7 +379,7 @@ insert:
                        BUG_ON(ret);
                }
        } else if (ret) {
-                BUG();
+                return ret;
        }
        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
                                        path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
-                wc->process_func(root, next, wc, ptr_gen);
                if (*level == 1) {
+                        wc->process_func(root, next, wc, ptr_gen);
                        path->slots[*level]++;
                        if (wc->free) {
                                btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
        WARN_ON(*level < 0);
        WARN_ON(*level >= BTRFS_MAX_LEVEL);
-        if (path->nodes[*level] == root->node)
+        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
-                parent = path->nodes[*level];
-        else
-                parent = path->nodes[*level + 1];
-        bytenr = path->nodes[*level]->start;
-        blocksize = btrfs_level_size(root, *level);
-        root_owner = btrfs_header_owner(parent);
-        root_gen = btrfs_header_generation(parent);
-        wc->process_func(root, path->nodes[*level], wc,
-                         btrfs_header_generation(path->nodes[*level]));
-        if (wc->free) {
-                next = path->nodes[*level];
-                btrfs_tree_lock(next);
-                clean_tree_block(trans, root, next);
-                btrfs_set_lock_blocking(next);
-                btrfs_wait_tree_block_writeback(next);
-                btrfs_tree_unlock(next);
-                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-                ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
-                BUG_ON(ret);
-        }
-        free_extent_buffer(path->nodes[*level]);
-        path->nodes[*level] = NULL;
-        *level += 1;
        cond_resched();
        return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
                slot = path->slots[i];
-                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
                        struct extent_buffer *node;
                        node = path->nodes[i];
                        path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        mutex_unlock(&log_root_tree->log_mutex);
        ret = update_log_root(trans, log);
-        BUG_ON(ret);
        mutex_lock(&log_root_tree->log_mutex);
        if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                        wake_up(&log_root_tree->log_writer_wait);
        }
+        if (ret) {
+                BUG_ON(ret != -ENOSPC);
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+                mutex_unlock(&log_root_tree->log_mutex);
+                ret = -EAGAIN;
+                goto out;
+        }
        index2 = log_root_tree->log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
        return 0;
 }
-/*
+static void free_log_tree(struct btrfs_trans_handle *trans,
- * free all the extents used by the tree log.  This should be called
+                          struct btrfs_root *log)
- * at commit time of the full transaction
- */
-int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 {
        int ret;
-        struct btrfs_root *log;
-        struct key;
        u64 start;
        u64 end;
        struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
                .process_func = process_one_buffer
        };
-        if (!root->log_root || root->fs_info->log_root_recovering)
-                return 0;
-        log = root->log_root;
        ret = walk_log_tree(trans, log, &wc);
        BUG_ON(ret);
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
        }
-        if (log->log_transid > 0) {
-                ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
-                                     &log->root_key);
-                BUG_ON(ret);
-        }
-        root->log_root = NULL;
        free_extent_buffer(log->node);
        kfree(log);
+}
+/*
+ * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+        if (root->log_root) {
+                free_log_tree(trans, root->log_root);
+                root->log_root = NULL;
+        }
+        return 0;
+}
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info)
+{
+        if (fs_info->log_root_tree) {
+                free_log_tree(trans, fs_info->log_root_tree);
+                fs_info->log_root_tree = NULL;
+        }
        return 0;
 }
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        struct btrfs_dir_item *di;
        struct btrfs_path *path;
        int ret;
+        int err = 0;
        int bytes_del = 0;
        if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
                                   name, name_len, -1);
-        if (di && !IS_ERR(di)) {
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto fail;
+        }
+        if (di) {
                ret = btrfs_delete_one_dir_name(trans, log, path, di);
                bytes_del += name_len;
                BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        btrfs_release_path(log, path);
        di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
                                         index, name, name_len, -1);
-        if (di && !IS_ERR(di)) {
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto fail;
+        }
+        if (di) {
                ret = btrfs_delete_one_dir_name(trans, log, path, di);
                bytes_del += name_len;
                BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                btrfs_release_path(log, path);
                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+                if (ret < 0) {
+                        err = ret;
+                        goto fail;
+                }
                if (ret == 0) {
                        struct btrfs_inode_item *item;
                        u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                        ret = 0;
                btrfs_release_path(log, path);
        }
+fail:
        btrfs_free_path(path);
        mutex_unlock(&BTRFS_I(dir)->log_mutex);
+        if (ret == -ENOSPC) {
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 0;
+        }
        btrfs_end_log_trans(root);
        return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
                                  dirid, &index);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
+        if (ret == -ENOSPC) {
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 0;
+        }
        btrfs_end_log_trans(root);
        return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
        else
                key.type = BTRFS_DIR_LOG_INDEX_KEY;
        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
-        BUG_ON(ret);
+        if (ret)
+                return ret;
        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                              struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        struct btrfs_key max_key;
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src;
+        int err = 0;
        int ret;
        int i;
        int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        ret = overwrite_item(trans, log, dst_path,
                                             path->nodes[0], path->slots[0],
                                             &tmp);
+                        if (ret) {
+                                err = ret;
+                                goto done;
+                        }
                }
        }
        btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                                goto done;
                        ret = overwrite_item(trans, log, dst_path, src, i,
                                             &min_key);
-                        BUG_ON(ret);
+                        if (ret) {
+                                err = ret;
+                                goto done;
+                        }
                }
                path->slots[0] = nritems;
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        ret = overwrite_item(trans, log, dst_path,
                                             path->nodes[0], path->slots[0],
                                             &tmp);
+                        if (ret)
-                        BUG_ON(ret);
+                                err = ret;
-                        last_offset = tmp.offset;
+                        else
+                                last_offset = tmp.offset;
                        goto done;
                }
        }
 done:
-        *last_offset_ret = last_offset;
        btrfs_release_path(root, path);
        btrfs_release_path(log, dst_path);
-        /* insert the log range keys to indicate where the log is valid */
+        if (err == 0) {
-        ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+                *last_offset_ret = last_offset;
-                                 first_offset, last_offset);
+                /*
-        BUG_ON(ret);
+                 * insert the log range keys to indicate where the log
-        return 0;
+                 * is valid
+                 */
+                ret = insert_dir_log_key(trans, log, path, key_type,
+                                         inode->i_ino, first_offset,
+                                         last_offset);
+                if (ret)
+                        err = ret;
+        }
+        return err;
 }
 /*
@@ -2501,7 +2529,8 @@ again:
                ret = log_dir_items(trans, root, inode, path,
                                    dst_path, key_type, min_key,
                                    &max_key);
-                BUG_ON(ret);
+                if (ret)
+                        return ret;
                if (max_key == (u64)-1)
                        break;
                min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
        while (1) {
                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+                BUG_ON(ret == 0);
-                if (ret != 1)
+                if (ret < 0)
                        break;
                if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                btrfs_release_path(log, path);
        }
        btrfs_release_path(log, path);
-        return 0;
+        return ret;
 }
 static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        }
        ret = btrfs_insert_empty_items(trans, log, dst_path,
                                       ins_keys, ins_sizes, nr);
-        BUG_ON(ret);
+        if (ret) {
+                kfree(ins_data);
+                return ret;
+        }
        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
         * we have to do this after the loop above to avoid changing the
         * log tree while trying to change the log tree.
         */
+        ret = 0;
        while (!list_empty(&ordered_sums)) {
                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
                                                   struct btrfs_ordered_sum,
                                                   list);
-                ret = btrfs_csum_file_blocks(trans, log, sums);
+                if (!ret)
-                BUG_ON(ret);
+                        ret = btrfs_csum_file_blocks(trans, log, sums);
                list_del(&sums->list);
                kfree(sums);
        }
-        return 0;
+        return ret;
 }
 /* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src = NULL;
        u32 size;
+        int err = 0;
        int ret;
        int nritems;
        int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        } else {
                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
        }
-        BUG_ON(ret);
+        if (ret) {
+                err = ret;
+                goto out_unlock;
+        }
        path->keep_locks = 1;
        while (1) {
@@ -2768,7 +2805,10 @@ again:
                ret = copy_items(trans, log, dst_path, src, ins_start_slot,
                                 ins_nr, inode_only);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
                ins_nr = 1;
                ins_start_slot = path->slots[0];
 next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
                        ret = copy_items(trans, log, dst_path, src,
                                         ins_start_slot,
                                         ins_nr, inode_only);
-                        BUG_ON(ret);
+                        if (ret) {
+                                err = ret;
+                                goto out_unlock;
+                        }
                        ins_nr = 0;
                }
                btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
                ret = copy_items(trans, log, dst_path, src,
                                 ins_start_slot,
                                 ins_nr, inode_only);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
                ins_nr = 0;
        }
        WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
                btrfs_release_path(root, path);
                btrfs_release_path(log, dst_path);
                ret = log_directory_changes(trans, root, inode, path, dst_path);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
        }
        BTRFS_I(inode)->logged_trans = trans->transid;
+out_unlock:
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
        btrfs_free_path(path);
        btrfs_free_path(dst_path);
-        return 0;
+        return err;
 }
 /*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_no_trans;
        }
-        start_log_trans(trans, root);
+        ret = start_log_trans(trans, root);
+        if (ret)
+                goto end_trans;
        ret = btrfs_log_inode(trans, root, inode, inode_only);
-        BUG_ON(ret);
+        if (ret)
+                goto end_trans;
        /*
         * for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         */
        if (S_ISREG(inode->i_mode) &&
            BTRFS_I(inode)->generation <= last_committed &&
-            BTRFS_I(inode)->last_unlink_trans <= last_committed)
+            BTRFS_I(inode)->last_unlink_trans <= last_committed) {
-                        goto no_parent;
+                ret = 0;
+                goto end_trans;
+        }
        inode_only = LOG_INODE_EXISTS;
        while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (BTRFS_I(inode)->generation >
                    root->fs_info->last_trans_committed) {
                        ret = btrfs_log_inode(trans, root, inode, inode_only);
-                        BUG_ON(ret);
+                        if (ret)
+                                goto end_trans;
                }
                if (IS_ROOT(parent))
                        break;
                parent = parent->d_parent;
        }
-no_parent:
        ret = 0;
+end_trans:
+        if (ret < 0) {
+                BUG_ON(ret != -ENOSPC);
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 1;
+        }
        btrfs_end_log_trans(root);
 end_no_trans:
        return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        trans = btrfs_start_transaction(fs_info->tree_root, 1);
+        trans = btrfs_start_transaction(fs_info->tree_root, 0);
        wc.trans = trans;
        wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae8..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
        if (!path)
                return -ENOMEM;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                goto error;
        }
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        lock_chunks(root);
        device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        /* step one, relocate all the extents inside this chunk */
        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
-        BUG_ON(ret);
+        if (ret)
+                return ret;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        BUG_ON(!trans);
        lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
                        break;
                BUG_ON(ret);
-                trans = btrfs_start_transaction(dev_root, 1);
+                trans = btrfs_start_transaction(dev_root, 0);
                BUG_ON(!trans);
                ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
        }
        /* Shrinking succeeded, else we would be at "done". */
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto done;
-        }
        lock_chunks(root);
        device->disk_total_bytes = new_size;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 59acd3eb288a..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        if (trans)
                return do_setxattr(trans, inode, name, value, size, flags);
-        ret = btrfs_reserve_metadata_space(root, 2);
+        trans = btrfs_start_transaction(root, 2);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto out;
-        }
        btrfs_set_trans_block_group(trans, inode);
        ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
 out:
        btrfs_end_transaction_throttle(trans, root);
-        btrfs_unreserve_metadata_space(root, 2);
        return ret;
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index e8aa7081d25c..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 }
 /*
- * block_write_begin takes care of the basic task of block allocation and
+ * Filesystems implementing the new truncate sequence should use the
- * bringing partial write blocks uptodate first.
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
- *
+ * The filesystem needs to handle block truncation upon failure.
- * If *pagep is not NULL, then block_write_begin uses the locked page
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
 */
-int block_write_begin(struct file *file, struct address_space *mapping,
+int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
                        unlock_page(page);
                        page_cache_release(page);
                        *pagep = NULL;
-                        /*
-                         * prepare_write() may have instantiated a few blocks
-                         * outside i_size.  Trim these off again. Don't need
-                         * i_size_read because we hold i_mutex.
-                         */
-                        if (pos + len > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
                }
        }
 out:
        return status;
 }
+EXPORT_SYMBOL(block_write_begin_newtrunc);
+/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        int ret;
+        ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block);
+        /*
+         * prepare_write() may have instantiated a few blocks
+         * outside i_size.  Trim these off again. Don't need
+         * i_size_read because we hold i_mutex.
+         *
+         * Filesystems which pass down their own page also cannot
+         * call into vmtruncate here because it would lead to lock
+         * inversion problems (*pagep is locked). This is a further
+         * example of where the old truncate sequence is inadequate.
+         */
+        if (unlikely(ret) && *pagep == NULL) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
+}
 EXPORT_SYMBOL(block_write_begin);
 int block_write_end(struct file *file, struct address_space *mapping,
@@ -2324,7 +2351,7 @@ out:
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */
-int cont_write_begin(struct file *file, struct address_space *mapping,
+int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block, loff_t *bytes)
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
        }
        *pagep = NULL;
-        err = block_write_begin(file, mapping, pos, len,
+        err = block_write_begin_newtrunc(file, mapping, pos, len,
                                flags, pagep, fsdata, get_block);
 out:
        return err;
 }
+EXPORT_SYMBOL(cont_write_begin_newtrunc);
+int cont_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block, loff_t *bytes)
+{
+        int ret;
+        ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block, bytes);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
+}
 EXPORT_SYMBOL(cont_write_begin);
 int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
- * vmtruncate() writes the inode size before removing pages, once we have the
+ * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 }
 /*
- * On entry, the page is fully not uptodate.
+ * Filesystems implementing the new truncate sequence should use the
- * On exit the page is fully uptodate in the areas outside (from,to)
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
 */
-int nobh_write_begin(struct file *file, struct address_space *mapping,
+int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
                unlock_page(page);
                page_cache_release(page);
                *pagep = NULL;
-                return block_write_begin(file, mapping, pos, len, flags, pagep,
+                return block_write_begin_newtrunc(file, mapping, pos, len,
-                                        fsdata, get_block);
+                                        flags, pagep, fsdata, get_block);
        }
        if (PageMappedToDisk(page))
@@ -2605,8 +2652,34 @@ out_release:
        page_cache_release(page);
        *pagep = NULL;
-        if (pos + len > inode->i_size)
+        return ret;
-                vmtruncate(inode, inode->i_size);
+}
+EXPORT_SYMBOL(nobh_write_begin_newtrunc);
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        int ret;
+        ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block);
+        /*
+         * prepare_write() may have instantiated a few blocks
+         * outside i_size.  Trim these off again. Don't need
+         * i_size_read because we hold i_mutex.
+         */
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
        return ret;
 }
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index a9005d862ed4..d9c60b84949a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -274,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
        int rc = 0;
        struct page **pages;
-        struct pagevec pvec;
        loff_t offset;
        u64 len;
@@ -297,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        if (rc < 0)
                goto out;
-        /* set uptodate and add to lru in pagevec-sized chunks */
-        pagevec_init(&pvec, 0);
        for (; !list_empty(page_list) && len > 0;
             rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
                struct page *page =
@@ -312,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                        zero_user_segment(page, s, PAGE_CACHE_SIZE);
                }
-                if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+                if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
                        page_cache_release(page);
                        dout("readpages %p add_to_page_cache failed %p\n",
                             inode, page);
@@ -323,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                flush_dcache_page(page);
                SetPageUptodate(page);
                unlock_page(page);
-                if (pagevec_add(&pvec, page) == 0)
+                page_cache_release(page);
-                        pagevec_lru_add_file(&pvec);   /* add to lru */
        }
-        pagevec_lru_add_file(&pvec);
        rc = 0;
 out:
@@ -568,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
        ceph_release_pages(req->r_pages, req->r_num_pages);
        if (req->r_pages_from_pool)
                mempool_free(req->r_pages,
-                             ceph_client(inode->i_sb)->wb_pagevec_pool);
+                             ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
        else
                kfree(req->r_pages);
        ceph_osdc_put_request(req);
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 818afe72e6c7..89490beaf537 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -1,7 +1,6 @@
 #include "ceph_debug.h"
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/slab.h>
@@ -150,7 +149,8 @@ int ceph_build_auth_request(struct ceph_auth_client *ac,
        ret = ac->ops->build_request(ac, p + sizeof(u32), end);
        if (ret < 0) {
-                pr_err("error %d building request\n", ret);
+                pr_err("error %d building auth method %s request\n", ret,
+                       ac->ops->name);
                return ret;
        }
        dout(" built request %d bytes\n", ret);
@@ -229,7 +229,7 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
        if (ret == -EAGAIN) {
                return ceph_build_auth_request(ac, reply_buf, reply_len);
        } else if (ret) {
-                pr_err("authentication error %d\n", ret);
+                pr_err("auth method '%s' error %d\n", ac->ops->name, ret);
                return ret;
        }
        return 0;
@@ -246,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
        if (!ac->protocol)
                return ceph_auth_build_hello(ac, msg_buf, msg_len);
        BUG_ON(!ac->ops);
-        if (!ac->ops->is_authenticated(ac))
+        if (ac->ops->should_authenticate(ac))
                return ceph_build_auth_request(ac, msg_buf, msg_len);
        return 0;
 }
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index ca4f57cfb267..d38a2fb4a137 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -15,6 +15,8 @@ struct ceph_auth_client;
 struct ceph_authorizer;
 struct ceph_auth_client_ops {
+        const char *name;
        /*
         * true if we are authenticated and can connect to
         * services.
@@ -22,6 +24,12 @@ struct ceph_auth_client_ops {
        int (*is_authenticated)(struct ceph_auth_client *ac);
        /*
+         * true if we should (re)authenticate, e.g., when our tickets
+         * are getting old and crusty.
+         */
+        int (*should_authenticate)(struct ceph_auth_client *ac);
+        /*
         * build requests and process replies during monitor
         * handshake.  if handle_reply returns -EAGAIN, we build
         * another request.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 8cd9e3af07f7..ad1dc21286c7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -31,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac)
        return !xi->starting;
 }
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+        struct ceph_auth_none_info *xi = ac->private;
+        return xi->starting;
+}
 /*
 * the generic auth code decode the global_id, and we carry no actual
 * authenticate state, so nothing happens here.
@@ -94,9 +101,11 @@ static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,
 }
 static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+        .name = "none",
        .reset = reset,
        .destroy = destroy,
        .is_authenticated = is_authenticated,
+        .should_authenticate = should_authenticate,
        .handle_reply = handle_reply,
        .create_authorizer = ceph_auth_none_create_authorizer,
        .destroy_authorizer = ceph_auth_none_destroy_authorizer,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index fee5a08da881..83d4d2785ffe 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -27,6 +27,17 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
        return (ac->want_keys & xi->have_keys) == ac->want_keys;
 }
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi = ac->private;
+        int need;
+        ceph_x_validate_tickets(ac, &need);
+        dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+             ac->want_keys, need, xi->have_keys);
+        return need != 0;
+}
 static int ceph_x_encrypt_buflen(int ilen)
 {
        return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
@@ -127,7 +138,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
        int ret;
        char *dbuf;
        char *ticket_buf;
-        u8 struct_v;
+        u8 reply_struct_v;
        dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS);
        if (!dbuf)
@@ -139,14 +150,14 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                goto out_dbuf;
        ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
-        struct_v = ceph_decode_8(&p);
+        reply_struct_v = ceph_decode_8(&p);
-        if (struct_v != 1)
+        if (reply_struct_v != 1)
                goto bad;
        num = ceph_decode_32(&p);
        dout("%d tickets\n", num);
        while (num--) {
                int type;
-                u8 struct_v;
+                u8 tkt_struct_v, blob_struct_v;
                struct ceph_x_ticket_handler *th;
                void *dp, *dend;
                int dlen;
@@ -165,8 +176,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                type = ceph_decode_32(&p);
                dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
-                struct_v = ceph_decode_8(&p);
+                tkt_struct_v = ceph_decode_8(&p);
-                if (struct_v != 1)
+                if (tkt_struct_v != 1)
                        goto bad;
                th = get_ticket_handler(ac, type);
@@ -186,8 +197,8 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                dend = dbuf + dlen;
                dp = dbuf;
-                struct_v = ceph_decode_8(&dp);
+                tkt_struct_v = ceph_decode_8(&dp);
-                if (struct_v != 1)
+                if (tkt_struct_v != 1)
                        goto bad;
                memcpy(&old_key, &th->session_key, sizeof(old_key));
@@ -224,7 +235,7 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
                tpend = tp + dlen;
                dout(" ticket blob is %d bytes\n", dlen);
                ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
-                struct_v = ceph_decode_8(&tp);
+                blob_struct_v = ceph_decode_8(&tp);
                new_secret_id = ceph_decode_64(&tp);
                ret = ceph_decode_buffer(&new_ticket_blob, &tp, tpend);
                if (ret)
@@ -618,7 +629,9 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
 static const struct ceph_auth_client_ops ceph_x_ops = {
+        .name = "x",
        .is_authenticated = ceph_x_is_authenticated,
+        .should_authenticate = ceph_x_should_authenticate,
        .build_request = ceph_x_build_request,
        .handle_reply = ceph_x_handle_reply,
        .create_authorizer = ceph_x_create_authorizer,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d9400534b279..ae3e3a306445 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -867,7 +867,8 @@ void __ceph_remove_cap(struct ceph_cap *cap)
 {
        struct ceph_mds_session *session = cap->session;
        struct ceph_inode_info *ci = cap->ci;
-        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        struct ceph_mds_client *mdsc =
+                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        int removed = 0;
        dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
@@ -937,9 +938,9 @@ static int send_cap_msg(struct ceph_mds_session *session,
             seq, issue_seq, mseq, follows, size, max_size,
             xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg)
-                return PTR_ERR(msg);
+                return -ENOMEM;
        msg->hdr.tid = cpu_to_le64(flush_tid);
@@ -1298,7 +1299,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci)
 */
 void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+        struct ceph_mds_client *mdsc =
+                &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
        struct inode *inode = &ci->vfs_inode;
        int was = ci->i_dirty_caps;
        int dirty = 0;
@@ -1336,7 +1338,7 @@ void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
 static int __mark_caps_flushing(struct inode *inode,
                                 struct ceph_mds_session *session)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int flushing;
@@ -1663,7 +1665,7 @@ ack:
 static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
                          unsigned *flush_tid)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        struct ceph_inode_info *ci = ceph_inode(inode);
        int unlock_session = session ? 0 : 1;
        int flushing = 0;
@@ -1716,10 +1718,9 @@ out_unlocked:
 static int caps_are_flushed(struct inode *inode, unsigned tid)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        int dirty, i, ret = 1;
+        int i, ret = 1;
        spin_lock(&inode->i_lock);
-        dirty = __ceph_caps_dirty(ci);
        for (i = 0; i < CEPH_CAP_BITS; i++)
                if ((ci->i_flushing_caps & (1 << i)) &&
                    ci->i_cap_flush_tid[i] <= tid) {
@@ -1775,9 +1776,9 @@ out:
        spin_unlock(&ci->i_unsafe_lock);
 }
-int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ceph_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
        unsigned flush_tid;
        int ret;
@@ -1829,7 +1830,8 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
                        err = wait_event_interruptible(ci->i_cap_wq,
                                       caps_are_flushed(inode, flush_tid));
        } else {
-                struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+                struct ceph_mds_client *mdsc =
+                        &ceph_sb_to_client(inode->i_sb)->mdsc;
                spin_lock(&inode->i_lock);
                if (__ceph_caps_dirty(ci))
@@ -2411,7 +2413,7 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
        __releases(inode->i_lock)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        unsigned seq = le32_to_cpu(m->seq);
        int dirty = le32_to_cpu(m->dirty);
        int cleaned = 0;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 0c2241ef3653..2fa992eaf7da 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -19,7 +19,7 @@
 * Ceph release version
 */
 #define CEPH_VERSION_MAJOR 0
-#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_MINOR 20
 #define CEPH_VERSION_PATCH 0
 #define _CEPH_STRINGIFY(x) #x
@@ -36,7 +36,7 @@
 * client-facing protocol.
 */
 #define CEPH_OSD_PROTOCOL     8 /* cluster internal */
-#define CEPH_MDS_PROTOCOL     9 /* cluster internal */
+#define CEPH_MDS_PROTOCOL    12 /* cluster internal */
 #define CEPH_MON_PROTOCOL     5 /* cluster internal */
 #define CEPH_OSDC_PROTOCOL   24 /* server/client */
 #define CEPH_MDSC_PROTOCOL   32 /* server/client */
@@ -53,8 +53,18 @@
 /*
 * feature bits
 */
-#define CEPH_FEATURE_SUPPORTED  0
+#define CEPH_FEATURE_UID        1
-#define CEPH_FEATURE_REQUIRED   0
+#define CEPH_FEATURE_NOSRCADDR  2
+#define CEPH_FEATURE_FLOCK      4
+#define CEPH_FEATURE_SUPPORTED_MON  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_MON   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_MDS  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR|CEPH_FEATURE_FLOCK
+#define CEPH_FEATURE_REQUIRED_MDS   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_OSD  CEPH_FEATURE_UID|CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_OSD   CEPH_FEATURE_UID
+#define CEPH_FEATURE_SUPPORTED_CLIENT CEPH_FEATURE_NOSRCADDR
+#define CEPH_FEATURE_REQUIRED_CLIENT CEPH_FEATURE_NOSRCADDR
 /*
@@ -91,6 +101,8 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_AUTH_NONE          0x1
 #define CEPH_AUTH_CEPHX         0x2
+#define CEPH_AUTH_UID_DEFAULT ((__u64) -1)
 /*********************************************
 * message layer
@@ -128,11 +140,27 @@ int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
 #define CEPH_MSG_CLIENT_SNAP            0x312
 #define CEPH_MSG_CLIENT_CAPRELEASE      0x313
+/* pool ops */
+#define CEPH_MSG_POOLOP_REPLY           48
+#define CEPH_MSG_POOLOP                 49
 /* osd */
 #define CEPH_MSG_OSD_MAP          41
 #define CEPH_MSG_OSD_OP           42
 #define CEPH_MSG_OSD_OPREPLY      43
+/* pool operations */
+enum {
+  POOL_OP_CREATE                        = 0x01,
+  POOL_OP_DELETE                        = 0x02,
+  POOL_OP_AUID_CHANGE                   = 0x03,
+  POOL_OP_CREATE_SNAP                   = 0x11,
+  POOL_OP_DELETE_SNAP                   = 0x12,
+  POOL_OP_CREATE_UNMANAGED_SNAP         = 0x21,
+  POOL_OP_DELETE_UNMANAGED_SNAP         = 0x22,
+};
 struct ceph_mon_request_header {
        __le64 have_version;
        __le16 session_mon;
@@ -155,6 +183,31 @@ struct ceph_mon_statfs_reply {
        struct ceph_statfs st;
 } __attribute__ ((packed));
+const char *ceph_pool_op_name(int op);
+struct ceph_mon_poolop {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+        __le32 pool;
+        __le32 op;
+        __le64 auid;
+        __le64 snapid;
+        __le32 name_len;
+} __attribute__ ((packed));
+struct ceph_mon_poolop_reply {
+        struct ceph_mon_request_header monhdr;
+        struct ceph_fsid fsid;
+        __le32 reply_code;
+        __le32 epoch;
+        char has_data;
+        char data[0];
+} __attribute__ ((packed));
+struct ceph_mon_unmanaged_snap {
+        __le64 snapid;
+} __attribute__ ((packed));
 struct ceph_osd_getmap {
        struct ceph_mon_request_header monhdr;
        struct ceph_fsid fsid;
@@ -212,16 +265,17 @@ extern const char *ceph_mds_state_name(int s);
 *  - they also define the lock ordering by the MDS
 *  - a few of these are internal to the mds
 */
-#define CEPH_LOCK_DN          1
+#define CEPH_LOCK_DVERSION    1
-#define CEPH_LOCK_ISNAP       2
+#define CEPH_LOCK_DN          2
-#define CEPH_LOCK_IVERSION    4     /* mds internal */
+#define CEPH_LOCK_ISNAP       16
-#define CEPH_LOCK_IFILE       8     /* mds internal */
+#define CEPH_LOCK_IVERSION    32    /* mds internal */
-#define CEPH_LOCK_IAUTH       32
+#define CEPH_LOCK_IFILE       64
-#define CEPH_LOCK_ILINK       64
+#define CEPH_LOCK_IAUTH       128
-#define CEPH_LOCK_IDFT        128   /* dir frag tree */
+#define CEPH_LOCK_ILINK       256
-#define CEPH_LOCK_INEST       256   /* mds internal */
+#define CEPH_LOCK_IDFT        512   /* dir frag tree */
-#define CEPH_LOCK_IXATTR      512
+#define CEPH_LOCK_INEST       1024  /* mds internal */
-#define CEPH_LOCK_INO         2048  /* immutable inode bits; not a lock */
+#define CEPH_LOCK_IXATTR      2048
+#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
 /* client_session ops */
 enum {
@@ -308,6 +362,7 @@ union ceph_mds_request_args {
        struct {
                __le32 frag;                 /* which dir fragment */
                __le32 max_entries;          /* how many dentries to grab */
+                __le32 max_bytes;
        } __attribute__ ((packed)) readdir;
        struct {
                __le32 mode;
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
index 8e4be6a80c62..7503aee828ce 100644
--- a/fs/ceph/ceph_strings.c
+++ b/fs/ceph/ceph_strings.c
@@ -10,7 +10,6 @@ const char *ceph_entity_type_name(int type)
        case CEPH_ENTITY_TYPE_OSD: return "osd";
        case CEPH_ENTITY_TYPE_MON: return "mon";
        case CEPH_ENTITY_TYPE_CLIENT: return "client";
-        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
        case CEPH_ENTITY_TYPE_AUTH: return "auth";
        default: return "unknown";
        }
@@ -45,6 +44,7 @@ const char *ceph_osd_op_name(int op)
        case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
        case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
        case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+        case CEPH_OSD_OP_CMPXATTR: return "cmpxattr";
        case CEPH_OSD_OP_PULL: return "pull";
        case CEPH_OSD_OP_PUSH: return "push";
@@ -174,3 +174,17 @@ const char *ceph_snap_op_name(int o)
        }
        return "???";
 }
+const char *ceph_pool_op_name(int op)
+{
+        switch (op) {
+        case POOL_OP_CREATE: return "create";
+        case POOL_OP_DELETE: return "delete";
+        case POOL_OP_AUID_CHANGE: return "auid change";
+        case POOL_OP_CREATE_SNAP: return "create snap";
+        case POOL_OP_DELETE_SNAP: return "delete snap";
+        case POOL_OP_CREATE_UNMANAGED_SNAP: return "create unmanaged snap";
+        case POOL_OP_DELETE_UNMANAGED_SNAP: return "delete unmanaged snap";
+        }
+        return "???";
+}
diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
index f7048da92acc..3be33fb066cc 100644
--- a/fs/ceph/debugfs.c
+++ b/fs/ceph/debugfs.c
@@ -113,7 +113,7 @@ static int osdmap_show(struct seq_file *s, void *p)
 static int monc_show(struct seq_file *s, void *p)
 {
        struct ceph_client *client = s->private;
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_client *monc = &client->monc;
        struct rb_node *rp;
@@ -126,9 +126,14 @@ static int monc_show(struct seq_file *s, void *p)
        if (monc->want_next_osdmap)
                seq_printf(s, "want next osdmap\n");
-        for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
+        for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
-                req = rb_entry(rp, struct ceph_mon_statfs_request, node);
+                __u16 op;
-                seq_printf(s, "%lld statfs\n", req->tid);
+                req = rb_entry(rp, struct ceph_mon_generic_request, node);
+                op = le16_to_cpu(req->request->hdr.type);
+                if (op == CEPH_MSG_STATFS)
+                        seq_printf(s, "%lld statfs\n", req->tid);
+                else
+                        seq_printf(s, "%lld unknown\n", req->tid);
        }
        mutex_unlock(&monc->mutex);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 650d2db5ed26..f85719310db2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -51,8 +51,11 @@ int ceph_init_dentry(struct dentry *dentry)
                return -ENOMEM;          /* oh well */
        spin_lock(&dentry->d_lock);
-        if (dentry->d_fsdata) /* lost a race */
+        if (dentry->d_fsdata) {
+                /* lost a race */
+                kmem_cache_free(ceph_dentry_cachep, di);
                goto out_unlock;
+        }
        di->dentry = dentry;
        di->lease_session = NULL;
        dentry->d_fsdata = di;
@@ -125,7 +128,8 @@ more:
        dentry = list_entry(p, struct dentry, d_u.d_child);
        di = ceph_dentry(dentry);
        while (1) {
-                dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+                dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next,
+                     d_unhashed(dentry) ? "!hashed" : "hashed",
                     parent->d_subdirs.prev, parent->d_subdirs.next);
                if (p == &parent->d_subdirs) {
                        fi->at_end = 1;
@@ -229,6 +233,7 @@ static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
        u32 ftype;
        struct ceph_mds_reply_info_parsed *rinfo;
        const int max_entries = client->mount_args->max_readdir;
+        const int max_bytes = client->mount_args->max_readdir_bytes;
        dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
        if (fi->at_end)
@@ -312,6 +317,7 @@ more:
                req->r_readdir_offset = fi->next_offset;
                req->r_args.readdir.frag = cpu_to_le32(frag);
                req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+                req->r_args.readdir.max_bytes = cpu_to_le32(max_bytes);
                req->r_num_caps = max_entries + 1;
                err = ceph_mdsc_do_request(mdsc, NULL, req);
                if (err < 0) {
@@ -335,7 +341,7 @@ more:
                if (req->r_reply_info.dir_end) {
                        kfree(fi->last_name);
                        fi->last_name = NULL;
-                        fi->next_offset = 0;
+                        fi->next_offset = 2;
                } else {
                        rinfo = &req->r_reply_info;
                        err = note_last_dentry(fi,
@@ -478,7 +484,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
 struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
                                  struct dentry *dentry, int err)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct inode *parent = dentry->d_parent->d_inode;
        /* .snap dir? */
@@ -568,7 +574,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                    !is_root_ceph_dentry(dir, dentry) &&
                    (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
                    (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
-                        di->offset = ci->i_max_offset++;
                        spin_unlock(&dir->i_lock);
                        dout(" dir %p complete, -ENOENT\n", dir);
                        d_add(dentry, NULL);
@@ -582,7 +587,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
        req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        /* we only need inode linkage */
@@ -888,13 +893,22 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
                /* ensure target dentry is invalidated, despite
                   rehashing bug in vfs_rename_dir */
-                new_dentry->d_time = jiffies;
+                ceph_invalidate_dentry_lease(new_dentry);
-                ceph_dentry(new_dentry)->lease_shared_gen = 0;
        }
        ceph_mdsc_put_request(req);
        return err;
 }
+/*
+ * Ensure a dentry lease will no longer revalidate.
+ */
+void ceph_invalidate_dentry_lease(struct dentry *dentry)
+{
+        spin_lock(&dentry->d_lock);
+        dentry->d_time = jiffies;
+        ceph_dentry(dentry)->lease_shared_gen = 0;
+        spin_unlock(&dentry->d_lock);
+}
 /*
 * Check if dentry lease is valid.  If not, delete the lease.  Try to
@@ -972,8 +986,9 @@ static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
        struct inode *dir = dentry->d_parent->d_inode;
-        dout("d_revalidate %p '%.*s' inode %p\n", dentry,
+        dout("d_revalidate %p '%.*s' inode %p offset %lld\n", dentry,
-             dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+             dentry->d_name.len, dentry->d_name.name, dentry->d_inode,
+             ceph_dentry(dentry)->offset);
        /* always trust cached snapped dentries, snapdir dentry */
        if (ceph_snap(dir) != CEPH_NOSNAP) {
@@ -1050,7 +1065,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
        struct ceph_inode_info *ci = ceph_inode(inode);
        int left;
-        if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+        if (!ceph_test_opt(ceph_sb_to_client(inode->i_sb), DIRSTAT))
                return -EISDIR;
        if (!cf->dir_info) {
@@ -1092,10 +1107,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 * an fsync() on a dir will wait for any uncommitted directory
 * operations to commit.
 */
-static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
+static int ceph_dir_fsync(struct file *file, int datasync)
-                          int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct list_head *head = &ci->i_unsafe_dirops;
        struct ceph_mds_request *req;
@@ -1152,7 +1166,7 @@ void ceph_dentry_lru_add(struct dentry *dn)
        dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_add_tail(&di->lru, &mdsc->dentry_lru);
                mdsc->num_dentry++;
@@ -1165,10 +1179,10 @@ void ceph_dentry_lru_touch(struct dentry *dn)
        struct ceph_dentry_info *di = ceph_dentry(dn);
        struct ceph_mds_client *mdsc;
-        dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
+        dout("dentry_lru_touch %p %p '%.*s' (offset %lld)\n", di, dn,
-             dn->d_name.len, dn->d_name.name);
+             dn->d_name.len, dn->d_name.name, di->offset);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_move_tail(&di->lru, &mdsc->dentry_lru);
                spin_unlock(&mdsc->dentry_lru_lock);
@@ -1183,7 +1197,7 @@ void ceph_dentry_lru_del(struct dentry *dn)
        dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
             dn->d_name.len, dn->d_name.name);
        if (di) {
-                mdsc = &ceph_client(dn->d_sb)->mdsc;
+                mdsc = &ceph_sb_to_client(dn->d_sb)->mdsc;
                spin_lock(&mdsc->dentry_lru_lock);
                list_del_init(&di->lru);
                mdsc->num_dentry--;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 9d67572fb328..4480cb1c63e7 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -93,11 +93,11 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
                return ERR_PTR(-ESTALE);
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("fh_to_dentry %llx -- inode %p but ENOMEM\n",
                       fh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
@@ -115,7 +115,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb,
 static struct dentry *__cfh_to_dentry(struct super_block *sb,
                                      struct ceph_nfs_confh *cfh)
 {
-        struct ceph_mds_client *mdsc = &ceph_client(sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(sb)->mdsc;
        struct inode *inode;
        struct dentry *dentry;
        struct ceph_vino vino;
@@ -133,7 +133,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
                req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
                                               USE_ANY_MDS);
                if (IS_ERR(req))
-                        return ERR_PTR(PTR_ERR(req));
+                        return ERR_CAST(req);
                req->r_ino1 = vino;
                req->r_ino2.ino = cfh->parent_ino;
@@ -149,11 +149,11 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
        }
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("cfh_to_dentry %llx -- inode %p but ENOMEM\n",
                       cfh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
@@ -202,11 +202,11 @@ static struct dentry *ceph_fh_to_parent(struct super_block *sb,
                return ERR_PTR(-ESTALE);
        dentry = d_obtain_alias(inode);
-        if (!dentry) {
+        if (IS_ERR(dentry)) {
                pr_err("fh_to_parent %llx -- inode %p but ENOMEM\n",
                       cfh->ino, inode);
                iput(inode);
-                return ERR_PTR(-ENOMEM);
+                return dentry;
        }
        err = ceph_init_dentry(dentry);
        if (err < 0) {
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 7d634938edc9..6251a1574b94 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -230,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
        /* do the open */
        req = prepare_open_request(dir->i_sb, flags, mode);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        if (flags & O_CREAT) {
@@ -317,16 +317,16 @@ void ceph_release_page_vector(struct page **pages, int num_pages)
 /*
 * allocate a vector new pages
 */
-static struct page **alloc_page_vector(int num_pages)
+struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags)
 {
        struct page **pages;
        int i;
-        pages = kmalloc(sizeof(*pages) * num_pages, GFP_NOFS);
+        pages = kmalloc(sizeof(*pages) * num_pages, flags);
        if (!pages)
                return ERR_PTR(-ENOMEM);
        for (i = 0; i < num_pages; i++) {
-                pages[i] = alloc_page(GFP_NOFS);
+                pages[i] = __page_cache_alloc(flags);
                if (pages[i] == NULL) {
                        ceph_release_page_vector(pages, i);
                        return ERR_PTR(-ENOMEM);
@@ -540,7 +540,7 @@ static ssize_t ceph_sync_read(struct file *file, char __user *data,
                 * in sequence.
                 */
        } else {
-                pages = alloc_page_vector(num_pages);
+                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
        }
        if (IS_ERR(pages))
                return PTR_ERR(pages);
@@ -649,8 +649,8 @@ more:
                                    do_sync,
                                    ci->i_truncate_seq, ci->i_truncate_size,
                                    &mtime, false, 2);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        num_pages = calc_pages_for(pos, len);
@@ -668,7 +668,7 @@ more:
                truncate_inode_pages_range(inode->i_mapping, pos, 
                                           (pos+len) | (PAGE_CACHE_SIZE-1));
        } else {
-                pages = alloc_page_vector(num_pages);
+                pages = ceph_alloc_page_vector(num_pages, GFP_NOFS);
                if (IS_ERR(pages)) {
                        ret = PTR_ERR(pages);
                        goto out;
@@ -809,7 +809,7 @@ static ssize_t ceph_aio_write(struct kiocb *iocb, const struct iovec *iov,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
        loff_t endoff = pos + iov->iov_len;
        int got = 0;
        int ret, err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 85b4d2ffdeba..226f5a50d362 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
        BUG_ON(!S_ISDIR(parent->i_mode));
        if (IS_ERR(inode))
-                return ERR_PTR(PTR_ERR(inode));
+                return inode;
        inode->i_mode = parent->i_mode;
        inode->i_uid = parent->i_uid;
        inode->i_gid = parent->i_gid;
@@ -384,7 +384,7 @@ void ceph_destroy_inode(struct inode *inode)
         */
        if (ci->i_snap_realm) {
                struct ceph_mds_client *mdsc =
-                        &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+                        &ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
                struct ceph_snap_realm *realm = ci->i_snap_realm;
                dout(" dropping residual ref to snap realm %p\n", realm);
@@ -619,11 +619,12 @@ static int fill_inode(struct inode *inode,
                        memcpy(ci->i_xattrs.blob->vec.iov_base,
                               iinfo->xattr_data, iinfo->xattr_len);
                ci->i_xattrs.version = le64_to_cpu(info->xattr_version);
+                xattr_blob = NULL;
        }
        inode->i_mapping->a_ops = &ceph_aops;
        inode->i_mapping->backing_dev_info =
-                &ceph_client(inode->i_sb)->backing_dev_info;
+                &ceph_sb_to_client(inode->i_sb)->backing_dev_info;
        switch (inode->i_mode & S_IFMT) {
        case S_IFIFO:
@@ -674,14 +675,15 @@ static int fill_inode(struct inode *inode,
                /* set dir completion flag? */
                if (ci->i_files == 0 && ci->i_subdirs == 0 &&
                    ceph_snap(inode) == CEPH_NOSNAP &&
-                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED)) {
+                    (le32_to_cpu(info->cap.caps) & CEPH_CAP_FILE_SHARED) &&
+                    (ci->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
                        dout(" marking %p complete (empty)\n", inode);
                        ci->i_ceph_flags |= CEPH_I_COMPLETE;
                        ci->i_max_offset = 2;
                }
                /* it may be better to set st_size in getattr instead? */
-                if (ceph_test_opt(ceph_client(inode->i_sb), RBYTES))
+                if (ceph_test_opt(ceph_sb_to_client(inode->i_sb), RBYTES))
                        inode->i_size = ci->i_rbytes;
                break;
        default:
@@ -802,6 +804,37 @@ out_unlock:
 }
 /*
+ * Set dentry's directory position based on the current dir's max, and
+ * order it in d_subdirs, so that dcache_readdir behaves.
+ */
+static void ceph_set_dentry_offset(struct dentry *dn)
+{
+        struct dentry *dir = dn->d_parent;
+        struct inode *inode = dn->d_parent->d_inode;
+        struct ceph_dentry_info *di;
+        BUG_ON(!inode);
+        di = ceph_dentry(dn);
+        spin_lock(&inode->i_lock);
+        if ((ceph_inode(inode)->i_ceph_flags & CEPH_I_COMPLETE) == 0) {
+                spin_unlock(&inode->i_lock);
+                return;
+        }
+        di->offset = ceph_inode(inode)->i_max_offset++;
+        spin_unlock(&inode->i_lock);
+        spin_lock(&dcache_lock);
+        spin_lock(&dn->d_lock);
+        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
+             dn->d_u.d_child.prev, dn->d_u.d_child.next);
+        spin_unlock(&dn->d_lock);
+        spin_unlock(&dcache_lock);
+}
+/*
 * splice a dentry to an inode.
 * caller must hold directory i_mutex for this to be safe.
 *
@@ -814,6 +847,8 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 {
        struct dentry *realdn;
+        BUG_ON(dn->d_inode);
        /* dn must be unhashed */
        if (!d_unhashed(dn))
                d_drop(dn);
@@ -835,44 +870,17 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
                dn = realdn;
        } else {
                BUG_ON(!ceph_dentry(dn));
                dout("dn %p attached to %p ino %llx.%llx\n",
                     dn, dn->d_inode, ceph_vinop(dn->d_inode));
        }
        if ((!prehash || *prehash) && d_unhashed(dn))
                d_rehash(dn);
+        ceph_set_dentry_offset(dn);
 out:
        return dn;
 }
 /*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
-        struct dentry *dir = dn->d_parent;
-        struct inode *inode = dn->d_parent->d_inode;
-        struct ceph_dentry_info *di;
-        BUG_ON(!inode);
-        di = ceph_dentry(dn);
-        spin_lock(&inode->i_lock);
-        di->offset = ceph_inode(inode)->i_max_offset++;
-        spin_unlock(&inode->i_lock);
-        spin_lock(&dcache_lock);
-        spin_lock(&dn->d_lock);
-        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
-        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
-             dn->d_u.d_child.prev, dn->d_u.d_child.next);
-        spin_unlock(&dn->d_lock);
-        spin_unlock(&dcache_lock);
-}
-/*
 * Incorporate results into the local cache.  This is either just
 * one inode, or a directory, dentry, and possibly linked-to inode (e.g.,
 * after a lookup).
@@ -933,14 +941,8 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
        if (!rinfo->head->is_target && !rinfo->head->is_dentry) {
                dout("fill_trace reply is empty!\n");
-                if (rinfo->head->result == 0 && req->r_locked_dir) {
+                if (rinfo->head->result == 0 && req->r_locked_dir)
-                        struct ceph_inode_info *ci =
+                        ceph_invalidate_dir_request(req);
-                                ceph_inode(req->r_locked_dir);
-                        dout(" clearing %p complete (empty trace)\n",
-                             req->r_locked_dir);
-                        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                        ci->i_release_count++;
-                }
                return 0;
        }
@@ -1011,13 +1013,18 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                             req->r_old_dentry->d_name.len,
                             req->r_old_dentry->d_name.name,
                             dn, dn->d_name.len, dn->d_name.name);
                        /* ensure target dentry is invalidated, despite
                           rehashing bug in vfs_rename_dir */
-                        dn->d_time = jiffies;
+                        ceph_invalidate_dentry_lease(dn);
-                        ceph_dentry(dn)->lease_shared_gen = 0;
                        /* take overwritten dentry's readdir offset */
+                        dout("dn %p gets %p offset %lld (old offset %lld)\n",
+                             req->r_old_dentry, dn, ceph_dentry(dn)->offset,
+                             ceph_dentry(req->r_old_dentry)->offset);
                        ceph_dentry(req->r_old_dentry)->offset =
                                ceph_dentry(dn)->offset;
                        dn = req->r_old_dentry;  /* use old_dentry */
                        in = dn->d_inode;
                }
@@ -1059,7 +1066,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                                goto done;
                        }
                        req->r_dentry = dn;  /* may have spliced */
-                        ceph_set_dentry_offset(dn);
                        igrab(in);
                } else if (ceph_ino(in) == vino.ino &&
                           ceph_snap(in) == vino.snap) {
@@ -1102,7 +1108,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
                        err = PTR_ERR(dn);
                        goto done;
                }
-                ceph_set_dentry_offset(dn);
                req->r_dentry = dn;  /* may have spliced */
                igrab(in);
                rinfo->head->is_dentry = 1;  /* fool notrace handlers */
@@ -1429,7 +1434,7 @@ void ceph_queue_vmtruncate(struct inode *inode)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
-        if (queue_work(ceph_client(inode->i_sb)->trunc_wq,
+        if (queue_work(ceph_sb_to_client(inode->i_sb)->trunc_wq,
                       &ci->i_vmtruncate_work)) {
                dout("ceph_queue_vmtruncate %p\n", inode);
                igrab(inode);
@@ -1518,7 +1523,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
        struct inode *parent_inode = dentry->d_parent->d_inode;
        const unsigned int ia_valid = attr->ia_valid;
        struct ceph_mds_request *req;
-        struct ceph_mds_client *mdsc = &ceph_client(dentry->d_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(dentry->d_sb)->mdsc;
        int issued;
        int release = 0, dirtied = 0;
        int mask = 0;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index 8a5bcae62846..d085f07756b4 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -98,7 +98,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
        struct ceph_ioctl_dataloc dl;
        struct inode *inode = file->f_dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
-        struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+        struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->osdc;
        u64 len = 1, olen;
        u64 tmp;
        struct ceph_object_layout ol;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 24561a557e01..b49f12822cbc 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -40,7 +40,7 @@
 static void __wake_requests(struct ceph_mds_client *mdsc,
                            struct list_head *head);
-const static struct ceph_connection_operations mds_con_ops;
+static const struct ceph_connection_operations mds_con_ops;
 /*
@@ -665,10 +665,10 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
        struct ceph_msg *msg;
        struct ceph_mds_session_head *h;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                pr_err("create_session_msg ENOMEM creating msg\n");
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        h = msg->front.iov_base;
        h->op = cpu_to_le32(op);
@@ -687,7 +687,6 @@ static int __open_session(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        int mstate;
        int mds = session->s_mds;
-        int err = 0;
        /* wait for mds to go active? */
        mstate = ceph_mdsmap_get_state(mdsc->mdsmap, mds);
@@ -698,13 +697,9 @@ static int __open_session(struct ceph_mds_client *mdsc,
        /* send connect message */
        msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq);
-        if (IS_ERR(msg)) {
+        if (!msg)
-                err = PTR_ERR(msg);
+                return -ENOMEM;
-                goto out;
-        }
        ceph_con_send(&session->s_con, msg);
-out:
        return 0;
 }
@@ -804,12 +799,49 @@ out:
 }
 static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
-                                   void *arg)
+                                  void *arg)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
+        int drop = 0;
        dout("removing cap %p, ci is %p, inode is %p\n",
             cap, ci, &ci->vfs_inode);
-        ceph_remove_cap(cap);
+        spin_lock(&inode->i_lock);
+        __ceph_remove_cap(cap);
+        if (!__ceph_is_any_real_caps(ci)) {
+                struct ceph_mds_client *mdsc =
+                        &ceph_sb_to_client(inode->i_sb)->mdsc;
+                spin_lock(&mdsc->cap_dirty_lock);
+                if (!list_empty(&ci->i_dirty_item)) {
+                        pr_info(" dropping dirty %s state for %p %lld\n",
+                                ceph_cap_string(ci->i_dirty_caps),
+                                inode, ceph_ino(inode));
+                        ci->i_dirty_caps = 0;
+                        list_del_init(&ci->i_dirty_item);
+                        drop = 1;
+                }
+                if (!list_empty(&ci->i_flushing_item)) {
+                        pr_info(" dropping dirty+flushing %s state for %p %lld\n",
+                                ceph_cap_string(ci->i_flushing_caps),
+                                inode, ceph_ino(inode));
+                        ci->i_flushing_caps = 0;
+                        list_del_init(&ci->i_flushing_item);
+                        mdsc->num_cap_flushing--;
+                        drop = 1;
+                }
+                if (drop && ci->i_wrbuffer_ref) {
+                        pr_info(" dropping dirty data for %p %lld\n",
+                                inode, ceph_ino(inode));
+                        ci->i_wrbuffer_ref = 0;
+                        ci->i_wrbuffer_ref_head = 0;
+                        drop++;
+                }
+                spin_unlock(&mdsc->cap_dirty_lock);
+        }
+        spin_unlock(&inode->i_lock);
+        while (drop--)
+                iput(inode);
        return 0;
 }
@@ -821,6 +853,7 @@ static void remove_session_caps(struct ceph_mds_session *session)
        dout("remove_session_caps on %p\n", session);
        iterate_session_caps(session, remove_session_caps_cb, NULL);
        BUG_ON(session->s_nr_caps > 0);
+        BUG_ON(!list_empty(&session->s_cap_flushing));
        cleanup_cap_releases(session);
 }
@@ -883,8 +916,8 @@ static int send_renew_caps(struct ceph_mds_client *mdsc,
                ceph_mds_state_name(state));
        msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
                                 ++session->s_renew_seq);
-        if (IS_ERR(msg))
+        if (!msg)
-                return PTR_ERR(msg);
+                return -ENOMEM;
        ceph_con_send(&session->s_con, msg);
        return 0;
 }
@@ -931,17 +964,15 @@ static int request_close_session(struct ceph_mds_client *mdsc,
                                 struct ceph_mds_session *session)
 {
        struct ceph_msg *msg;
-        int err = 0;
        dout("request_close_session mds%d state %s seq %lld\n",
             session->s_mds, session_state_name(session->s_state),
             session->s_seq);
        msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
-        if (IS_ERR(msg))
+        if (!msg)
-                err = PTR_ERR(msg);
+                return -ENOMEM;
-        else
+        ceph_con_send(&session->s_con, msg);
-                ceph_con_send(&session->s_con, msg);
+        return 0;
-        return err;
 }
 /*
@@ -1059,7 +1090,7 @@ static int add_cap_releases(struct ceph_mds_client *mdsc,
        while (session->s_num_cap_releases < session->s_nr_caps + extra) {
                spin_unlock(&session->s_cap_lock);
                msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE,
-                                   0, 0, NULL);
+                                   GFP_NOFS);
                if (!msg)
                        goto out_unlocked;
                dout("add_cap_releases %p msg %p now %d\n", session, msg,
@@ -1151,10 +1182,8 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
        struct ceph_msg *msg;
        dout("send_cap_releases mds%d\n", session->s_mds);
-        while (1) {
+        spin_lock(&session->s_cap_lock);
-                spin_lock(&session->s_cap_lock);
+        while (!list_empty(&session->s_cap_releases_done)) {
-                if (list_empty(&session->s_cap_releases_done))
-                        break;
                msg = list_first_entry(&session->s_cap_releases_done,
                                 struct ceph_msg, list_head);
                list_del_init(&msg->list_head);
@@ -1162,10 +1191,49 @@ static void send_cap_releases(struct ceph_mds_client *mdsc,
                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
                dout("send_cap_releases mds%d %p\n", session->s_mds, msg);
                ceph_con_send(&session->s_con, msg);
+                spin_lock(&session->s_cap_lock);
        }
        spin_unlock(&session->s_cap_lock);
 }
+static void discard_cap_releases(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_session *session)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_cap_release *head;
+        unsigned num;
+        dout("discard_cap_releases mds%d\n", session->s_mds);
+        spin_lock(&session->s_cap_lock);
+        /* zero out the in-progress message */
+        msg = list_first_entry(&session->s_cap_releases,
+                               struct ceph_msg, list_head);
+        head = msg->front.iov_base;
+        num = le32_to_cpu(head->num);
+        dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, num);
+        head->num = cpu_to_le32(0);
+        session->s_num_cap_releases += num;
+        /* requeue completed messages */
+        while (!list_empty(&session->s_cap_releases_done)) {
+                msg = list_first_entry(&session->s_cap_releases_done,
+                                 struct ceph_msg, list_head);
+                list_del_init(&msg->list_head);
+                head = msg->front.iov_base;
+                num = le32_to_cpu(head->num);
+                dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg,
+                     num);
+                session->s_num_cap_releases += num;
+                head->num = cpu_to_le32(0);
+                msg->front.iov_len = sizeof(*head);
+                list_add(&msg->list_head, &session->s_cap_releases);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
 /*
 * requests
 */
@@ -1181,6 +1249,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
        if (!req)
                return ERR_PTR(-ENOMEM);
+        mutex_init(&req->r_fill_mutex);
        req->r_started = jiffies;
        req->r_resend_mds = -1;
        INIT_LIST_HEAD(&req->r_unsafe_dir_item);
@@ -1251,7 +1320,7 @@ retry:
                        len += 1 + temp->d_name.len;
                temp = temp->d_parent;
                if (temp == NULL) {
-                        pr_err("build_path_dentry corrupt dentry %p\n", dentry);
+                        pr_err("build_path corrupt dentry %p\n", dentry);
                        return ERR_PTR(-EINVAL);
                }
        }
@@ -1267,7 +1336,7 @@ retry:
                struct inode *inode = temp->d_inode;
                if (inode && ceph_snap(inode) == CEPH_SNAPDIR) {
-                        dout("build_path_dentry path+%d: %p SNAPDIR\n",
+                        dout("build_path path+%d: %p SNAPDIR\n",
                             pos, temp);
                } else if (stop_on_nosnap && inode &&
                           ceph_snap(inode) == CEPH_NOSNAP) {
@@ -1278,20 +1347,18 @@ retry:
                                break;
                        strncpy(path + pos, temp->d_name.name,
                                temp->d_name.len);
-                        dout("build_path_dentry path+%d: %p '%.*s'\n",
-                             pos, temp, temp->d_name.len, path + pos);
                }
                if (pos)
                        path[--pos] = '/';
                temp = temp->d_parent;
                if (temp == NULL) {
-                        pr_err("build_path_dentry corrupt dentry\n");
+                        pr_err("build_path corrupt dentry\n");
                        kfree(path);
                        return ERR_PTR(-EINVAL);
                }
        }
        if (pos != 0) {
-                pr_err("build_path_dentry did not end path lookup where "
+                pr_err("build_path did not end path lookup where "
                       "expected, namelen is %d, pos is %d\n", len, pos);
                /* presumably this is only possible if racing with a
                   rename of one of the parent directories (we can not
@@ -1303,7 +1370,7 @@ retry:
        *base = ceph_ino(temp->d_inode);
        *plen = len;
-        dout("build_path_dentry on %p %d built %llx '%.*s'\n",
+        dout("build_path on %p %d built %llx '%.*s'\n",
             dentry, atomic_read(&dentry->d_count), *base, len, path);
        return path;
 }
@@ -1426,9 +1493,11 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
        if (req->r_old_dentry_drop)
                len += req->r_old_dentry->d_name.len;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg) {
+                msg = ERR_PTR(-ENOMEM);
                goto out_free2;
+        }
        msg->hdr.tid = cpu_to_le64(req->r_tid);
@@ -1517,9 +1586,9 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
        }
        msg = create_request_message(mdsc, req, mds);
        if (IS_ERR(msg)) {
-                req->r_reply = ERR_PTR(PTR_ERR(msg));
+                req->r_err = PTR_ERR(msg);
                complete_request(mdsc, req);
-                return -PTR_ERR(msg);
+                return PTR_ERR(msg);
        }
        req->r_request = msg;
@@ -1552,7 +1621,7 @@ static int __do_request(struct ceph_mds_client *mdsc,
        int mds = -1;
        int err = -EAGAIN;
-        if (req->r_reply)
+        if (req->r_err || req->r_got_result)
                goto out;
        if (req->r_timeout &&
@@ -1609,7 +1678,7 @@ out:
        return err;
 finish:
-        req->r_reply = ERR_PTR(err);
+        req->r_err = err;
        complete_request(mdsc, req);
        goto out;
 }
@@ -1630,10 +1699,9 @@ static void __wake_requests(struct ceph_mds_client *mdsc,
 /*
 * Wake up threads with requests pending for @mds, so that they can
- * resubmit their requests to a possibly different mds.  If @all is set,
+ * resubmit their requests to a possibly different mds.
- * wake up if their requests has been forwarded to @mds, too.
 */
-static void kick_requests(struct ceph_mds_client *mdsc, int mds, int all)
+static void kick_requests(struct ceph_mds_client *mdsc, int mds)
 {
        struct ceph_mds_request *req;
        struct rb_node *p;
@@ -1689,64 +1757,78 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
        __register_request(mdsc, req, dir);
        __do_request(mdsc, req);
-        /* wait */
+        if (req->r_err) {
-        if (!req->r_reply) {
+                err = req->r_err;
-                mutex_unlock(&mdsc->mutex);
+                __unregister_request(mdsc, req);
-                if (req->r_timeout) {
+                dout("do_request early error %d\n", err);
-                        err = (long)wait_for_completion_interruptible_timeout(
+                goto out;
-                                &req->r_completion, req->r_timeout);
-                        if (err == 0)
-                                req->r_reply = ERR_PTR(-EIO);
-                        else if (err < 0)
-                                req->r_reply = ERR_PTR(err);
-                } else {
-                        err = wait_for_completion_interruptible(
-                                &req->r_completion);
-                        if (err)
-                                req->r_reply = ERR_PTR(err);
-                }
-                mutex_lock(&mdsc->mutex);
        }
-        if (IS_ERR(req->r_reply)) {
+        /* wait */
-                err = PTR_ERR(req->r_reply);
+        mutex_unlock(&mdsc->mutex);
-                req->r_reply = NULL;
+        dout("do_request waiting\n");
+        if (req->r_timeout) {
+                err = (long)wait_for_completion_killable_timeout(
+                        &req->r_completion, req->r_timeout);
+                if (err == 0)
+                        err = -EIO;
+        } else {
+                err = wait_for_completion_killable(&req->r_completion);
+        }
+        dout("do_request waited, got %d\n", err);
+        mutex_lock(&mdsc->mutex);
-                if (err == -ERESTARTSYS) {
+        /* only abort if we didn't race with a real reply */
-                        /* aborted */
+        if (req->r_got_result) {
-                        req->r_aborted = true;
+                err = le32_to_cpu(req->r_reply_info.head->result);
+        } else if (err < 0) {
+                dout("aborted request %lld with %d\n", req->r_tid, err);
-                        if (req->r_locked_dir &&
+                /*
-                            (req->r_op & CEPH_MDS_OP_WRITE)) {
+                 * ensure we aren't running concurrently with
-                                struct ceph_inode_info *ci =
+                 * ceph_fill_trace or ceph_readdir_prepopulate, which
-                                        ceph_inode(req->r_locked_dir);
+                 * rely on locks (dir mutex) held by our caller.
+                 */
+                mutex_lock(&req->r_fill_mutex);
+                req->r_err = err;
+                req->r_aborted = true;
+                mutex_unlock(&req->r_fill_mutex);
-                                dout("aborted, clearing I_COMPLETE on %p\n", 
+                if (req->r_locked_dir &&
-                                     req->r_locked_dir);
+                    (req->r_op & CEPH_MDS_OP_WRITE))
-                                spin_lock(&req->r_locked_dir->i_lock);
+                        ceph_invalidate_dir_request(req);
-                                ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
-                                ci->i_release_count++;
-                                spin_unlock(&req->r_locked_dir->i_lock);
-                        }
-                } else {
-                        /* clean up this request */
-                        __unregister_request(mdsc, req);
-                        if (!list_empty(&req->r_unsafe_item))
-                                list_del_init(&req->r_unsafe_item);
-                        complete(&req->r_safe_completion);
-                }
-        } else if (req->r_err) {
-                err = req->r_err;
        } else {
-                err = le32_to_cpu(req->r_reply_info.head->result);
+                err = req->r_err;
        }
-        mutex_unlock(&mdsc->mutex);
+out:
+        mutex_unlock(&mdsc->mutex);
        dout("do_request %p done, result %d\n", req, err);
        return err;
 }
 /*
+ * Invalidate dir I_COMPLETE, dentry lease state on an aborted MDS
+ * namespace request.
+ */
+void ceph_invalidate_dir_request(struct ceph_mds_request *req)
+{
+        struct inode *inode = req->r_locked_dir;
+        struct ceph_inode_info *ci = ceph_inode(inode);
+        dout("invalidate_dir_request %p (I_COMPLETE, lease(s))\n", inode);
+        spin_lock(&inode->i_lock);
+        ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+        ci->i_release_count++;
+        spin_unlock(&inode->i_lock);
+        if (req->r_dentry)
+                ceph_invalidate_dentry_lease(req->r_dentry);
+        if (req->r_old_dentry)
+                ceph_invalidate_dentry_lease(req->r_old_dentry);
+}
+/*
 * Handle mds reply.
 *
 * We take the session mutex and parse and process the reply immediately.
@@ -1797,6 +1879,12 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                mutex_unlock(&mdsc->mutex);
                goto out;
        }
+        if (req->r_got_safe && !head->safe) {
+                pr_warning("got unsafe after safe on %llu from mds%d\n",
+                           tid, mds);
+                mutex_unlock(&mdsc->mutex);
+                goto out;
+        }
        result = le32_to_cpu(head->result);
@@ -1838,11 +1926,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
                        mutex_unlock(&mdsc->mutex);
                        goto out;
                }
-        }
+        } else {
-        BUG_ON(req->r_reply);
-        if (!head->safe) {
                req->r_got_unsafe = true;
                list_add_tail(&req->r_unsafe_item, &req->r_session->s_unsafe);
        }
@@ -1871,21 +1955,30 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
        }
        /* insert trace into our cache */
+        mutex_lock(&req->r_fill_mutex);
        err = ceph_fill_trace(mdsc->client->sb, req, req->r_session);
        if (err == 0) {
                if (result == 0 && rinfo->dir_nr)
                        ceph_readdir_prepopulate(req, req->r_session);
                ceph_unreserve_caps(&req->r_caps_reservation);
        }
+        mutex_unlock(&req->r_fill_mutex);
        up_read(&mdsc->snap_rwsem);
 out_err:
-        if (err) {
+        mutex_lock(&mdsc->mutex);
-                req->r_err = err;
+        if (!req->r_aborted) {
+                if (err) {
+                        req->r_err = err;
+                } else {
+                        req->r_reply = msg;
+                        ceph_msg_get(msg);
+                        req->r_got_result = true;
+                }
        } else {
-                req->r_reply = msg;
+                dout("reply arrived after request %lld was aborted\n", tid);
-                ceph_msg_get(msg);
        }
+        mutex_unlock(&mdsc->mutex);
        add_cap_releases(mdsc, req->r_session, -1);
        mutex_unlock(&session->s_mutex);
@@ -1921,16 +2014,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
        mutex_lock(&mdsc->mutex);
        req = __lookup_request(mdsc, tid);
        if (!req) {
-                dout("forward %llu to mds%d - req dne\n", tid, next_mds);
+                dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
                goto out;  /* dup reply? */
        }
-        if (fwd_seq <= req->r_num_fwd) {
+        if (req->r_aborted) {
-                dout("forward %llu to mds%d - old seq %d <= %d\n",
+                dout("forward tid %llu aborted, unregistering\n", tid);
+                __unregister_request(mdsc, req);
+        } else if (fwd_seq <= req->r_num_fwd) {
+                dout("forward tid %llu to mds%d - old seq %d <= %d\n",
                     tid, next_mds, req->r_num_fwd, fwd_seq);
        } else {
                /* resend. forward race not possible; mds would drop */
-                dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
+                dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+                BUG_ON(req->r_err);
+                BUG_ON(req->r_got_result);
                req->r_num_fwd = fwd_seq;
                req->r_resend_mds = next_mds;
                put_request_session(req);
@@ -1984,6 +2082,8 @@ static void handle_session(struct ceph_mds_session *session,
        switch (op) {
        case CEPH_SESSION_OPEN:
+                if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+                        pr_info("mds%d reconnect success\n", session->s_mds);
                session->s_state = CEPH_MDS_SESSION_OPEN;
                renewed_caps(mdsc, session, 0);
                wake = 1;
@@ -1997,10 +2097,12 @@ static void handle_session(struct ceph_mds_session *session,
                break;
        case CEPH_SESSION_CLOSE:
+                if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
+                        pr_info("mds%d reconnect denied\n", session->s_mds);
                remove_session_caps(session);
                wake = 1; /* for good measure */
                complete(&mdsc->session_close_waiters);
-                kick_requests(mdsc, mds, 0);      /* cur only */
+                kick_requests(mdsc, mds);
                break;
        case CEPH_SESSION_STALE:
@@ -2132,54 +2234,44 @@ out:
 *
 * called with mdsc->mutex held.
 */
-static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
+static void send_mds_reconnect(struct ceph_mds_client *mdsc,
+                               struct ceph_mds_session *session)
 {
-        struct ceph_mds_session *session = NULL;
        struct ceph_msg *reply;
        struct rb_node *p;
+        int mds = session->s_mds;
        int err = -ENOMEM;
        struct ceph_pagelist *pagelist;
-        pr_info("reconnect to recovering mds%d\n", mds);
+        pr_info("mds%d reconnect start\n", mds);
        pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
        if (!pagelist)
                goto fail_nopagelist;
        ceph_pagelist_init(pagelist);
-        reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, 0, 0, NULL);
+        reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS);
-        if (IS_ERR(reply)) {
+        if (!reply)
-                err = PTR_ERR(reply);
                goto fail_nomsg;
-        }
-        /* find session */
-        session = __ceph_lookup_mds_session(mdsc, mds);
-        mutex_unlock(&mdsc->mutex);    /* drop lock for duration */
-        if (session) {
+        mutex_lock(&session->s_mutex);
-                mutex_lock(&session->s_mutex);
+        session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+        session->s_seq = 0;
-                session->s_state = CEPH_MDS_SESSION_RECONNECTING;
+        ceph_con_open(&session->s_con,
-                session->s_seq = 0;
+                      ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
-                ceph_con_open(&session->s_con,
+        /* replay unsafe requests */
-                              ceph_mdsmap_get_addr(mdsc->mdsmap, mds));
+        replay_unsafe_requests(mdsc, session);
-                /* replay unsafe requests */
-                replay_unsafe_requests(mdsc, session);
-        } else {
-                dout("no session for mds%d, will send short reconnect\n",
-                     mds);
-        }
        down_read(&mdsc->snap_rwsem);
-        if (!session)
-                goto send;
        dout("session %p state %s\n", session,
             session_state_name(session->s_state));
+        /* drop old cap expires; we're about to reestablish that state */
+        discard_cap_releases(mdsc, session);
        /* traverse this session's caps */
        err = ceph_pagelist_encode_32(pagelist, session->s_nr_caps);
        if (err)
@@ -2208,36 +2300,29 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, int mds)
                        goto fail;
        }
-send:
        reply->pagelist = pagelist;
        reply->hdr.data_len = cpu_to_le32(pagelist->length);
        reply->nr_pages = calc_pages_for(0, pagelist->length);
        ceph_con_send(&session->s_con, reply);
-        session->s_state = CEPH_MDS_SESSION_OPEN;
        mutex_unlock(&session->s_mutex);
        mutex_lock(&mdsc->mutex);
        __wake_requests(mdsc, &session->s_waiting);
        mutex_unlock(&mdsc->mutex);
-        ceph_put_mds_session(session);
        up_read(&mdsc->snap_rwsem);
-        mutex_lock(&mdsc->mutex);
        return;
 fail:
        ceph_msg_put(reply);
        up_read(&mdsc->snap_rwsem);
        mutex_unlock(&session->s_mutex);
-        ceph_put_mds_session(session);
 fail_nomsg:
        ceph_pagelist_release(pagelist);
        kfree(pagelist);
 fail_nopagelist:
        pr_err("error %d preparing reconnect for mds%d\n", err, mds);
-        mutex_lock(&mdsc->mutex);
        return;
 }
@@ -2290,7 +2375,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                        }
                        /* kick any requests waiting on the recovering mds */
-                        kick_requests(mdsc, i, 1);
+                        kick_requests(mdsc, i);
                } else if (oldstate == newstate) {
                        continue;  /* nothing new with this mds */
                }
@@ -2299,22 +2384,21 @@ static void check_new_map(struct ceph_mds_client *mdsc,
                 * send reconnect?
                 */
                if (s->s_state == CEPH_MDS_SESSION_RESTARTING &&
-                    newstate >= CEPH_MDS_STATE_RECONNECT)
+                    newstate >= CEPH_MDS_STATE_RECONNECT) {
-                        send_mds_reconnect(mdsc, i);
+                        mutex_unlock(&mdsc->mutex);
+                        send_mds_reconnect(mdsc, s);
+                        mutex_lock(&mdsc->mutex);
+                }
                /*
-                 * kick requests on any mds that has gone active.
+                 * kick request on any mds that has gone active.
-                 *
-                 * kick requests on cur or forwarder: we may have sent
-                 * the request to mds1, mds1 told us it forwarded it
-                 * to mds2, but then we learn mds1 failed and can't be
-                 * sure it successfully forwarded our request before
-                 * it died.
                 */
                if (oldstate < CEPH_MDS_STATE_ACTIVE &&
                    newstate >= CEPH_MDS_STATE_ACTIVE) {
-                        pr_info("mds%d reconnect completed\n", s->s_mds);
+                        if (oldstate != CEPH_MDS_STATE_CREATING &&
-                        kick_requests(mdsc, i, 1);
+                            oldstate != CEPH_MDS_STATE_STARTING)
+                                pr_info("mds%d recovery completed\n", s->s_mds);
+                        kick_requests(mdsc, i);
                        ceph_kick_flushing_caps(mdsc, s);
                        wake_up_session_caps(s, 1);
                }
@@ -2457,12 +2541,12 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
        dnamelen = dentry->d_name.len;
        len += dnamelen;
-        msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, 0, 0, NULL);
+        msg = ceph_msg_new(CEPH_MSG_CLIENT_LEASE, len, GFP_NOFS);
-        if (IS_ERR(msg))
+        if (!msg)
                return;
        lease = msg->front.iov_base;
        lease->action = action;
-        lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+        lease->mask = cpu_to_le16(1);
        lease->ino = cpu_to_le64(ceph_vino(inode).ino);
        lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
        lease->seq = cpu_to_le32(seq);
@@ -2492,7 +2576,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
        BUG_ON(inode == NULL);
        BUG_ON(dentry == NULL);
-        BUG_ON(mask != CEPH_LOCK_DN);
+        BUG_ON(mask == 0);
        /* is dentry lease valid? */
        spin_lock(&dentry->d_lock);
@@ -2603,7 +2687,9 @@ static void delayed_work(struct work_struct *work)
                else
                        ceph_con_keepalive(&s->s_con);
                add_cap_releases(mdsc, s, -1);
-                send_cap_releases(mdsc, s);
+                if (s->s_state == CEPH_MDS_SESSION_OPEN ||
+                    s->s_state == CEPH_MDS_SESSION_HUNG)
+                        send_cap_releases(mdsc, s);
                mutex_unlock(&s->s_mutex);
                ceph_put_mds_session(s);
@@ -2620,6 +2706,9 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        mdsc->client = client;
        mutex_init(&mdsc->mutex);
        mdsc->mdsmap = kzalloc(sizeof(*mdsc->mdsmap), GFP_NOFS);
+        if (mdsc->mdsmap == NULL)
+                return -ENOMEM;
        init_completion(&mdsc->safe_umount_waiters);
        init_completion(&mdsc->session_close_waiters);
        INIT_LIST_HEAD(&mdsc->waiting_for_map);
@@ -2645,6 +2734,7 @@ int ceph_mdsc_init(struct ceph_mds_client *mdsc, struct ceph_client *client)
        init_waitqueue_head(&mdsc->cap_flushing_wq);
        spin_lock_init(&mdsc->dentry_lru_lock);
        INIT_LIST_HEAD(&mdsc->dentry_lru);
        return 0;
 }
@@ -2740,6 +2830,9 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
 {
        u64 want_tid, want_flush;
+        if (mdsc->client->mount_state == CEPH_MOUNT_SHUTDOWN)
+                return;
        dout("sync\n");
        mutex_lock(&mdsc->mutex);
        want_tid = mdsc->last_tid;
@@ -2922,9 +3015,10 @@ static void con_put(struct ceph_connection *con)
 static void peer_reset(struct ceph_connection *con)
 {
        struct ceph_mds_session *s = con->private;
+        struct ceph_mds_client *mdsc = s->s_mdsc;
-        pr_err("mds%d gave us the boot.  IMPLEMENT RECONNECT.\n",
+        pr_warning("mds%d closed our session\n", s->s_mds);
-               s->s_mds);
+        send_mds_reconnect(mdsc, s);
 }
 static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
@@ -3031,7 +3125,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
        return ceph_monc_validate_auth(&mdsc->client->monc);
 }
-const static struct ceph_connection_operations mds_con_ops = {
+static const struct ceph_connection_operations mds_con_ops = {
        .get = con_get,
        .put = con_put,
        .dispatch = dispatch,
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index 961cc6f65878..d9936c4f1212 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -165,6 +165,8 @@ struct ceph_mds_request {
        struct inode *r_locked_dir; /* dir (if any) i_mutex locked by vfs */
        struct inode *r_target_inode;       /* resulting inode */
+        struct mutex r_fill_mutex;
        union ceph_mds_request_args r_args;
        int r_fmode;        /* file mode, if expecting cap */
@@ -213,7 +215,7 @@ struct ceph_mds_request {
        struct completion r_safe_completion;
        ceph_mds_request_callback_t r_callback;
        struct list_head  r_unsafe_item;  /* per-session unsafe list item */
-        bool              r_got_unsafe, r_got_safe;
+        bool              r_got_unsafe, r_got_safe, r_got_result;
        bool              r_did_prepopulate;
        u32               r_readdir_offset;
@@ -301,6 +303,8 @@ extern void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc,
                                    struct inode *inode,
                                    struct dentry *dn, int mask);
+extern void ceph_invalidate_dir_request(struct ceph_mds_request *req);
 extern struct ceph_mds_request *
 ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
 extern void ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index cd4fadb6491a..64b8b1f7863d 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -39,18 +39,6 @@ static void queue_con(struct ceph_connection *con);
 static void con_work(struct work_struct *);
 static void ceph_fault(struct ceph_connection *con);
-const char *ceph_name_type_str(int t)
-{
-        switch (t) {
-        case CEPH_ENTITY_TYPE_MON: return "mon";
-        case CEPH_ENTITY_TYPE_MDS: return "mds";
-        case CEPH_ENTITY_TYPE_OSD: return "osd";
-        case CEPH_ENTITY_TYPE_CLIENT: return "client";
-        case CEPH_ENTITY_TYPE_ADMIN: return "admin";
-        default: return "???";
-        }
-}
 /*
 * nicely render a sockaddr as a string.
 */
@@ -132,6 +120,12 @@ void ceph_msgr_exit(void)
        destroy_workqueue(ceph_msgr_wq);
 }
+void ceph_msgr_flush()
+{
+        flush_workqueue(ceph_msgr_wq);
+}
 /*
 * socket callback functions
 */
@@ -340,6 +334,7 @@ static void reset_connection(struct ceph_connection *con)
                ceph_msg_put(con->out_msg);
                con->out_msg = NULL;
        }
+        con->out_keepalive_pending = false;
        con->in_seq = 0;
        con->in_seq_acked = 0;
 }
@@ -357,6 +352,7 @@ void ceph_con_close(struct ceph_connection *con)
        clear_bit(WRITE_PENDING, &con->state);
        mutex_lock(&con->mutex);
        reset_connection(con);
+        con->peer_global_seq = 0;
        cancel_delayed_work(&con->work);
        mutex_unlock(&con->mutex);
        queue_con(con);
@@ -661,7 +657,7 @@ static void prepare_write_connect(struct ceph_messenger *msgr,
        dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
             con->connect_seq, global_seq, proto);
-        con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+        con->out_connect.features = CEPH_FEATURE_SUPPORTED_CLIENT;
        con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
        con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
        con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1124,8 +1120,8 @@ static void fail_protocol(struct ceph_connection *con)
 static int process_connect(struct ceph_connection *con)
 {
-        u64 sup_feat = CEPH_FEATURE_SUPPORTED;
+        u64 sup_feat = CEPH_FEATURE_SUPPORTED_CLIENT;
-        u64 req_feat = CEPH_FEATURE_REQUIRED;
+        u64 req_feat = CEPH_FEATURE_REQUIRED_CLIENT;
        u64 server_feat = le64_to_cpu(con->in_reply.features);
        dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
@@ -1233,6 +1229,7 @@ static int process_connect(struct ceph_connection *con)
                clear_bit(CONNECTING, &con->state);
                con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
                con->connect_seq++;
+                con->peer_features = server_feat;
                dout("process_connect got READY gseq %d cseq %d (%d)\n",
                     con->peer_global_seq,
                     le32_to_cpu(con->in_reply.connect_seq),
@@ -1402,19 +1399,17 @@ static int read_partial_message(struct ceph_connection *con)
                con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
                if (skip) {
                        /* skip this message */
-                        dout("alloc_msg returned NULL, skipping message\n");
+                        dout("alloc_msg said skip message\n");
                        con->in_base_pos = -front_len - middle_len - data_len -
                                sizeof(m->footer);
                        con->in_tag = CEPH_MSGR_TAG_READY;
                        con->in_seq++;
                        return 0;
                }
-                if (IS_ERR(con->in_msg)) {
+                if (!con->in_msg) {
-                        ret = PTR_ERR(con->in_msg);
-                        con->in_msg = NULL;
                        con->error_msg =
                                "error allocating memory for incoming message";
-                        return ret;
+                        return -ENOMEM;
                }
                m = con->in_msg;
                m->front.iov_len = 0;    /* haven't read it yet */
@@ -1514,14 +1509,14 @@ static void process_message(struct ceph_connection *con)
        /* if first message, set peer_name */
        if (con->peer_name.type == 0)
-                con->peer_name = msg->hdr.src.name;
+                con->peer_name = msg->hdr.src;
        con->in_seq++;
        mutex_unlock(&con->mutex);
        dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
             msg, le64_to_cpu(msg->hdr.seq),
-             ENTITY_NAME(msg->hdr.src.name),
+             ENTITY_NAME(msg->hdr.src),
             le16_to_cpu(msg->hdr.type),
             ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
             le32_to_cpu(msg->hdr.front_len),
@@ -1546,7 +1541,6 @@ static int try_write(struct ceph_connection *con)
        dout("try_write start %p state %lu nref %d\n", con, con->state,
             atomic_read(&con->nref));
-        mutex_lock(&con->mutex);
 more:
        dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
@@ -1639,7 +1633,6 @@ do_next:
 done:
        ret = 0;
 out:
-        mutex_unlock(&con->mutex);
        dout("try_write done on %p\n", con);
        return ret;
 }
@@ -1651,7 +1644,6 @@ out:
 */
 static int try_read(struct ceph_connection *con)
 {
-        struct ceph_messenger *msgr;
        int ret = -1;
        if (!con->sock)
@@ -1661,9 +1653,6 @@ static int try_read(struct ceph_connection *con)
                return 0;
        dout("try_read start on %p\n", con);
-        msgr = con->msgr;
-        mutex_lock(&con->mutex);
 more:
        dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
@@ -1758,7 +1747,6 @@ more:
 done:
        ret = 0;
 out:
-        mutex_unlock(&con->mutex);
        dout("try_read done on %p\n", con);
        return ret;
@@ -1830,6 +1818,8 @@ more:
        dout("con_work %p start, clearing QUEUED\n", con);
        clear_bit(QUEUED, &con->state);
+        mutex_lock(&con->mutex);
        if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
                dout("con_work CLOSED\n");
                con_close_socket(con);
@@ -1844,11 +1834,16 @@ more:
        if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
            try_read(con) < 0 ||
            try_write(con) < 0) {
+                mutex_unlock(&con->mutex);
                backoff = 1;
                ceph_fault(con);     /* error/fault path */
+                goto done_unlocked;
        }
 done:
+        mutex_unlock(&con->mutex);
+done_unlocked:
        clear_bit(BUSY, &con->state);
        dout("con->state=%lu\n", con->state);
        if (test_bit(QUEUED, &con->state)) {
@@ -1947,7 +1942,7 @@ struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)
        /* the zero page is needed if a request is "canceled" while the message
         * is being written over the socket */
-        msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+        msgr->zero_page = __page_cache_alloc(GFP_KERNEL | __GFP_ZERO);
        if (!msgr->zero_page) {
                kfree(msgr);
                return ERR_PTR(-ENOMEM);
@@ -1987,9 +1982,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
        }
        /* set src+dst */
-        msg->hdr.src.name = con->msgr->inst.name;
+        msg->hdr.src = con->msgr->inst.name;
-        msg->hdr.src.addr = con->msgr->my_enc_addr;
-        msg->hdr.orig_src = msg->hdr.src;
        BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
@@ -2083,12 +2076,11 @@ void ceph_con_keepalive(struct ceph_connection *con)
 * construct a new message with given type, size
 * the new msg has a ref count of 1.
 */
-struct ceph_msg *ceph_msg_new(int type, int front_len,
+struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags)
-                              int page_len, int page_off, struct page **pages)
 {
        struct ceph_msg *m;
-        m = kmalloc(sizeof(*m), GFP_NOFS);
+        m = kmalloc(sizeof(*m), flags);
        if (m == NULL)
                goto out;
        kref_init(&m->kref);
@@ -2100,8 +2092,8 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        m->hdr.version = 0;
        m->hdr.front_len = cpu_to_le32(front_len);
        m->hdr.middle_len = 0;
-        m->hdr.data_len = cpu_to_le32(page_len);
+        m->hdr.data_len = 0;
-        m->hdr.data_off = cpu_to_le16(page_off);
+        m->hdr.data_off = 0;
        m->hdr.reserved = 0;
        m->footer.front_crc = 0;
        m->footer.middle_crc = 0;
@@ -2115,11 +2107,11 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        /* front */
        if (front_len) {
                if (front_len > PAGE_CACHE_SIZE) {
-                        m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+                        m->front.iov_base = __vmalloc(front_len, flags,
                                                      PAGE_KERNEL);
                        m->front_is_vmalloc = true;
                } else {
-                        m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+                        m->front.iov_base = kmalloc(front_len, flags);
                }
                if (m->front.iov_base == NULL) {
                        pr_err("msg_new can't allocate %d bytes\n",
@@ -2135,19 +2127,18 @@ struct ceph_msg *ceph_msg_new(int type, int front_len,
        m->middle = NULL;
        /* data */
-        m->nr_pages = calc_pages_for(page_off, page_len);
+        m->nr_pages = 0;
-        m->pages = pages;
+        m->pages = NULL;
        m->pagelist = NULL;
-        dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
+        dout("ceph_msg_new %p front %d\n", m, front_len);
-             m->nr_pages);
        return m;
 out2:
        ceph_msg_put(m);
 out:
-        pr_err("msg_new can't create type %d len %d\n", type, front_len);
+        pr_err("msg_new can't create type %d front %d\n", type, front_len);
-        return ERR_PTR(-ENOMEM);
+        return NULL;
 }
 /*
@@ -2190,29 +2181,25 @@ static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
                mutex_unlock(&con->mutex);
                msg = con->ops->alloc_msg(con, hdr, skip);
                mutex_lock(&con->mutex);
-                if (IS_ERR(msg))
+                if (!msg || *skip)
-                        return msg;
-                if (*skip)
                        return NULL;
        }
        if (!msg) {
                *skip = 0;
-                msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+                msg = ceph_msg_new(type, front_len, GFP_NOFS);
                if (!msg) {
                        pr_err("unable to allocate msg type %d len %d\n",
                               type, front_len);
-                        return ERR_PTR(-ENOMEM);
+                        return NULL;
                }
        }
        memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
-        if (middle_len) {
+        if (middle_len && !msg->middle) {
                ret = ceph_alloc_middle(con, msg);
                if (ret < 0) {
                        ceph_msg_put(msg);
-                        return msg;
+                        return NULL;
                }
        }
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index a5caf91cc971..76fbc957bc13 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -49,10 +49,8 @@ struct ceph_connection_operations {
                                        int *skip);
 };
-extern const char *ceph_name_type_str(int t);
 /* use format string %s%d */
-#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+#define ENTITY_NAME(n) ceph_entity_type_name((n).type), le64_to_cpu((n).num)
 struct ceph_messenger {
        struct ceph_entity_inst inst;    /* my name+address */
@@ -144,6 +142,7 @@ struct ceph_connection {
        struct ceph_entity_addr peer_addr; /* peer address */
        struct ceph_entity_name peer_name; /* peer name */
        struct ceph_entity_addr peer_addr_for_me;
+        unsigned peer_features;
        u32 connect_seq;      /* identify the most recent connection
                                 attempt for this connection, client */
        u32 peer_global_seq;  /* peer's global seq for this connection */
@@ -158,7 +157,6 @@ struct ceph_connection {
        struct list_head out_queue;
        struct list_head out_sent;   /* sending or sent but unacked */
        u64 out_seq;                 /* last message queued for send */
-        u64 out_seq_sent;            /* last message sent */
        bool out_keepalive_pending;
        u64 in_seq, in_seq_acked;  /* last message received, acked */
@@ -215,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end,
 extern int ceph_msgr_init(void);
 extern void ceph_msgr_exit(void);
+extern void ceph_msgr_flush(void);
 extern struct ceph_messenger *ceph_messenger_create(
        struct ceph_entity_addr *myaddr);
@@ -234,9 +233,7 @@ extern void ceph_con_keepalive(struct ceph_connection *con);
 extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
 extern void ceph_con_put(struct ceph_connection *con);
-extern struct ceph_msg *ceph_msg_new(int type, int front_len,
+extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags);
-                                     int page_len, int page_off,
-                                     struct page **pages);
 extern void ceph_msg_kfree(struct ceph_msg *m);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index 8fdc011ca956..21c62e9b7d1d 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -28,7 +28,7 @@
 * resend any outstanding requests.
 */
-const static struct ceph_connection_operations mon_con_ops;
+static const struct ceph_connection_operations mon_con_ops;
 static int __validate_auth(struct ceph_mon_client *monc);
@@ -104,6 +104,7 @@ static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
        monc->pending_auth = 1;
        monc->m_auth->front.iov_len = len;
        monc->m_auth->hdr.front_len = cpu_to_le32(len);
+        ceph_con_revoke(monc->con, monc->m_auth);
        ceph_msg_get(monc->m_auth);  /* keep our ref */
        ceph_con_send(monc->con, monc->m_auth);
 }
@@ -187,16 +188,12 @@ static void __send_subscribe(struct ceph_mon_client *monc)
             monc->want_next_osdmap);
        if ((__sub_expired(monc) && !monc->sub_sent) ||
            monc->want_next_osdmap == 1) {
-                struct ceph_msg *msg;
+                struct ceph_msg *msg = monc->m_subscribe;
                struct ceph_mon_subscribe_item *i;
                void *p, *end;
-                msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
-                if (!msg)
-                        return;
                p = msg->front.iov_base;
-                end = p + msg->front.iov_len;
+                end = p + msg->front_max;
                dout("__send_subscribe to 'mdsmap' %u+\n",
                     (unsigned)monc->have_mdsmap);
@@ -226,7 +223,8 @@ static void __send_subscribe(struct ceph_mon_client *monc)
                msg->front.iov_len = p - msg->front.iov_base;
                msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
-                ceph_con_send(monc->con, msg);
+                ceph_con_revoke(monc->con, msg);
+                ceph_con_send(monc->con, ceph_msg_get(msg));
                monc->sub_sent = jiffies | 1;  /* never 0 */
        }
@@ -353,14 +351,14 @@ out:
 /*
 * statfs
 */
-static struct ceph_mon_statfs_request *__lookup_statfs(
+static struct ceph_mon_generic_request *__lookup_generic_req(
        struct ceph_mon_client *monc, u64 tid)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
-        struct rb_node *n = monc->statfs_request_tree.rb_node;
+        struct rb_node *n = monc->generic_request_tree.rb_node;
        while (n) {
-                req = rb_entry(n, struct ceph_mon_statfs_request, node);
+                req = rb_entry(n, struct ceph_mon_generic_request, node);
                if (tid < req->tid)
                        n = n->rb_left;
                else if (tid > req->tid)
@@ -371,16 +369,16 @@ static struct ceph_mon_statfs_request *__lookup_statfs(
        return NULL;
 }
-static void __insert_statfs(struct ceph_mon_client *monc,
+static void __insert_generic_request(struct ceph_mon_client *monc,
-                            struct ceph_mon_statfs_request *new)
+                            struct ceph_mon_generic_request *new)
 {
-        struct rb_node **p = &monc->statfs_request_tree.rb_node;
+        struct rb_node **p = &monc->generic_request_tree.rb_node;
        struct rb_node *parent = NULL;
-        struct ceph_mon_statfs_request *req = NULL;
+        struct ceph_mon_generic_request *req = NULL;
        while (*p) {
                parent = *p;
-                req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+                req = rb_entry(parent, struct ceph_mon_generic_request, node);
                if (new->tid < req->tid)
                        p = &(*p)->rb_left;
                else if (new->tid > req->tid)
@@ -390,113 +388,157 @@ static void __insert_statfs(struct ceph_mon_client *monc,
        }
        rb_link_node(&new->node, parent, p);
-        rb_insert_color(&new->node, &monc->statfs_request_tree);
+        rb_insert_color(&new->node, &monc->generic_request_tree);
+}
+static void release_generic_request(struct kref *kref)
+{
+        struct ceph_mon_generic_request *req =
+                container_of(kref, struct ceph_mon_generic_request, kref);
+        if (req->reply)
+                ceph_msg_put(req->reply);
+        if (req->request)
+                ceph_msg_put(req->request);
+}
+static void put_generic_request(struct ceph_mon_generic_request *req)
+{
+        kref_put(&req->kref, release_generic_request);
+}
+static void get_generic_request(struct ceph_mon_generic_request *req)
+{
+        kref_get(&req->kref);
+}
+static struct ceph_msg *get_generic_reply(struct ceph_connection *con,
+                                         struct ceph_msg_header *hdr,
+                                         int *skip)
+{
+        struct ceph_mon_client *monc = con->private;
+        struct ceph_mon_generic_request *req;
+        u64 tid = le64_to_cpu(hdr->tid);
+        struct ceph_msg *m;
+        mutex_lock(&monc->mutex);
+        req = __lookup_generic_req(monc, tid);
+        if (!req) {
+                dout("get_generic_reply %lld dne\n", tid);
+                *skip = 1;
+                m = NULL;
+        } else {
+                dout("get_generic_reply %lld got %p\n", tid, req->reply);
+                m = ceph_msg_get(req->reply);
+                /*
+                 * we don't need to track the connection reading into
+                 * this reply because we only have one open connection
+                 * at a time, ever.
+                 */
+        }
+        mutex_unlock(&monc->mutex);
+        return m;
 }
 static void handle_statfs_reply(struct ceph_mon_client *monc,
                                struct ceph_msg *msg)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
-        u64 tid;
+        u64 tid = le64_to_cpu(msg->hdr.tid);
        if (msg->front.iov_len != sizeof(*reply))
                goto bad;
-        tid = le64_to_cpu(msg->hdr.tid);
        dout("handle_statfs_reply %p tid %llu\n", msg, tid);
        mutex_lock(&monc->mutex);
-        req = __lookup_statfs(monc, tid);
+        req = __lookup_generic_req(monc, tid);
        if (req) {
-                *req->buf = reply->st;
+                *(struct ceph_statfs *)req->buf = reply->st;
                req->result = 0;
+                get_generic_request(req);
        }
        mutex_unlock(&monc->mutex);
-        if (req)
+        if (req) {
                complete(&req->completion);
+                put_generic_request(req);
+        }
        return;
 bad:
-        pr_err("corrupt statfs reply, no tid\n");
+        pr_err("corrupt generic reply, no tid\n");
        ceph_msg_dump(msg);
 }
 /*
- * (re)send a statfs request
+ * Do a synchronous statfs().
 */
-static int send_statfs(struct ceph_mon_client *monc,
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-                       struct ceph_mon_statfs_request *req)
 {
-        struct ceph_msg *msg;
+        struct ceph_mon_generic_request *req;
        struct ceph_mon_statfs *h;
+        int err;
-        dout("send_statfs tid %llu\n", req->tid);
+        req = kzalloc(sizeof(*req), GFP_NOFS);
-        msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);
+        if (!req)
-        if (IS_ERR(msg))
+                return -ENOMEM;
-                return PTR_ERR(msg);
-        req->request = msg;
+        kref_init(&req->kref);
-        msg->hdr.tid = cpu_to_le64(req->tid);
+        req->buf = buf;
-        h = msg->front.iov_base;
+        init_completion(&req->completion);
+        err = -ENOMEM;
+        req->request = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), GFP_NOFS);
+        if (!req->request)
+                goto out;
+        req->reply = ceph_msg_new(CEPH_MSG_STATFS_REPLY, 1024, GFP_NOFS);
+        if (!req->reply)
+                goto out;
+        /* fill out request */
+        h = req->request->front.iov_base;
        h->monhdr.have_version = 0;
        h->monhdr.session_mon = cpu_to_le16(-1);
        h->monhdr.session_mon_tid = 0;
        h->fsid = monc->monmap->fsid;
-        ceph_con_send(monc->con, msg);
-        return 0;
-}
-/*
- * Do a synchronous statfs().
- */
-int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
-{
-        struct ceph_mon_statfs_request req;
-        int err;
-        req.buf = buf;
-        init_completion(&req.completion);
-        /* allocate memory for reply */
-        err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
-        if (err)
-                return err;
        /* register request */
        mutex_lock(&monc->mutex);
-        req.tid = ++monc->last_tid;
+        req->tid = ++monc->last_tid;
-        req.last_attempt = jiffies;
+        req->request->hdr.tid = cpu_to_le64(req->tid);
-        req.delay = BASE_DELAY_INTERVAL;
+        __insert_generic_request(monc, req);
-        __insert_statfs(monc, &req);
+        monc->num_generic_requests++;
-        monc->num_statfs_requests++;
        mutex_unlock(&monc->mutex);
        /* send request and wait */
-        err = send_statfs(monc, &req);
+        ceph_con_send(monc->con, ceph_msg_get(req->request));
-        if (!err)
+        err = wait_for_completion_interruptible(&req->completion);
-                err = wait_for_completion_interruptible(&req.completion);
        mutex_lock(&monc->mutex);
-        rb_erase(&req.node, &monc->statfs_request_tree);
+        rb_erase(&req->node, &monc->generic_request_tree);
-        monc->num_statfs_requests--;
+        monc->num_generic_requests--;
-        ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
        mutex_unlock(&monc->mutex);
        if (!err)
-                err = req.result;
+                err = req->result;
+out:
+        kref_put(&req->kref, release_generic_request);
        return err;
 }
 /*
 * Resend pending statfs requests.
 */
-static void __resend_statfs(struct ceph_mon_client *monc)
+static void __resend_generic_request(struct ceph_mon_client *monc)
 {
-        struct ceph_mon_statfs_request *req;
+        struct ceph_mon_generic_request *req;
        struct rb_node *p;
-        for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
+        for (p = rb_first(&monc->generic_request_tree); p; p = rb_next(p)) {
-                req = rb_entry(p, struct ceph_mon_statfs_request, node);
+                req = rb_entry(p, struct ceph_mon_generic_request, node);
-                send_statfs(monc, req);
+                ceph_con_revoke(monc->con, req->request);
+                ceph_con_send(monc->con, ceph_msg_get(req->request));
        }
 }
@@ -586,26 +628,26 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
                CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
                CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
-        /* msg pools */
+        /* msgs */
-        err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
+        err = -ENOMEM;
-                               sizeof(struct ceph_mon_subscribe_ack), 1, false);
+        monc->m_subscribe_ack = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE_ACK,
-        if (err < 0)
+                                     sizeof(struct ceph_mon_subscribe_ack),
+                                     GFP_NOFS);
+        if (!monc->m_subscribe_ack)
                goto out_monmap;
-        err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
-                                sizeof(struct ceph_mon_statfs_reply), 0, false);
+        monc->m_subscribe = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, GFP_NOFS);
-        if (err < 0)
+        if (!monc->m_subscribe)
-                goto out_pool1;
+                goto out_subscribe_ack;
-        err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
-        if (err < 0)
+        monc->m_auth_reply = ceph_msg_new(CEPH_MSG_AUTH_REPLY, 4096, GFP_NOFS);
-                goto out_pool2;
+        if (!monc->m_auth_reply)
+                goto out_subscribe;
-        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+        monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, GFP_NOFS);
        monc->pending_auth = 0;
-        if (IS_ERR(monc->m_auth)) {
+        if (!monc->m_auth)
-                err = PTR_ERR(monc->m_auth);
+                goto out_auth_reply;
-                monc->m_auth = NULL;
-                goto out_pool3;
-        }
        monc->cur_mon = -1;
        monc->hunting = true;
@@ -613,8 +655,8 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        monc->sub_sent = 0;
        INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
-        monc->statfs_request_tree = RB_ROOT;
+        monc->generic_request_tree = RB_ROOT;
-        monc->num_statfs_requests = 0;
+        monc->num_generic_requests = 0;
        monc->last_tid = 0;
        monc->have_mdsmap = 0;
@@ -622,12 +664,12 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
        monc->want_next_osdmap = 1;
        return 0;
-out_pool3:
+out_auth_reply:
-        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+        ceph_msg_put(monc->m_auth_reply);
-out_pool2:
+out_subscribe:
-        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+        ceph_msg_put(monc->m_subscribe);
-out_pool1:
+out_subscribe_ack:
-        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+        ceph_msg_put(monc->m_subscribe_ack);
 out_monmap:
        kfree(monc->monmap);
 out:
@@ -651,9 +693,9 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
        ceph_auth_destroy(monc->auth);
        ceph_msg_put(monc->m_auth);
-        ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+        ceph_msg_put(monc->m_auth_reply);
-        ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+        ceph_msg_put(monc->m_subscribe);
-        ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+        ceph_msg_put(monc->m_subscribe_ack);
        kfree(monc->monmap);
 }
@@ -662,8 +704,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                              struct ceph_msg *msg)
 {
        int ret;
+        int was_auth = 0;
        mutex_lock(&monc->mutex);
+        if (monc->auth->ops)
+                was_auth = monc->auth->ops->is_authenticated(monc->auth);
        monc->pending_auth = 0;
        ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
                                     msg->front.iov_len,
@@ -674,14 +719,14 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                wake_up(&monc->client->auth_wq);
        } else if (ret > 0) {
                __send_prepared_auth_request(monc, ret);
-        } else if (monc->auth->ops->is_authenticated(monc->auth)) {
+        } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
                dout("authenticated, starting session\n");
                monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
                monc->client->msgr->inst.name.num = monc->auth->global_id;
                __send_subscribe(monc);
-                __resend_statfs(monc);
+                __resend_generic_request(monc);
        }
        mutex_unlock(&monc->mutex);
 }
@@ -770,18 +815,17 @@ static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,
        switch (type) {
        case CEPH_MSG_MON_SUBSCRIBE_ACK:
-                m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+                m = ceph_msg_get(monc->m_subscribe_ack);
                break;
        case CEPH_MSG_STATFS_REPLY:
-                m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
+                return get_generic_reply(con, hdr, skip);
-                break;
        case CEPH_MSG_AUTH_REPLY:
-                m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+                m = ceph_msg_get(monc->m_auth_reply);
                break;
        case CEPH_MSG_MON_MAP:
        case CEPH_MSG_MDS_MAP:
        case CEPH_MSG_OSD_MAP:
-                m = ceph_msg_new(type, front_len, 0, 0, NULL);
+                m = ceph_msg_new(type, front_len, GFP_NOFS);
                break;
        }
@@ -826,7 +870,7 @@ out:
        mutex_unlock(&monc->mutex);
 }
-const static struct ceph_connection_operations mon_con_ops = {
+static const struct ceph_connection_operations mon_con_ops = {
        .get = ceph_con_get,
        .put = ceph_con_put,
        .dispatch = dispatch,
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
index b958ad5afa06..174d794321d0 100644
--- a/fs/ceph/mon_client.h
+++ b/fs/ceph/mon_client.h
@@ -2,10 +2,10 @@
 #define _FS_CEPH_MON_CLIENT_H
 #include <linux/completion.h>
+#include <linux/kref.h>
 #include <linux/rbtree.h>
 #include "messenger.h"
-#include "msgpool.h"
 struct ceph_client;
 struct ceph_mount_args;
@@ -22,7 +22,7 @@ struct ceph_monmap {
 };
 struct ceph_mon_client;
-struct ceph_mon_statfs_request;
+struct ceph_mon_generic_request;
 /*
@@ -40,17 +40,19 @@ struct ceph_mon_request {
 };
 /*
- * statfs() is done a bit differently because we need to get data back
+ * ceph_mon_generic_request is being used for the statfs and poolop requests
+ * which are bening done a bit differently because we need to get data back
 * to the caller
 */
-struct ceph_mon_statfs_request {
+struct ceph_mon_generic_request {
+        struct kref kref;
        u64 tid;
        struct rb_node node;
        int result;
-        struct ceph_statfs *buf;
+        void *buf;
        struct completion completion;
-        unsigned long last_attempt, delay; /* jiffies */
        struct ceph_msg *request;  /* original request */
+        struct ceph_msg *reply;    /* and reply */
 };
 struct ceph_mon_client {
@@ -61,7 +63,7 @@ struct ceph_mon_client {
        struct delayed_work delayed_work;
        struct ceph_auth_client *auth;
-        struct ceph_msg *m_auth;
+        struct ceph_msg *m_auth, *m_auth_reply, *m_subscribe, *m_subscribe_ack;
        int pending_auth;
        bool hunting;
@@ -70,14 +72,9 @@ struct ceph_mon_client {
        struct ceph_connection *con;
        bool have_fsid;
-        /* msg pools */
+        /* pending generic requests */
-        struct ceph_msgpool msgpool_subscribe_ack;
+        struct rb_root generic_request_tree;
-        struct ceph_msgpool msgpool_statfs_reply;
+        int num_generic_requests;
-        struct ceph_msgpool msgpool_auth_reply;
-        /* pending statfs requests */
-        struct rb_root statfs_request_tree;
-        int num_statfs_requests;
        u64 last_tid;
        /* mds/osd map */
diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
index ca3b44a89f2d..dd65a6438131 100644
--- a/fs/ceph/msgpool.c
+++ b/fs/ceph/msgpool.c
@@ -7,180 +7,58 @@
 #include "msgpool.h"
-/*
+static void *alloc_fn(gfp_t gfp_mask, void *arg)
- * We use msg pools to preallocate memory for messages we expect to
+{
- * receive over the wire, to avoid getting ourselves into OOM
+        struct ceph_msgpool *pool = arg;
- * conditions at unexpected times.  We take use a few different
+        void *p;
- * strategies:
- *
- *  - for request/response type interactions, we preallocate the
- * memory needed for the response when we generate the request.
- *
- *  - for messages we can receive at any time from the MDS, we preallocate
- * a pool of messages we can re-use.
- *
- *  - for writeback, we preallocate some number of messages to use for
- * requests and their replies, so that we always make forward
- * progress.
- *
- * The msgpool behaves like a mempool_t, but keeps preallocated
- * ceph_msgs strung together on a list_head instead of using a pointer
- * vector.  This avoids vector reallocation when we adjust the number
- * of preallocated items (which happens frequently).
- */
+        p = ceph_msg_new(0, pool->front_len, gfp_mask);
+        if (!p)
+                pr_err("msgpool %s alloc failed\n", pool->name);
+        return p;
+}
-/*
+static void free_fn(void *element, void *arg)
- * Allocate or release as necessary to meet our target pool size.
- */
-static int __fill_msgpool(struct ceph_msgpool *pool)
 {
-        struct ceph_msg *msg;
+        ceph_msg_put(element);
-        while (pool->num < pool->min) {
-                dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
-                     pool->min);
-                spin_unlock(&pool->lock);
-                msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
-                spin_lock(&pool->lock);
-                if (IS_ERR(msg))
-                        return PTR_ERR(msg);
-                msg->pool = pool;
-                list_add(&msg->list_head, &pool->msgs);
-                pool->num++;
-        }
-        while (pool->num > pool->min) {
-                msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
-                dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
-                     pool->min, msg);
-                list_del_init(&msg->list_head);
-                pool->num--;
-                ceph_msg_kfree(msg);
-        }
-        return 0;
 }
 int ceph_msgpool_init(struct ceph_msgpool *pool,
-                      int front_len, int min, bool blocking)
+                      int front_len, int size, bool blocking, const char *name)
 {
-        int ret;
-        dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
-        spin_lock_init(&pool->lock);
        pool->front_len = front_len;
-        INIT_LIST_HEAD(&pool->msgs);
+        pool->pool = mempool_create(size, alloc_fn, free_fn, pool);
-        pool->num = 0;
+        if (!pool->pool)
-        pool->min = min;
+                return -ENOMEM;
-        pool->blocking = blocking;
+        pool->name = name;
-        init_waitqueue_head(&pool->wait);
+        return 0;
-        spin_lock(&pool->lock);
-        ret = __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
-        return ret;
 }
 void ceph_msgpool_destroy(struct ceph_msgpool *pool)
 {
-        dout("msgpool_destroy %p\n", pool);
+        mempool_destroy(pool->pool);
-        spin_lock(&pool->lock);
-        pool->min = 0;
-        __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
 }
-int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)
+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
+                                  int front_len)
 {
-        int ret;
+        if (front_len > pool->front_len) {
+                pr_err("msgpool_get pool %s need front %d, pool size is %d\n",
-        spin_lock(&pool->lock);
+                       pool->name, front_len, pool->front_len);
-        dout("msgpool_resv %p delta %d\n", pool, delta);
-        pool->min += delta;
-        ret = __fill_msgpool(pool);
-        spin_unlock(&pool->lock);
-        return ret;
-}
-struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
-{
-        wait_queue_t wait;
-        struct ceph_msg *msg;
-        if (front_len && front_len > pool->front_len) {
-                pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
-                       pool, front_len, pool->front_len);
                WARN_ON(1);
                /* try to alloc a fresh message */
-                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+                return ceph_msg_new(0, front_len, GFP_NOFS);
-                if (!IS_ERR(msg))
-                        return msg;
-        }
-        if (!front_len)
-                front_len = pool->front_len;
-        if (pool->blocking) {
-                /* mempool_t behavior; first try to alloc */
-                msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-                if (!IS_ERR(msg))
-                        return msg;
        }
-        while (1) {
+        return mempool_alloc(pool->pool, GFP_NOFS);
-                spin_lock(&pool->lock);
-                if (likely(pool->num)) {
-                        msg = list_entry(pool->msgs.next, struct ceph_msg,
-                                         list_head);
-                        list_del_init(&msg->list_head);
-                        pool->num--;
-                        dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
-                             pool->num, pool->min);
-                        spin_unlock(&pool->lock);
-                        return msg;
-                }
-                pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
-                       pool->min, pool->blocking ? "waiting" : "may fail");
-                spin_unlock(&pool->lock);
-                if (!pool->blocking) {
-                        WARN_ON(1);
-                        /* maybe we can allocate it now? */
-                        msg = ceph_msg_new(0, front_len, 0, 0, NULL);
-                        if (!IS_ERR(msg))
-                                return msg;
-                        pr_err("msgpool_get %p empty + alloc failed\n", pool);
-                        return ERR_PTR(-ENOMEM);
-                }
-                init_wait(&wait);
-                prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
-                schedule();
-                finish_wait(&pool->wait, &wait);
-        }
 }
 void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
 {
-        spin_lock(&pool->lock);
+        /* reset msg front_len; user may have changed it */
-        if (pool->num < pool->min) {
+        msg->front.iov_len = pool->front_len;
-                /* reset msg front_len; user may have changed it */
+        msg->hdr.front_len = cpu_to_le32(pool->front_len);
-                msg->front.iov_len = pool->front_len;
-                msg->hdr.front_len = cpu_to_le32(pool->front_len);
-                kref_set(&msg->kref, 1);  /* retake a single ref */
+        kref_init(&msg->kref);  /* retake single ref */
-                list_add(&msg->list_head, &pool->msgs);
-                pool->num++;
-                dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
-                     pool->num, pool->min);
-                spin_unlock(&pool->lock);
-                wake_up(&pool->wait);
-        } else {
-                dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
-                     pool->num, pool->min);
-                spin_unlock(&pool->lock);
-                ceph_msg_kfree(msg);
-        }
 }
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
index bc834bfcd720..a362605f9368 100644
--- a/fs/ceph/msgpool.h
+++ b/fs/ceph/msgpool.h
@@ -1,6 +1,7 @@
 #ifndef _FS_CEPH_MSGPOOL
 #define _FS_CEPH_MSGPOOL
+#include <linux/mempool.h>
 #include "messenger.h"
 /*
@@ -8,18 +9,15 @@
 * avoid unexpected OOM conditions.
 */
 struct ceph_msgpool {
-        spinlock_t lock;
+        const char *name;
+        mempool_t *pool;
        int front_len;          /* preallocated payload size */
-        struct list_head msgs;  /* msgs in the pool; each has 1 ref */
-        int num, min;           /* cur, min # msgs in the pool */
-        bool blocking;
-        wait_queue_head_t wait;
 };
 extern int ceph_msgpool_init(struct ceph_msgpool *pool,
-                             int front_len, int size, bool blocking);
+                             int front_len, int size, bool blocking,
+                             const char *name);
 extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
-extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
 extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
                                         int front_len);
 extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
index 8aaab414f3f8..892a0298dfdf 100644
--- a/fs/ceph/msgr.h
+++ b/fs/ceph/msgr.h
@@ -50,7 +50,6 @@ struct ceph_entity_name {
 #define CEPH_ENTITY_TYPE_MDS    0x02
 #define CEPH_ENTITY_TYPE_OSD    0x04
 #define CEPH_ENTITY_TYPE_CLIENT 0x08
-#define CEPH_ENTITY_TYPE_ADMIN  0x10
 #define CEPH_ENTITY_TYPE_AUTH   0x20
 #define CEPH_ENTITY_TYPE_ANY    0xFF
@@ -120,7 +119,7 @@ struct ceph_msg_connect_reply {
 /*
 * message header
 */
-struct ceph_msg_header {
+struct ceph_msg_header_old {
        __le64 seq;       /* message seq# for this session */
        __le64 tid;       /* transaction id */
        __le16 type;      /* message type */
@@ -138,6 +137,24 @@ struct ceph_msg_header {
        __le32 crc;       /* header crc32c */
 } __attribute__ ((packed));
+struct ceph_msg_header {
+        __le64 seq;       /* message seq# for this session */
+        __le64 tid;       /* transaction id */
+        __le16 type;      /* message type */
+        __le16 priority;  /* priority.  higher value == higher priority */
+        __le16 version;   /* version of message encoding */
+        __le32 front_len; /* bytes in main payload */
+        __le32 middle_len;/* bytes in middle payload */
+        __le32 data_len;  /* bytes of data payload */
+        __le16 data_off;  /* sender: include full offset;
+                             receiver: mask against ~PAGE_MASK */
+        struct ceph_entity_name src;
+        __le32 reserved;
+        __le32 crc;       /* header crc32c */
+} __attribute__ ((packed));
 #define CEPH_MSG_PRIO_LOW     64
 #define CEPH_MSG_PRIO_DEFAULT 127
 #define CEPH_MSG_PRIO_HIGH    196
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index 3514f71ff85f..d25b4add85b4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -16,7 +16,7 @@
 #define OSD_OP_FRONT_LEN        4096
 #define OSD_OPREPLY_FRONT_LEN   512
-const static struct ceph_connection_operations osd_con_ops;
+static const struct ceph_connection_operations osd_con_ops;
 static int __kick_requests(struct ceph_osd_client *osdc,
                          struct ceph_osd *kickosd);
@@ -147,7 +147,7 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                req = kzalloc(sizeof(*req), GFP_NOFS);
        }
        if (req == NULL)
-                return ERR_PTR(-ENOMEM);
+                return NULL;
        req->r_osdc = osdc;
        req->r_mempool = use_mempool;
@@ -164,10 +164,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
                msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
        else
                msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
-                                   OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
+                                   OSD_OPREPLY_FRONT_LEN, GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                ceph_osdc_put_request(req);
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        req->r_reply = msg;
@@ -178,10 +178,10 @@ struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
        if (use_mempool)
                msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
        else
-                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+                msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, GFP_NOFS);
-        if (IS_ERR(msg)) {
+        if (!msg) {
                ceph_osdc_put_request(req);
-                return ERR_PTR(PTR_ERR(msg));
+                return NULL;
        }
        msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
        memset(msg->front.iov_base, 0, msg->front.iov_len);
@@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd)
 {
        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
             atomic_read(&osd->o_ref) - 1);
-        if (atomic_dec_and_test(&osd->o_ref))
+        if (atomic_dec_and_test(&osd->o_ref)) {
+                struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+                if (osd->o_authorizer)
+                        ac->ops->destroy_authorizer(ac, osd->o_authorizer);
                kfree(osd);
+        }
 }
 /*
@@ -715,7 +720,7 @@ static void handle_timeout(struct work_struct *work)
         * should mark the osd as failed and we should find out about
         * it from an updated osd map.
         */
-        while (!list_empty(&osdc->req_lru)) {
+        while (timeout && !list_empty(&osdc->req_lru)) {
                req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
                                 r_req_lru_item);
@@ -1078,6 +1083,7 @@ done:
        if (newmap)
                kick_requests(osdc, NULL);
        up_read(&osdc->map_sem);
+        wake_up(&osdc->client->auth_wq);
        return;
 bad:
@@ -1087,45 +1093,6 @@ bad:
        return;
 }
-/*
- * A read request prepares specific pages that data is to be read into.
- * When a message is being read off the wire, we call prepare_pages to
- * find those pages.
- *  0 = success, -1 failure.
- */
-static int __prepare_pages(struct ceph_connection *con,
-                         struct ceph_msg_header *hdr,
-                         struct ceph_osd_request *req,
-                         u64 tid,
-                         struct ceph_msg *m)
-{
-        struct ceph_osd *osd = con->private;
-        struct ceph_osd_client *osdc;
-        int ret = -1;
-        int data_len = le32_to_cpu(hdr->data_len);
-        unsigned data_off = le16_to_cpu(hdr->data_off);
-        int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
-        if (!osd)
-                return -1;
-        osdc = osd->o_osdc;
-        dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
-             tid, req->r_num_pages, want);
-        if (unlikely(req->r_num_pages < want))
-                goto out;
-        m->pages = req->r_pages;
-        m->nr_pages = req->r_num_pages;
-        ret = 0; /* success */
-out:
-        BUG_ON(ret < 0 || m->nr_pages < want);
-        return ret;
-}
 /*
 * Register request, send initial attempt.
 */
@@ -1252,11 +1219,13 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
        if (!osdc->req_mempool)
                goto out;
-        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+        err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true,
+                                "osd_op");
        if (err < 0)
                goto out_mempool;
        err = ceph_msgpool_init(&osdc->msgpool_op_reply,
-                                OSD_OPREPLY_FRONT_LEN, 10, true);
+                                OSD_OPREPLY_FRONT_LEN, 10, true,
+                                "osd_op_reply");
        if (err < 0)
                goto out_msgpool;
        return 0;
@@ -1302,8 +1271,8 @@ int ceph_osdc_readpages(struct ceph_osd_client *osdc,
                                    CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
                                    NULL, 0, truncate_seq, truncate_size, NULL,
                                    false, 1);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        /* it may be a short read due to an object boundary */
        req->r_pages = pages;
@@ -1345,8 +1314,8 @@ int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
                                    snapc, do_sync,
                                    truncate_seq, truncate_size, mtime,
                                    nofail, 1);
-        if (IS_ERR(req))
+        if (!req)
-                return PTR_ERR(req);
+                return -ENOMEM;
        /* it may be a short write due to an object boundary */
        req->r_pages = pages;
@@ -1394,7 +1363,8 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
 }
 /*
- * lookup and return message for incoming reply
+ * lookup and return message for incoming reply.  set up reply message
+ * pages.
 */
 static struct ceph_msg *get_reply(struct ceph_connection *con,
                                  struct ceph_msg_header *hdr,
@@ -1407,7 +1377,6 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        int front = le32_to_cpu(hdr->front_len);
        int data_len = le32_to_cpu(hdr->data_len);
        u64 tid;
-        int err;
        tid = le64_to_cpu(hdr->tid);
        mutex_lock(&osdc->request_mutex);
@@ -1425,13 +1394,14 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
                     req->r_reply, req->r_con_filling_msg);
                ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
                ceph_con_put(req->r_con_filling_msg);
+                req->r_con_filling_msg = NULL;
        }
        if (front > req->r_reply->front.iov_len) {
                pr_warning("get_reply front %d > preallocated %d\n",
                           front, (int)req->r_reply->front.iov_len);
-                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
+                m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, GFP_NOFS);
-                if (IS_ERR(m))
+                if (!m)
                        goto out;
                ceph_msg_put(req->r_reply);
                req->r_reply = m;
@@ -1439,12 +1409,19 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
        m = ceph_msg_get(req->r_reply);
        if (data_len > 0) {
-                err = __prepare_pages(con, hdr, req, tid, m);
+                unsigned data_off = le16_to_cpu(hdr->data_off);
-                if (err < 0) {
+                int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+                if (unlikely(req->r_num_pages < want)) {
+                        pr_warning("tid %lld reply %d > expected %d pages\n",
+                                   tid, want, m->nr_pages);
                        *skip = 1;
                        ceph_msg_put(m);
-                        m = ERR_PTR(err);
+                        m = NULL;
+                        goto out;
                }
+                m->pages = req->r_pages;
+                m->nr_pages = req->r_num_pages;
        }
        *skip = 0;
        req->r_con_filling_msg = ceph_con_get(con);
@@ -1466,7 +1443,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
        switch (type) {
        case CEPH_MSG_OSD_MAP:
-                return ceph_msg_new(type, front, 0, 0, NULL);
+                return ceph_msg_new(type, front, GFP_NOFS);
        case CEPH_MSG_OSD_OPREPLY:
                return get_reply(con, hdr, skip);
        default:
@@ -1552,7 +1529,7 @@ static int invalidate_authorizer(struct ceph_connection *con)
        return ceph_monc_validate_auth(&osdc->client->monc);
 }
-const static struct ceph_connection_operations osd_con_ops = {
+static const struct ceph_connection_operations osd_con_ops = {
        .get = get_osd_con,
        .put = put_osd_con,
        .dispatch = dispatch,
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index cfdd8f4388b7..ddc656fb5c05 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -706,7 +706,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                     len, *p, end);
                newcrush = crush_decode(*p, min(*p+len, end));
                if (IS_ERR(newcrush))
-                        return ERR_PTR(PTR_ERR(newcrush));
+                        return ERR_CAST(newcrush);
        }
        /* new flags? */
diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
index 5f8dbf7c745a..b6859f47d364 100644
--- a/fs/ceph/pagelist.c
+++ b/fs/ceph/pagelist.c
@@ -20,7 +20,7 @@ int ceph_pagelist_release(struct ceph_pagelist *pl)
 static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
 {
-        struct page *page = alloc_page(GFP_NOFS);
+        struct page *page = __page_cache_alloc(GFP_NOFS);
        if (!page)
                return -ENOMEM;
        pl->room += PAGE_SIZE;
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
index fd56451a871f..8fcc023056c7 100644
--- a/fs/ceph/rados.h
+++ b/fs/ceph/rados.h
@@ -101,8 +101,8 @@ struct ceph_pg_pool {
        __le64 snap_seq;          /* seq for per-pool snapshot */
        __le32 snap_epoch;        /* epoch of last snap */
        __le32 num_snaps;
-        __le32 num_removed_snap_intervals;
+        __le32 num_removed_snap_intervals; /* if non-empty, NO per-pool snaps */
-        __le64 uid;
+        __le64 auid;               /* who owns the pg */
 } __attribute__ ((packed));
 /*
@@ -208,6 +208,7 @@ enum {
        /* read */
        CEPH_OSD_OP_GETXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
        CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+        CEPH_OSD_OP_CMPXATTR  = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 3,
        /* write */
        CEPH_OSD_OP_SETXATTR  = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
@@ -305,6 +306,22 @@ enum {
 #define EOLDSNAPC    ERESTART  /* ORDERSNAP flag set; writer has old snapc*/
 #define EBLACKLISTED ESHUTDOWN /* blacklisted */
+/* xattr comparison */
+enum {
+        CEPH_OSD_CMPXATTR_OP_NOP = 0,
+        CEPH_OSD_CMPXATTR_OP_EQ  = 1,
+        CEPH_OSD_CMPXATTR_OP_NE  = 2,
+        CEPH_OSD_CMPXATTR_OP_GT  = 3,
+        CEPH_OSD_CMPXATTR_OP_GTE = 4,
+        CEPH_OSD_CMPXATTR_OP_LT  = 5,
+        CEPH_OSD_CMPXATTR_OP_LTE = 6
+};
+enum {
+        CEPH_OSD_CMPXATTR_MODE_STRING = 1,
+        CEPH_OSD_CMPXATTR_MODE_U64    = 2
+};
 /*
 * an individual object operation.  each may be accompanied by some data
 * payload
@@ -321,6 +338,8 @@ struct ceph_osd_op {
                struct {
                        __le32 name_len;
                        __le32 value_len;
+                        __u8 cmp_op;       /* CEPH_OSD_CMPXATTR_OP_* */
+                        __u8 cmp_mode;     /* CEPH_OSD_CMPXATTR_MODE_* */
                } __attribute__ ((packed)) xattr;
                struct {
                        __u8 class_len;
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index d5114db70453..c0b26b6badba 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -512,7 +512,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
                            struct ceph_cap_snap *capsnap)
 {
        struct inode *inode = &ci->vfs_inode;
-        struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+        struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
        BUG_ON(capsnap->writing);
        capsnap->size = inode->i_size;
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 9307bbee6fbe..4e0bee240b9d 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -8,14 +8,11 @@
 #include <linux/module.h>
 #include <linux/mount.h>
 #include <linux/parser.h>
-#include <linux/rwsem.h>
 #include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/statfs.h>
 #include <linux/string.h>
-#include <linux/version.h>
-#include <linux/vmalloc.h>
 #include "decode.h"
 #include "super.h"
@@ -107,12 +104,40 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
 static int ceph_syncfs(struct super_block *sb, int wait)
 {
        dout("sync_fs %d\n", wait);
-        ceph_osdc_sync(&ceph_client(sb)->osdc);
+        ceph_osdc_sync(&ceph_sb_to_client(sb)->osdc);
-        ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+        ceph_mdsc_sync(&ceph_sb_to_client(sb)->mdsc);
        dout("sync_fs %d done\n", wait);
        return 0;
 }
+static int default_congestion_kb(void)
+{
+        int congestion_kb;
+        /*
+         * Copied from NFS
+         *
+         * congestion size, scale with available memory.
+         *
+         *  64MB:    8192k
+         * 128MB:   11585k
+         * 256MB:   16384k
+         * 512MB:   23170k
+         *   1GB:   32768k
+         *   2GB:   46340k
+         *   4GB:   65536k
+         *   8GB:   92681k
+         *  16GB:  131072k
+         *
+         * This allows larger machines to have larger/more transfers.
+         * Limit the default to 256M
+         */
+        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+        if (congestion_kb > 256*1024)
+                congestion_kb = 256*1024;
+        return congestion_kb;
+}
 /**
 * ceph_show_options - Show mount options in /proc/mounts
@@ -138,6 +163,35 @@ static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
                seq_puts(m, ",nocrc");
        if (args->flags & CEPH_OPT_NOASYNCREADDIR)
                seq_puts(m, ",noasyncreaddir");
+        if (args->mount_timeout != CEPH_MOUNT_TIMEOUT_DEFAULT)
+                seq_printf(m, ",mount_timeout=%d", args->mount_timeout);
+        if (args->osd_idle_ttl != CEPH_OSD_IDLE_TTL_DEFAULT)
+                seq_printf(m, ",osd_idle_ttl=%d", args->osd_idle_ttl);
+        if (args->osd_timeout != CEPH_OSD_TIMEOUT_DEFAULT)
+                seq_printf(m, ",osdtimeout=%d", args->osd_timeout);
+        if (args->osd_keepalive_timeout != CEPH_OSD_KEEPALIVE_DEFAULT)
+                seq_printf(m, ",osdkeepalivetimeout=%d",
+                         args->osd_keepalive_timeout);
+        if (args->wsize)
+                seq_printf(m, ",wsize=%d", args->wsize);
+        if (args->rsize != CEPH_MOUNT_RSIZE_DEFAULT)
+                seq_printf(m, ",rsize=%d", args->rsize);
+        if (args->congestion_kb != default_congestion_kb())
+                seq_printf(m, ",write_congestion_kb=%d", args->congestion_kb);
+        if (args->caps_wanted_delay_min != CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_min=%d",
+                         args->caps_wanted_delay_min);
+        if (args->caps_wanted_delay_max != CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT)
+                seq_printf(m, ",caps_wanted_delay_max=%d",
+                           args->caps_wanted_delay_max);
+        if (args->cap_release_safety != CEPH_CAP_RELEASE_SAFETY_DEFAULT)
+                seq_printf(m, ",cap_release_safety=%d",
+                           args->cap_release_safety);
+        if (args->max_readdir != CEPH_MAX_READDIR_DEFAULT)
+                seq_printf(m, ",readdir_max_entries=%d", args->max_readdir);
+        if (args->max_readdir_bytes != CEPH_MAX_READDIR_BYTES_DEFAULT)
+                seq_printf(m, ",readdir_max_bytes=%d", args->max_readdir_bytes);
        if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
                seq_printf(m, ",snapdirname=%s", args->snapdir_name);
        if (args->name)
@@ -161,35 +215,6 @@ static void ceph_inode_init_once(void *foo)
        inode_init_once(&ci->vfs_inode);
 }
-static int default_congestion_kb(void)
-{
-        int congestion_kb;
-        /*
-         * Copied from NFS
-         *
-         * congestion size, scale with available memory.
-         *
-         *  64MB:    8192k
-         * 128MB:   11585k
-         * 256MB:   16384k
-         * 512MB:   23170k
-         *   1GB:   32768k
-         *   2GB:   46340k
-         *   4GB:   65536k
-         *   8GB:   92681k
-         *  16GB:  131072k
-         *
-         * This allows larger machines to have larger/more transfers.
-         * Limit the default to 256M
-         */
-        congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
-        if (congestion_kb > 256*1024)
-                congestion_kb = 256*1024;
-        return congestion_kb;
-}
 static int __init init_caches(void)
 {
        ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
@@ -308,7 +333,9 @@ enum {
        Opt_osd_idle_ttl,
        Opt_caps_wanted_delay_min,
        Opt_caps_wanted_delay_max,
+        Opt_cap_release_safety,
        Opt_readdir_max_entries,
+        Opt_readdir_max_bytes,
        Opt_congestion_kb,
        Opt_last_int,
        /* int args above */
@@ -339,7 +366,9 @@ static match_table_t arg_tokens = {
        {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
        {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
        {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+        {Opt_cap_release_safety, "cap_release_safety=%d"},
        {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+        {Opt_readdir_max_bytes, "readdir_max_bytes=%d"},
        {Opt_congestion_kb, "write_congestion_kb=%d"},
        /* int args above */
        {Opt_snapdirname, "snapdirname=%s"},
@@ -388,8 +417,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
        args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
        args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
        args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
-        args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
+        args->cap_release_safety = CEPH_CAP_RELEASE_SAFETY_DEFAULT;
-        args->max_readdir = 1024;
+        args->max_readdir = CEPH_MAX_READDIR_DEFAULT;
+        args->max_readdir_bytes = CEPH_MAX_READDIR_BYTES_DEFAULT;
        args->congestion_kb = default_congestion_kb();
        /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
@@ -497,6 +527,9 @@ static struct ceph_mount_args *parse_mount_args(int flags, char *options,
                case Opt_readdir_max_entries:
                        args->max_readdir = intval;
                        break;
+                case Opt_readdir_max_bytes:
+                        args->max_readdir_bytes = intval;
+                        break;
                case Opt_congestion_kb:
                        args->congestion_kb = intval;
                        break;
@@ -636,9 +669,17 @@ static void ceph_destroy_client(struct ceph_client *client)
        /* unmount */
        ceph_mdsc_stop(&client->mdsc);
-        ceph_monc_stop(&client->monc);
        ceph_osdc_stop(&client->osdc);
+        /*
+         * make sure mds and osd connections close out before destroying
+         * the auth module, which is needed to free those connections'
+         * ceph_authorizers.
+         */
+        ceph_msgr_flush();
+        ceph_monc_stop(&client->monc);
        ceph_adjust_min_caps(-client->min_caps);
        ceph_debugfs_client_cleanup(client);
@@ -682,9 +723,10 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
 /*
 * true if we have the mon map (and have thus joined the cluster)
 */
-static int have_mon_map(struct ceph_client *client)
+static int have_mon_and_osd_map(struct ceph_client *client)
 {
-        return client->monc.monmap && client->monc.monmap->epoch;
+        return client->monc.monmap && client->monc.monmap->epoch &&
+               client->osdc.osdmap && client->osdc.osdmap->epoch;
 }
 /*
@@ -704,7 +746,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
        dout("open_root_inode opening '%s'\n", path);
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_path1 = kstrdup(path, GFP_NOFS);
        req->r_ino1.ino = CEPH_INO_ROOT;
        req->r_ino1.snap = CEPH_NOSNAP;
@@ -762,7 +804,7 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
        if (err < 0)
                goto out;
-        while (!have_mon_map(client)) {
+        while (!have_mon_and_osd_map(client)) {
                err = -EIO;
                if (timeout && time_after_eq(jiffies, started + timeout))
                        goto out;
@@ -770,8 +812,8 @@ static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
                /* wait */
                dout("mount waiting for mon_map\n");
                err = wait_event_interruptible_timeout(client->auth_wq,
-                               have_mon_map(client) || (client->auth_err < 0),
+                       have_mon_and_osd_map(client) || (client->auth_err < 0),
-                               timeout);
+                       timeout);
                if (err == -EINTR || err == -ERESTARTSYS)
                        goto out;
                if (client->auth_err < 0) {
@@ -884,6 +926,8 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 /*
 * construct our own bdi so we can control readahead, etc.
 */
+static atomic_long_t bdi_seq = ATOMIC_INIT(0);
 static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
        int err;
@@ -893,7 +937,8 @@ static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
                client->backing_dev_info.ra_pages =
                        (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
                        >> PAGE_SHIFT;
-        err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);
+        err = bdi_register(&client->backing_dev_info, NULL, "ceph-%d",
+                           atomic_long_inc_return(&bdi_seq));
        if (!err)
                sb->s_bdi = &client->backing_dev_info;
        return err;
@@ -932,9 +977,9 @@ static int ceph_get_sb(struct file_system_type *fs_type,
                goto out;
        }
-        if (ceph_client(sb) != client) {
+        if (ceph_sb_to_client(sb) != client) {
                ceph_destroy_client(client);
-                client = ceph_client(sb);
+                client = ceph_sb_to_client(sb);
                dout("get_sb got existing client %p\n", client);
        } else {
                dout("get_sb using new client %p\n", client);
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 13513b80d87f..10a4a406e887 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,7 +10,6 @@
 #include <linux/fs.h>
 #include <linux/mempool.h>
 #include <linux/pagemap.h>
-#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
 #include <linux/slab.h>
@@ -52,24 +51,25 @@
 struct ceph_mount_args {
        int sb_flags;
+        int flags;
+        struct ceph_fsid fsid;
+        struct ceph_entity_addr my_addr;
        int num_mon;
        struct ceph_entity_addr *mon_addr;
-        int flags;
        int mount_timeout;
        int osd_idle_ttl;
-        int caps_wanted_delay_min, caps_wanted_delay_max;
-        struct ceph_fsid fsid;
-        struct ceph_entity_addr my_addr;
-        int wsize;
-        int rsize;            /* max readahead */
-        int max_readdir;      /* max readdir size */
-        int congestion_kb;      /* max readdir size */
        int osd_timeout;
        int osd_keepalive_timeout;
+        int wsize;
+        int rsize;            /* max readahead */
+        int congestion_kb;    /* max writeback in flight */
+        int caps_wanted_delay_min, caps_wanted_delay_max;
+        int cap_release_safety;
+        int max_readdir;       /* max readdir result (entires) */
+        int max_readdir_bytes; /* max readdir result (bytes) */
        char *snapdir_name;   /* default ".snap" */
        char *name;
        char *secret;
-        int cap_release_safety;
 };
 /*
@@ -80,13 +80,14 @@ struct ceph_mount_args {
 #define CEPH_OSD_KEEPALIVE_DEFAULT  5
 #define CEPH_OSD_IDLE_TTL_DEFAULT    60
 #define CEPH_MOUNT_RSIZE_DEFAULT    (512*1024) /* readahead */
+#define CEPH_MAX_READDIR_DEFAULT    1024
+#define CEPH_MAX_READDIR_BYTES_DEFAULT    (512*1024)
 #define CEPH_MSG_MAX_FRONT_LEN  (16*1024*1024)
 #define CEPH_MSG_MAX_DATA_LEN   (16*1024*1024)
 #define CEPH_SNAPDIRNAME_DEFAULT ".snap"
 #define CEPH_AUTH_NAME_DEFAULT   "guest"
 /*
 * Delay telling the MDS we no longer want caps, in case we reopen
 * the file.  Delay a minimum amount of time, even if we send a cap
@@ -96,6 +97,7 @@ struct ceph_mount_args {
 #define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT      5  /* cap release delay */
 #define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT     60  /* cap release delay */
+#define CEPH_CAP_RELEASE_SAFETY_DEFAULT        (CEPH_CAPS_PER_RELEASE * 4)
 /* mount state */
 enum {
@@ -160,12 +162,6 @@ struct ceph_client {
 #endif
 };
-static inline struct ceph_client *ceph_client(struct super_block *sb)
-{
-        return sb->s_fs_info;
-}
 /*
 * File i/o capability.  This tracks shared state with the metadata
 * server that allows us to cache or writeback attributes or to read
@@ -814,7 +810,7 @@ extern void ceph_put_cap(struct ceph_cap *cap);
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
-extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ceph_fsync(struct file *file, int datasync);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                    struct ceph_mds_session *session);
 extern int ceph_get_cap_mds(struct inode *inode);
@@ -871,6 +867,7 @@ extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
 extern void ceph_dentry_lru_add(struct dentry *dn);
 extern void ceph_dentry_lru_touch(struct dentry *dn);
 extern void ceph_dentry_lru_del(struct dentry *dn);
+extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 /*
 * our d_ops vary depending on whether the inode is live,
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 2845422907fc..68aeebc69681 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -7,7 +7,8 @@
 static bool ceph_is_valid_xattr(const char *name)
 {
-        return !strncmp(name, XATTR_SECURITY_PREFIX,
+        return !strncmp(name, "ceph.", 5) ||
+               !strncmp(name, XATTR_SECURITY_PREFIX,
                        XATTR_SECURITY_PREFIX_LEN) ||
               !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) ||
               !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
@@ -76,14 +77,14 @@ static size_t ceph_vxattrcb_rctime(struct ceph_inode_info *ci, char *val,
 }
 static struct ceph_vxattr_cb ceph_dir_vxattrs[] = {
-        { true, "user.ceph.dir.entries", ceph_vxattrcb_entries},
+        { true, "ceph.dir.entries", ceph_vxattrcb_entries},
-        { true, "user.ceph.dir.files", ceph_vxattrcb_files},
+        { true, "ceph.dir.files", ceph_vxattrcb_files},
-        { true, "user.ceph.dir.subdirs", ceph_vxattrcb_subdirs},
+        { true, "ceph.dir.subdirs", ceph_vxattrcb_subdirs},
-        { true, "user.ceph.dir.rentries", ceph_vxattrcb_rentries},
+        { true, "ceph.dir.rentries", ceph_vxattrcb_rentries},
-        { true, "user.ceph.dir.rfiles", ceph_vxattrcb_rfiles},
+        { true, "ceph.dir.rfiles", ceph_vxattrcb_rfiles},
-        { true, "user.ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
+        { true, "ceph.dir.rsubdirs", ceph_vxattrcb_rsubdirs},
-        { true, "user.ceph.dir.rbytes", ceph_vxattrcb_rbytes},
+        { true, "ceph.dir.rbytes", ceph_vxattrcb_rbytes},
-        { true, "user.ceph.dir.rctime", ceph_vxattrcb_rctime},
+        { true, "ceph.dir.rctime", ceph_vxattrcb_rctime},
        { true, NULL, NULL }
 };
@@ -107,7 +108,7 @@ static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
 }
 static struct ceph_vxattr_cb ceph_file_vxattrs[] = {
-        { true, "user.ceph.layout", ceph_vxattrcb_layout},
+        { true, "ceph.layout", ceph_vxattrcb_layout},
        { NULL, NULL }
 };
@@ -186,12 +187,6 @@ static int __set_xattr(struct ceph_inode_info *ci,
                ci->i_xattrs.names_size -= xattr->name_len;
                ci->i_xattrs.vals_size -= xattr->val_len;
        }
-        if (!xattr) {
-                pr_err("__set_xattr ENOMEM on %p %llx.%llx xattr %s=%s\n",
-                       &ci->vfs_inode, ceph_vinop(&ci->vfs_inode), name,
-                       xattr->val);
-                return -ENOMEM;
-        }
        ci->i_xattrs.names_size += name_len;
        ci->i_xattrs.vals_size += val_len;
        if (val)
@@ -574,7 +569,7 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
             ci->i_xattrs.version, ci->i_xattrs.index_version);
        if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
-            (ci->i_xattrs.index_version > ci->i_xattrs.version)) {
+            (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
                goto list_xattr;
        } else {
                spin_unlock(&inode->i_lock);
@@ -622,7 +617,7 @@ out:
 static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                              const char *value, size_t size, int flags)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct inode *inode = dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct inode *parent_inode = dentry->d_parent->d_inode;
@@ -641,7 +636,7 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name,
                        return -ENOMEM;
                err = -ENOMEM;
                for (i = 0; i < nr_pages; i++) {
-                        pages[i] = alloc_page(GFP_NOFS);
+                        pages[i] = __page_cache_alloc(GFP_NOFS);
                        if (!pages[i]) {
                                nr_pages = i;
                                goto out;
@@ -779,7 +774,7 @@ out:
 static int ceph_send_removexattr(struct dentry *dentry, const char *name)
 {
-        struct ceph_client *client = ceph_client(dentry->d_sb);
+        struct ceph_client *client = ceph_sb_to_client(dentry->d_sb);
        struct ceph_mds_client *mdsc = &client->mdsc;
        struct inode *inode = dentry->d_inode;
        struct inode *parent_inode = dentry->d_parent->d_inode;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0242ff9cbf41..a7eb65c84b1c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                         size_t write_size, loff_t *poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
-extern int cifs_fsync(struct file *, struct dentry *, int);
+extern int cifs_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a83541ec9713..f1ff785b2292 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1676,7 +1676,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        return rc;
 }
-int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int cifs_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
@@ -1688,7 +1688,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
        xid = GetXid();
        cFYI(1, "Sync file - name: %s datasync: 0x%x",
-                dentry->d_name.name, datasync);
+                file->f_path.dentry->d_name.name, datasync);
        rc = filemap_write_and_wait(inode->i_mapping);
        if (rc == 0) {
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
 void coda_destroy_inodecache(void);
 int coda_init_inodecache(void);
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry,
+int coda_fsync(struct file *coda_file, int datasync);
-               int datasync);
 void coda_sysctl_init(void);
 void coda_sysctl_clean(void);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7196077b1688..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        return 0;
 }
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
+int coda_fsync(struct file *coda_file, int datasync)
 {
        struct file *host_file;
-        struct inode *coda_inode = coda_dentry->d_inode;
+        struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
        struct coda_file_info *cfi;
        int err = 0;
diff --git a/fs/coda/pioctl.c b/fs/coda/pioctl.c
index 773f2ce9aa06..ca25d96d45c9 100644
--- a/fs/coda/pioctl.c
+++ b/fs/coda/pioctl.c
@@ -1,6 +1,6 @@
 /*
 * Pioctl operations for Coda.
- * Original version: (C) 1996 Peter Braam 
+ * Original version: (C) 1996 Peter Braam
 * Rewritten for Linux 2.1: (C) 1997 Carnegie Mellon University
 *
 * Carnegie Mellon encourages users of this code to contribute improvements
@@ -23,21 +23,22 @@
 #include <linux/coda_fs_i.h>
 #include <linux/coda_psdev.h>
+#include <linux/smp_lock.h>
 /* pioctl ops */
 static int coda_ioctl_permission(struct inode *inode, int mask);
-static int coda_pioctl(struct inode * inode, struct file * filp, 
+static long coda_pioctl(struct file *filp, unsigned int cmd,
-                       unsigned int cmd, unsigned long user_data);
+                        unsigned long user_data);
 /* exported from this file */
-const struct inode_operations coda_ioctl_inode_operations =
+const struct inode_operations coda_ioctl_inode_operations = {
-{
        .permission     = coda_ioctl_permission,
        .setattr        = coda_setattr,
 };
 const struct file_operations coda_ioctl_operations = {
        .owner          = THIS_MODULE,
-        .ioctl          = coda_pioctl,
+        .unlocked_ioctl = coda_pioctl,
 };
 /* the coda pioctl inode ops */
@@ -46,48 +47,53 @@ static int coda_ioctl_permission(struct inode *inode, int mask)
        return (mask & MAY_EXEC) ? -EACCES : 0;
 }
-static int coda_pioctl(struct inode * inode, struct file * filp, 
+static long coda_pioctl(struct file *filp, unsigned int cmd,
-                       unsigned int cmd, unsigned long user_data)
+                        unsigned long user_data)
 {
        struct path path;
-        int error;
+        int error;
        struct PioctlData data;
-        struct inode *target_inode = NULL;
+        struct inode *inode = filp->f_dentry->d_inode;
-        struct coda_inode_info *cnp;
+        struct inode *target_inode = NULL;
+        struct coda_inode_info *cnp;
-        /* get the Pioctl data arguments from user space */
+        lock_kernel();
-        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-            return -EINVAL;
+        /* get the Pioctl data arguments from user space */
-        }
+        if (copy_from_user(&data, (void __user *)user_data, sizeof(data))) {
-       
+                error = -EINVAL;
-        /* 
+                goto out;
-         * Look up the pathname. Note that the pathname is in 
-         * user memory, and namei takes care of this
-         */
-        if (data.follow) {
-                error = user_path(data.path, &path);
-        } else {
-                error = user_lpath(data.path, &path);
        }
-                
-        if ( error ) {
+        /*
-                return error;
+         * Look up the pathname. Note that the pathname is in
-        } else {
+         * user memory, and namei takes care of this
+         */
+        if (data.follow)
+                error = user_path(data.path, &path);
+        else
+                error = user_lpath(data.path, &path);
+        if (error)
+                goto out;
+        else
                target_inode = path.dentry->d_inode;
-        }
-        
        /* return if it is not a Coda inode */
-        if ( target_inode->i_sb != inode->i_sb ) {
+        if (target_inode->i_sb != inode->i_sb) {
                path_put(&path);
-                return  -EINVAL;
+                error = -EINVAL;
+                goto out;
        }
        /* now proceed to make the upcall */
-        cnp = ITOC(target_inode);
+        cnp = ITOC(target_inode);
        error = venus_pioctl(inode->i_sb, &(cnp->c_fid), cmd, &data);
        path_put(&path);
-        return error;
-}
+out:
+        unlock_kernel();
+        return error;
+}
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index be4392ca2098..66b9cf79c5ba 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -73,8 +73,7 @@ static unsigned int coda_psdev_poll(struct file *file, poll_table * wait)
        return mask;
 }
-static int coda_psdev_ioctl(struct inode * inode, struct file * filp, 
+static long coda_psdev_ioctl(struct file * filp, unsigned int cmd, unsigned long arg)
-                            unsigned int cmd, unsigned long arg)
 {
        unsigned int data;
@@ -344,7 +343,7 @@ static const struct file_operations coda_psdev_fops = {
        .read           = coda_psdev_read,
        .write          = coda_psdev_write,
        .poll           = coda_psdev_poll,
-        .ioctl          = coda_psdev_ioctl,
+        .unlocked_ioctl = coda_psdev_ioctl,
        .open           = coda_psdev_open,
        .release        = coda_psdev_release,
 };
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f840..f0b391c50552 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -568,6 +568,79 @@ out:
        return ret;
 }
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+ssize_t compat_rw_copy_check_uvector(int type,
+                const struct compat_iovec __user *uvector, unsigned long nr_segs,
+                unsigned long fast_segs, struct iovec *fast_pointer,
+                struct iovec **ret_pointer)
+{
+        compat_ssize_t tot_len;
+        struct iovec *iov = *ret_pointer = fast_pointer;
+        ssize_t ret = 0;
+        int seg;
+        /*
+         * SuS says "The readv() function *may* fail if the iovcnt argument
+         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+         * traditionally returned zero for zero segments, so...
+         */
+        if (nr_segs == 0)
+                goto out;
+        ret = -EINVAL;
+        if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+                goto out;
+        if (nr_segs > fast_segs) {
+                ret = -ENOMEM;
+                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+                if (iov == NULL) {
+                        *ret_pointer = fast_pointer;
+                        goto out;
+                }
+        }
+        *ret_pointer = iov;
+        /*
+         * Single unix specification:
+         * We should -EINVAL if an element length is not >= 0 and fitting an
+         * ssize_t.  The total length is fitting an ssize_t
+         *
+         * Be careful here because iov_len is a size_t not an ssize_t
+         */
+        tot_len = 0;
+        ret = -EINVAL;
+        for (seg = 0; seg < nr_segs; seg++) {
+                compat_ssize_t tmp = tot_len;
+                compat_uptr_t buf;
+                compat_ssize_t len;
+                if (__get_user(len, &uvector->iov_len) ||
+                   __get_user(buf, &uvector->iov_base)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
+                        goto out;
+                tot_len += len;
+                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
+                        goto out;
+                if (!access_ok(vrfy_dir(type), buf, len)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                iov->iov_base = compat_ptr(buf);
+                iov->iov_len = (compat_size_t) len;
+                uvector++;
+                iov++;
+        }
+        ret = tot_len;
+out:
+        return ret;
+}
 static inline long
 copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
 {
@@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
        iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
        ret = copy_iocb(nr, iocb, iocb64);
        if (!ret)
-                ret = sys_io_submit(ctx_id, nr, iocb64);
+                ret = do_io_submit(ctx_id, nr, iocb64, 1);
        return ret;
 }
@@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 {
        compat_ssize_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
-        struct iovec *iov=iovstack, *vector;
+        struct iovec *iov;
        ssize_t ret;
-        int seg;
        io_fn_t fn;
        iov_fn_t fnv;
-        /*
-         * SuS says "The readv() function *may* fail if the iovcnt argument
-         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-         * traditionally returned zero for zero segments, so...
-         */
-        ret = 0;
-        if (nr_segs == 0)
-                goto out;
-        /*
-         * First get the "struct iovec" from user memory and
-         * verify all the pointers
-         */
        ret = -EINVAL;
-        if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
-                goto out;
        if (!file->f_op)
                goto out;
-        if (nr_segs > UIO_FASTIOV) {
-                ret = -ENOMEM;
-                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-                if (!iov)
-                        goto out;
-        }
        ret = -EFAULT;
        if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
                goto out;
-        /*
+        tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
-         * Single unix specification:
+                                               UIO_FASTIOV, iovstack, &iov);
-         * We should -EINVAL if an element length is not >= 0 and fitting an
-         * ssize_t.  The total length is fitting an ssize_t
-         *
-         * Be careful here because iov_len is a size_t not an ssize_t
-         */
-        tot_len = 0;
-        vector = iov;
-        ret = -EINVAL;
-        for (seg = 0 ; seg < nr_segs; seg++) {
-                compat_ssize_t tmp = tot_len;
-                compat_ssize_t len;
-                compat_uptr_t buf;
-                if (__get_user(len, &uvector->iov_len) ||
-                    __get_user(buf, &uvector->iov_base)) {
-                        ret = -EFAULT;
-                        goto out;
-                }
-                if (len < 0)    /* size_t not fitting an compat_ssize_t .. */
-                        goto out;
-                tot_len += len;
-                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-                        goto out;
-                vector->iov_base = compat_ptr(buf);
-                vector->iov_len = (compat_size_t) len;
-                uvector++;
-                vector++;
-        }
        if (tot_len == 0) {
                ret = 0;
                goto out;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c8af2d91174b..41645142b88b 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -72,16 +72,11 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
        if (!sd)
                return -EINVAL;
-        sd_iattr = sd->s_iattr;
+        error = simple_setattr(dentry, iattr);
-        error = inode_change_ok(inode, iattr);
-        if (error)
-                return error;
-        error = inode_setattr(inode, iattr);
        if (error)
                return error;
+        sd_iattr = sd->s_iattr;
        if (!sd_iattr) {
                /* setting attributes for the first time, allocate now */
                sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
 /*
- * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value
+ * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
 *
 * These functions are exactly the same as the above functions (but use a hex
 * output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
+/**
+ * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+                                 struct dentry *parent, u64 *value)
+{
+        return debugfs_create_file(name, mode, parent, value, &fops_x64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x64);
 static int debugfs_size_t_set(void *data, u64 val)
 {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..7600aacf531d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
        int reap_counter;               /* rate limit reaping */
        get_block_t *get_block;         /* block mapping function */
        dio_iodone_t *end_io;           /* IO completion function */
+        dio_submit_t *submit_io;        /* IO submition function */
+        loff_t logical_offset_in_bio;   /* current first logical block in bio */
        sector_t final_block_in_bio;    /* current final block in bio + 1 */
        sector_t next_block_for_io;     /* next block to be put under IO,
                                           in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
        unsigned cur_page_offset;       /* Offset into it, in bytes */
        unsigned cur_page_len;          /* Nr of bytes at cur_page_offset */
        sector_t cur_page_block;        /* Where it starts */
+        loff_t cur_page_fs_offset;      /* Offset in file */
        /* BIO completion state */
        spinlock_t bio_lock;            /* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
        spin_unlock_irqrestore(&dio->bio_lock, flags);
 }
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ * @error: Error if there was one
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio, int error)
+{
+        struct dio *dio = bio->bi_private;
+        if (dio->is_async)
+                dio_bio_end_aio(bio, error);
+        else
+                dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
 static int
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                bio->bi_end_io = dio_bio_end_io;
        dio->bio = bio;
+        dio->logical_offset_in_bio = dio->cur_page_fs_offset;
        return 0;
 }
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
        if (dio->is_async && dio->rw == READ)
                bio_set_pages_dirty(bio);
-        submit_bio(dio->rw, bio);
+        if (dio->submit_io)
+                dio->submit_io(dio->rw, bio, dio->inode,
+                               dio->logical_offset_in_bio);
+        else
+                submit_bio(dio->rw, bio);
        dio->bio = NULL;
        dio->boundary = 0;
+        dio->logical_offset_in_bio = 0;
 }
 /*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
        int ret = 0;
        if (dio->bio) {
+                loff_t cur_offset = dio->block_in_file << dio->blkbits;
+                loff_t bio_next_offset = dio->logical_offset_in_bio +
+                        dio->bio->bi_size;
                /*
-                 * See whether this new request is contiguous with the old
+                 * See whether this new request is contiguous with the old.
+                 *
+                 * Btrfs cannot handl having logically non-contiguous requests
+                 * submitted.  For exmple if you have
+                 *
+                 * Logical:  [0-4095][HOLE][8192-12287]
+                 * Phyiscal: [0-4095]      [4096-8181]
+                 *
+                 * We cannot submit those pages together as one BIO.  So if our
+                 * current logical offset in the file does not equal what would
+                 * be the next logical offset in the bio, submit the bio we
+                 * have.
                 */
-                if (dio->final_block_in_bio != dio->cur_page_block)
+                if (dio->final_block_in_bio != dio->cur_page_block ||
+                    cur_offset != bio_next_offset)
                        dio_bio_submit(dio);
                /*
                 * Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
        dio->cur_page_offset = offset;
        dio->cur_page_len = len;
        dio->cur_page_block = blocknr;
+        dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
 out:
        return ret;
 }
@@ -935,7 +981,7 @@ static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
        const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
        unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-        struct dio *dio)
+        dio_submit_t submit_io, struct dio *dio)
 {
        unsigned long user_addr; 
        unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        dio->get_block = get_block;
        dio->end_io = end_io;
+        dio->submit_io = submit_io;
        dio->final_block_in_bio = -1;
        dio->next_block_for_io = -1;
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
                }
        } /* end iovec loop */
-        if (ret == -ENOTBLK && (rw & WRITE)) {
+        if (ret == -ENOTBLK) {
                /*
                 * The remaining part of the request will be
                 * be handled by buffered I/O when we return
@@ -1087,30 +1134,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        return ret;
 }
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- *  - if the flags value contains DIO_LOCKING we use a fancy locking
- *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
- *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
- *
- *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- *    internal locking but rather rely on the filesystem to synchronize
- *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
- */
 ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset, 
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-        int flags)
+        dio_submit_t submit_io, int flags)
 {
        int seg;
        size_t size;
@@ -1197,11 +1225,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                (end > i_size_read(inode)));
        retval = direct_io_worker(rw, iocb, inode, iov, offset,
-                                nr_segs, blkbits, get_block, end_io, dio);
+                                nr_segs, blkbits, get_block, end_io,
+                                submit_io, dio);
+out:
+        return retval;
+}
+EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+        struct block_device *bdev, const struct iovec *iov, loff_t offset,
+        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+        dio_submit_t submit_io, int flags)
+{
+        ssize_t retval;
+        retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
+                        offset, nr_segs, get_block, end_io, submit_io, flags);
        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again for DIO_LOCKING.
+         * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
+         * their own manner. This is a further example of where the old
+         * truncate sequence is inadequate.
         *
         * NOTE: filesystems with their own locking have to handle this
         * on their own.
@@ -1209,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (flags & DIO_LOCKING) {
                if (unlikely((rw & WRITE) && retval < 0)) {
                        loff_t isize = i_size_read(inode);
+                        loff_t end = offset + iov_length(iov, nr_segs);
                        if (end > isize)
                                vmtruncate(inode, isize);
                }
        }
-out:
        return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 3bdddbcc785f..e8fcf4e2ed7d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -274,7 +274,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 }
 static int
-ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+ecryptfs_fsync(struct file *file, int datasync)
 {
        return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 65dee2f336ae..31ef5252f0fe 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -805,7 +805,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                                    - (ia->ia_size & ~PAGE_CACHE_MASK));
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-                        rc = vmtruncate(inode, ia->ia_size);
+                        rc = simple_setsize(inode, ia->ia_size);
                        if (rc)
                                goto out;
                        lower_ia->ia_size = ia->ia_size;
@@ -830,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                                goto out;
                        }
                }
-                vmtruncate(inode, ia->ia_size);
+                simple_setsize(inode, ia->ia_size);
                rc = ecryptfs_write_inode_size_to_metadata(inode);
                if (rc) {
                        printk(KERN_ERR "Problem with "
diff --git a/fs/exec.c b/fs/exec.c
index e6e94c626c2c..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -242,9 +242,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
         * use STACK_TOP because that can depend on attributes which aren't
         * configured yet.
         */
+        BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
        vma->vm_end = STACK_TOP_MAX;
        vma->vm_start = vma->vm_end - PAGE_SIZE;
-        vma->vm_flags = VM_STACK_FLAGS;
+        vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
        vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
        INIT_LIST_HEAD(&vma->anon_vma_chain);
        err = insert_vm_struct(mm, vma);
@@ -616,6 +617,7 @@ int setup_arg_pages(struct linux_binprm *bprm,
        else if (executable_stack == EXSTACK_DISABLE_X)
                vm_flags &= ~VM_EXEC;
        vm_flags |= mm->def_flags;
+        vm_flags |= VM_STACK_INCOMPLETE_SETUP;
        ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
                        vm_flags);
@@ -630,6 +632,9 @@ int setup_arg_pages(struct linux_binprm *bprm,
                        goto out_unlock;
        }
+        /* mprotect_fixup is overkill to remove the temporary stack flags */
+        vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
        stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
        stack_size = vma->vm_end - vma->vm_start;
        /*
@@ -763,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;
-        int count;
        if (thread_group_empty(tsk))
                goto no_thread_group;
@@ -780,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
                spin_unlock_irq(lock);
                return -EAGAIN;
        }
        sig->group_exit_task = tsk;
-        zap_other_threads(tsk);
+        sig->notify_count = zap_other_threads(tsk);
+        if (!thread_group_leader(tsk))
+                sig->notify_count--;
-        /* Account for the thread group leader hanging around: */
+        while (sig->notify_count) {
-        count = thread_group_leader(tsk) ? 1 : 2;
-        sig->notify_count = count;
-        while (atomic_read(&sig->count) > count) {
                __set_current_state(TASK_UNINTERRUPTIBLE);
                spin_unlock_irq(lock);
                schedule();
@@ -1657,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
        struct task_struct *tsk = current;
        struct mm_struct *mm = tsk->mm;
        struct completion *vfork_done;
-        int core_waiters;
+        int core_waiters = -EBUSY;
        init_completion(&core_state->startup);
        core_state->dumper.task = tsk;
        core_state->dumper.next = NULL;
-        core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+        down_write(&mm->mmap_sem);
+        if (!mm->core_state)
+                core_waiters = zap_threads(tsk, mm, core_state, exit_code);
        up_write(&mm->mmap_sem);
        if (unlikely(core_waiters < 0))
@@ -1782,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
 }
+/*
+ * uhm_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info)
+{
+        struct file *rp, *wp;
+        struct fdtable *fdt;
+        struct coredump_params *cp = (struct coredump_params *)info->data;
+        struct files_struct *cf = current->files;
+        wp = create_write_pipe(0);
+        if (IS_ERR(wp))
+                return PTR_ERR(wp);
+        rp = create_read_pipe(wp, 0);
+        if (IS_ERR(rp)) {
+                free_write_pipe(wp);
+                return PTR_ERR(rp);
+        }
+        cp->file = wp;
+        sys_close(0);
+        fd_install(0, rp);
+        spin_lock(&cf->file_lock);
+        fdt = files_fdtable(cf);
+        FD_SET(0, fdt->open_fds);
+        FD_CLR(0, fdt->close_on_exec);
+        spin_unlock(&cf->file_lock);
+        /* and disallow core files too */
+        current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+        return 0;
+}
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
        char corename[CORENAME_MAX_SIZE + 1];
        struct mm_struct *mm = current->mm;
        struct linux_binfmt * binfmt;
-        struct inode * inode;
        const struct cred *old_cred;
        struct cred *cred;
        int retval = 0;
        int flag = 0;
-        int ispipe = 0;
+        int ispipe;
-        char **helper_argv = NULL;
-        int helper_argc = 0;
-        int dump_count = 0;
        static atomic_t core_dump_count = ATOMIC_INIT(0);
        struct coredump_params cprm = {
                .signr = signr,
@@ -1815,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        binfmt = mm->binfmt;
        if (!binfmt || !binfmt->core_dump)
                goto fail;
+        if (!__get_dumpable(cprm.mm_flags))
-        cred = prepare_creds();
-        if (!cred) {
-                retval = -ENOMEM;
                goto fail;
-        }
-        down_write(&mm->mmap_sem);
+        cred = prepare_creds();
-        /*
+        if (!cred)
-         * If another thread got here first, or we are not dumpable, bail out.
-         */
-        if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
-                up_write(&mm->mmap_sem);
-                put_cred(cred);
                goto fail;
-        }
        /*
         *      We cannot trust fsuid as being the "true" uid of the
         *      process nor do we know its entire history. We only know it
@@ -1844,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        }
        retval = coredump_wait(exit_code, &core_state);
-        if (retval < 0) {
+        if (retval < 0)
-                put_cred(cred);
+                goto fail_creds;
-                goto fail;
-        }
        old_cred = override_creds(cred);
@@ -1865,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        ispipe = format_corename(corename, signr);
        unlock_kernel();
-        if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
-                goto fail_unlock;
        if (ispipe) {
-                if (cprm.limit == 0) {
+                int dump_count;
+                char **helper_argv;
+                if (cprm.limit == 1) {
                        /*
                         * Normally core limits are irrelevant to pipes, since
                         * we're not writing to the file system, but we use
-                         * cprm.limit of 0 here as a speacial value. Any
+                         * cprm.limit of 1 here as a speacial value. Any
-                         * non-zero limit gets set to RLIM_INFINITY below, but
+                         * non-1 limit gets set to RLIM_INFINITY below, but
                         * a limit of 0 skips the dump.  This is a consistent
                         * way to catch recursive crashes.  We can still crash
-                         * if the core_pattern binary sets RLIM_CORE =  !0
+                         * if the core_pattern binary sets RLIM_CORE =  !1
                         * but it runs as root, and can do lots of stupid things
                         * Note that we use task_tgid_vnr here to grab the pid
                         * of the process group leader.  That way we get the
@@ -1885,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                         * core_pattern process dies.
                         */
                        printk(KERN_WARNING
-                                "Process %d(%s) has RLIMIT_CORE set to 0\n",
+                                "Process %d(%s) has RLIMIT_CORE set to 1\n",
                                task_tgid_vnr(current), current->comm);
                        printk(KERN_WARNING "Aborting core\n");
                        goto fail_unlock;
                }
+                cprm.limit = RLIM_INFINITY;
                dump_count = atomic_inc_return(&core_dump_count);
                if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1899,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                        goto fail_dropcount;
                }
-                helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+                helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
                if (!helper_argv) {
                        printk(KERN_WARNING "%s failed to allocate memory\n",
                               __func__);
                        goto fail_dropcount;
                }
-                cprm.limit = RLIM_INFINITY;
+                retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+                                        NULL, UMH_WAIT_EXEC, umh_pipe_setup,
-                /* SIGPIPE can happen, but it's just never processed */
+                                        NULL, &cprm);
-                if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
+                argv_free(helper_argv);
-                                &cprm.file)) {
+                if (retval) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
                               corename);
-                        goto fail_dropcount;
+                        goto close_fail;
                }
-        } else
+        } else {
+                struct inode *inode;
+                if (cprm.limit < binfmt->min_coredump)
+                        goto fail_unlock;
                cprm.file = filp_open(corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
                                 0600);
-        if (IS_ERR(cprm.file))
+                if (IS_ERR(cprm.file))
-                goto fail_dropcount;
+                        goto fail_unlock;
-        inode = cprm.file->f_path.dentry->d_inode;
-        if (inode->i_nlink > 1)
-                goto close_fail;        /* multiple links - don't dump */
-        if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
-                goto close_fail;
-        /* AK: actually i see no reason to not allow this for named pipes etc.,
-           but keep the previous behaviour for now. */
-        if (!ispipe && !S_ISREG(inode->i_mode))
-                goto close_fail;
-        /*
-         * Dont allow local users get cute and trick others to coredump
-         * into their pre-created files:
-         * Note, this is not relevant for pipes
-         */
-        if (!ispipe && (inode->i_uid != current_fsuid()))
-                goto close_fail;
-        if (!cprm.file->f_op)
-                goto close_fail;
-        if (!cprm.file->f_op->write)
-                goto close_fail;
-        if (!ispipe &&
-            do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
-                goto close_fail;
-        retval = binfmt->core_dump(&cprm);
+                inode = cprm.file->f_path.dentry->d_inode;
+                if (inode->i_nlink > 1)
+                        goto close_fail;
+                if (d_unhashed(cprm.file->f_path.dentry))
+                        goto close_fail;
+                /*
+                 * AK: actually i see no reason to not allow this for named
+                 * pipes etc, but keep the previous behaviour for now.
+                 */
+                if (!S_ISREG(inode->i_mode))
+                        goto close_fail;
+                /*
+                 * Dont allow local users get cute and trick others to coredump
+                 * into their pre-created files.
+                 */
+                if (inode->i_uid != current_fsuid())
+                        goto close_fail;
+                if (!cprm.file->f_op || !cprm.file->f_op->write)
+                        goto close_fail;
+                if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+                        goto close_fail;
+        }
+        retval = binfmt->core_dump(&cprm);
        if (retval)
                current->signal->group_exit_code |= 0x80;
-close_fail:
        if (ispipe && core_pipe_limit)
                wait_for_dump_helpers(cprm.file);
-        filp_close(cprm.file, NULL);
+close_fail:
+        if (cprm.file)
+                filp_close(cprm.file, NULL);
 fail_dropcount:
-        if (dump_count)
+        if (ispipe)
                atomic_dec(&core_dump_count);
 fail_unlock:
-        if (helper_argv)
+        coredump_finish(mm);
-                argv_free(helper_argv);
        revert_creds(old_cred);
+fail_creds:
        put_cred(cred);
-        coredump_finish(mm);
 fail:
        return;
 }
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 4cfab1cc75c0..d91e9d829bc1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -608,7 +608,7 @@ int exofs_make_empty(struct inode *inode, struct inode *parent)
        de->inode_no = cpu_to_le64(parent->i_ino);
        memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
        exofs_set_de_type(de, inode);
-        kunmap_atomic(page, KM_USER0);
+        kunmap_atomic(kaddr, KM_USER0);
        err = exofs_commit_chunk(page, 0, chunk_size);
 fail:
        page_cache_release(page);
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..fef6899be397 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
-static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+static int exofs_file_fsync(struct file *filp, int datasync)
-                            int datasync)
 {
        int ret;
        struct address_space *mapping = filp->f_mapping;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = mapping->host;
        struct super_block *sb;
        ret = filemap_write_and_wait(mapping);
@@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
 static int exofs_flush(struct file *file, fl_owner_t id)
 {
-        exofs_file_fsync(file, file->f_path.dentry, 1);
+        exofs_file_fsync(file, 1);
        /* TODO: Flush the OSD target */
        return 0;
 }
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d7c6afa79754..4bb6ef822e46 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -755,6 +755,21 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
        return ret;
 }
+static int exofs_releasepage(struct page *page, gfp_t gfp)
+{
+        EXOFS_DBGMSG("page 0x%lx\n", page->index);
+        WARN_ON(1);
+        return try_to_free_buffers(page);
+}
+static void exofs_invalidatepage(struct page *page, unsigned long offset)
+{
+        EXOFS_DBGMSG("page_has_buffers=>%d\n", page_has_buffers(page));
+        WARN_ON(1);
+        block_invalidatepage(page, offset);
+}
 const struct address_space_operations exofs_aops = {
        .readpage       = exofs_readpage,
        .readpages      = exofs_readpages,
@@ -762,6 +777,21 @@ const struct address_space_operations exofs_aops = {
        .writepages     = exofs_writepages,
        .write_begin    = exofs_write_begin_export,
        .write_end      = exofs_write_end,
+        .releasepage    = exofs_releasepage,
+        .set_page_dirty = __set_page_dirty_nobuffers,
+        .invalidatepage = exofs_invalidatepage,
+        /* Not implemented Yet */
+        .bmap           = NULL, /* TODO: use osd's OSD_ACT_READ_MAP */
+        .direct_IO      = NULL, /* TODO: Should be trivial to do */
+        /* With these NULL has special meaning or default is not exported */
+        .sync_page      = NULL,
+        .get_xip_mem    = NULL,
+        .migratepage    = NULL,
+        .launder_page   = NULL,
+        .is_partially_uptodate = NULL,
+        .error_remove_page = NULL,
 };
 /******************************************************************************
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2f..52b34f1d2738 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -122,7 +122,6 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_delete_inode (struct inode *);
 extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern void ext2_truncate (struct inode *);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern void ext2_get_inode_flags(struct ext2_inode_info *);
@@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *);
 extern const struct file_operations ext2_dir_operations;
 /* file.c */
-extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ext2_fsync(struct file *file, int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697fb..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
        return 0;
 }
-int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ext2_fsync(struct file *file, int datasync)
 {
        int ret;
-        struct super_block *sb = dentry->d_inode->i_sb;
+        struct super_block *sb = file->f_mapping->host->i_sb;
        struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
-        ret = simple_fsync(file, dentry, datasync);
+        ret = generic_file_fsync(file, datasync);
        if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
                /* We don't really know where the IO error happened... */
                ext2_error(sb, __func__,
@@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = {
 #endif
 const struct inode_operations ext2_file_inode_operations = {
-        .truncate       = ext2_truncate,
 #ifdef CONFIG_EXT2_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 527c46d9bc1f..19214435b752 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -54,6 +54,18 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
                inode->i_blocks - ea_blocks == 0);
 }
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
+static void ext2_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                ext2_truncate_blocks(inode, inode->i_size);
+        }
+}
 /*
 * Called at the last iput() if i_nlink is zero.
 */
@@ -71,7 +83,7 @@ void ext2_delete_inode (struct inode * inode)
        inode->i_size = 0;
        if (inode->i_blocks)
-                ext2_truncate (inode);
+                ext2_truncate_blocks(inode, 0);
        ext2_free_inode (inode);
        return;
@@ -757,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        return block_write_begin_newtrunc(file, mapping, pos, len, flags,
-                                                        ext2_get_block);
+                                        pagep, fsdata, ext2_get_block);
 }
 static int
@@ -766,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
+        int ret;
        *pagep = NULL;
-        return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata);
+        ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+        if (ret < 0)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
+}
+static int ext2_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        int ret;
+        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+        if (ret < len)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
 }
 static int
@@ -775,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
+        int ret;
        /*
         * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
         * directory handling code to pass around offsets rather than struct
         * pages in order to make this work easily.
         */
-        return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
-                                                        ext2_get_block);
+                                                fsdata, ext2_get_block);
+        if (ret < 0)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
 }
 static int ext2_nobh_writepage(struct page *page,
@@ -800,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                        loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ssize_t ret;
-                                offset, nr_segs, ext2_get_block, NULL);
+        ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
+                                iov, offset, nr_segs, ext2_get_block, NULL);
+        if (ret < 0 && (rw & WRITE))
+                ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
+        return ret;
 }
 static int
@@ -818,7 +857,7 @@ const struct address_space_operations ext2_aops = {
        .writepage              = ext2_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext2_write_begin,
-        .write_end              = generic_write_end,
+        .write_end              = ext2_write_end,
        .bmap                   = ext2_bmap,
        .direct_IO              = ext2_direct_IO,
        .writepages             = ext2_writepages,
@@ -1027,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
                ext2_free_data(inode, p, q);
 }
-void ext2_truncate(struct inode *inode)
+static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 {
        __le32 *i_data = EXT2_I(inode)->i_data;
        struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1039,27 +1078,8 @@ void ext2_truncate(struct inode *inode)
        int n;
        long iblock;
        unsigned blocksize;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-            S_ISLNK(inode->i_mode)))
-                return;
-        if (ext2_inode_is_fast_symlink(inode))
-                return;
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return;
        blocksize = inode->i_sb->s_blocksize;
-        iblock = (inode->i_size + blocksize-1)
+        iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
-                                        >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
-        if (mapping_is_xip(inode->i_mapping))
-                xip_truncate_page(inode->i_mapping, inode->i_size);
-        else if (test_opt(inode->i_sb, NOBH))
-                nobh_truncate_page(inode->i_mapping,
-                                inode->i_size, ext2_get_block);
-        else
-                block_truncate_page(inode->i_mapping,
-                                inode->i_size, ext2_get_block);
        n = ext2_block_to_path(inode, iblock, offsets, NULL);
        if (n == 0)
@@ -1127,6 +1147,62 @@ do_indirects:
        ext2_discard_reservation(inode);
        mutex_unlock(&ei->truncate_mutex);
+}
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
+{
+        /*
+         * XXX: it seems like a bug here that we don't allow
+         * IS_APPEND inode to have blocks-past-i_size trimmed off.
+         * review and fix this.
+         *
+         * Also would be nice to be able to handle IO errors and such,
+         * but that's probably too much to ask.
+         */
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)))
+                return;
+        if (ext2_inode_is_fast_symlink(inode))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        __ext2_truncate_blocks(inode, offset);
+}
+int ext2_setsize(struct inode *inode, loff_t newsize)
+{
+        loff_t oldsize;
+        int error;
+        error = inode_newsize_ok(inode, newsize);
+        if (error)
+                return error;
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)))
+                return -EINVAL;
+        if (ext2_inode_is_fast_symlink(inode))
+                return -EINVAL;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return -EPERM;
+        if (mapping_is_xip(inode->i_mapping))
+                error = xip_truncate_page(inode->i_mapping, newsize);
+        else if (test_opt(inode->i_sb, NOBH))
+                error = nobh_truncate_page(inode->i_mapping,
+                                newsize, ext2_get_block);
+        else
+                error = block_truncate_page(inode->i_mapping,
+                                newsize, ext2_get_block);
+        if (error)
+                return error;
+        oldsize = inode->i_size;
+        i_size_write(inode, newsize);
+        truncate_pagecache(inode, oldsize, newsize);
+        __ext2_truncate_blocks(inode, newsize);
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        if (inode_needs_sync(inode)) {
                sync_mapping_buffers(inode->i_mapping);
@@ -1134,6 +1210,8 @@ do_indirects:
        } else {
                mark_inode_dirty(inode);
        }
+        return 0;
 }
 static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1474,8 +1552,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
                if (error)
                        return error;
        }
-        error = inode_setattr(inode, iattr);
+        if (iattr->ia_valid & ATTR_SIZE) {
-        if (!error && (iattr->ia_valid & ATTR_MODE))
+                error = ext2_setsize(inode, iattr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, iattr);
+        if (iattr->ia_valid & ATTR_MODE)
                error = ext2_acl_chmod(inode);
+        mark_inode_dirty(inode);
        return error;
 }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 71e9eb1fa696..7ff43f4a59cd 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -119,6 +119,8 @@ static void ext2_put_super (struct super_block * sb)
        int i;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        if (sb->s_dirt)
                ext2_write_super(sb);
@@ -1063,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_op = &ext2_sops;
        sb->s_export_op = &ext2_export_ops;
        sb->s_xattr = ext2_xattr_handlers;
+#ifdef CONFIG_QUOTA
+        sb->dq_op = &dquot_operations;
+        sb->s_qcop = &dquot_quotactl_ops;
+#endif
        root = ext2_iget(sb, EXT2_ROOT_INO);
        if (IS_ERR(root)) {
                ret = PTR_ERR(root);
@@ -1241,6 +1249,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                        spin_unlock(&sbi->s_lock);
                        return 0;
                }
                /*
                 * OK, we are remounting a valid rw partition rdonly, so set
                 * the rdonly flag and then mark the partition as valid again.
@@ -1248,6 +1257,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                es->s_state = cpu_to_le16(sbi->s_mount_state);
                es->s_mtime = cpu_to_le32(get_seconds());
                spin_unlock(&sbi->s_lock);
+                err = dquot_suspend(sb, -1);
+                if (err < 0) {
+                        spin_lock(&sbi->s_lock);
+                        goto restore_opts;
+                }
                ext2_sync_super(sb, es, 1);
        } else {
                __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1269,8 +1285,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                if (!ext2_setup_super (sb, es, 0))
                        sb->s_flags &= ~MS_RDONLY;
                spin_unlock(&sbi->s_lock);
                ext2_write_super(sb);
+                dquot_resume(sb, -1);
        }
        return 0;
 restore_opts:
        sbi->s_mount_opt = old_opts.s_mount_opt;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
                        kfree (old);
                }
                if (!parent)
-                        root->rb_node = NULL;
+                        *root = RB_ROOT;
                else if (parent->rb_left == n)
                        parent->rb_left = NULL;
                else if (parent->rb_right == n)
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index fcf7487734b6..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,9 +43,9 @@
 * inode to disk.
 */
-int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext3_sync_file(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ext3_inode_info *ei = EXT3_I(inode);
        journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
        int ret, needs_barrier = 0;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 0fc1293d0e96..6c953bb255e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
        struct ext3_super_block *es = sbi->s_es;
        int i, err;
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        lock_kernel();
        ext3_xattr_put_super(sb);
@@ -748,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path, int remount);
+                                char *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -767,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = {
 static const struct quotactl_ops ext3_qctl_operations = {
        .quota_on       = ext3_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
 #endif
@@ -1527,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
                if (sb_dqopt(sb)->files[i])
-                        vfs_quota_off(sb, i, 0);
+                        dquot_quota_off(sb, i);
        }
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2551,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        ext3_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext3_mount_options old_opts;
+        int enable_quota = 0;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
@@ -2597,6 +2600,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                }
                if (*flags & MS_RDONLY) {
+                        err = dquot_suspend(sb, -1);
+                        if (err < 0)
+                                goto restore_opts;
                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
@@ -2651,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                                goto restore_opts;
                        if (!ext3_setup_super (sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        enable_quota = 1;
                }
        }
 #ifdef CONFIG_QUOTA
@@ -2662,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 #endif
        unlock_super(sb);
        unlock_kernel();
+        if (enable_quota)
+                dquot_resume(sb, -1);
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
@@ -2851,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type)
 */
 static int ext3_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
-                        EXT3_SB(sb)->s_jquota_fmt, type);
+                                        EXT3_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name, int remount)
+                         char *name)
 {
        int err;
        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        /* When remounting, no checks are needed and in fact, name is NULL */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, remount);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
@@ -2906,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
                }
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
        path_put(&path);
        return err;
 }
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
                *count = ar.len;
        /*
-         * Account for the allocated meta blocks
+         * Account for the allocated meta blocks.  We will never
+         * fail EDQUOT for metdata, but we do account for it.
         */
        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                dquot_alloc_block_nofail(inode, ar.len);
        }
        return ret;
 }
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c48655084..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else {
-                        if (start_blk + count > (entry->start_blk + 
+                        if (start_blk + count > (entry->start_blk +
                                                 entry->count))
-                                entry->count = (start_blk + count - 
+                                entry->count = (start_blk + count -
                                                entry->start_blk);
                        new_node = *n;
                        new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                error_msg = "inode out of bounds";
        if (error_msg != NULL)
-                __ext4_error(dir->i_sb, function,
+                ext4_error_inode(function, dir,
-                        "bad entry in directory #%lu: %s - block=%llu"
+                        "bad entry in directory: %s - block=%llu"
                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
-                        dir->i_ino, error_msg, 
+                        error_msg, (unsigned long long) bh->b_blocknr,
-                        (unsigned long long) bh->b_blocknr,     
                        (unsigned) (offset%bh->b_size), offset,
                        le32_to_cpu(de->inode),
                        rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
        if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
                                    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
-            ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+            ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
             ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
                err = ext4_dx_readdir(filp, dirent, filldir);
                if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
                 * We don't set the inode dirty flag since it's not
                 * critical that it get flushed back to the disk.
                 */
-                EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
        }
        stored = 0;
        offset = filp->f_pos & (sb->s_blocksize - 1);
        while (!error && !stored && filp->f_pos < inode->i_size) {
-                ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+                struct ext4_map_blocks map;
-                struct buffer_head map_bh;
                struct buffer_head *bh = NULL;
-                map_bh.b_state = 0;
+                map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
-                err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
+                map.m_len = 1;
+                err = ext4_map_blocks(NULL, inode, &map, 0);
                if (err > 0) {
-                        pgoff_t index = map_bh.b_blocknr >>
+                        pgoff_t index = map.m_pblk >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
                        if (!ra_has_index(&filp->f_ra, index))
                                page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
                                        &filp->f_ra, filp,
                                        index, 1);
                        filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-                        bh = ext4_bread(NULL, inode, blk, 0, &err);
+                        bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
                }
                /*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                ext4_error(sb, "directory #%lu "
+                                EXT4_ERROR_INODE(inode, "directory "
                                           "contains a hole at offset %Lu",
-                                           inode->i_ino,
                                           (unsigned long long) filp->f_pos);
                                dir_has_error = 1;
                        }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..19a4de57128a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
 /*
 * The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
 #endif
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
-        ext4_error_inode(__func__, (inode), (fmt), ## a);
+        ext4_error_inode(__func__, (inode), (fmt), ## a)
 #define EXT4_ERROR_FILE(file, fmt, a...)        \
-        ext4_error_file(__func__, (file), (fmt), ## a);
+        ext4_error_file(__func__, (file), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
 typedef unsigned int ext4_group_t;
 /*
- * Flags used in mballoc's allocation_context flags field.  
+ * Flags used in mballoc's allocation_context flags field.
 *
 * Also used to show what's going on for debugging purposes when the
 * flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
 };
 /*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks().  It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW            (1 << BH_New)
+#define EXT4_MAP_MAPPED         (1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN      (1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY       (1 << BH_Boundary)
+#define EXT4_MAP_UNINIT         (1 << BH_Uninit)
+#define EXT4_MAP_FLAGS          (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+                                 EXT4_MAP_UNINIT)
+struct ext4_map_blocks {
+        ext4_fsblk_t m_pblk;
+        ext4_lblk_t m_lblk;
+        unsigned int m_len;
+        unsigned int m_flags;
+};
+/*
 * For delayed allocation tracking
 */
 struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
                return flags & EXT4_OTHER_FLMASK;
 }
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+        EXT4_INODE_SECRM        = 0,    /* Secure deletion */
+        EXT4_INODE_UNRM         = 1,    /* Undelete */
+        EXT4_INODE_COMPR        = 2,    /* Compress file */
+        EXT4_INODE_SYNC         = 3,    /* Synchronous updates */
+        EXT4_INODE_IMMUTABLE    = 4,    /* Immutable file */
+        EXT4_INODE_APPEND       = 5,    /* writes to file may only append */
+        EXT4_INODE_NODUMP       = 6,    /* do not dump file */
+        EXT4_INODE_NOATIME      = 7,    /* do not update atime */
+/* Reserved for compression usage... */
+        EXT4_INODE_DIRTY        = 8,
+        EXT4_INODE_COMPRBLK     = 9,    /* One or more compressed clusters */
+        EXT4_INODE_NOCOMPR      = 10,   /* Don't compress */
+        EXT4_INODE_ECOMPR       = 11,   /* Compression error */
+/* End compression flags --- maybe not all used */
+        EXT4_INODE_INDEX        = 12,   /* hash-indexed directory */
+        EXT4_INODE_IMAGIC       = 13,   /* AFS directory */
+        EXT4_INODE_JOURNAL_DATA = 14,   /* file data should be journaled */
+        EXT4_INODE_NOTAIL       = 15,   /* file tail should not be merged */
+        EXT4_INODE_DIRSYNC      = 16,   /* dirsync behaviour (directories only) */
+        EXT4_INODE_TOPDIR       = 17,   /* Top of directory hierarchies*/
+        EXT4_INODE_HUGE_FILE    = 18,   /* Set to each huge file */
+        EXT4_INODE_EXTENTS      = 19,   /* Inode uses extents */
+        EXT4_INODE_EA_INODE     = 21,   /* Inode used for large EA */
+        EXT4_INODE_EOFBLOCKS    = 22,   /* Blocks allocated beyond EOF */
+        EXT4_INODE_RESERVED     = 31,   /* reserved for ext4 lib */
+};
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+        printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+                EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+        CHECK_FLAG_VALUE(SECRM);
+        CHECK_FLAG_VALUE(UNRM);
+        CHECK_FLAG_VALUE(COMPR);
+        CHECK_FLAG_VALUE(SYNC);
+        CHECK_FLAG_VALUE(IMMUTABLE);
+        CHECK_FLAG_VALUE(APPEND);
+        CHECK_FLAG_VALUE(NODUMP);
+        CHECK_FLAG_VALUE(NOATIME);
+        CHECK_FLAG_VALUE(DIRTY);
+        CHECK_FLAG_VALUE(COMPRBLK);
+        CHECK_FLAG_VALUE(NOCOMPR);
+        CHECK_FLAG_VALUE(ECOMPR);
+        CHECK_FLAG_VALUE(INDEX);
+        CHECK_FLAG_VALUE(IMAGIC);
+        CHECK_FLAG_VALUE(JOURNAL_DATA);
+        CHECK_FLAG_VALUE(NOTAIL);
+        CHECK_FLAG_VALUE(DIRSYNC);
+        CHECK_FLAG_VALUE(TOPDIR);
+        CHECK_FLAG_VALUE(HUGE_FILE);
+        CHECK_FLAG_VALUE(EXTENTS);
+        CHECK_FLAG_VALUE(EA_INODE);
+        CHECK_FLAG_VALUE(EOFBLOCKS);
+        CHECK_FLAG_VALUE(RESERVED);
+}
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
        __u32 group;            /* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
        __u16 unused;
 };
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+        u32 group;
+        compat_u64 block_bitmap;
+        compat_u64 inode_bitmap;
+        compat_u64 inode_table;
+        u32 blocks_count;
+        u16 reserved_blocks;
+        u16 unused;
+};
+#endif
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
        __u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT       (EXT4_GET_BLOCKS_UNINIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path,
-           so set the magic i_delalloc_reserve_flag after taking the 
+           so set the magic i_delalloc_reserve_flag after taking the
           inode allocation semaphore for */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_ALLOC_DA_BLKS          _IO('f', 12)
 #define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
 * ioctl commands in 32 bit emulation
 */
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETRSVSZ             _IOR('f', 5, int)
 #define EXT4_IOC32_SETRSVSZ             _IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND         _IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD            _IOW('f', 8, struct compat_ext4_new_group_input)
 #ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC32_WAIT_FOR_READONLY    _IOR('f', 99, int)
 #endif
 #define EXT4_IOC32_GETVERSION_OLD       FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
+#endif
 /*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
 */
 struct ext4_inode_info {
        __le32  i_data[15];     /* unconverted */
-        __u32   i_flags;
-        ext4_fsblk_t    i_file_acl;
        __u32   i_dtime;
+        ext4_fsblk_t    i_file_acl;
        /*
         * i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
         */
        ext4_group_t    i_block_group;
        unsigned long   i_state_flags;          /* Dynamic state flags */
+        unsigned long   i_flags;
        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
        EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
+        EXT4_STATE_NEWENTRY,            /* File just added to dir */
 };
-static inline int ext4_test_inode_state(struct inode *inode, int bit)
+#define EXT4_INODE_BIT_FNS(name, field)                                 \
-{
+static inline int ext4_test_inode_##name(struct inode *inode, int bit)  \
-        return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+{                                                                       \
-}
+        return test_bit(bit, &EXT4_I(inode)->i_##field);                \
+}                                                                       \
-static inline void ext4_set_inode_state(struct inode *inode, int bit)
+static inline void ext4_set_inode_##name(struct inode *inode, int bit)  \
-{
+{                                                                       \
-        set_bit(bit, &EXT4_I(inode)->i_state_flags);
+        set_bit(bit, &EXT4_I(inode)->i_##field);                        \
+}                                                                       \
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{                                                                       \
+        clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
 }
-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
+EXT4_INODE_BIT_FNS(flag, flags)
-{
+EXT4_INODE_BIT_FNS(state, state_flags)
-        clear_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
 #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
                                      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
-                      (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+                    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
 #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
 #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -1398,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
-extern int ext4_sync_file(struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, int);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
        ext4_grpblk_t   bb_first_free;  /* first free block */
        ext4_grpblk_t   bb_free;        /* total free blocks */
        ext4_grpblk_t   bb_fragments;   /* nr of freespace fragments */
+        ext4_grpblk_t   bb_largest_free_order;/* order of largest frag in BG */
        struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
        void            *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
-extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, unsigned int max_blocks,
+                               struct ext4_map_blocks *map, int flags);
-                               struct buffer_head *bh_result, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+                           struct ext4_map_blocks *map, int flags);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
                           sector_t block, unsigned int max_blocks,
                           struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
                return 1;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                return 1;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 1;
        return 0;
 }
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
                return 0;
        if (EXT4_JOURNAL(inode) == NULL)
                return 1;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca8..377309c1af65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
        if (err <= 0)
                return err;
        err = ext4_truncate_restart_trans(handle, inode, needed);
-        /*
+        if (err == 0)
-         * We have dropped i_data_sem so someone might have cached again
+                err = -EAGAIN;
-         * an extent we are going to truncate.
-         */
-        ext4_ext_invalidate_cache(inode);
        return err;
 }
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
                /*
                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
-                 * block groups per flexgroup, reserve the first block 
+                 * block groups per flexgroup, reserve the first block
-                 * group for directories and special files.  Regular 
+                 * group for directories and special files.  Regular
                 * files will start at the second block group.  This
-                 * tends to speed up directory access and improves 
+                 * tends to speed up directory access and improves
                 * fsck times.
                 */
                block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
        return 0;
 corrupted:
-        __ext4_error(inode->i_sb, function,
+        ext4_error_inode(function, inode,
-                        "bad header/extent in inode #%lu: %s - magic %x, "
+                        "bad header/extent: %s - magic %x, "
                        "entries %u, max %u(%u), depth %u(%u)",
-                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+                        error_msg, le16_to_cpu(eh->eh_magic),
                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
                        max, le16_to_cpu(eh->eh_depth), depth);
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
                merge_done = 1;
                WARN_ON(eh->eh_entries == 0);
                if (!eh->eh_entries)
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
-                                   "inode#%lu, eh->eh_entries = 0!",
-                                   inode->i_ino);
        }
        return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
        struct ext4_ext_cache *cex;
        int ret = EXT4_EXT_CACHE_NO;
-        /* 
+        /*
         * We borrow i_block_reservation_lock to protect i_cached_extent
         */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        int depth = ext_depth(inode);
        struct ext4_ext_path *path;
        handle_t *handle;
-        int i = 0, err = 0;
+        int i, err;
        ext_debug("truncate since %u\n", start);
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        if (IS_ERR(handle))
                return PTR_ERR(handle);
+again:
        ext4_ext_invalidate_cache(inode);
        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
         */
+        depth = ext_depth(inode);
        path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
        if (path == NULL) {
                ext4_journal_stop(handle);
                return -ENOMEM;
        }
+        path[0].p_depth = depth;
        path[0].p_hdr = ext_inode_hdr(inode);
        if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                err = -EIO;
                goto out;
        }
-        path[0].p_depth = depth;
+        i = err = 0;
        while (i >= 0 && err == 0) {
                if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 out:
        ext4_ext_drop_refs(path);
        kfree(path);
+        if (err == -EAGAIN)
+                goto again;
        ext4_journal_stop(handle);
        return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
-        int ret = -EIO;
+        int ret;
        struct bio *bio;
        int blkbits, blocksize;
        sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
                        len = ee_len;
                bio = bio_alloc(GFP_NOIO, len);
+                if (!bio)
+                        return -ENOMEM;
                bio->bi_sector = ee_pblock;
                bio->bi_bdev   = inode->i_sb->s_bdev;
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
                submit_bio(WRITE, bio);
                wait_for_completion(&event);
-                if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+                if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-                        ret = 0;
+                        bio_put(bio);
-                else {
+                        return -EIO;
-                        ret = -EIO;
-                        break;
                }
                bio_put(bio);
                ee_len    -= done;
                ee_pblock += done  << (blkbits - 9);
        }
-        return ret;
+        return 0;
 }
 #define EXT4_EXT_ZERO_LEN 7
 /*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an uninitialized extent. It may result in splitting the uninitialized
 * extent into multiple extents (upto three - one initialized and two
 * uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 *   c> Splits in three extents: Somone is writing in middle of the extent
 */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
-                                                struct inode *inode,
+                                           struct inode *inode,
-                                                struct ext4_ext_path *path,
+                                           struct ext4_map_blocks *map,
-                                                ext4_lblk_t iblock,
+                                           struct ext4_ext_path *path)
-                                                unsigned int max_blocks)
 {
        struct ext4_extent *ex, newex, orig_ex;
        struct ext4_extent *ex1 = NULL;
        struct ext4_extent *ex2 = NULL;
        struct ext4_extent *ex3 = NULL;
        struct ext4_extent_header *eh;
-        ext4_lblk_t ee_block;
+        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
        ext4_fsblk_t newblock;
        int err = 0;
        int ret = 0;
+        int may_zeroout;
+        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)map->m_lblk, map->m_len);
+        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+                inode->i_sb->s_blocksize_bits;
+        if (eof_block < map->m_lblk + map->m_len)
+                eof_block = map->m_lblk + map->m_len;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (iblock - ee_block);
+        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = iblock - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        /*
+         * It is safe to convert extent to initialized via explicit
+         * zeroout only if extent is fully insde i_size or new_size.
+         */
+        may_zeroout = ee_block + ee_len <= eof_block;
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
        /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-        if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+        if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                return allocated;
        }
-        /* ex1: ee_block to iblock - 1 : uninitialized */
+        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (iblock > ee_block) {
+        if (map->m_lblk > ee_block) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         * we insert ex3, if ex1 is NULL. This is to avoid temporary
         * overlap of blocks.
         */
-        if (!ex1 && allocated > max_blocks)
+        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(max_blocks);
+                ex2->ee_len = cpu_to_le16(map->m_len);
        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unsigned int newdepth;
                /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-                if (allocated <= EXT4_EXT_ZERO_LEN) {
+                if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
                        /*
-                         * iblock == ee_block is handled by the zerouout
+                         * map->m_lblk == ee_block is handled by the zerouout
                         * at the beginning.
                         * Mark first half uninitialized.
                         * Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_dirty(handle, inode, path + depth);
                        ex3 = &newex;
-                        ex3->ee_block = cpu_to_le32(iblock);
+                        ex3->ee_block = cpu_to_le32(map->m_lblk);
                        ext4_ext_store_pblock(ex3, newblock);
                        ex3->ee_len = cpu_to_le16(allocated);
                        err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                ex->ee_len   = orig_ex.ee_len;
                                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                                ext4_ext_dirty(handle, inode, path + depth);
-                                /* blocks available from iblock */
+                                /* blocks available from map->m_lblk */
                                return allocated;
                        } else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                 */
                                depth = ext_depth(inode);
                                ext4_ext_drop_refs(path);
-                                path = ext4_ext_find_extent(inode,
+                                path = ext4_ext_find_extent(inode, map->m_lblk,
-                                                                iblock, path);
+                                                            path);
                                if (IS_ERR(path)) {
                                        err = PTR_ERR(path);
                                        return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        return allocated;
                }
                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + max_blocks);
+                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
                ext4_ext_mark_uninitialized(ex3);
                err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-                if (err == -ENOSPC) {
+                if (err == -ENOSPC && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                } else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 * update the extent length after successful insert of the
                 * split extent
                 */
-                orig_ex.ee_len = cpu_to_le16(ee_len -
+                ee_len -= ext4_ext_get_actual_len(ex3);
-                                                ext4_ext_get_actual_len(ex3));
+                orig_ex.ee_len = cpu_to_le16(ee_len);
+                may_zeroout = ee_block + ee_len <= eof_block;
                depth = newdepth;
                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, iblock, path);
+                path = ext4_ext_find_extent(inode, map->m_lblk, path);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                if (err)
                        goto out;
-                allocated = max_blocks;
+                allocated = map->m_len;
                /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
                 * to insert a extent in the middle zerout directly
                 * otherwise give the extent a chance to merge to left
                 */
                if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-                                                        iblock != ee_block) {
+                        map->m_lblk != ee_block && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zero out the first half */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                }
        }
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         */
        if (ex1 && ex1 != ex) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
-        /* ex2: iblock to iblock + maxblocks-1 : initialised */
+        /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
-        ex2->ee_block = cpu_to_le32(iblock);
+        ex2->ee_block = cpu_to_le32(map->m_lblk);
        ext4_ext_store_pblock(ex2, newblock);
        ex2->ee_len = cpu_to_le16(allocated);
        if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        goto out;
 insert:
        err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
-        if (err == -ENOSPC) {
+        if (err == -ENOSPC && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
 }
 /*
- * This function is called by ext4_ext_get_blocks() from
+ * This function is called by ext4_ext_map_blocks() from
 * ext4_get_blocks_dio_write() when DIO to write
 * to an uninitialized extent.
 *
@@ -2927,9 +2946,8 @@ fix_extent_len:
 */
 static int ext4_split_unwritten_extents(handle_t *handle,
                                        struct inode *inode,
+                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path *path,
-                                        ext4_lblk_t iblock,
-                                        unsigned int max_blocks,
                                        int flags)
 {
        struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        struct ext4_extent *ex2 = NULL;
        struct ext4_extent *ex3 = NULL;
        struct ext4_extent_header *eh;
-        ext4_lblk_t ee_block;
+        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
        ext4_fsblk_t newblock;
        int err = 0;
+        int may_zeroout;
+        ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)map->m_lblk, map->m_len);
+        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+                inode->i_sb->s_blocksize_bits;
+        if (eof_block < map->m_lblk + map->m_len)
+                eof_block = map->m_lblk + map->m_len;
-        ext_debug("ext4_split_unwritten_extents: inode %lu,"
-                  "iblock %llu, max_blocks %u\n", inode->i_ino,
-                  (unsigned long long)iblock, max_blocks);
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (iblock - ee_block);
+        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = iblock - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
        /*
+         * It is safe to convert extent to initialized via explicit
+         * zeroout only if extent is fully insde i_size or new_size.
+         */
+        may_zeroout = ee_block + ee_len <= eof_block;
+        /*
         * If the uninitialized extent begins at the same logical
         * block where the write begins, and the write completely
         * covers the extent, then we don't need to split it.
         */
-        if ((iblock == ee_block) && (allocated <= max_blocks))
+        if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
                return allocated;
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
-        /* ex1: ee_block to iblock - 1 : uninitialized */
+        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (iblock > ee_block) {
+        if (map->m_lblk > ee_block) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
         * we insert ex3, if ex1 is NULL. This is to avoid temporary
         * overlap of blocks.
         */
-        if (!ex1 && allocated > max_blocks)
+        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(max_blocks);
+                ex2->ee_len = cpu_to_le16(map->m_len);
        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unsigned int newdepth;
                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + max_blocks);
+                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
                ext4_ext_mark_uninitialized(ex3);
                err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-                if (err == -ENOSPC) {
+                if (err == -ENOSPC && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                } else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                 * update the extent length after successful insert of the
                 * split extent
                 */
-                orig_ex.ee_len = cpu_to_le16(ee_len -
+                ee_len -= ext4_ext_get_actual_len(ex3);
-                                                ext4_ext_get_actual_len(ex3));
+                orig_ex.ee_len = cpu_to_le16(ee_len);
+                may_zeroout = ee_block + ee_len <= eof_block;
                depth = newdepth;
                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, iblock, path);
+                path = ext4_ext_find_extent(inode, map->m_lblk, path);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                if (err)
                        goto out;
-                allocated = max_blocks;
+                allocated = map->m_len;
        }
        /*
         * If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
         */
        if (ex1 && ex1 != ex) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
        /*
-         * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+         * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
-         * uninitialised still.
+         * using direct I/O, uninitialised still.
         */
-        ex2->ee_block = cpu_to_le32(iblock);
+        ex2->ee_block = cpu_to_le32(map->m_lblk);
        ext4_ext_store_pblock(ex2, newblock);
        ex2->ee_len = cpu_to_le16(allocated);
        ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        goto out;
 insert:
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-        if (err == -ENOSPC) {
+        if (err == -ENOSPC && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t iblock, unsigned int max_blocks,
+                        struct ext4_map_blocks *map,
                        struct ext4_ext_path *path, int flags,
-                        unsigned int allocated, struct buffer_head *bh_result,
+                        unsigned int allocated, ext4_fsblk_t newblock)
-                        ext4_fsblk_t newblock)
 {
        int ret = 0;
        int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
                  "block %llu, max_blocks %u, flags %d, allocated %u",
-                  inode->i_ino, (unsigned long long)iblock, max_blocks,
+                  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                ret = ext4_split_unwritten_extents(handle,
+                ret = ext4_split_unwritten_extents(handle, inode, map,
-                                                inode, path, iblock,
+                                                   path, flags);
-                                                max_blocks, flags);
                /*
                 * Flag the inode(non aio case) or end_io struct (aio case)
                 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
-                        set_buffer_uninit(bh_result);
+                        map->m_flags |= EXT4_MAP_UNINIT;
                goto out;
        }
        /* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * the buffer head will be unmapped so that
                 * a read from the block returns 0s.
                 */
-                set_buffer_unwritten(bh_result);
+                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto out1;
        }
        /* buffered write, writepage time, convert*/
-        ret = ext4_ext_convert_to_initialized(handle, inode,
+        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-                                                path, iblock,
-                                                max_blocks);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -3226,7 +3256,7 @@ out:
                goto out2;
        } else
                allocated = ret;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        /*
         * if we allocated more blocks than requested
         * we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
         * unmapped later when we find the buffer_head marked
         * new.
         */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
-                                        newblock + max_blocks,
+                                        newblock + map->m_len,
-                                        allocated - max_blocks);
+                                        allocated - map->m_len);
-                allocated = max_blocks;
+                allocated = map->m_len;
        }
        /*
@@ -3252,13 +3282,13 @@ out:
                ext4_da_update_reserve_space(inode, allocated, 0);
 map_out:
-        set_buffer_mapped(bh_result);
+        map->m_flags |= EXT4_MAP_MAPPED;
 out1:
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
-        bh_result->b_bdev = inode->i_sb->s_bdev;
+        map->m_pblk = newblock;
-        bh_result->b_blocknr = newblock;
+        map->m_len = allocated;
 out2:
        if (path) {
                ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
 *
 * return < 0, error case.
 */
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t iblock,
+                        struct ext4_map_blocks *map, int flags)
-                        unsigned int max_blocks, struct buffer_head *bh_result,
-                        int flags)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex, *last_ex;
        ext4_fsblk_t newblock;
-        int err = 0, depth, ret, cache_type;
+        int i, err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
-        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%u requested for inode %lu\n",
-                        iblock, max_blocks, inode->i_ino);
+                  map->m_lblk, map->m_len, inode->i_ino);
        /* check in cache */
-        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+        cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
        if (cache_type) {
                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        /* we should allocate requested block */
                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
                        /* block is already allocated */
-                        newblock = iblock
+                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
                                   + ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
                        allocated = ext4_ext_get_actual_len(&newex) -
-                                        (iblock - le32_to_cpu(newex.ee_block));
+                                (map->m_lblk - le32_to_cpu(newex.ee_block));
                        goto out;
                } else {
                        BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        }
        /* find extent for this block */
-        path = ext4_ext_find_extent(inode, iblock, NULL);
+        path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode, "bad extent address "
-                                 "iblock: %d, depth: %d pblock %lld",
+                                 "lblock: %lu, depth: %d pblock %lld",
-                                 iblock, depth, path[depth].p_block);
+                                 (unsigned long) map->m_lblk, depth,
+                                 path[depth].p_block);
                err = -EIO;
                goto out2;
        }
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                 */
                ee_len = ext4_ext_get_actual_len(ex);
                /* if found extent covers block, simply return it */
-                if (in_range(iblock, ee_block, ee_len)) {
+                if (in_range(map->m_lblk, ee_block, ee_len)) {
-                        newblock = iblock - ee_block + ee_start;
+                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
-                        allocated = ee_len - (iblock - ee_block);
+                        allocated = ee_len - (map->m_lblk - ee_block);
-                        ext_debug("%u fit into %u:%d -> %llu\n", iblock,
+                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
-                                        ee_block, ee_len, newblock);
+                                  ee_block, ee_len, newblock);
                        /* Do not put uninitialized extent in the cache */
                        if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                goto out;
                        }
                        ret = ext4_ext_handle_uninitialized_extents(handle,
-                                        inode, iblock, max_blocks, path,
+                                        inode, map, path, flags, allocated,
-                                        flags, allocated, bh_result, newblock);
+                                        newblock);
                        return ret;
                }
        }
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                 * put just found gap into cache to speed up
                 * subsequent requests
                 */
-                ext4_ext_put_gap_in_cache(inode, path, iblock);
+                ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
                goto out2;
        }
        /*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         */
        /* find neighbour allocated blocks */
-        ar.lleft = iblock;
+        ar.lleft = map->m_lblk;
        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
        if (err)
                goto out2;
-        ar.lright = iblock;
+        ar.lright = map->m_lblk;
        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
        if (err)
                goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
         * EXT_UNINIT_MAX_LEN.
         */
-        if (max_blocks > EXT_INIT_MAX_LEN &&
+        if (map->m_len > EXT_INIT_MAX_LEN &&
            !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-                max_blocks = EXT_INIT_MAX_LEN;
+                map->m_len = EXT_INIT_MAX_LEN;
-        else if (max_blocks > EXT_UNINIT_MAX_LEN &&
+        else if (map->m_len > EXT_UNINIT_MAX_LEN &&
                 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-                max_blocks = EXT_UNINIT_MAX_LEN;
+                map->m_len = EXT_UNINIT_MAX_LEN;
-        /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
+        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
-        newex.ee_block = cpu_to_le32(iblock);
+        newex.ee_block = cpu_to_le32(map->m_lblk);
-        newex.ee_len = cpu_to_le16(max_blocks);
+        newex.ee_len = cpu_to_le16(map->m_len);
        err = ext4_ext_check_overlap(inode, &newex, path);
        if (err)
                allocated = ext4_ext_get_actual_len(&newex);
        else
-                allocated = max_blocks;
+                allocated = map->m_len;
        /* allocate new block */
        ar.inode = inode;
-        ar.goal = ext4_ext_find_goal(inode, path, iblock);
+        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
-        ar.logical = iblock;
+        ar.logical = map->m_lblk;
        ar.len = allocated;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
                if (ext4_should_dioread_nolock(inode))
-                        set_buffer_uninit(bh_result);
+                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+        if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
                if (unlikely(!eh->eh_entries)) {
                        EXT4_ERROR_INODE(inode,
-                                         "eh->eh_entries == 0 ee_block %d",
+                                         "eh->eh_entries == 0 and "
-                                         ex->ee_block);
+                                         "EOFBLOCKS_FL set");
                        err = -EIO;
                        goto out2;
                }
                last_ex = EXT_LAST_EXTENT(eh);
-                if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+                /*
-                    + ext4_ext_get_actual_len(last_ex))
+                 * If the current leaf block was reached by looking at
-                        EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+                 * the last index block all the way down the tree, and
+                 * we are extending the inode beyond the last extent
+                 * in the current leaf block, then clear the
+                 * EOFBLOCKS_FL flag.
+                 */
+                for (i = depth-1; i >= 0; i--) {
+                        if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                                break;
+                }
+                if ((i < 0) &&
+                    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
+                     ext4_ext_get_actual_len(last_ex)))
+                        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        /*
         * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * when it is _not_ an uninitialized extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
                ext4_update_inode_fsync_trans(handle, inode, 1);
        } else
                ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
-        set_buffer_mapped(bh_result);
+        map->m_flags |= EXT4_MAP_MAPPED;
-        bh_result->b_bdev = inode->i_sb->s_bdev;
+        map->m_pblk = newblock;
-        bh_result->b_blocknr = newblock;
+        map->m_len = allocated;
 out2:
        if (path) {
                ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
                 * can proceed even if the new size is the same as i_size.
                 */
                if (new_size > i_size_read(inode))
-                        EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
 }
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 {
        handle_t *handle;
-        ext4_lblk_t block;
        loff_t new_size;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
-        struct buffer_head map_bh;
+        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
        /*
         * currently supporting (pre)allocate mode for extent-based
         * files _only_
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
        /* preallocation to directories is currently not supported */
        if (S_ISDIR(inode->i_mode))
                return -ENODEV;
-        block = offset >> blkbits;
+        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-                                                        - block;
+                - map.m_lblk;
        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, (len + offset));
+        if (ret) {
+                mutex_unlock(&inode->i_mutex);
+                return ret;
+        }
 retry:
        while (ret >= 0 && ret < max_blocks) {
-                block = block + ret;
+                map.m_lblk = map.m_lblk + ret;
-                max_blocks = max_blocks - ret;
+                map.m_len = max_blocks = max_blocks - ret;
                handle = ext4_journal_start(inode, credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
-                map_bh.b_state = 0;
+                ret = ext4_map_blocks(handle, inode, &map,
-                ret = ext4_get_blocks(handle, inode, block,
-                                      max_blocks, &map_bh,
                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
-                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
+                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
                                    inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
                        ret2 = ext4_journal_stop(handle);
                        break;
                }
-                if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+                if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
                                                blkbits) >> blkbits))
                        new_size = offset + len;
                else
-                        new_size = (block + ret) << blkbits;
+                        new_size = (map.m_lblk + ret) << blkbits;
                ext4_falloc_update_inode(inode, mode, new_size,
-                                                buffer_new(&map_bh));
+                                         (map.m_flags & EXT4_MAP_NEW));
                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
                if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                                    ssize_t len)
 {
        handle_t *handle;
-        ext4_lblk_t block;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
-        struct buffer_head map_bh;
+        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
-        block = offset >> blkbits;
+        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
-        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+        max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
-                                                        - block;
+                      map.m_lblk);
        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        while (ret >= 0 && ret < max_blocks) {
-                block = block + ret;
+                map.m_lblk += ret;
-                max_blocks = max_blocks - ret;
+                map.m_len = (max_blocks -= ret);
                handle = ext4_journal_start(inode, credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
-                map_bh.b_state = 0;
+                ret = ext4_map_blocks(handle, inode, &map,
-                ret = ext4_get_blocks(handle, inode, block,
-                                      max_blocks, &map_bh,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                if (ret <= 0) {
                        WARN_ON(ret <= 0);
-                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
+                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
-                                    inode->i_ino, block, max_blocks);
+                                    inode->i_ino, map.m_lblk, map.m_len);
                }
                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        int error = 0;
        /* fallback to generic here if not in extents fmt */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return generic_block_fiemap(inode, fieinfo, start, len,
                        ext4_get_block);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
         * is smaller than s_maxbytes, which is for extent-mapped files.
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                size_t length = iov_length(iov, nr_segs);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ef3d980e67cb..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
 #include <trace/events/ext4.h>
 /*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file.  This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+        struct dentry *dentry = NULL;
+        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+                dentry = list_entry(inode->i_dentry.next,
+                                    struct dentry, d_alias);
+                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+                        break;
+                inode = dentry->d_parent->d_inode;
+                sync_mapping_buffers(inode->i_mapping);
+        }
+}
+/*
 * akpm: A new design for ext4_sync_file().
 *
 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
 * i_mutex lock is held when entering and exiting this function
 */
-int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret;
@@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        J_ASSERT(ext4_journal_current_handle() == NULL);
-        trace_ext4_sync_file(file, dentry, datasync);
+        trace_ext4_sync_file(file, datasync);
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        ret = flush_completed_IO(inode);
        if (ret < 0)
                return ret;
-        
-        if (!journal)
+        if (!journal) {
-                return simple_fsync(file, dentry, datasync);
+                ret = generic_file_fsync(file, datasync);
+                if (!ret && !list_empty(&inode->i_dentry))
+                        ext4_sync_parent(inode);
+                return ret;
+        }
        /*
         * data=writeback,ordered:
@@ -102,7 +129,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
                    (journal->j_flags & JBD2_BARRIER))
                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
                                        NULL, BLKDEV_IFL_WAIT);
-                jbd2_log_wait_commit(journal, commit_tid);
+                ret = jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
                        BLKDEV_IFL_WAIT);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1a0e183a2f04..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        if (fatal)
                goto error_return;
-        /* Ok, now we can actually update the inode bitmaps.. */
+        fatal = -ESRCH;
-        cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+        gdp = ext4_get_group_desc(sb, block_group, &bh2);
-                                        bit, bitmap_bh->b_data);
+        if (gdp) {
-        if (!cleared)
-                ext4_error(sb, "bit already cleared for inode %lu", ino);
-        else {
-                gdp = ext4_get_group_desc(sb, block_group, &bh2);
                BUFFER_TRACE(bh2, "get_write_access");
                fatal = ext4_journal_get_write_access(handle, bh2);
-                if (fatal) goto error_return;
+        }
+        ext4_lock_group(sb, block_group);
-                if (gdp) {
+        cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
-                        ext4_lock_group(sb, block_group);
+        if (fatal || !cleared) {
-                        count = ext4_free_inodes_count(sb, gdp) + 1;
+                ext4_unlock_group(sb, block_group);
-                        ext4_free_inodes_set(sb, gdp, count);
+                goto out;
-                        if (is_directory) {
+        }
-                                count = ext4_used_dirs_count(sb, gdp) - 1;
-                                ext4_used_dirs_set(sb, gdp, count);
-                                if (sbi->s_log_groups_per_flex) {
-                                        ext4_group_t f;
-                                        f = ext4_flex_group(sbi, block_group);
-                                        atomic_dec(&sbi->s_flex_groups[f].used_dirs);
-                                }
-                        }
+        count = ext4_free_inodes_count(sb, gdp) + 1;
-                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
+        ext4_free_inodes_set(sb, gdp, count);
-                                                        block_group, gdp);
+        if (is_directory) {
-                        ext4_unlock_group(sb, block_group);
+                count = ext4_used_dirs_count(sb, gdp) - 1;
-                        percpu_counter_inc(&sbi->s_freeinodes_counter);
+                ext4_used_dirs_set(sb, gdp, count);
-                        if (is_directory)
+                percpu_counter_dec(&sbi->s_dirs_counter);
-                                percpu_counter_dec(&sbi->s_dirs_counter);
-                        if (sbi->s_log_groups_per_flex) {
-                                ext4_group_t f;
-                                f = ext4_flex_group(sbi, block_group);
-                                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
-                        }
-                }
-                BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
-                err = ext4_handle_dirty_metadata(handle, NULL, bh2);
-                if (!fatal) fatal = err;
        }
-        BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+        ext4_unlock_group(sb, block_group);
-        if (!fatal)
-                fatal = err;
+        percpu_counter_inc(&sbi->s_freeinodes_counter);
-        sb->s_dirt = 1;
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t f = ext4_flex_group(sbi, block_group);
+                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+                if (is_directory)
+                        atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+        }
+        BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+        fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+        if (cleared) {
+                BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+                if (!fatal)
+                        fatal = err;
+                sb->s_dirt = 1;
+        } else
+                ext4_error(sb, "bit already cleared for inode %lu", ino);
 error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        if (S_ISDIR(mode) &&
            ((parent == sb->s_root->d_inode) ||
-             (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+             (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
                int best_ndir = inodes_per_group;
                int ret = -1;
@@ -1041,7 +1034,7 @@ got:
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
-                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
                        ext4_ext_tree_init(handle, inode);
                }
        }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e0f6af9d08d..19df61c321fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
        int ret;
        /*
-         * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
         * moment, get_block can be called only for blocks inside i_size since
         * page cache has been already dropped and writes are blocked by
         * i_mutex. So we can safely drop the i_data_sem here.
@@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
                if (blk &&
                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                    blk, 1))) {
-                        __ext4_error(inode->i_sb, function,
+                        ext4_error_inode(function, inode,
-                                   "invalid block reference %u "
+                                         "invalid block reference %u", blk);
-                                   "in inode #%lu", blk, inode->i_ino);
                        return -EIO;
                }
        }
@@ -785,7 +784,7 @@ failed:
        /* Allocation failed, free what we already allocated */
        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
-                /* 
+                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 err_out:
        for (i = 1; i <= num; i++) {
-                /* 
+                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +889,9 @@ err_out:
 }
 /*
- * The ext4_ind_get_blocks() function handles non-extents inodes
+ * The ext4_ind_map_blocks() function handles non-extents inodes
 * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_get_blocks().
+ * scheme) for ext4_map_blocks().
 *
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +916,8 @@ err_out:
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
 * blocks.
 */
-static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, unsigned int maxblocks,
+                               struct ext4_map_blocks *map,
-                               struct buffer_head *bh_result,
                               int flags)
 {
        int err = -EIO;
@@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        int count = 0;
        ext4_fsblk_t first_block = 0;
-        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
-        depth = ext4_block_to_path(inode, iblock, offsets,
+        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
                                   &blocks_to_boundary);
        if (depth == 0)
@@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
-                clear_buffer_new(bh_result);
                count++;
                /*map more blocks*/
-                while (count < maxblocks && count <= blocks_to_boundary) {
+                while (count < map->m_len && count <= blocks_to_boundary) {
                        ext4_fsblk_t blk;
                        blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        /*
         * Okay, we need to do block allocation.
        */
-        goal = ext4_find_goal(inode, iblock, partial);
+        goal = ext4_find_goal(inode, map->m_lblk, partial);
        /* the number of blocks need to allocate for [d,t]indirect blocks */
        indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * direct blocks to allocate for this branch.
         */
        count = ext4_blks_to_allocate(partial, indirect_blks,
-                                        maxblocks, blocks_to_boundary);
+                                      map->m_len, blocks_to_boundary);
        /*
         * Block out ext4_truncate while we alter the tree
         */
-        err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+        err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
                                &count, goal,
                                offsets + (partial - chain), partial);
@@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * may need to return -EAGAIN upwards in the worst case.  --sct
         */
        if (!err)
-                err = ext4_splice_branch(handle, inode, iblock,
+                err = ext4_splice_branch(handle, inode, map->m_lblk,
                                         partial, indirect_blks, count);
        if (err)
                goto cleanup;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
-        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+        map->m_flags |= EXT4_MAP_MAPPED;
+        map->m_pblk = le32_to_cpu(chain[depth-1].key);
+        map->m_len = count;
        if (count > blocks_to_boundary)
-                set_buffer_boundary(bh_result);
+                map->m_flags |= EXT4_MAP_BOUNDARY;
        err = count;
        /* Clean up and exit */
        partial = chain + depth - 1;    /* the whole chain */
@@ -1016,7 +1015,6 @@ cleanup:
                brelse(partial->bh);
                partial--;
        }
-        BUFFER_TRACE(bh_result, "returned");
 out:
        return err;
 }
@@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 */
 static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
        return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        int mdb_free = 0, allocated_meta_blocks = 0;
        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
-        used += ei->i_allocated_meta_blocks;
        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-        allocated_meta_blocks = ei->i_allocated_meta_blocks;
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                           used + ei->i_allocated_meta_blocks);
        ei->i_allocated_meta_blocks = 0;
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
        if (ei->i_reserved_data_blocks == 0) {
                /*
@@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-                mdb_free = ei->i_reserved_meta_blocks;
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
        }
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        /* Update quota subsystem */
+        /* Update quota subsystem for data blocks */
-        if (quota_claim) {
+        if (quota_claim)
                dquot_claim_block(inode, used);
-                if (mdb_free)
+        else {
-                        dquot_release_reservation_block(inode, mdb_free);
-        } else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
-                 * not update the quota for allocated blocks. But then
+                 * not re-claim the quota for fallocated blocks.
-                 * converting an fallocate region to initialized region would
-                 * have caused a metadata allocation. So claim quota for
-                 * that
                 */
-                if (allocated_meta_blocks)
+                dquot_release_reservation_block(inode, used);
-                        dquot_claim_block(inode, allocated_meta_blocks);
-                dquot_release_reservation_block(inode, mdb_free + used);
        }
        /*
@@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
                ext4_discard_preallocations(inode);
 }
-static int check_block_validity(struct inode *inode, const char *msg,
+static int check_block_validity(struct inode *inode, const char *func,
-                                sector_t logical, sector_t phys, int len)
+                                struct ext4_map_blocks *map)
 {
-        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
-                __ext4_error(inode->i_sb, msg,
+                                   map->m_len)) {
-                           "inode #%lu logical block %llu mapped to %llu "
+                ext4_error_inode(func, inode,
-                           "(size %d)", inode->i_ino,
+                           "lblock %lu mapped to illegal pblock %llu "
-                           (unsigned long long) logical,
+                           "(length %d)", (unsigned long) map->m_lblk,
-                           (unsigned long long) phys, len);
+                                 map->m_pblk, map->m_len);
                return -EIO;
        }
        return 0;
@@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 }
 /*
- * The ext4_get_blocks() function tries to look up the requested blocks,
+ * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
- * If file type is extents based, it will call ext4_ext_get_blocks(),
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
- * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 *
 * It returns the error in case of allocation failure.
 */
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
-                    unsigned int max_blocks, struct buffer_head *bh,
+                    struct ext4_map_blocks *map, int flags)
-                    int flags)
 {
        int retval;
-        clear_buffer_mapped(bh);
+        map->m_flags = 0;
-        clear_buffer_unwritten(bh);
+        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
-        ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+                  (unsigned long) map->m_lblk);
-                  "logical block %lu\n", inode->i_ino, flags, max_blocks,
-                  (unsigned long)block);
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ext_map_blocks(handle, inode, map, 0);
-                                bh, 0);
        } else {
-                retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ind_map_blocks(handle, inode, map, 0);
-                                             bh, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
-        if (retval > 0 && buffer_mapped(bh)) {
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, "file system corruption",
+                int ret = check_block_validity(inode, __func__, map);
-                                               block, bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
@@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * ext4_ext_get_block() returns th create = 0
         * with buffer head unmapped.
         */
-        if (retval > 0 && buffer_mapped(bh))
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
                return retval;
        /*
@@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * of BH_Unwritten and BH_Mapped flags being simultaneously
         * set on the buffer_head.
         */
-        clear_buffer_unwritten(bh);
+        map->m_flags &= ~EXT4_MAP_UNWRITTEN;
        /*
         * New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
         */
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ext_map_blocks(handle, inode, map, flags);
-                                              bh, flags);
        } else {
-                retval = ext4_ind_get_blocks(handle, inode, block,
+                retval = ext4_ind_map_blocks(handle, inode, map, flags);
-                                             max_blocks, bh, flags);
-                if (retval > 0 && buffer_new(bh)) {
+                if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
                        /*
                         * We allocated new blocks which will result in
                         * i_data's format changing.  Force the migrate
@@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
        up_write((&EXT4_I(inode)->i_data_sem));
-        if (retval > 0 && buffer_mapped(bh)) {
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, "file system "
+                int ret = check_block_validity(inode,
-                                               "corruption after allocation",
+                                               "ext4_map_blocks_after_alloc",
-                                               block, bh->b_blocknr, retval);
+                                               map);
                if (ret != 0)
                        return ret;
        }
@@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
-int ext4_get_block(struct inode *inode, sector_t iblock,
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
-                   struct buffer_head *bh_result, int create)
+                           struct buffer_head *bh, int flags)
 {
        handle_t *handle = ext4_journal_current_handle();
+        struct ext4_map_blocks map;
        int ret = 0, started = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        int dio_credits;
-        if (create && !handle) {
+        map.m_lblk = iblock;
+        map.m_len = bh->b_size >> inode->i_blkbits;
+        if (flags && !handle) {
                /* Direct IO write... */
-                if (max_blocks > DIO_MAX_BLOCKS)
+                if (map.m_len > DIO_MAX_BLOCKS)
-                        max_blocks = DIO_MAX_BLOCKS;
+                        map.m_len = DIO_MAX_BLOCKS;
-                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+                dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
                handle = ext4_journal_start(inode, dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-                        goto out;
+                        return ret;
                }
                started = 1;
        }
-        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+        ret = ext4_map_blocks(handle, inode, &map, flags);
-                              create ? EXT4_GET_BLOCKS_CREATE : 0);
        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
+                map_bh(bh, inode->i_sb, map.m_pblk);
+                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        }
        if (started)
                ext4_journal_stop(handle);
-out:
        return ret;
 }
+int ext4_get_block(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh, int create)
+{
+        return _ext4_get_block(inode, iblock, bh,
+                               create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
 /*
 * `handle' can be NULL if create is zero
 */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                ext4_lblk_t block, int create, int *errp)
 {
-        struct buffer_head dummy;
+        struct ext4_map_blocks map;
+        struct buffer_head *bh;
        int fatal = 0, err;
-        int flags = 0;
        J_ASSERT(handle != NULL || create == 0);
-        dummy.b_state = 0;
+        map.m_lblk = block;
-        dummy.b_blocknr = -1000;
+        map.m_len = 1;
-        buffer_trace_init(&dummy.b_history);
+        err = ext4_map_blocks(handle, inode, &map,
-        if (create)
+                              create ? EXT4_GET_BLOCKS_CREATE : 0);
-                flags |= EXT4_GET_BLOCKS_CREATE;
-        err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
+        if (err < 0)
-        /*
+                *errp = err;
-         * ext4_get_blocks() returns number of blocks mapped. 0 in
+        if (err <= 0)
-         * case of a HOLE.
+                return NULL;
-         */
+        *errp = 0;
-        if (err > 0) {
-                if (err > 1)
+        bh = sb_getblk(inode->i_sb, map.m_pblk);
-                        WARN_ON(1);
+        if (!bh) {
-                err = 0;
+                *errp = -EIO;
+                return NULL;
        }
-        *errp = err;
+        if (map.m_flags & EXT4_MAP_NEW) {
-        if (!err && buffer_mapped(&dummy)) {
+                J_ASSERT(create != 0);
-                struct buffer_head *bh;
+                J_ASSERT(handle != NULL);
-                bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-                if (!bh) {
-                        *errp = -EIO;
-                        goto err;
-                }
-                if (buffer_new(&dummy)) {
-                        J_ASSERT(create != 0);
-                        J_ASSERT(handle != NULL);
-                        /*
+                /*
-                         * Now that we do not always journal data, we should
+                 * Now that we do not always journal data, we should
-                         * keep in mind whether this should always journal the
+                 * keep in mind whether this should always journal the
-                         * new buffer as metadata.  For now, regular file
+                 * new buffer as metadata.  For now, regular file
-                         * writes use ext4_get_block instead, so it's not a
+                 * writes use ext4_get_block instead, so it's not a
-                         * problem.
+                 * problem.
-                         */
+                 */
-                        lock_buffer(bh);
+                lock_buffer(bh);
-                        BUFFER_TRACE(bh, "call get_create_access");
+                BUFFER_TRACE(bh, "call get_create_access");
-                        fatal = ext4_journal_get_create_access(handle, bh);
+                fatal = ext4_journal_get_create_access(handle, bh);
-                        if (!fatal && !buffer_uptodate(bh)) {
+                if (!fatal && !buffer_uptodate(bh)) {
-                                memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-                                set_buffer_uptodate(bh);
+                        set_buffer_uptodate(bh);
-                        }
-                        unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        err = ext4_handle_dirty_metadata(handle, inode, bh);
-                        if (!fatal)
-                                fatal = err;
-                } else {
-                        BUFFER_TRACE(bh, "not a new buffer");
-                }
-                if (fatal) {
-                        *errp = fatal;
-                        brelse(bh);
-                        bh = NULL;
                }
-                return bh;
+                unlock_buffer(bh);
+                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
+                if (!fatal)
+                        fatal = err;
+        } else {
+                BUFFER_TRACE(bh, "not a new buffer");
        }
-err:
+        if (fatal) {
-        return NULL;
+                *errp = fatal;
+                brelse(bh);
+                bh = NULL;
+        }
+        return bh;
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long md_needed, md_reserved;
+        unsigned long md_needed;
        int ret;
        /*
@@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
         */
 repeat:
        spin_lock(&ei->i_block_reservation_lock);
-        md_reserved = ei->i_reserved_meta_blocks;
        md_needed = ext4_calc_metadata_amount(inode, lblock);
        trace_ext4_da_reserve_space(inode, md_needed);
        spin_unlock(&ei->i_block_reservation_lock);
        /*
-         * Make quota reservation here to prevent quota overflow
+         * We will charge metadata quota at writeout time; this saves
-         * later. Real quota accounting is done at pages writeout
+         * us from metadata over-estimation, though we may go over by
-         * time.
+         * a small amount in the end.  Here we just reserve for data.
         */
-        ret = dquot_reserve_block(inode, md_needed + 1);
+        ret = dquot_reserve_block(inode, 1);
        if (ret)
                return ret;
+        /*
+         * We do still charge estimated metadata to the sb though;
+         * we cannot afford to run out of free blocks.
+         */
        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
-                dquot_release_reservation_block(inode, md_needed + 1);
+                dquot_release_reservation_block(inode, 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
@@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        trace_ext4_da_release_space(inode, to_free);
        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
                 * if there aren't enough reserved blocks, then the
@@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-                to_free += ei->i_reserved_meta_blocks;
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
        }
-        /* update fs dirty blocks counter */
+        /* update fs dirty data blocks counter */
        percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 /*
 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
 *
- * @mpd->inode - inode to walk through
- * @exbh->b_blocknr - first block on a disk
- * @exbh->b_size - amount of space in bytes
- * @logical - first logical block to start assignment with
- *
 * the function goes through all passed space and put actual disk
 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
 */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                 struct buffer_head *exbh)
+                                 struct ext4_map_blocks *map)
 {
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-        int blocks = exbh->b_size >> inode->i_blkbits;
+        int blocks = map->m_len;
-        sector_t pblock = exbh->b_blocknr, cur_logical;
+        sector_t pblock = map->m_pblk, cur_logical;
        struct buffer_head *head, *bh;
        pgoff_t index, end;
        struct pagevec pvec;
        int nr_pages, i;
-        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
        pagevec_init(&pvec, 0);
@@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                        /* skip blocks out of the range */
                        do {
-                                if (cur_logical >= logical)
+                                if (cur_logical >= map->m_lblk)
                                        break;
                                cur_logical++;
                        } while ((bh = bh->b_this_page) != head);
                        do {
-                                if (cur_logical >= logical + blocks)
+                                if (cur_logical >= map->m_lblk + blocks)
                                        break;
-                                if (buffer_delay(bh) ||
+                                if (buffer_delay(bh) || buffer_unwritten(bh)) {
-                                                buffer_unwritten(bh)) {
                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
@@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
-                                if (buffer_uninit(exbh))
+                                if (map->m_flags & EXT4_MAP_UNINIT)
                                        set_buffer_uninit(bh);
                                cur_logical++;
                                pblock++;
@@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 }
-/*
- * __unmap_underlying_blocks - just a helper function to unmap
- * set of blocks described by @bh
- */
-static inline void __unmap_underlying_blocks(struct inode *inode,
-                                             struct buffer_head *bh)
-{
-        struct block_device *bdev = inode->i_sb->s_bdev;
-        int blocks, i;
-        blocks = bh->b_size >> inode->i_blkbits;
-        for (i = 0; i < blocks; i++)
-                unmap_underlying_metadata(bdev, bh->b_blocknr + i);
-}
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                        sector_t logical, long blk_cnt)
 {
@@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-        struct buffer_head new;
+        struct ext4_map_blocks map;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
         * variables are updated after the blocks have been allocated.
         */
-        new.b_state = 0;
+        map.m_lblk = next;
+        map.m_len = max_blocks;
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
        if (ext4_should_dioread_nolock(mpd->inode))
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (mpd->b_state & (1 << BH_Delay))
                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+        blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
-                               &new, get_blocks_flags);
        if (blks < 0) {
                err = blks;
                /*
@@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                ext4_msg(mpd->inode->i_sb, KERN_CRIT,
                         "delayed block allocation failed for inode %lu at "
                         "logical offset %llu with max blocks %zd with "
-                         "error %d\n", mpd->inode->i_ino,
+                         "error %d", mpd->inode->i_ino,
                         (unsigned long long) next,
                         mpd->b_size >> mpd->inode->i_blkbits, err);
                printk(KERN_CRIT "This should not happen!!  "
@@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
        }
        BUG_ON(blks == 0);
-        new.b_size = (blks << mpd->inode->i_blkbits);
+        if (map.m_flags & EXT4_MAP_NEW) {
+                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+                int i;
-        if (buffer_new(&new))
+                for (i = 0; i < map.m_len; i++)
-                __unmap_underlying_blocks(mpd->inode, &new);
+                        unmap_underlying_metadata(bdev, map.m_pblk + i);
+        }
        /*
         * If blocks are delayed marked, we need to
@@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         */
        if ((mpd->b_state & (1 << BH_Delay)) ||
            (mpd->b_state & (1 << BH_Unwritten)))
-                mpage_put_bnr_to_bhs(mpd, next, &new);
+                mpage_put_bnr_to_bhs(mpd, &map);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
        sector_t next;
        int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+        /*
+         * XXX Don't go larger than mballoc is willing to allocate
+         * This is a stopgap solution.  We eventually need to fold
+         * mpage_da_submit_io() into this function and then call
+         * ext4_get_blocks() multiple times in a loop
+         */
+        if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+                goto flush_it;
        /* check if thereserved journal credits might overflow */
-        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
                        /*
                         * With non-extent format we are limited by the journal
@@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
        struct buffer_head *bh, *head;
        sector_t logical;
-        if (mpd->io_done) {
-                /*
-                 * Rest of the page in the page_vec
-                 * redirty then and skip then. We will
-                 * try to write them again after
-                 * starting a new transaction
-                 */
-                redirty_page_for_writepage(wbc, page);
-                unlock_page(page);
-                return MPAGE_DA_EXTENT_TAIL;
-        }
        /*
         * Can we merge this page to current extent?
         */
@@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
 * initialized properly.
 */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-                                  struct buffer_head *bh_result, int create)
+                                  struct buffer_head *bh, int create)
 {
+        struct ext4_map_blocks map;
        int ret = 0;
        sector_t invalid_block = ~((sector_t) 0xffff);
@@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                invalid_block = ~0;
        BUG_ON(create == 0);
-        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+        map.m_lblk = iblock;
+        map.m_len = 1;
        /*
         * first, we need to know whether the block is allocated already
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-        ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
+        ret = ext4_map_blocks(NULL, inode, &map, 0);
-        if ((ret == 0) && !buffer_delay(bh_result)) {
+        if (ret < 0)
-                /* the block isn't (pre)allocated yet, let's reserve space */
+                return ret;
+        if (ret == 0) {
+                if (buffer_delay(bh))
+                        return 0; /* Not sure this could or should happen */
                /*
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
@@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                        /* not enough space to reserve */
                        return ret;
-                map_bh(bh_result, inode->i_sb, invalid_block);
+                map_bh(bh, inode->i_sb, invalid_block);
-                set_buffer_new(bh_result);
+                set_buffer_new(bh);
-                set_buffer_delay(bh_result);
+                set_buffer_delay(bh);
-        } else if (ret > 0) {
+                return 0;
-                bh_result->b_size = (ret << inode->i_blkbits);
-                if (buffer_unwritten(bh_result)) {
-                        /* A delayed write to unwritten bh should
-                         * be marked new and mapped.  Mapped ensures
-                         * that we don't do get_block multiple times
-                         * when we write to the same offset and new
-                         * ensures that we do proper zero out for
-                         * partial write.
-                         */
-                        set_buffer_new(bh_result);
-                        set_buffer_mapped(bh_result);
-                }
-                ret = 0;
        }
-        return ret;
+        map_bh(bh, inode->i_sb, map.m_pblk);
+        bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+        if (buffer_unwritten(bh)) {
+                /* A delayed write to unwritten bh should be marked
+                 * new and mapped.  Mapped ensures that we don't do
+                 * get_block multiple times when we write to the same
+                 * offset and new ensures that we do proper zero out
+                 * for partial write.
+                 */
+                set_buffer_new(bh);
+                set_buffer_mapped(bh);
+        }
+        return 0;
 }
 /*
@@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
-        int ret = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        return _ext4_get_block(inode, iblock, bh_result, 0);
-        /*
-         * we don't want to do block allocation in writepage
-         * so call get_block_wrap with create = 0
-         */
-        ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
-        }
-        return ret;
 }
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
         * number of contiguous block. So we will limit
         * number of contiguous block to a sane value
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
            (max_blocks > EXT4_MAX_TRANS_DATA))
                max_blocks = EXT4_MAX_TRANS_DATA;
        return ext4_chunk_trans_blocks(inode, max_blocks);
 }
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages().  Differences:
+ *      Range cyclic is ignored.
+ *      no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+                                struct writeback_control *wbc,
+                                struct mpage_da_data *mpd)
+{
+        int ret = 0;
+        int done = 0;
+        struct pagevec pvec;
+        int nr_pages;
+        pgoff_t index;
+        pgoff_t end;            /* Inclusive */
+        long nr_to_write = wbc->nr_to_write;
+        pagevec_init(&pvec, 0);
+        index = wbc->range_start >> PAGE_CACHE_SHIFT;
+        end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        while (!done && (index <= end)) {
+                int i;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                              PAGECACHE_TAG_DIRTY,
+                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /*
+                         * At this point, the page may be truncated or
+                         * invalidated (changing page->mapping to NULL), or
+                         * even swizzled back from swapper_space to tmpfs file
+                         * mapping. However, page->index will not change
+                         * because we have a reference on the page.
+                         */
+                        if (page->index > end) {
+                                done = 1;
+                                break;
+                        }
+                        lock_page(page);
+                        /*
+                         * Page truncated or invalidated. We can freely skip it
+                         * then, even for data integrity operations: the page
+                         * has disappeared concurrently, so there could be no
+                         * real expectation of this data interity operation
+                         * even if there is now a new, dirty page at the same
+                         * pagecache address.
+                         */
+                        if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (!PageDirty(page)) {
+                                /* someone wrote it for us */
+                                goto continue_unlock;
+                        }
+                        if (PageWriteback(page)) {
+                                if (wbc->sync_mode != WB_SYNC_NONE)
+                                        wait_on_page_writeback(page);
+                                else
+                                        goto continue_unlock;
+                        }
+                        BUG_ON(PageWriteback(page));
+                        if (!clear_page_dirty_for_io(page))
+                                goto continue_unlock;
+                        ret = __mpage_da_writepage(page, wbc, mpd);
+                        if (unlikely(ret)) {
+                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                                        unlock_page(page);
+                                        ret = 0;
+                                } else {
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                        if (nr_to_write > 0) {
+                                nr_to_write--;
+                                if (nr_to_write == 0 &&
+                                    wbc->sync_mode == WB_SYNC_NONE) {
+                                        /*
+                                         * We stop writing back only if we are
+                                         * not doing integrity sync. In case of
+                                         * integrity sync we have to keep going
+                                         * because someone may be concurrently
+                                         * dirtying pages, and we might have
+                                         * synced a lot of newly appeared dirty
+                                         * pages, but have not synced all of the
+                                         * old dirty pages.
+                                         */
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        return ret;
+}
 static int ext4_da_writepages(struct address_space *mapping,
                              struct writeback_control *wbc)
 {
@@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        handle_t *handle = NULL;
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
-        int no_nrwrite_index_update;
        int pages_written = 0;
        long pages_skipped;
        unsigned int max_pages;
@@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        mpd.wbc = wbc;
        mpd.inode = mapping->host;
-        /*
-         * we don't want write_cache_pages to update
-         * nr_to_write and writeback_index
-         */
-        no_nrwrite_index_update = wbc->no_nrwrite_index_update;
-        wbc->no_nrwrite_index_update = 1;
        pages_skipped = wbc->pages_skipped;
 retry:
@@ -2941,7 +3011,7 @@ retry:
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
-                               "%ld pages, ino %lu; err %d\n", __func__,
+                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        goto out_writepages;
                }
@@ -2963,8 +3033,7 @@ retry:
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+                ret = write_cache_pages_da(mapping, wbc, &mpd);
-                                        &mpd);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3085,7 @@ retry:
        if (pages_skipped != wbc->pages_skipped)
                ext4_msg(inode->i_sb, KERN_CRIT,
                         "This should not happen leaving %s "
-                         "with nr_to_write = %ld ret = %d\n",
+                         "with nr_to_write = %ld ret = %d",
                         __func__, wbc->nr_to_write, ret);
        /* Update index */
@@ -3030,8 +3099,6 @@ retry:
                mapping->writeback_index = index;
 out_writepages:
-        if (!no_nrwrite_index_update)
-                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len, unsigned flags,
                               struct page **pagep, void **fsdata)
 {
-        int ret, retries = 0, quota_retries = 0;
+        int ret, retries = 0;
        struct page *page;
        pgoff_t index;
        unsigned from, to;
@@ -3135,22 +3202,6 @@ retry:
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-        if ((ret == -EDQUOT) &&
-            EXT4_I(inode)->i_reserved_meta_blocks &&
-            (quota_retries++ < 3)) {
-                /*
-                 * Since we often over-estimate the number of meta
-                 * data blocks required, we may sometimes get a
-                 * spurios out of quota error even though there would
-                 * be enough space once we write the data blocks and
-                 * find out how many meta data blocks were _really_
-                 * required.  So try forcing the inode write to see if
-                 * that helps.
-                 */
-                write_inode_now(inode, (quota_retries == 3));
-                goto retry;
-        }
 out:
        return ret;
 }
@@ -3546,46 +3597,18 @@ out:
        return ret;
 }
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create)
 {
-        handle_t *handle = ext4_journal_current_handle();
-        int ret = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        int dio_credits;
-        int started = 0;
        ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
                   inode->i_ino, create);
-        /*
+        return _ext4_get_block(inode, iblock, bh_result,
-         * ext4_get_block in prepare for a DIO write or buffer write.
+                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
-         * We allocate an uinitialized extent if blocks haven't been allocated.
-         * The extent will be converted to initialized after IO complete.
-         */
-        create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
-        if (!handle) {
-                if (max_blocks > DIO_MAX_BLOCKS)
-                        max_blocks = DIO_MAX_BLOCKS;
-                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-                handle = ext4_journal_start(inode, dio_credits);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        goto out;
-                }
-                started = 1;
-        }
-        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-                              create);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
-        }
-        if (started)
-                ext4_journal_stop(handle);
-out:
-        return ret;
 }
 static void dump_completed_IO(struct inode * inode)
@@ -3973,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4302,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
                                   count)) {
-                ext4_error(inode->i_sb, "inode #%lu: "
+                EXT4_ERROR_INODE(inode, "attempt to clear invalid "
-                           "attempt to clear blocks %llu len %lu, invalid",
+                                 "blocks %llu len %lu",
-                           inode->i_ino, (unsigned long long) block_to_free,
+                                 (unsigned long long) block_to_free, count);
-                           count);
                return 1;
        }
@@ -4410,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode,
-                                   "circular indirect block detected, "
+                                         "circular indirect block detected at "
-                                   "inode=%lu, block=%llu",
+                                         "block %llu",
-                                   inode->i_ino,
+                                (unsigned long long) this_bh->b_blocknr);
-                                   (unsigned long long) this_bh->b_blocknr);
        }
 }
@@ -4452,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                   nr, 1)) {
-                                ext4_error(inode->i_sb,
+                                EXT4_ERROR_INODE(inode,
-                                           "indirect mapped block in inode "
+                                                 "invalid indirect mapped "
-                                           "#%lu invalid (level %d, blk #%lu)",
+                                                 "block %lu (level %d)",
-                                           inode->i_ino, depth,
+                                                 (unsigned long) nr, depth);
-                                           (unsigned long) nr);
                                break;
                        }
@@ -4468,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * (should be rare).
                         */
                        if (!bh) {
-                                ext4_error(inode->i_sb,
+                                EXT4_ERROR_INODE(inode,
-                                           "Read failure, inode=%lu, block=%llu",
+                                                 "Read failure block=%llu",
-                                           inode->i_ino, nr);
+                                                 (unsigned long long) nr);
                                continue;
                        }
@@ -4612,12 +4632,12 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
-        EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_ext_truncate(inode);
                return;
        }
@@ -4785,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
        bh = sb_getblk(sb, block);
        if (!bh) {
-                ext4_error(sb, "unable to read inode block - "
+                EXT4_ERROR_INODE(inode, "unable to read inode block - "
-                           "inode=%lu, block=%llu", inode->i_ino, block);
+                                 "block %llu", block);
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -4884,8 +4904,8 @@ make_io:
                submit_bh(READ_META, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                        ext4_error(sb, "unable to read inode block - inode=%lu,"
+                        EXT4_ERROR_INODE(inode, "unable to read inode "
-                                   " block=%llu", inode->i_ino, block);
+                                         "block %llu", block);
                        brelse(bh);
                        return -EIO;
                }
@@ -5096,8 +5116,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-                ext4_error(sb, "bad extended attribute block %llu inode #%lu",
+                EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
-                           ei->i_file_acl, inode->i_ino);
+                                 ei->i_file_acl);
                ret = -EIO;
                goto bad_inode;
        } else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5142,8 +5162,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else {
                ret = -EIO;
-                ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
+                EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
-                           inode->i_mode, inode->i_ino);
                goto bad_inode;
        }
        brelse(iloc.bh);
@@ -5381,9 +5400,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                if (wbc->sync_mode == WB_SYNC_ALL)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-                        ext4_error(inode->i_sb, "IO error syncing inode, "
+                        EXT4_ERROR_INODE(inode,
-                                   "inode=%lu, block=%llu", inode->i_ino,
+                                "IO error syncing inode (block=%llu)",
-                                   (unsigned long long)iloc.bh->b_blocknr);
+                                (unsigned long long) iloc.bh->b_blocknr);
                        err = -EIO;
                }
                brelse(iloc.bh);
@@ -5455,7 +5474,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        }
        if (attr->ia_valid & ATTR_SIZE) {
-                if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5468,7 +5487,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) &&
            attr->ia_valid & ATTR_SIZE &&
            (attr->ia_size < inode->i_size ||
-             (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+             (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
                handle_t *handle;
                handle = ext4_journal_start(inode, 3);
@@ -5500,7 +5519,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        }
                }
                /* ext4_truncate will clear the flag */
-                if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+                if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
                        ext4_truncate(inode);
        }
@@ -5576,7 +5595,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
        return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
@@ -5911,9 +5930,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         */
        if (val)
-                EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else
-                EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        ext4_set_aops(inode);
        jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
                if (me.moved_len > 0)
                        file_remove_suid(donor_filp);
-                if (copy_to_user((struct move_extent __user *)arg, 
+                if (copy_to_user((struct move_extent __user *)arg,
                                 &me, sizeof(me)))
                        err = -EFAULT;
 mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case EXT4_IOC32_SETRSVSZ:
                cmd = EXT4_IOC_SETRSVSZ;
                break;
-        case EXT4_IOC_GROUP_ADD:
+        case EXT4_IOC32_GROUP_ADD: {
+                struct compat_ext4_new_group_input __user *uinput;
+                struct ext4_new_group_input input;
+                mm_segment_t old_fs;
+                int err;
+                uinput = compat_ptr(arg);
+                err = get_user(input.group, &uinput->group);
+                err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+                err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+                err |= get_user(input.inode_table, &uinput->inode_table);
+                err |= get_user(input.blocks_count, &uinput->blocks_count);
+                err |= get_user(input.reserved_blocks,
+                                &uinput->reserved_blocks);
+                if (err)
+                        return -EFAULT;
+                old_fs = get_fs();
+                set_fs(KERNEL_DS);
+                err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+                                 (unsigned long) &input);
+                set_fs(old_fs);
+                return err;
+        }
+        case EXT4_IOC_MOVE_EXT:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca3..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        }
 }
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+        int i;
+        int bits;
+        grp->bb_largest_free_order = -1; /* uninit */
+        bits = sb->s_blocksize_bits + 1;
+        for (i = bits; i >= 0; i--) {
+                if (grp->bb_counters[i] > 0) {
+                        grp->bb_largest_free_order = i;
+                        break;
+                }
+        }
+}
 static noinline_for_stack
 void ext4_mb_generate_buddy(struct super_block *sb,
                                void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
                 */
                grp->bb_free = free;
        }
+        mb_set_largest_free_order(sb, grp);
        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
 * So it can have information regarding groups_per_page which
 * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
 */
 static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore == NULL);
                        mb_debug(1, "put buddy for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
+                        trace_ext4_mb_buddy_bitmap_load(sb, group);
                        grinfo = ext4_get_group_info(sb, group);
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
@@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore != NULL);
                        mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
+                        trace_ext4_mb_bitmap_load(sb, group);
                        /* see comments in ext4_mb_put_pa() */
                        ext4_lock_group(sb, group);
@@ -910,6 +937,11 @@ out:
        return err;
 }
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
@@ -1004,6 +1036,11 @@ err:
        return ret;
 }
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                                        struct ext4_buddy *e4b)
@@ -1150,7 +1187,7 @@ err:
        return ret;
 }
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 {
        if (e4b->bd_bitmap_page)
                page_cache_release(e4b->bd_bitmap_page);
@@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        buddy = buddy2;
                } while (1);
        }
+        mb_set_largest_free_order(sb, e4b->bd_info);
        mb_check_buddy(e4b);
 }
@@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
                e4b->bd_info->bb_counters[ord]++;
                e4b->bd_info->bb_counters[ord]++;
        }
+        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
        mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
        mb_check_buddy(e4b);
@@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
        }
        ext4_unlock_group(ac->ac_sb, group);
-        ext4_mb_release_desc(e4b);
+        ext4_mb_unload_buddy(e4b);
        return 0;
 }
@@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                ext4_mb_use_best_found(ac, e4b);
        }
        ext4_unlock_group(ac->ac_sb, group);
-        ext4_mb_release_desc(e4b);
+        ext4_mb_unload_buddy(e4b);
        return 0;
 }
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
        }
 }
+/* This is now called BEFORE we load the buddy bitmap. */
 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                                ext4_group_t group, int cr)
 {
        unsigned free, fragments;
-        unsigned i, bits;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        BUG_ON(cr < 0 || cr >= 4);
-        BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+        /* We only do this if the grp has never been initialized */
+        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+                int ret = ext4_mb_init_group(ac->ac_sb, group);
+                if (ret)
+                        return 0;
+        }
        free = grp->bb_free;
        fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        case 0:
                BUG_ON(ac->ac_2order == 0);
+                if (grp->bb_largest_free_order < ac->ac_2order)
+                        return 0;
                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
                    ((group % flex_size) == 0))
                        return 0;
-                bits = ac->ac_sb->s_blocksize_bits + 1;
+                return 1;
-                for (i = ac->ac_2order; i <= bits; i++)
-                        if (grp->bb_counters[i] > 0)
-                                return 1;
-                break;
        case 1:
                if ((free / fragments) >= ac->ac_g_ex.fe_len)
                        return 1;
@@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        sbi = EXT4_SB(sb);
        ngroups = ext4_get_groups_count(sb);
        /* non-extent files are limited to low blocks/groups */
-        if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
                ngroups = sbi->s_blockfile_groups;
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2024,15 +2068,11 @@ repeat:
                group = ac->ac_g_ex.fe_group;
                for (i = 0; i < ngroups; group++, i++) {
-                        struct ext4_group_info *grp;
-                        struct ext4_group_desc *desc;
                        if (group == ngroups)
                                group = 0;
-                        /* quick check to skip empty groups */
+                        /* This now checks without needing the buddy page */
-                        grp = ext4_get_group_info(sb, group);
+                        if (!ext4_mb_good_group(ac, group, cr))
-                        if (grp->bb_free == 0)
                                continue;
                        err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,15 +2080,18 @@ repeat:
                                goto out;
                        ext4_lock_group(sb, group);
+                        /*
+                         * We need to check again after locking the
+                         * block group
+                         */
                        if (!ext4_mb_good_group(ac, group, cr)) {
-                                /* someone did allocation from this group */
                                ext4_unlock_group(sb, group);
-                                ext4_mb_release_desc(&e4b);
+                                ext4_mb_unload_buddy(&e4b);
                                continue;
                        }
                        ac->ac_groups_scanned++;
-                        desc = ext4_get_group_desc(sb, group, NULL);
                        if (cr == 0)
                                ext4_mb_simple_scan_group(ac, &e4b);
                        else if (cr == 1 &&
@@ -2058,7 +2101,7 @@ repeat:
                                ext4_mb_complex_scan_group(ac, &e4b);
                        ext4_unlock_group(sb, group);
-                        ext4_mb_release_desc(&e4b);
+                        ext4_mb_unload_buddy(&e4b);
                        if (ac->ac_status != AC_STATUS_CONTINUE)
                                break;
@@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        ext4_lock_group(sb, group);
        memcpy(&sg, ext4_get_group_info(sb, group), i);
        ext4_unlock_group(sb, group);
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
                        sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root = RB_ROOT;
+        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 #ifdef DOUBLE_CHECK
        {
@@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                         entry->count, entry->group, entry);
                if (test_opt(sb, DISCARD)) {
+                        int ret;
                        ext4_fsblk_t discard_block;
                        discard_block = entry->start_blk +
@@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                        trace_ext4_discard_blocks(sb,
                                        (unsigned long long)discard_block,
                                        entry->count);
-                        sb_issue_discard(sb, discard_block, entry->count);
+                        ret = sb_issue_discard(sb, discard_block, entry->count);
+                        if (ret == EOPNOTSUPP) {
+                                ext4_warning(sb,
+                                        "discard not supported, disabling");
+                                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+                        }
                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                }
                ext4_unlock_group(sb, entry->group);
                kmem_cache_free(ext4_free_ext_cachep, entry);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
        }
        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
 void exit_ext4_mballoc(void)
 {
-        /* 
+        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
         */
@@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
                atomic_inc(&sbi->s_bal_reqs);
                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
-                if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
                        atomic_inc(&sbi->s_bal_success);
                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                        continue;
                /* non-extent files can't have physical blocks past 2^32 */
-                if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
                        pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
                        continue;
@@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        spin_unlock(&pa->pa_lock);
        grp_blk = pa->pa_pstart;
-        /* 
+        /*
         * If doing group-based preallocation, pa_pstart may be in the
         * next group when pa is used up
         */
@@ -3697,7 +3747,7 @@ out:
        ext4_unlock_group(sb, group);
        if (ac)
                kmem_cache_free(ext4_ac_cachep, ac);
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
        return free;
 }
@@ -3801,7 +3851,7 @@ repeat:
                if (bitmap_bh == NULL) {
                        ext4_error(sb, "Error reading block bitmap for %u",
                                        group);
-                        ext4_mb_release_desc(&e4b);
+                        ext4_mb_unload_buddy(&e4b);
                        continue;
                }
@@ -3810,7 +3860,7 @@ repeat:
                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
                ext4_unlock_group(sb, group);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
                put_bh(bitmap_bh);
                list_del(&pa->u.pa_tmp_list);
@@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                ext4_mb_release_group_pa(&e4b, pa, ac);
                ext4_unlock_group(sb, group);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
@@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
-                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
                }
        }
-        /* 
+        /*
         * We need to make sure we don't reuse the freed block until
         * after the transaction is committed, which we can do by
         * treating the block as metadata, below.  We make an
@@ -4610,7 +4660,7 @@ do_more:
                atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
        }
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        freed += count;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef44..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
         */
        if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
                                       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
-            (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+            (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EINVAL;
        if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc311..3a6c92ac131c 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        int depth = ext_depth(orig_inode);
        int ret;
+        start_ext.ee_block = end_ext.ee_block = 0;
        o_start = o_end = oext = orig_path[depth].p_ext;
        oext_alen = ext4_ext_get_actual_len(oext);
        start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
         * new_ext       |-------|
         */
        if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-                ext4_error(orig_inode->i_sb,
+                EXT4_ERROR_INODE(orig_inode,
                        "new_ext_end(%u) should be less than or equal to "
                        "oext->ee_block(%u) + oext_alen(%d) - 1",
                        new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        while (1) {
                /* The extent for donor must be found. */
                if (!dext) {
-                        ext4_error(donor_inode->i_sb,
+                        EXT4_ERROR_INODE(donor_inode,
                                   "The extent for donor must be found");
                        *err = -EIO;
                        goto out;
                } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-                        ext4_error(donor_inode->i_sb,
+                        EXT4_ERROR_INODE(donor_inode,
                                "Donor offset(%u) and the first block of donor "
                                "extent(%u) should be equal",
                                donor_off,
@@ -976,11 +977,11 @@ mext_check_arguments(struct inode *orig_inode,
        }
        /* Ext4 move extent supports only extent based file */
-        if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
                ext4_debug("ext4 move extent: orig file is not extents "
                        "based file [ino:orig %lu]\n", orig_inode->i_ino);
                return -EOPNOTSUPP;
-        } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+        } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
                ext4_debug("ext4 move extent: donor file is not extents "
                        "based file [ino:donor %lu]\n", donor_inode->i_ino);
                return -EOPNOTSUPP;
@@ -1354,7 +1355,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                        if (ret1 < 0)
                                break;
                        if (*moved_len > len) {
-                                ext4_error(orig_inode->i_sb,
+                                EXT4_ERROR_INODE(orig_inode,
                                        "We replaced blocks too much! "
                                        "sum of replaced: %llu requested: %llu",
                                        *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
                return blocksize;
        return (len & 65532) | ((len & 3) << 16);
 }
-  
 __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 {
        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
        if (len == blocksize) {
                if (blocksize == 65536)
                        return cpu_to_le16(EXT4_MAX_REC_LEN);
-                else 
+                else
                        return cpu_to_le16(0);
        }
        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
                brelse(bh);
        }
        if (bcount)
-                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
+                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
                       levels ? "" : "   ", names, space/bcount,
                       (space/bcount)*100/blocksize);
        return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        int ret, err;
        __u32 hashval;
-        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
                       start_hash, start_minor_hash));
        dir = dir_file->f_path.dentry->d_inode;
-        if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+        if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
                hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
                if (hinfo.hash_version <= DX_HASH_TEA)
                        hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
 {
        if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
                                     EXT4_FEATURE_COMPAT_DIR_INDEX))
-                EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
 }
 /*
@@ -943,8 +943,8 @@ restart:
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        /* read error, skip block & hope for the best */
-                        ext4_error(sb, "reading directory #%lu offset %lu",
+                        EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
-                                   dir->i_ino, (unsigned long)block);
+                                         (unsigned long) block);
                        brelse(bh);
                        goto next;
                }
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
-                        ext4_error(dir->i_sb, "bad inode number: %u", ino);
+                        EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
                if (unlikely(IS_ERR(inode))) {
                        if (PTR_ERR(inode) == -ESTALE) {
-                                ext4_error(dir->i_sb,
+                                EXT4_ERROR_INODE(dir,
-                                                "deleted inode referenced: %u",
+                                                 "deleted inode referenced: %u",
-                                                ino);
+                                                 ino);
                                return ERR_PTR(-EIO);
                        } else {
                                return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
        brelse(bh);
        if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-                ext4_error(child->d_inode->i_sb,
+                EXT4_ERROR_INODE(child->d_inode,
-                           "bad inode number: %u", ino);
+                                 "bad parent inode number: %u", ino);
                return ERR_PTR(-EIO);
        }
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
        unsigned rec_len = 0;
        while (count--) {
-                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
                                                (from + (map->offs<<2));
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        de = (struct ext4_dir_entry_2 *)((char *)fde +
                ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
-                ext4_error(dir->i_sb,
+                EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
-                           "invalid rec_len for '..' in inode %lu",
-                           dir->i_ino);
                brelse(bh);
                return -EIO;
        }
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
                brelse(bh);
                return retval;
        }
-        EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+        ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
        data1 = bh2->b_data;
        memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                retval = ext4_dx_add_entry(handle, dentry, inode);
                if (!retval || (retval != ERR_BAD_DX_DIR))
                        return retval;
-                EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
                dx_fallback++;
                ext4_mark_inode_dirty(handle, dir);
        }
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
        brelse(bh);
+        if (retval == 0)
+                ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
        return retval;
 }
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
        if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
            !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
                if (err)
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode,
-                                   "error %d reading directory #%lu offset 0",
+                                "error %d reading directory lblock 0", err);
-                                   err, inode->i_ino);
                else
                        ext4_warning(inode->i_sb,
                                     "bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
        de = ext4_next_entry(de1, sb->s_blocksize);
        while (offset < inode->i_size) {
                if (!bh ||
-                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+                    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+                        unsigned int lblock;
                        err = 0;
                        brelse(bh);
-                        bh = ext4_bread(NULL, inode,
+                        lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
-                                offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+                        bh = ext4_bread(NULL, inode, lblock, 0, &err);
                        if (!bh) {
                                if (err)
-                                        ext4_error(sb,
+                                        EXT4_ERROR_INODE(inode,
-                                                   "error %d reading directory"
+                                                "error %d reading directory "
-                                                   " #%lu offset %u",
+                                                "lblock %u", err, lblock);
-                                                   err, inode->i_ino, offset);
                                offset += sb->s_blocksize;
                                continue;
                        }
@@ -2297,7 +2296,7 @@ retry:
                }
        } else {
                /* clear the extent format for fast symlink */
-                EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
                inode->i_op = &ext4_fast_symlink_inode_operations;
                memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
                inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT4_INODES_PER_GROUP(sb));
-        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+            sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group;
                flex_group = ext4_flex_group(sbi, input->group);
                atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d5..4e8983a9811b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
+        vfs_check_frozen(sb, SB_FREEZE_WRITE);
        /* Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly. */
@@ -645,6 +646,8 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        flush_workqueue(sbi->dio_unwritten_wq);
        destroy_workqueue(sbi->dio_unwritten_wq);
@@ -941,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
                seq_puts(seq, ",journal_async_commit");
+        else if (test_opt(sb, JOURNAL_CHECKSUM))
+                seq_puts(seq, ",journal_checksum");
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
        if (test_opt(sb, I_VERSION))
@@ -1059,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path, int remount);
+                                char *path);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1081,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = {
 static const struct quotactl_ops ext4_qctl_operations = {
        .quota_on       = ext4_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
 #endif
@@ -2051,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
                if (sb_dqopt(sb)->files[i])
-                        vfs_quota_off(sb, i, 0);
+                        dquot_quota_off(sb, i);
        }
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2213,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 struct ext4_attr {
        struct attribute attr;
        ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
-        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
                         const char *, size_t);
        int offset;
 };
@@ -2430,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
 {
+        char *orig_data = kstrdup(data, GFP_KERNEL);
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
        struct ext4_sb_info *sbi;
@@ -2793,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                        ext4_count_free_blocks(sb));
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                                ext4_count_free_inodes(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                ext4_count_dirs(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
-        }
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "insufficient memory");
-                goto failed_mount3;
-        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_max_writeback_mb_bump = 128;
@@ -2910,6 +2898,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 no_journal:
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                                  ext4_count_free_blocks(sb));
+        if (!err)
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                          ext4_count_free_inodes(sb));
+        if (!err)
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                          ext4_count_dirs(sb));
+        if (!err)
+                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
+                goto failed_mount_wq;
+        }
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -3001,7 +3003,7 @@ no_journal:
        err = ext4_setup_system_zone(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize system "
-                         "zone (%d)\n", err);
+                         "zone (%d)", err);
                goto failed_mount4;
        }
@@ -3040,9 +3042,11 @@ no_journal:
        } else
                descr = "out journal";
-        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
+        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+                "Opts: %s", descr, orig_data);
        lock_kernel();
+        kfree(orig_data);
        return 0;
 cantfind_ext4:
@@ -3059,6 +3063,10 @@ failed_mount_wq:
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3074,6 @@ failed_mount3:
                else
                        kfree(sbi->s_flex_groups);
        }
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
-        percpu_counter_destroy(&sbi->s_freeinodes_counter);
-        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3093,7 @@ out_fail:
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
+        kfree(orig_data);
        return ret;
 }
@@ -3380,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        if (!(sb->s_flags & MS_RDONLY))
                es->s_wtime = cpu_to_le32(get_seconds());
        es->s_kbytes_written =
-                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
                            ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
                              EXT4_SB(sb)->s_sectors_written_start) >> 1));
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3485,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb)
                return 0;
        journal = EXT4_SB(sb)->s_journal;
-        if (journal)
+        if (journal) {
+                vfs_check_frozen(sb, SB_FREEZE_WRITE);
                ret = ext4_journal_force_commit(journal);
+        }
        return ret;
 }
@@ -3535,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb)
         * the journal.
         */
        error = jbd2_journal_flush(journal);
-        if (error < 0) {
+        if (error < 0)
-        out:
+                goto out;
-                jbd2_journal_unlock_updates(journal);
-                return error;
-        }
        /* Journal blocked and flushed, clear needs_recovery flag. */
        EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        error = ext4_commit_super(sb, 1);
-        if (error)
+out:
-                goto out;
+        /* we rely on s_frozen to stop further updates */
-        return 0;
+        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+        return error;
 }
 /*
@@ -3563,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb)
        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        ext4_commit_super(sb, 1);
        unlock_super(sb);
-        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        return 0;
 }
@@ -3574,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        ext4_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
+        int enable_quota = 0;
        ext4_group_t g;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
 #endif
+        char *orig_data = kstrdup(data, GFP_KERNEL);
        lock_kernel();
@@ -3630,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                }
                if (*flags & MS_RDONLY) {
+                        err = dquot_suspend(sb, -1);
+                        if (err < 0)
+                                goto restore_opts;
                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
@@ -3698,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                goto restore_opts;
                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        enable_quota = 1;
                }
        }
        ext4_setup_system_zone(sb);
@@ -3713,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
        unlock_super(sb);
        unlock_kernel();
+        if (enable_quota)
+                dquot_resume(sb, -1);
+        ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+        kfree(orig_data);
        return 0;
 restore_opts:
@@ -3734,6 +3750,7 @@ restore_opts:
 #endif
        unlock_super(sb);
        unlock_kernel();
+        kfree(orig_data);
        return err;
 }
@@ -3906,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type)
 */
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-                                  EXT4_SB(sb)->s_jquota_fmt, type);
+                                        EXT4_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name, int remount)
+                         char *name)
 {
        int err;
        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        /* When remounting, no checks are needed and in fact, name is NULL */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, remount);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
@@ -3962,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                }
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
        path_put(&path);
        return err;
 }
@@ -4141,6 +4155,7 @@ static int __init init_ext4_fs(void)
 {
        int err;
+        ext4_check_flag_values();
        err = init_ext4_system_zone();
        if (err)
                return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .setattr        = ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ext4_follow_link,
+        .setattr        = ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2de0e9515089..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
 bad_block:
-                ext4_error(inode->i_sb,
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           "inode %lu: bad block %llu", inode->i_ino,
+                                 EXT4_I(inode)->i_file_acl);
-                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
        }
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
-                ext4_error(inode->i_sb,
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           "inode %lu: bad block %llu", inode->i_ino,
+                                 EXT4_I(inode)->i_file_acl);
-                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
        }
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                        atomic_read(&(bs->bh->b_count)),
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                if (ext4_xattr_check_block(bs->bh)) {
-                        ext4_error(sb, "inode %lu: bad block %llu",
+                        EXT4_ERROR_INODE(inode, "bad block %llu",
-                                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                         EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -820,7 +818,7 @@ inserted:
                                                EXT4_I(inode)->i_block_group);
                        /* non-extent files can't have physical blocks past 2^32 */
-                        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
                        block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
                        if (error)
                                goto cleanup;
-                        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
                        ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
        goto cleanup;
 bad_block:
-        ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+        EXT4_ERROR_INODE(inode, "bad block %llu",
-                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                         EXT4_I(inode)->i_file_acl);
        goto cleanup;
 #undef header
@@ -1194,8 +1192,8 @@ retry:
                if (!bh)
                        goto cleanup;
                if (ext4_xattr_check_block(bh)) {
-                        ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                        EXT4_ERROR_INODE(inode, "bad block %llu",
-                                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                         EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
                goto cleanup;
        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
        if (!bh) {
-                ext4_error(inode->i_sb, "inode %lu: block %llu read error",
+                EXT4_ERROR_INODE(inode, "block %llu read error",
-                           inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                 EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-                ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                 EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
                }
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode, "block %lu read error",
-                                "inode %lu: block %lu read error",
+                                         (unsigned long) ce->e_block);
-                                inode->i_ino, (unsigned long) ce->e_block);
                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
                                EXT4_XATTR_REFCOUNT_MAX) {
                        ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 113f0a1e565d..ae8200f84e39 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -242,9 +242,10 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
        while (*fclus < cluster) {
                /* prevent the infinite loop of cluster chain */
                if (*fclus > limit) {
-                        fat_fs_error(sb, "%s: detected the cluster chain loop"
+                        fat_fs_error_ratelimit(sb,
-                                     " (i_pos %lld)", __func__,
+                                        "%s: detected the cluster chain loop"
-                                     MSDOS_I(inode)->i_pos);
+                                        " (i_pos %lld)", __func__,
+                                        MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                }
@@ -253,9 +254,9 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus)
                if (nr < 0)
                        goto out;
                else if (nr == FAT_ENT_FREE) {
-                        fat_fs_error(sb, "%s: invalid cluster chain"
+                        fat_fs_error_ratelimit(sb, "%s: invalid cluster chain"
-                                     " (i_pos %lld)", __func__,
+                                               " (i_pos %lld)", __func__,
-                                     MSDOS_I(inode)->i_pos);
+                                               MSDOS_I(inode)->i_pos);
                        nr = -EIO;
                        goto out;
                } else if (nr == FAT_ENT_EOF) {
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 530b4ca01510..ee42b9e0b16a 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -19,6 +19,7 @@
 #include <linux/buffer_head.h>
 #include <linux/compat.h>
 #include <asm/uaccess.h>
+#include <linux/kernel.h>
 #include "fat.h"
 /*
@@ -140,28 +141,22 @@ static int uni16_to_x8(unsigned char *ascii, const wchar_t *uni, int len,
 {
        const wchar_t *ip;
        wchar_t ec;
-        unsigned char *op, nc;
+        unsigned char *op;
        int charlen;
-        int k;
        ip = uni;
        op = ascii;
        while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) {
                ec = *ip++;
-                if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
+                if ((charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) {
                        op += charlen;
                        len -= charlen;
                } else {
                        if (uni_xlate == 1) {
-                                *op = ':';
+                                *op++ = ':';
-                                for (k = 4; k > 0; k--) {
+                                op = pack_hex_byte(op, ec >> 8);
-                                        nc = ec & 0xF;
+                                op = pack_hex_byte(op, ec);
-                                        op[k] = nc > 9  ? nc + ('a' - 10)
-                                                        : nc + '0';
-                                        ec >>= 4;
-                                }
-                                op += 5;
                                len -= 5;
                        } else {
                                *op++ = '?';
@@ -758,9 +753,10 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *filp,
        return ret;
 }
-static int fat_dir_ioctl(struct inode *inode, struct file *filp,
+static long fat_dir_ioctl(struct file *filp, unsigned int cmd,
-                         unsigned int cmd, unsigned long arg)
+                          unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        struct __fat_dirent __user *d1 = (struct __fat_dirent __user *)arg;
        int short_only, both;
@@ -774,7 +770,7 @@ static int fat_dir_ioctl(struct inode *inode, struct file *filp,
                both = 1;
                break;
        default:
-                return fat_generic_ioctl(inode, filp, cmd, arg);
+                return fat_generic_ioctl(filp, cmd, arg);
        }
        if (!access_ok(VERIFY_WRITE, d1, sizeof(struct __fat_dirent[2])))
@@ -814,7 +810,7 @@ static long fat_compat_dir_ioctl(struct file *filp, unsigned cmd,
                both = 1;
                break;
        default:
-                return -ENOIOCTLCMD;
+                return fat_generic_ioctl(filp, cmd, (unsigned long)arg);
        }
        if (!access_ok(VERIFY_WRITE, d1, sizeof(struct compat_dirent[2])))
@@ -836,7 +832,7 @@ const struct file_operations fat_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = fat_readdir,
-        .ioctl          = fat_dir_ioctl,
+        .unlocked_ioctl = fat_dir_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = fat_compat_dir_ioctl,
 #endif
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index e6efdfa0f6db..27ac25725954 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -6,6 +6,7 @@
 #include <linux/nls.h>
 #include <linux/fs.h>
 #include <linux/mutex.h>
+#include <linux/ratelimit.h>
 #include <linux/msdos_fs.h>
 /*
@@ -82,6 +83,8 @@ struct msdos_sb_info {
        struct fatent_operations *fatent_ops;
        struct inode *fat_inode;
+        struct ratelimit_state ratelimit;
        spinlock_t inode_hash_lock;
        struct hlist_head inode_hashtable[FAT_HASH_SIZE];
 };
@@ -298,16 +301,16 @@ extern int fat_free_clusters(struct inode *inode, int cluster);
 extern int fat_count_free_clusters(struct super_block *sb);
 /* fat/file.c */
-extern int fat_generic_ioctl(struct inode *inode, struct file *filp,
+extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
-                             unsigned int cmd, unsigned long arg);
+                              unsigned long arg);
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
-extern void fat_truncate(struct inode *inode);
+extern int fat_setsize(struct inode *inode, loff_t offset);
+extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
                       struct kstat *stat);
-extern int fat_file_fsync(struct file *file, struct dentry *dentry,
+extern int fat_file_fsync(struct file *file, int datasync);
-                          int datasync);
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
@@ -322,8 +325,13 @@ extern int fat_fill_super(struct super_block *sb, void *data, int silent,
 extern int fat_flush_inodes(struct super_block *sb, struct inode *i1,
                            struct inode *i2);
 /* fat/misc.c */
-extern void fat_fs_error(struct super_block *s, const char *fmt, ...)
+extern void
-        __attribute__ ((format (printf, 2, 3))) __cold;
+__fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
+        __attribute__ ((format (printf, 3, 4))) __cold;
+#define fat_fs_error(s, fmt, args...)           \
+        __fat_fs_error(s, 1, fmt , ## args)
+#define fat_fs_error_ratelimit(s, fmt, args...) \
+        __fat_fs_error(s, __ratelimit(&MSDOS_SB(s)->ratelimit), fmt , ## args)
 extern int fat_clusters_flush(struct super_block *sb);
 extern int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster);
 extern void fat_time_fat2unix(struct msdos_sb_info *sbi, struct timespec *ts,
diff --git a/fs/fat/file.c b/fs/fat/file.c
index e8c159de236b..990dfae022e5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -8,6 +8,7 @@
 #include <linux/capability.h>
 #include <linux/module.h>
+#include <linux/compat.h>
 #include <linux/mount.h>
 #include <linux/time.h>
 #include <linux/buffer_head.h>
@@ -114,9 +115,9 @@ out:
        return err;
 }
-int fat_generic_ioctl(struct inode *inode, struct file *filp,
+long fat_generic_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                      unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        u32 __user *user_attr = (u32 __user *)arg;
        switch (cmd) {
@@ -129,6 +130,15 @@ int fat_generic_ioctl(struct inode *inode, struct file *filp,
        }
 }
+#ifdef CONFIG_COMPAT
+static long fat_generic_compat_ioctl(struct file *filp, unsigned int cmd,
+                                      unsigned long arg)
+{
+        return fat_generic_ioctl(filp, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
 static int fat_file_release(struct inode *inode, struct file *filp)
 {
        if ((filp->f_mode & FMODE_WRITE) &&
@@ -139,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
        return 0;
 }
-int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int fat_file_fsync(struct file *filp, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int res, err;
-        res = simple_fsync(filp, dentry, datasync);
+        res = generic_file_fsync(filp, datasync);
        err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
        return res ? res : err;
@@ -159,7 +169,10 @@ const struct file_operations fat_file_operations = {
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
        .release        = fat_file_release,
-        .ioctl          = fat_generic_ioctl,
+        .unlocked_ioctl = fat_generic_ioctl,
+#ifdef CONFIG_COMPAT
+        .compat_ioctl   = fat_generic_compat_ioctl,
+#endif
        .fsync          = fat_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
@@ -270,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
        return fat_free_clusters(inode, free_start);
 }
-void fat_truncate(struct inode *inode)
+void fat_truncate_blocks(struct inode *inode, loff_t offset)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        const unsigned int cluster_size = sbi->cluster_size;
@@ -280,10 +293,10 @@ void fat_truncate(struct inode *inode)
         * This protects against truncating a file bigger than it was then
         * trying to write into the hole.
         */
-        if (MSDOS_I(inode)->mmu_private > inode->i_size)
+        if (MSDOS_I(inode)->mmu_private > offset)
-                MSDOS_I(inode)->mmu_private = inode->i_size;
+                MSDOS_I(inode)->mmu_private = offset;
-        nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
+        nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
        fat_free(inode, nr_clusters);
        fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -351,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
        return 0;
 }
+int fat_setsize(struct inode *inode, loff_t offset)
+{
+        int error;
+        error = simple_setsize(inode, offset);
+        if (error)
+                return error;
+        fat_truncate_blocks(inode, offset);
+        return error;
+}
 #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
 /* valid file mode bits */
 #define FAT_VALID_MODE  (S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -365,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        /*
         * Expand the file. Since inode_setattr() updates ->i_size
         * before calling the ->truncate(), but FAT needs to fill the
-         * hole before it.
+         * hole before it. XXX: this is no longer true with new truncate
+         * sequence.
         */
        if (attr->ia_valid & ATTR_SIZE) {
                if (attr->ia_size > inode->i_size) {
@@ -414,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
                        attr->ia_valid &= ~ATTR_MODE;
        }
-        if (attr->ia_valid)
+        if (attr->ia_valid & ATTR_SIZE) {
-                error = inode_setattr(inode, attr);
+                error = fat_setsize(inode, attr->ia_size);
+                if (error)
+                        goto out;
+        }
+        generic_setattr(inode, attr);
+        mark_inode_dirty(inode);
 out:
        return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
 const struct inode_operations fat_file_inode_operations = {
-        .truncate       = fat_truncate,
        .setattr        = fat_setattr,
        .getattr        = fat_getattr,
 };
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 0ce143bd7d56..7bf45aee56d7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
 }
+static void fat_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                fat_truncate_blocks(inode, inode->i_size);
+        }
+}
 static int fat_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
+        int err;
        *pagep = NULL;
-        return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
-                                fat_get_block,
+                                pagep, fsdata, fat_get_block,
                                &MSDOS_I(mapping->host)->mmu_private);
+        if (err < 0)
+                fat_write_failed(mapping, pos + len);
+        return err;
 }
 static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
        struct inode *inode = mapping->host;
        int err;
        err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+        if (err < len)
+                fat_write_failed(mapping, pos + len);
        if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
                inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
                MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
                             loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        ssize_t ret;
        if (rw == WRITE) {
                /*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
         * FAT need to use the DIO_LOCKING for avoiding the race
         * condition of fat_get_block() and ->truncate().
         */
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
-                                  offset, nr_segs, fat_get_block, NULL);
+                                iov, offset, nr_segs, fat_get_block, NULL);
+        if (ret < 0 && (rw & WRITE))
+                fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+        return ret;
 }
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
        inode->i_size = 0;
-        fat_truncate(inode);
+        fat_truncate_blocks(inode, 0);
        clear_inode(inode);
 }
@@ -1250,6 +1273,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent,
        sb->s_op = &fat_sops;
        sb->s_export_op = &fat_export_ops;
        sbi->dir_ops = fs_dir_inode_ops;
+        ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL,
+                             DEFAULT_RATELIMIT_BURST);
        error = parse_options(data, isvfat, silent, &debug, &sbi->options);
        if (error)
@@ -1497,10 +1522,8 @@ out_fail:
                iput(fat_inode);
        if (root_inode)
                iput(root_inode);
-        if (sbi->nls_io)
+        unload_nls(sbi->nls_io);
-                unload_nls(sbi->nls_io);
+        unload_nls(sbi->nls_disk);
-        if (sbi->nls_disk)
-                unload_nls(sbi->nls_disk);
        if (sbi->options.iocharset != fat_default_iocharset)
                kfree(sbi->options.iocharset);
        sb->s_fs_info = NULL;
diff --git a/fs/fat/misc.c b/fs/fat/misc.c
index d3da05f26465..1fa23f6ffba5 100644
--- a/fs/fat/misc.c
+++ b/fs/fat/misc.c
@@ -20,27 +20,29 @@
 * In case the file system is remounted read-only, it can be made writable
 * again by remounting it.
 */
-void fat_fs_error(struct super_block *s, const char *fmt, ...)
+void __fat_fs_error(struct super_block *s, int report, const char *fmt, ...)
 {
        struct fat_mount_options *opts = &MSDOS_SB(s)->options;
        va_list args;
-        printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
+        if (report) {
+                printk(KERN_ERR "FAT: Filesystem error (dev %s)\n", s->s_id);
-        printk(KERN_ERR "    ");
+                printk(KERN_ERR "    ");
-        va_start(args, fmt);
+                va_start(args, fmt);
-        vprintk(fmt, args);
+                vprintk(fmt, args);
-        va_end(args);
+                va_end(args);
-        printk("\n");
+                printk("\n");
+        }
        if (opts->errors == FAT_ERRORS_PANIC)
-                panic("    FAT fs panic from previous error\n");
+                panic("FAT: fs panic from previous error\n");
        else if (opts->errors == FAT_ERRORS_RO && !(s->s_flags & MS_RDONLY)) {
                s->s_flags |= MS_RDONLY;
-                printk(KERN_ERR "    File system has been set read-only\n");
+                printk(KERN_ERR "FAT: Filesystem has been set read-only\n");
        }
 }
-EXPORT_SYMBOL_GPL(fat_fs_error);
+EXPORT_SYMBOL_GPL(__fat_fs_error);
 /* Flushes the number of free clusters on FAT32 */
 /* XXX: Need to write one per FSINFO block.  Currently only writes 1 */
diff --git a/fs/file_table.c b/fs/file_table.c
index 32d12b78bac8..5c7d10ead4ad 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 }
 EXPORT_SYMBOL(alloc_file);
-void fput(struct file *file)
-{
-        if (atomic_long_dec_and_test(&file->f_count))
-                __fput(file);
-}
-EXPORT_SYMBOL(fput);
 /**
 * drop_file_write_access - give up ability to write to a file
 * @file: the file to which we will stop writing
@@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file)
 }
 EXPORT_SYMBOL_GPL(drop_file_write_access);
-/* __fput is called from task context when aio completion releases the last
+/* the real guts of fput() - releasing the last reference to file
- * last use of a struct file *.  Do not use otherwise.
 */
-void __fput(struct file *file)
+static void __fput(struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
@@ -268,6 +259,14 @@ void __fput(struct file *file)
        mntput(mnt);
 }
+void fput(struct file *file)
+{
+        if (atomic_long_dec_and_test(&file->f_count))
+                __fput(file);
+}
+EXPORT_SYMBOL(fput);
 struct file *fget(unsigned int fd)
 {
        struct file *file;
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
 };
 const struct file_operations vxfs_dir_operations = {
+        .llseek =               generic_file_llseek,
+        .read =                 generic_read_dir,
        .readdir =              vxfs_readdir,
 };
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 408a7877b79d..1d1088f48bc2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -398,11 +398,11 @@ static void inode_wait_for_writeback(struct inode *inode)
        wait_queue_head_t *wqh;
        wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
-        do {
+         while (inode->i_state & I_SYNC) {
                spin_unlock(&inode_lock);
                __wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
                spin_lock(&inode_lock);
-        } while (inode->i_state & I_SYNC);
+        }
 }
 /*
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
        /* banners (can't represent line 0 by pos 0 as that would involve
         * returning a NULL pointer) */
        if (pos == 0)
-                return (struct fscache_object *) ++(*_pos);
+                return (struct fscache_object *)(long)++(*_pos);
        if (pos < 3)
                return (struct fscache_object *)pos;
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index eb7e9423691f..9424796d6634 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,8 +16,12 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
 static struct kmem_cache *fuse_req_cachep;
@@ -498,6 +502,9 @@ struct fuse_copy_state {
        int write;
        struct fuse_req *req;
        const struct iovec *iov;
+        struct pipe_buffer *pipebufs;
+        struct pipe_buffer *currbuf;
+        struct pipe_inode_info *pipe;
        unsigned long nr_segs;
        unsigned long seglen;
        unsigned long addr;
@@ -505,16 +512,16 @@ struct fuse_copy_state {
        void *mapaddr;
        void *buf;
        unsigned len;
+        unsigned move_pages:1;
 };
 static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
-                           int write, struct fuse_req *req,
+                           int write,
                           const struct iovec *iov, unsigned long nr_segs)
 {
        memset(cs, 0, sizeof(*cs));
        cs->fc = fc;
        cs->write = write;
-        cs->req = req;
        cs->iov = iov;
        cs->nr_segs = nr_segs;
 }
@@ -522,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
 /* Unmap and put previous page of userspace buffer */
 static void fuse_copy_finish(struct fuse_copy_state *cs)
 {
-        if (cs->mapaddr) {
+        if (cs->currbuf) {
+                struct pipe_buffer *buf = cs->currbuf;
+                if (!cs->write) {
+                        buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+                } else {
+                        kunmap_atomic(cs->mapaddr, KM_USER0);
+                        buf->len = PAGE_SIZE - cs->len;
+                }
+                cs->currbuf = NULL;
+                cs->mapaddr = NULL;
+        } else if (cs->mapaddr) {
                kunmap_atomic(cs->mapaddr, KM_USER0);
                if (cs->write) {
                        flush_dcache_page(cs->pg);
@@ -544,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
        unlock_request(cs->fc, cs->req);
        fuse_copy_finish(cs);
-        if (!cs->seglen) {
+        if (cs->pipebufs) {
-                BUG_ON(!cs->nr_segs);
+                struct pipe_buffer *buf = cs->pipebufs;
-                cs->seglen = cs->iov[0].iov_len;
-                cs->addr = (unsigned long) cs->iov[0].iov_base;
+                if (!cs->write) {
-                cs->iov++;
+                        err = buf->ops->confirm(cs->pipe, buf);
-                cs->nr_segs--;
+                        if (err)
+                                return err;
+                        BUG_ON(!cs->nr_segs);
+                        cs->currbuf = buf;
+                        cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+                        cs->len = buf->len;
+                        cs->buf = cs->mapaddr + buf->offset;
+                        cs->pipebufs++;
+                        cs->nr_segs--;
+                } else {
+                        struct page *page;
+                        if (cs->nr_segs == cs->pipe->buffers)
+                                return -EIO;
+                        page = alloc_page(GFP_HIGHUSER);
+                        if (!page)
+                                return -ENOMEM;
+                        buf->page = page;
+                        buf->offset = 0;
+                        buf->len = 0;
+                        cs->currbuf = buf;
+                        cs->mapaddr = kmap_atomic(page, KM_USER0);
+                        cs->buf = cs->mapaddr;
+                        cs->len = PAGE_SIZE;
+                        cs->pipebufs++;
+                        cs->nr_segs++;
+                }
+        } else {
+                if (!cs->seglen) {
+                        BUG_ON(!cs->nr_segs);
+                        cs->seglen = cs->iov[0].iov_len;
+                        cs->addr = (unsigned long) cs->iov[0].iov_base;
+                        cs->iov++;
+                        cs->nr_segs--;
+                }
+                err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
+                if (err < 0)
+                        return err;
+                BUG_ON(err != 1);
+                offset = cs->addr % PAGE_SIZE;
+                cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
+                cs->buf = cs->mapaddr + offset;
+                cs->len = min(PAGE_SIZE - offset, cs->seglen);
+                cs->seglen -= cs->len;
+                cs->addr += cs->len;
        }
-        down_read(&current->mm->mmap_sem);
-        err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
-                             &cs->pg, NULL);
-        up_read(&current->mm->mmap_sem);
-        if (err < 0)
-                return err;
-        BUG_ON(err != 1);
-        offset = cs->addr % PAGE_SIZE;
-        cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
-        cs->buf = cs->mapaddr + offset;
-        cs->len = min(PAGE_SIZE - offset, cs->seglen);
-        cs->seglen -= cs->len;
-        cs->addr += cs->len;
        return lock_request(cs->fc, cs->req);
 }
@@ -585,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
        return ncpy;
 }
+static int fuse_check_page(struct page *page)
+{
+        if (page_mapcount(page) ||
+            page->mapping != NULL ||
+            page_count(page) != 1 ||
+            (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+             ~(1 << PG_locked |
+               1 << PG_referenced |
+               1 << PG_uptodate |
+               1 << PG_lru |
+               1 << PG_active |
+               1 << PG_reclaim))) {
+                printk(KERN_WARNING "fuse: trying to steal weird page\n");
+                printk(KERN_WARNING "  page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+                return 1;
+        }
+        return 0;
+}
+static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+{
+        int err;
+        struct page *oldpage = *pagep;
+        struct page *newpage;
+        struct pipe_buffer *buf = cs->pipebufs;
+        struct address_space *mapping;
+        pgoff_t index;
+        unlock_request(cs->fc, cs->req);
+        fuse_copy_finish(cs);
+        err = buf->ops->confirm(cs->pipe, buf);
+        if (err)
+                return err;
+        BUG_ON(!cs->nr_segs);
+        cs->currbuf = buf;
+        cs->len = buf->len;
+        cs->pipebufs++;
+        cs->nr_segs--;
+        if (cs->len != PAGE_SIZE)
+                goto out_fallback;
+        if (buf->ops->steal(cs->pipe, buf) != 0)
+                goto out_fallback;
+        newpage = buf->page;
+        if (WARN_ON(!PageUptodate(newpage)))
+                return -EIO;
+        ClearPageMappedToDisk(newpage);
+        if (fuse_check_page(newpage) != 0)
+                goto out_fallback_unlock;
+        mapping = oldpage->mapping;
+        index = oldpage->index;
+        /*
+         * This is a new and locked page, it shouldn't be mapped or
+         * have any special flags on it
+         */
+        if (WARN_ON(page_mapped(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(page_has_private(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(PageMlocked(oldpage)))
+                goto out_fallback_unlock;
+        remove_from_page_cache(oldpage);
+        page_cache_release(oldpage);
+        err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
+        if (err) {
+                printk(KERN_WARNING "fuse_try_move_page: failed to add page");
+                goto out_fallback_unlock;
+        }
+        page_cache_get(newpage);
+        if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+                lru_cache_add_file(newpage);
+        err = 0;
+        spin_lock(&cs->fc->lock);
+        if (cs->req->aborted)
+                err = -ENOENT;
+        else
+                *pagep = newpage;
+        spin_unlock(&cs->fc->lock);
+        if (err) {
+                unlock_page(newpage);
+                page_cache_release(newpage);
+                return err;
+        }
+        unlock_page(oldpage);
+        page_cache_release(oldpage);
+        cs->len = 0;
+        return 0;
+out_fallback_unlock:
+        unlock_page(newpage);
+out_fallback:
+        cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+        cs->buf = cs->mapaddr + buf->offset;
+        err = lock_request(cs->fc, cs->req);
+        if (err)
+                return err;
+        return 1;
+}
+static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
+                         unsigned offset, unsigned count)
+{
+        struct pipe_buffer *buf;
+        if (cs->nr_segs == cs->pipe->buffers)
+                return -EIO;
+        unlock_request(cs->fc, cs->req);
+        fuse_copy_finish(cs);
+        buf = cs->pipebufs;
+        page_cache_get(page);
+        buf->page = page;
+        buf->offset = offset;
+        buf->len = count;
+        cs->pipebufs++;
+        cs->nr_segs++;
+        cs->len = 0;
+        return 0;
+}
 /*
 * Copy a page in the request to/from the userspace buffer.  Must be
 * done atomically
 */
-static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
+static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
                          unsigned offset, unsigned count, int zeroing)
 {
+        int err;
+        struct page *page = *pagep;
        if (page && zeroing && count < PAGE_SIZE) {
                void *mapaddr = kmap_atomic(page, KM_USER1);
                memset(mapaddr, 0, PAGE_SIZE);
                kunmap_atomic(mapaddr, KM_USER1);
        }
        while (count) {
-                if (!cs->len) {
+                if (cs->write && cs->pipebufs && page) {
-                        int err = fuse_copy_fill(cs);
+                        return fuse_ref_page(cs, page, offset, count);
-                        if (err)
+                } else if (!cs->len) {
-                                return err;
+                        if (cs->move_pages && page &&
+                            offset == 0 && count == PAGE_SIZE) {
+                                err = fuse_try_move_page(cs, pagep);
+                                if (err <= 0)
+                                        return err;
+                        } else {
+                                err = fuse_copy_fill(cs);
+                                if (err)
+                                        return err;
+                        }
                }
                if (page) {
                        void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -626,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
        unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
        for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
-                struct page *page = req->pages[i];
+                int err;
-                int err = fuse_copy_page(cs, page, offset, count, zeroing);
+                err = fuse_copy_page(cs, &req->pages[i], offset, count,
+                                     zeroing);
                if (err)
                        return err;
@@ -704,11 +914,10 @@ __acquires(&fc->lock)
 *
 * Called with fc->lock held, releases it
 */
-static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
-                               const struct iovec *iov, unsigned long nr_segs)
+                               size_t nbytes, struct fuse_req *req)
 __releases(&fc->lock)
 {
-        struct fuse_copy_state cs;
        struct fuse_in_header ih;
        struct fuse_interrupt_in arg;
        unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -724,14 +933,13 @@ __releases(&fc->lock)
        arg.unique = req->in.h.unique;
        spin_unlock(&fc->lock);
-        if (iov_length(iov, nr_segs) < reqsize)
+        if (nbytes < reqsize)
                return -EINVAL;
-        fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
-        err = fuse_copy_one(&cs, &ih, sizeof(ih));
        if (!err)
-                err = fuse_copy_one(&cs, &arg, sizeof(arg));
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        return err ? err : reqsize;
 }
@@ -745,18 +953,13 @@ __releases(&fc->lock)
 * request_end().  Otherwise add it to the processing list, and set
 * the 'sent' flag.
 */
-static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
-                              unsigned long nr_segs, loff_t pos)
+                                struct fuse_copy_state *cs, size_t nbytes)
 {
        int err;
        struct fuse_req *req;
        struct fuse_in *in;
-        struct fuse_copy_state cs;
        unsigned reqsize;
-        struct file *file = iocb->ki_filp;
-        struct fuse_conn *fc = fuse_get_conn(file);
-        if (!fc)
-                return -EPERM;
 restart:
        spin_lock(&fc->lock);
@@ -776,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        if (!list_empty(&fc->interrupts)) {
                req = list_entry(fc->interrupts.next, struct fuse_req,
                                 intr_entry);
-                return fuse_read_interrupt(fc, req, iov, nr_segs);
+                return fuse_read_interrupt(fc, cs, nbytes, req);
        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -786,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        in = &req->in;
        reqsize = in->h.len;
        /* If request is too large, reply with an error and restart the read */
-        if (iov_length(iov, nr_segs) < reqsize) {
+        if (nbytes < reqsize) {
                req->out.h.error = -EIO;
                /* SETXATTR is special, since it may contain too large data */
                if (in->h.opcode == FUSE_SETXATTR)
@@ -795,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
                goto restart;
        }
        spin_unlock(&fc->lock);
-        fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);
+        cs->req = req;
-        err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+        err = fuse_copy_one(cs, &in->h, sizeof(in->h));
        if (!err)
-                err = fuse_copy_args(&cs, in->numargs, in->argpages,
+                err = fuse_copy_args(cs, in->numargs, in->argpages,
                                     (struct fuse_arg *) in->args, 0);
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        spin_lock(&fc->lock);
        req->locked = 0;
        if (req->aborted) {
@@ -828,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        return err;
 }
+static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t pos)
+{
+        struct fuse_copy_state cs;
+        struct file *file = iocb->ki_filp;
+        struct fuse_conn *fc = fuse_get_conn(file);
+        if (!fc)
+                return -EPERM;
+        fuse_copy_init(&cs, fc, 1, iov, nr_segs);
+        return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
+}
+static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
+                                   struct pipe_buffer *buf)
+{
+        return 1;
+}
+static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
+        .can_merge = 0,
+        .map = generic_pipe_buf_map,
+        .unmap = generic_pipe_buf_unmap,
+        .confirm = generic_pipe_buf_confirm,
+        .release = generic_pipe_buf_release,
+        .steal = fuse_dev_pipe_buf_steal,
+        .get = generic_pipe_buf_get,
+};
+static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+                                    struct pipe_inode_info *pipe,
+                                    size_t len, unsigned int flags)
+{
+        int ret;
+        int page_nr = 0;
+        int do_wakeup = 0;
+        struct pipe_buffer *bufs;
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc = fuse_get_conn(in);
+        if (!fc)
+                return -EPERM;
+        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        if (!bufs)
+                return -ENOMEM;
+        fuse_copy_init(&cs, fc, 1, NULL, 0);
+        cs.pipebufs = bufs;
+        cs.pipe = pipe;
+        ret = fuse_dev_do_read(fc, in, &cs, len);
+        if (ret < 0)
+                goto out;
+        ret = 0;
+        pipe_lock(pipe);
+        if (!pipe->readers) {
+                send_sig(SIGPIPE, current, 0);
+                if (!ret)
+                        ret = -EPIPE;
+                goto out_unlock;
+        }
+        if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+                ret = -EIO;
+                goto out_unlock;
+        }
+        while (page_nr < cs.nr_segs) {
+                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+                struct pipe_buffer *buf = pipe->bufs + newbuf;
+                buf->page = bufs[page_nr].page;
+                buf->offset = bufs[page_nr].offset;
+                buf->len = bufs[page_nr].len;
+                buf->ops = &fuse_dev_pipe_buf_ops;
+                pipe->nrbufs++;
+                page_nr++;
+                ret += buf->len;
+                if (pipe->inode)
+                        do_wakeup = 1;
+        }
+out_unlock:
+        pipe_unlock(pipe);
+        if (do_wakeup) {
+                smp_mb();
+                if (waitqueue_active(&pipe->wait))
+                        wake_up_interruptible(&pipe->wait);
+                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+        }
+out:
+        for (; page_nr < cs.nr_segs; page_nr++)
+                page_cache_release(bufs[page_nr].page);
+        kfree(bufs);
+        return ret;
+}
 static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
                            struct fuse_copy_state *cs)
 {
@@ -987,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
 * it from the list and copy the rest of the buffer to the request.
 * The request is finished by calling request_end()
 */
-static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
-                               unsigned long nr_segs, loff_t pos)
+                                 struct fuse_copy_state *cs, size_t nbytes)
 {
        int err;
-        size_t nbytes = iov_length(iov, nr_segs);
        struct fuse_req *req;
        struct fuse_out_header oh;
-        struct fuse_copy_state cs;
-        struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
-        if (!fc)
-                return -EPERM;
-        fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
        if (nbytes < sizeof(struct fuse_out_header))
                return -EINVAL;
-        err = fuse_copy_one(&cs, &oh, sizeof(oh));
+        err = fuse_copy_one(cs, &oh, sizeof(oh));
        if (err)
                goto err_finish;
@@ -1016,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
         * and error contains notification code.
         */
        if (!oh.unique) {
-                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
                return err ? err : nbytes;
        }
@@ -1035,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        if (req->aborted) {
                spin_unlock(&fc->lock);
-                fuse_copy_finish(&cs);
+                fuse_copy_finish(cs);
                spin_lock(&fc->lock);
                request_end(fc, req);
                return -ENOENT;
@@ -1052,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
                        queue_interrupt(fc, req);
                spin_unlock(&fc->lock);
-                fuse_copy_finish(&cs);
+                fuse_copy_finish(cs);
                return nbytes;
        }
@@ -1060,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        list_move(&req->list, &fc->io);
        req->out.h = oh;
        req->locked = 1;
-        cs.req = req;
+        cs->req = req;
+        if (!req->out.page_replace)
+                cs->move_pages = 0;
        spin_unlock(&fc->lock);
-        err = copy_out_args(&cs, &req->out, nbytes);
+        err = copy_out_args(cs, &req->out, nbytes);
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        spin_lock(&fc->lock);
        req->locked = 0;
@@ -1080,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 err_unlock:
        spin_unlock(&fc->lock);
 err_finish:
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        return err;
 }
+static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t pos)
+{
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
+        if (!fc)
+                return -EPERM;
+        fuse_copy_init(&cs, fc, 0, iov, nr_segs);
+        return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
+}
+static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
+                                     struct file *out, loff_t *ppos,
+                                     size_t len, unsigned int flags)
+{
+        unsigned nbuf;
+        unsigned idx;
+        struct pipe_buffer *bufs;
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc;
+        size_t rem;
+        ssize_t ret;
+        fc = fuse_get_conn(out);
+        if (!fc)
+                return -EPERM;
+        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        if (!bufs)
+                return -ENOMEM;
+        pipe_lock(pipe);
+        nbuf = 0;
+        rem = 0;
+        for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
+                rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+        ret = -EINVAL;
+        if (rem < len) {
+                pipe_unlock(pipe);
+                goto out;
+        }
+        rem = len;
+        while (rem) {
+                struct pipe_buffer *ibuf;
+                struct pipe_buffer *obuf;
+                BUG_ON(nbuf >= pipe->buffers);
+                BUG_ON(!pipe->nrbufs);
+                ibuf = &pipe->bufs[pipe->curbuf];
+                obuf = &bufs[nbuf];
+                if (rem >= ibuf->len) {
+                        *obuf = *ibuf;
+                        ibuf->ops = NULL;
+                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+                        pipe->nrbufs--;
+                } else {
+                        ibuf->ops->get(pipe, ibuf);
+                        *obuf = *ibuf;
+                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+                        obuf->len = rem;
+                        ibuf->offset += obuf->len;
+                        ibuf->len -= obuf->len;
+                }
+                nbuf++;
+                rem -= obuf->len;
+        }
+        pipe_unlock(pipe);
+        fuse_copy_init(&cs, fc, 0, NULL, nbuf);
+        cs.pipebufs = bufs;
+        cs.pipe = pipe;
+        if (flags & SPLICE_F_MOVE)
+                cs.move_pages = 1;
+        ret = fuse_dev_do_write(fc, &cs, len);
+        for (idx = 0; idx < nbuf; idx++) {
+                struct pipe_buffer *buf = &bufs[idx];
+                buf->ops->release(pipe, buf);
+        }
+out:
+        kfree(bufs);
+        return ret;
+}
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
        unsigned mask = POLLOUT | POLLWRNORM;
@@ -1225,8 +1619,10 @@ const struct file_operations fuse_dev_operations = {
        .llseek         = no_llseek,
        .read           = do_sync_read,
        .aio_read       = fuse_dev_read,
+        .splice_read    = fuse_dev_splice_read,
        .write          = do_sync_write,
        .aio_write      = fuse_dev_write,
+        .splice_write   = fuse_dev_splice_write,
        .poll           = fuse_dev_poll,
        .release        = fuse_dev_release,
        .fasync         = fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..3cdc5f78a406 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
        return 0;
 }
-static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_dir_fsync(struct file *file, int datasync)
 {
-        /* nfsd can call this with no file */
+        return fuse_fsync_common(file, datasync, 1);
-        return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
 }
 static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d3..ada0adeb3bb5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
        fuse_release_nowrite(inode);
 }
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+int fuse_fsync_common(struct file *file, int datasync, int isdir)
-                      int isdir)
 {
-        struct inode *inode = de->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_file *ff = file->private_data;
        struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
        return err;
 }
-static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_fsync(struct file *file, int datasync)
 {
-        return fuse_fsync_common(file, de, datasync, 0);
+        return fuse_fsync_common(file, datasync, 0);
 }
 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
        int i;
        size_t count = req->misc.read.in.size;
        size_t num_read = req->out.args[0].size;
-        struct inode *inode = req->pages[0]->mapping->host;
+        struct address_space *mapping = NULL;
-        /*
+        for (i = 0; mapping == NULL && i < req->num_pages; i++)
-         * Short read means EOF.  If file size is larger, truncate it
+                mapping = req->pages[i]->mapping;
-         */
-        if (!req->out.h.error && num_read < count) {
-                loff_t pos = page_offset(req->pages[0]) + num_read;
-                fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
-        }
-        fuse_invalidate_attr(inode); /* atime changed */
+        if (mapping) {
+                struct inode *inode = mapping->host;
+                /*
+                 * Short read means EOF. If file size is larger, truncate it
+                 */
+                if (!req->out.h.error && num_read < count) {
+                        loff_t pos;
+                        pos = page_offset(req->pages[0]) + num_read;
+                        fuse_read_update_size(inode, pos,
+                                              req->misc.read.attr_ver);
+                }
+                fuse_invalidate_attr(inode); /* atime changed */
+        }
        for (i = 0; i < req->num_pages; i++) {
                struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                else
                        SetPageError(page);
                unlock_page(page);
+                page_cache_release(page);
        }
        if (req->ff)
                fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
        req->out.argpages = 1;
        req->out.page_zeroing = 1;
+        req->out.page_replace = 1;
        fuse_read_fill(req, file, pos, count, FUSE_READ);
        req->misc.read.attr_ver = fuse_get_attr_version(fc);
        if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
                        return PTR_ERR(req);
                }
        }
+        page_cache_get(page);
        req->pages[req->num_pages] = page;
        req->num_pages++;
        return 0;
@@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
        nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
        npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
        npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
-        down_read(&current->mm->mmap_sem);
+        npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
-        npages = get_user_pages(current, current->mm, user_addr, npages, !write,
-                                0, req->pages, NULL);
-        up_read(&current->mm->mmap_sem);
        if (npages < 0)
                return npages;
@@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
        while (iov_iter_count(&ii)) {
                struct page *page = pages[page_idx++];
                size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
-                void *kaddr, *map;
+                void *kaddr;
-                kaddr = map = kmap(page);
+                kaddr = kmap(page);
                while (todo) {
                        char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..8f309f04064e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
        /** Zero partially or not copied pages */
        unsigned page_zeroing:1;
+        /** Pages may be replaced with new ones */
+        unsigned page_replace:1;
        /** Number or arguments */
        unsigned numargs;
@@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode);
 /**
 * Send FSYNC or FSYNCDIR request
 */
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+int fuse_fsync_common(struct file *file, int datasync, int isdir);
-                      int isdir);
 /**
 * Notify poll wakeup
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 9fb76b0a0485..48171f4c943d 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -236,10 +236,14 @@ static int gfs2_xattr_system_get(struct dentry *dentry, const char *name,
                                 void *buffer, size_t size, int xtype)
 {
        struct inode *inode = dentry->d_inode;
+        struct gfs2_sbd *sdp = GFS2_SB(inode);
        struct posix_acl *acl;
        int type;
        int error;
+        if (!sdp->sd_args.ar_posix_acl)
+                return -EOPNOTSUPP;
        type = gfs2_acl_type(name);
        if (type < 0)
                return type;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a739a0a48067..9f8b52500d63 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -700,8 +700,14 @@ out:
                return 0;
        page_cache_release(page);
+        /*
+         * XXX(hch): the call below should probably be replaced with
+         * a call to the gfs2-specific truncate blocks helper to actually
+         * release disk blocks..
+         */
        if (pos + len > ip->i_inode.i_size)
-                vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+                simple_setsize(&ip->i_inode, ip->i_inode.i_size);
 out_endtrans:
        gfs2_trans_end(sdp);
 out_trans_fail:
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e6dd2aec6f82..ed9a94f0ef15 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -218,6 +218,11 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
        if (error)
                goto out_drop_write;
+        error = -EACCES;
+        if (!is_owner_or_cap(inode))
+                goto out;
+        error = 0;
        flags = ip->i_diskflags;
        new_flags = (flags & ~mask) | (reqflags & mask);
        if ((new_flags ^ flags) == 0)
@@ -275,8 +280,10 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 {
        struct inode *inode = filp->f_path.dentry->d_inode;
        u32 fsflags, gfsflags;
        if (get_user(fsflags, ptr))
                return -EFAULT;
        gfsflags = fsflags_cvt(fsflags_to_gfs2, fsflags);
        if (!S_ISDIR(inode->i_mode)) {
                if (gfsflags & GFS2_DIF_INHERIT_JDATA)
@@ -547,9 +554,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
 * Returns: errno
 */
-static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int gfs2_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
        int ret = 0;
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 51d8061fa07a..b5612cbb62a5 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -242,34 +242,38 @@ fail:
 }
 /**
- * gfs2_unlinked_inode_lookup - Lookup an unlinked inode for reclamation
+ * gfs2_process_unlinked_inode - Lookup an unlinked inode for reclamation
+ *                               and try to reclaim it by doing iput.
+ *
+ * This function assumes no rgrp locks are currently held.
+ *
 * @sb: The super block
 * no_addr: The inode number
- * @@inode: A pointer to the inode found, if any
 *
- * Returns: 0 and *inode if no errors occurred.  If an error occurs,
- *          the resulting *inode may or may not be NULL.
 */
-int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
+void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr)
-                               struct inode **inode)
 {
        struct gfs2_sbd *sdp;
        struct gfs2_inode *ip;
        struct gfs2_glock *io_gl;
        int error;
        struct gfs2_holder gh;
+        struct inode *inode;
-        *inode = gfs2_iget_skip(sb, no_addr);
+        inode = gfs2_iget_skip(sb, no_addr);
-        if (!(*inode))
+        if (!inode)
-                return -ENOBUFS;
+                return;
-        if (!((*inode)->i_state & I_NEW))
+        /* If it's not a new inode, someone's using it, so leave it alone. */
-                return -ENOBUFS;
+        if (!(inode->i_state & I_NEW)) {
+                iput(inode);
+                return;
+        }
-        ip = GFS2_I(*inode);
+        ip = GFS2_I(inode);
-        sdp = GFS2_SB(*inode);
+        sdp = GFS2_SB(inode);
        ip->i_no_formal_ino = -1;
        error = gfs2_glock_get(sdp, no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl);
@@ -284,15 +288,13 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
        set_bit(GIF_INVALID, &ip->i_flags);
        error = gfs2_glock_nq_init(io_gl, LM_ST_SHARED, LM_FLAG_TRY | GL_EXACT,
                                   &ip->i_iopen_gh);
-        if (unlikely(error)) {
+        if (unlikely(error))
-                if (error == GLR_TRYFAILED)
-                        error = 0;
                goto fail_iopen;
-        }
        ip->i_iopen_gh.gh_gl->gl_object = ip;
        gfs2_glock_put(io_gl);
-        (*inode)->i_mode = DT2IF(DT_UNKNOWN);
+        inode->i_mode = DT2IF(DT_UNKNOWN);
        /*
         * We must read the inode in order to work out its type in
@@ -303,16 +305,17 @@ int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
         */
        error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, LM_FLAG_TRY,
                                   &gh);
-        if (unlikely(error)) {
+        if (unlikely(error))
-                if (error == GLR_TRYFAILED)
-                        error = 0;
                goto fail_glock;
-        }
        /* Inode is now uptodate */
        gfs2_glock_dq_uninit(&gh);
-        gfs2_set_iop(*inode);
+        gfs2_set_iop(inode);
+        /* The iput will cause it to be deleted. */
+        iput(inode);
+        return;
-        return 0;
 fail_glock:
        gfs2_glock_dq(&ip->i_iopen_gh);
 fail_iopen:
@@ -321,7 +324,8 @@ fail_put:
        ip->i_gl->gl_object = NULL;
        gfs2_glock_put(ip->i_gl);
 fail:
-        return error;
+        iget_failed(inode);
+        return;
 }
 static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf)
diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h
index e161461d4c57..300ada3f21de 100644
--- a/fs/gfs2/inode.h
+++ b/fs/gfs2/inode.h
@@ -84,8 +84,7 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip,
 extern void gfs2_set_iop(struct inode *inode);
 extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, 
                                       u64 no_addr, u64 no_formal_ino);
-extern int gfs2_unlinked_inode_lookup(struct super_block *sb, u64 no_addr,
+extern void gfs2_process_unlinked_inode(struct super_block *sb, u64 no_addr);
-                                      struct inode **inode);
 extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr);
 extern int gfs2_inode_refresh(struct gfs2_inode *ip);
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index b593f0e28f25..6a857e24f947 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -696,7 +696,7 @@ static void gfs2_ordered_wait(struct gfs2_sbd *sdp)
 *
 */
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 {
        struct gfs2_ail *ai;
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index eb570b4ad443..0d007f920234 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -47,28 +47,21 @@ static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp,
        sdp->sd_log_head = sdp->sd_log_tail = value;
 }
-unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
+extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
                            unsigned int ssize);
-int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
+extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-void gfs2_log_incr_head(struct gfs2_sbd *sdp);
+extern void gfs2_log_incr_head(struct gfs2_sbd *sdp);
-struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
+extern struct buffer_head *gfs2_log_get_buf(struct gfs2_sbd *sdp);
-struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
+extern struct buffer_head *gfs2_log_fake_buf(struct gfs2_sbd *sdp,
                                      struct buffer_head *real);
-void __gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
+extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-static inline void gfs2_log_flush(struct gfs2_sbd *sbd, struct gfs2_glock *gl)
+extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-{
+extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-        if (!gl || test_bit(GLF_LFLUSH, &gl->gl_flags))
+extern int gfs2_logd(void *data);
-                __gfs2_log_flush(sbd, gl);
-}
-void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
-void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
-void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
-int gfs2_logd(void *data);
 #endif /* __LOG_DOT_H__ */
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e64352d49de..98cdd05f3316 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
        return error;
 }
+/*
+ * XXX: should be changed to have proper ordering by opencoding simple_setsize
+ */
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1081,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
                error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
                if (error)
                        return error;
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
                gfs2_trans_end(sdp);
                if (error) 
                        return error;
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 117fa4171f62..171a744f8e45 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -1192,7 +1192,6 @@ int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, unsigned int line)
 {
        struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
        struct gfs2_alloc *al = ip->i_alloc;
-        struct inode *inode;
        int error = 0;
        u64 last_unlinked = NO_BLOCK, unlinked;
@@ -1210,22 +1209,27 @@ try_again:
        if (error)
                return error;
+        /* Find an rgrp suitable for allocation.  If it encounters any unlinked
+           dinodes along the way, error will equal -EAGAIN and unlinked will
+           contains it block address. We then need to look up that inode and
+           try to free it, and try the allocation again. */
        error = get_local_rgrp(ip, &unlinked, &last_unlinked);
        if (error) {
                if (ip != GFS2_I(sdp->sd_rindex))
                        gfs2_glock_dq_uninit(&al->al_ri_gh);
                if (error != -EAGAIN)
                        return error;
-                error = gfs2_unlinked_inode_lookup(ip->i_inode.i_sb,
-                                                   unlinked, &inode);
+                gfs2_process_unlinked_inode(ip->i_inode.i_sb, unlinked);
-                if (inode)
+                /* regardless of whether or not gfs2_process_unlinked_inode
-                        iput(inode);
+                   was successful, we don't want to repeat it again. */
+                last_unlinked = unlinked;
                gfs2_log_flush(sdp, NULL);
-                if (error == GLR_TRYFAILED)
+                error = 0;
-                        error = 0;
                goto try_again;
        }
+        /* no error, so we have the rgrp set in the inode's allocation. */
        al->al_file = file;
        al->al_line = line;
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index 5f4023678251..764fd1bdca88 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -494,7 +494,7 @@ const struct inode_operations hfsplus_dir_inode_operations = {
 const struct file_operations hfsplus_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = hfsplus_readdir,
-        .ioctl          = hfsplus_ioctl,
+        .unlocked_ioctl = hfsplus_ioctl,
        .llseek         = generic_file_llseek,
        .release        = hfsplus_dir_release,
 };
diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h
index 5c10d803d9df..6505c30ad965 100644
--- a/fs/hfsplus/hfsplus_fs.h
+++ b/fs/hfsplus/hfsplus_fs.h
@@ -337,8 +337,7 @@ struct inode *hfsplus_new_inode(struct super_block *, int);
 void hfsplus_delete_inode(struct inode *);
 /* ioctl.c */
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
-                  unsigned long arg);
 int hfsplus_setxattr(struct dentry *dentry, const char *name,
                     const void *value, size_t size, int flags);
 ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 1bcf597c0562..9bbb82924a22 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -285,7 +285,7 @@ static const struct file_operations hfsplus_file_operations = {
        .fsync          = file_fsync,
        .open           = hfsplus_file_open,
        .release        = hfsplus_file_release,
-        .ioctl          = hfsplus_ioctl,
+        .unlocked_ioctl = hfsplus_ioctl,
 };
 struct inode *hfsplus_new_inode(struct super_block *sb, int mode)
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index f457d2ca51ab..ac405f099026 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -17,14 +17,16 @@
 #include <linux/mount.h>
 #include <linux/sched.h>
 #include <linux/xattr.h>
+#include <linux/smp_lock.h>
 #include <asm/uaccess.h>
 #include "hfsplus_fs.h"
-int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
+long hfsplus_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-                  unsigned long arg)
 {
+        struct inode *inode = filp->f_path.dentry->d_inode;
        unsigned int flags;
+        lock_kernel();
        switch (cmd) {
        case HFSPLUS_IOC_EXT2_GETFLAGS:
                flags = 0;
@@ -38,8 +40,10 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
        case HFSPLUS_IOC_EXT2_SETFLAGS: {
                int err = 0;
                err = mnt_want_write(filp->f_path.mnt);
-                if (err)
+                if (err) {
+                        unlock_kernel();
                        return err;
+                }
                if (!is_owner_or_cap(inode)) {
                        err = -EACCES;
@@ -85,9 +89,11 @@ int hfsplus_ioctl(struct inode *inode, struct file *filp, unsigned int cmd,
                mark_inode_dirty(inode);
 setflags_out:
                mnt_drop_write(filp->f_path.mnt);
+                unlock_kernel();
                return err;
        }
        default:
+                unlock_kernel();
                return -ENOTTY;
        }
 }
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a029d8f4cf1..87ac1891a185 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -411,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file)
        return 0;
 }
-int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hostfs_fsync(struct file *file, int datasync)
 {
-        return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync);
+        return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
 }
 static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..a9ae9bfa752f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
        return 0;
 }
-int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hpfs_file_fsync(struct file *file, int datasync)
 {
-        /*return file_fsync(file, dentry);*/
+        /*return file_fsync(file, datasync);*/
        return 0; /* Don't fsync :-) */
 }
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d6..75f9d4324851 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
 /* file.c */
-int hpfs_file_fsync(struct file *, struct dentry *, int);
+int hpfs_file_fsync(struct file *, int);
 extern const struct file_operations hpfs_file_ops;
 extern const struct inode_operations hpfs_file_iops;
 extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593da..826c3f9d29ac 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
        return err;
 }
-static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int hppfs_fsync(struct file *file, int datasync)
 {
        return 0;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..a4e9a7ec3691 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -688,7 +688,7 @@ static void init_once(void *foo)
 const struct file_operations hugetlbfs_file_operations = {
        .read                   = hugetlbfs_read,
        .mmap                   = hugetlbfs_file_mmap,
-        .fsync                  = simple_sync_file,
+        .fsync                  = noop_fsync,
        .get_unmapped_area      = hugetlb_get_unmapped_area,
 };
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a482..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
 const struct file_operations isofs_dir_operations =
 {
+        .llseek = generic_file_llseek,
        .read = generic_read_dir,
        .readdir = isofs_readdir,
 };
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..e214d68620ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
-        spin_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
        transaction->t_outstanding_credits -= handle->h_buffer_credits;
        transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
                jbd_debug(2, "transaction too old, requesting commit for "
                                        "handle %p\n", handle);
                /* This is non-blocking */
-                __jbd2_log_start_commit(journal, transaction->t_tid);
+                jbd2_log_start_commit(journal, transaction->t_tid);
-                spin_unlock(&journal->j_state_lock);
                /*
                 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
                        err = jbd2_log_wait_commit(journal, tid);
        } else {
                spin_unlock(&transaction->t_handle_lock);
-                spin_unlock(&journal->j_state_lock);
        }
        lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c161a19..813497024437 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -26,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
                        struct page **pagep, void **fsdata);
 static int jffs2_readpage (struct file *filp, struct page *pg);
-int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int jffs2_fsync(struct file *filp, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
        /* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 86e0821fc989..8bc2c80ab159 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
        mutex_unlock(&f->sem);
        jffs2_complete_reservation(c);
-        /* We have to do the vmtruncate() without f->sem held, since
+        /* We have to do the simple_setsize() without f->sem held, since
           some pages may be locked and waiting for it in readpage().
           We are protected from a simultaneous write() extending i_size
           back past iattr->ia_size, because do_truncate() holds the
           generic inode semaphore. */
        if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
-                vmtruncate(inode, iattr->ia_size);      
+                simple_setsize(inode, iattr->ia_size);
                inode->i_blocks = (inode->i_size + 511) >> 9;
        }       
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 035a767f958b..4791aacf3084 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
 extern const struct file_operations jffs2_file_operations;
 extern const struct inode_operations jffs2_file_inode_operations;
 extern const struct address_space_operations jffs2_file_address_operations;
-int jffs2_fsync(struct file *, struct dentry *, int);
+int jffs2_fsync(struct file *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 /* ioctl.c */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 85d9ec659225..127263cc8657 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -27,9 +27,9 @@
 #include "jfs_acl.h"
 #include "jfs_debug.h"
-int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int jfs_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int rc = 0;
        if (!(inode->i_state & I_DIRTY) ||
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9e6bda30a6e8..11042b1f44b5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
 struct fid;
 extern struct inode *ialloc(struct inode *, umode_t);
-extern int jfs_fsync(struct file *, struct dentry *, int);
+extern int jfs_fsync(struct file *, int);
 extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
 extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b66832ac33ac..b38f96bef829 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -179,6 +179,8 @@ static void jfs_put_super(struct super_block *sb)
        jfs_info("In jfs_put_super");
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        lock_kernel();
        rc = jfs_umount(sb);
@@ -396,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
                JFS_SBI(sb)->flag = flag;
                ret = jfs_mount_rw(sb, 1);
+                /* mark the fs r/w for quota activity */
+                sb->s_flags &= ~MS_RDONLY;
                unlock_kernel();
+                dquot_resume(sb, -1);
                return ret;
        }
        if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
+                rc = dquot_suspend(sb, -1);
+                if (rc < 0) {
+                        unlock_kernel();
+                        return rc;
+                }
                rc = jfs_umount_rw(sb);
                JFS_SBI(sb)->flag = flag;
                unlock_kernel();
@@ -469,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
         */
        sb->s_op = &jfs_super_operations;
        sb->s_export_op = &jfs_export_operations;
+#ifdef CONFIG_QUOTA
+        sb->dq_op = &dquot_operations;
+        sb->s_qcop = &dquot_quotactl_ops;
+#endif
        /*
         * Initialize direct-mapping inode/address-space
diff --git a/fs/libfs.c b/fs/libfs.c
index 232bea425b09..09e1016eb774 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
+#include <linux/quotaops.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
@@ -58,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
        return NULL;
 }
-int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-        return 0;
-}
- 
 int dcache_dir_open(struct inode *inode, struct file *file)
 {
        static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -190,7 +186,7 @@ const struct file_operations simple_dir_operations = {
        .llseek         = dcache_dir_lseek,
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
-        .fsync          = simple_sync_file,
+        .fsync          = noop_fsync,
 };
 const struct inode_operations simple_dir_inode_operations = {
@@ -330,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
        return 0;
 }
+/**
+ * simple_setsize - handle core mm and vfs requirements for file size change
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setsize must be called with inode_mutex held.
+ *
+ * simple_setsize will check that the requested new size is OK (see
+ * inode_newsize_ok), and then will perform the necessary i_size update
+ * and pagecache truncation (if necessary). It will be typically be called
+ * from the filesystem's setattr function when ATTR_SIZE is passed in.
+ *
+ * The inode itself must have correct permissions and attributes to allow
+ * i_size to be changed, this function then just checks that the new size
+ * requested is valid.
+ *
+ * In the case of simple in-memory filesystems with inodes stored solely
+ * in the inode cache, and file data in the pagecache, nothing more needs
+ * to be done to satisfy a truncate request. Filesystems with on-disk
+ * blocks for example will need to free them in the case of truncate, in
+ * that case it may be easier not to use simple_setsize (but each of its
+ * components will likely be required at some point to update pagecache
+ * and inode etc).
+ */
+int simple_setsize(struct inode *inode, loff_t newsize)
+{
+        loff_t oldsize;
+        int error;
+        error = inode_newsize_ok(inode, newsize);
+        if (error)
+                return error;
+        oldsize = inode->i_size;
+        i_size_write(inode, newsize);
+        truncate_pagecache(inode, oldsize, newsize);
+        return error;
+}
+EXPORT_SYMBOL(simple_setsize);
+/**
+ * simple_setattr - setattr for simple in-memory filesystem
+ * @dentry: dentry
+ * @iattr: iattr structure
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setattr implements setattr for an in-memory filesystem which
+ * does not store its own file data or metadata (eg. uses the page cache
+ * and inode cache as its data store).
+ */
+int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                return error;
+        if (iattr->ia_valid & ATTR_SIZE) {
+                error = simple_setsize(inode, iattr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, iattr);
+        return error;
+}
+EXPORT_SYMBOL(simple_setattr);
 int simple_readpage(struct file *file, struct page *page)
 {
        clear_highpage(page);
@@ -851,13 +922,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
-int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+/**
+ * generic_file_fsync - generic fsync implementation for simple filesystems
+ * @file:       file to synchronize
+ * @datasync:   only synchronize essential metadata if true
+ *
+ * This is a generic implementation of the fsync method for simple
+ * filesystems which track all non-inode metadata in the buffers list
+ * hanging off the address_space structure.
+ */
+int generic_file_fsync(struct file *file, int datasync)
 {
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = 0, /* metadata-only; caller takes care of data */
        };
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;
@@ -872,7 +952,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
                ret = err;
        return ret;
 }
-EXPORT_SYMBOL(simple_fsync);
+EXPORT_SYMBOL(generic_file_fsync);
+/*
+ * No-op implementation of ->fsync for in-memory filesystems.
+ */
+int noop_fsync(struct file *file, int datasync)
+{
+        return 0;
+}
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
@@ -895,7 +983,7 @@ EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
 EXPORT_SYMBOL(simple_rmdir);
 EXPORT_SYMBOL(simple_statfs);
-EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(noop_fsync);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
 EXPORT_SYMBOL(simple_write_to_buffer);
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 0de524071870..abe1cafbd4c2 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -219,9 +219,9 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
        }
 }
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int logfs_fsync(struct file *file, int datasync)
 {
-        struct super_block *sb = dentry->d_inode->i_sb;
+        struct super_block *sb = file->f_mapping->host->i_sb;
        logfs_write_anchor(sb);
        return 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 1a9db84f8d8f..c838c4d72111 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -506,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops;
 int logfs_readpage(struct file *file, struct page *page);
 int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
                unsigned long arg);
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int logfs_fsync(struct file *file, int datasync);
 /* gc.c */
 u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..91969589131c 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = minix_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static inline void dir_put_page(struct page *page)
@@ -72,11 +72,8 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
        struct page *page = read_mapping_page(mapping, n, NULL);
-        if (!IS_ERR(page)) {
+        if (!IS_ERR(page))
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
-        }
        return page;
 fail:
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..d5320ff23faf 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
        return (block_t *)minix_i(inode)->u.i2_data;
 }
+#define DIRCOUNT 7
+#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
 static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
 {
        int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
                        printk("MINIX-fs: block_to_path: "
                               "block %ld too big on dev %s\n",
                                block, bdevname(sb->s_bdev, b));
-        } else if (block < 7) {
+        } else if (block < DIRCOUNT) {
                offsets[n++] = block;
-        } else if ((block -= 7) < 256) {
+        } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
-                offsets[n++] = 7;
+                offsets[n++] = DIRCOUNT;
                offsets[n++] = block;
-        } else if ((block -= 256) < 256*256) {
+        } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
-                offsets[n++] = 8;
+                offsets[n++] = DIRCOUNT + 1;
-                offsets[n++] = block>>8;
+                offsets[n++] = block / INDIRCOUNT(sb);
-                offsets[n++] = block & 255;
+                offsets[n++] = block % INDIRCOUNT(sb);
        } else {
-                block -= 256*256;
+                block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
-                offsets[n++] = 9;
+                offsets[n++] = DIRCOUNT + 2;
-                offsets[n++] = block>>16;
+                offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
-                offsets[n++] = (block>>8) & 255;
+                offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
-                offsets[n++] = block & 255;
+                offsets[n++] = block % INDIRCOUNT(sb);
        }
        return n;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 48e1f60520ea..868d0cb9d473 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1621,6 +1621,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        case LAST_DOTDOT:
                follow_dotdot(nd);
                dir = nd->path.dentry;
+        case LAST_DOT:
                if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
                        if (!dir->d_op->d_revalidate(dir, nd)) {
                                error = -ESTALE;
@@ -1628,7 +1629,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                        }
                }
                /* fallthrough */
-        case LAST_DOT:
        case LAST_ROOT:
                if (open_flag & O_CREAT)
                        goto exit;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 7edfcd4d5e52..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,9 +49,10 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
                      
 const struct file_operations ncp_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = ncp_readdir,
-        .ioctl          = ncp_ioctl,
+        .unlocked_ioctl = ncp_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 1daabb90e0a5..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -22,7 +22,7 @@
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
-static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int ncp_fsync(struct file *file, int datasync)
 {
        return 0;
 }
@@ -295,7 +295,7 @@ const struct file_operations ncp_file_operations =
        .llseek         = ncp_remote_llseek,
        .read           = ncp_file_read,
        .write          = ncp_file_write,
-        .ioctl          = ncp_ioctl,
+        .unlocked_ioctl = ncp_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = ncp_compat_ioctl,
 #endif
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 60a5e2864ea8..023c03d02070 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,6 +20,7 @@
 #include <linux/smp_lock.h>
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
+#include <linux/smp_lock.h>
 #include <linux/ncp_fs.h>
@@ -261,9 +262,9 @@ ncp_get_charsets(struct ncp_server* server, struct ncp_nls_ioctl __user *arg)
 }
 #endif /* CONFIG_NCPFS_NLS */
-static int __ncp_ioctl(struct inode *inode, struct file *filp,
+static long __ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned int cmd, unsigned long arg)
 {
+        struct inode *inode = filp->f_dentry->d_inode;
        struct ncp_server *server = NCP_SERVER(inode);
        int result;
        struct ncp_ioctl_request request;
@@ -841,11 +842,11 @@ static int ncp_ioctl_need_write(unsigned int cmd)
        }
 }
-int ncp_ioctl(struct inode *inode, struct file *filp,
+long ncp_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-              unsigned int cmd, unsigned long arg)
 {
-        int ret;
+        long ret;
+        lock_kernel();
        if (ncp_ioctl_need_write(cmd)) {
                /*
                 * inside the ioctl(), any failures which
@@ -853,24 +854,28 @@ int ncp_ioctl(struct inode *inode, struct file *filp,
                 * -EACCESS, so it seems consistent to keep
                 *  that here.
                 */
-                if (mnt_want_write(filp->f_path.mnt))
+                if (mnt_want_write(filp->f_path.mnt)) {
-                        return -EACCES;
+                        ret = -EACCES;
+                        goto out;
+                }
        }
-        ret = __ncp_ioctl(inode, filp, cmd, arg);
+        ret = __ncp_ioctl(filp, cmd, arg);
        if (ncp_ioctl_need_write(cmd))
                mnt_drop_write(filp->f_path.mnt);
+out:
+        unlock_kernel();
        return ret;
 }
 #ifdef CONFIG_COMPAT
 long ncp_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-        struct inode *inode = file->f_path.dentry->d_inode;
+        long ret;
-        int ret;
        lock_kernel();
        arg = (unsigned long) compat_ptr(arg);
-        ret = ncp_ioctl(inode, file, cmd, arg);
+        ret = ncp_ioctl(file, cmd, arg);
        unlock_kernel();
        return ret;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ee9a179ebdf3..782b431ef91c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
 static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
 static int nfs_rename(struct inode *, struct dentry *,
                      struct inode *, struct dentry *);
-static int nfs_fsync_dir(struct file *, struct dentry *, int);
+static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 const struct file_operations nfs_dir_operations = {
@@ -641,8 +641,10 @@ out:
 * All directory operations under NFS are synchronous, so fsync()
 * is a dummy operation.
 */
-static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
+static int nfs_fsync_dir(struct file *filp, int datasync)
 {
+        struct dentry *dentry = filp->f_path.dentry;
        dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        datasync);
@@ -1741,6 +1743,7 @@ remove_lru_entry:
                        clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
                        smp_mb__after_clear_bit();
                }
+                spin_unlock(&inode->i_lock);
        }
        spin_unlock(&nfs_access_lru_lock);
        nfs_access_free_list(&head);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cac96bcc91e4..36a5e74f51b4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -53,7 +53,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
+static int  nfs_file_fsync(struct file *, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -322,8 +322,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 * whether any write errors occurred for this process.
 */
 static int
-nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode *inode = dentry->d_inode;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 2f8b1157daa2..04214fc5c304 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1060,7 +1060,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        rc = strict_strtoul(string, 10, &option);
                        kfree(string);
-                        if (rc != 0 || option > USHORT_MAX)
+                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->nfs_server.port = option;
                        break;
@@ -1181,7 +1181,7 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        rc = strict_strtoul(string, 10, &option);
                        kfree(string);
-                        if (rc != 0 || option > USHORT_MAX)
+                        if (rc != 0 || option > USHRT_MAX)
                                goto out_invalid_value;
                        mnt->mount_server.port = option;
                        break;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab7..91679e2631ee 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1386,7 +1386,7 @@ static int nfs_commit_inode(struct inode *inode, int how)
        int res = 0;
        if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
-                goto out;
+                goto out_mark_dirty;
        spin_lock(&inode->i_lock);
        res = nfs_scan_commit(inode, &head, 0, 0);
        spin_unlock(&inode->i_lock);
@@ -1398,9 +1398,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
                        wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
                                        nfs_wait_bit_killable,
                                        TASK_KILLABLE);
+                else
+                        goto out_mark_dirty;
        } else
                nfs_commit_clear_lock(NFS_I(inode));
-out:
+        return res;
+        /* Note: If we exit without ensuring that the commit is complete,
+         * we must mark the inode as dirty. Otherwise, future calls to
+         * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
+         * that the data is on the disk.
+         */
+out_mark_dirty:
+        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        return res;
 }
@@ -1509,14 +1518,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
        };
        int ret;
-        while(PagePrivate(page)) {
+        for (;;) {
                wait_on_page_writeback(page);
                if (clear_page_dirty_for_io(page)) {
                        ret = nfs_writepage_locked(page, &wbc);
                        if (ret < 0)
                                goto out_error;
+                        continue;
                }
-                ret = sync_inode(inode, &wbc);
+                if (!PagePrivate(page))
+                        break;
+                ret = nfs_commit_inode(inode, FLUSH_SYNC);
                if (ret < 0)
                        goto out_error;
        }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index bc3194ea01f5..508941c23af7 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -998,7 +998,7 @@ static ssize_t __write_ports_addxprt(char *buf)
        if (sscanf(buf, "%15s %4u", transport, &port) != 2)
                return -EINVAL;
-        if (port < 1 || port > USHORT_MAX)
+        if (port < 1 || port > USHRT_MAX)
                return -EINVAL;
        err = nfsd_create_serv();
@@ -1040,7 +1040,7 @@ static ssize_t __write_ports_delxprt(char *buf)
        if (sscanf(&buf[1], "%15s %4u", transport, &port) != 2)
                return -EINVAL;
-        if (port < 1 || port > USHORT_MAX || nfsd_serv == NULL)
+        if (port < 1 || port > USHRT_MAX || nfsd_serv == NULL)
                return -EINVAL;
        xprt = svc_find_xprt(nfsd_serv, transport, AF_UNSPEC, port);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
 #include "nilfs.h"
 #include "segment.h"
-int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int nilfs_sync_file(struct file *file, int datasync)
 {
        /*
         * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
         * This function should be implemented when the writeback function
         * will be implemented.
         */
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int err;
        if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd071..47d6d7928122 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
                           struct page *, struct inode *);
 /* file.c */
-extern int nilfs_sync_file(struct file *, struct dentry *, int);
+extern int nilfs_sync_file(struct file *, int);
 /* ioctl.c */
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index fe44d3feee4a..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
 * this problem for now.  We do write the $BITMAP attribute if it is present
 * which is the important one for a directory so things are not too bad.
 */
-static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int ntfs_dir_fsync(struct file *filp, int datasync)
-                int datasync)
 {
-        struct inode *bmp_vi, *vi = dentry->d_inode;
+        struct inode *bmp_vi, *vi = filp->f_mapping->host;
        int err, ret;
        ntfs_attr na;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 8804f093ba75..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -98,9 +98,6 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
 * the page at all.  For a more detailed explanation see ntfs_truncate() in
 * fs/ntfs/inode.c.
 *
- * @cached_page and @lru_pvec are just optimizations for dealing with multiple
- * pages.
- *
 * Return 0 on success and -errno on error.  In the case that an error is
 * encountered it is possible that the initialized size will already have been
 * incremented some way towards @new_init_size but it is guaranteed that if
@@ -110,8 +107,7 @@ static int ntfs_file_open(struct inode *vi, struct file *filp)
 * Locking: i_mutex on the vfs inode corrseponsind to the ntfs inode @ni must be
 *          held by the caller.
 */
-static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size,
+static int ntfs_attr_extend_initialized(ntfs_inode *ni, const s64 new_init_size)
-                struct page **cached_page, struct pagevec *lru_pvec)
 {
        s64 old_init_size;
        loff_t old_i_size;
@@ -403,18 +399,13 @@ static inline void ntfs_fault_in_pages_readable_iovec(const struct iovec *iov,
 * Obtain @nr_pages locked page cache pages from the mapping @mapping and
 * starting at index @index.
 *
- * If a page is newly created, increment its refcount and add it to the
+ * If a page is newly created, add it to lru list
- * caller's lru-buffering pagevec @lru_pvec.
- *
- * This is the same as mm/filemap.c::__grab_cache_page(), except that @nr_pages
- * are obtained at once instead of just one page and that 0 is returned on
- * success and -errno on error.
 *
 * Note, the page locks are obtained in ascending page index order.
 */
 static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                pgoff_t index, const unsigned nr_pages, struct page **pages,
-                struct page **cached_page, struct pagevec *lru_pvec)
+                struct page **cached_page)
 {
        int err, nr;
@@ -430,7 +421,7 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                        goto err_out;
                                }
                        }
-                        err = add_to_page_cache(*cached_page, mapping, index,
+                        err = add_to_page_cache_lru(*cached_page, mapping, index,
                                        GFP_KERNEL);
                        if (unlikely(err)) {
                                if (err == -EEXIST)
@@ -438,9 +429,6 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping,
                                goto err_out;
                        }
                        pages[nr] = *cached_page;
-                        page_cache_get(*cached_page);
-                        if (unlikely(!pagevec_add(lru_pvec, *cached_page)))
-                                __pagevec_lru_add_file(lru_pvec);
                        *cached_page = NULL;
                }
                index++;
@@ -1800,7 +1788,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
        ssize_t status, written;
        unsigned nr_pages;
        int err;
-        struct pagevec lru_pvec;
        ntfs_debug("Entering for i_ino 0x%lx, attribute type 0x%x, "
                        "pos 0x%llx, count 0x%lx.",
@@ -1912,7 +1899,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        }
                }
        }
-        pagevec_init(&lru_pvec, 0);
        written = 0;
        /*
         * If the write starts beyond the initialized size, extend it up to the
@@ -1925,8 +1911,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
        ll = ni->initialized_size;
        read_unlock_irqrestore(&ni->size_lock, flags);
        if (pos > ll) {
-                err = ntfs_attr_extend_initialized(ni, pos, &cached_page,
+                err = ntfs_attr_extend_initialized(ni, pos);
-                                &lru_pvec);
                if (err < 0) {
                        ntfs_error(vol->sb, "Cannot perform write to inode "
                                        "0x%lx, attribute type 0x%x, because "
@@ -2012,7 +1997,7 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb,
                        ntfs_fault_in_pages_readable_iovec(iov, iov_ofs, bytes);
                /* Get and lock @do_pages starting at index @start_idx. */
                status = __ntfs_grab_cache_pages(mapping, start_idx, do_pages,
-                                pages, &cached_page, &lru_pvec);
+                                pages, &cached_page);
                if (unlikely(status))
                        break;
                /*
@@ -2077,7 +2062,6 @@ err_out:
        *ppos = pos;
        if (cached_page)
                page_cache_release(cached_page);
-        pagevec_lru_add_file(&lru_pvec);
        ntfs_debug("Done.  Returning %s (written 0x%lx, status %li).",
                        written ? "written" : "status", (unsigned long)written,
                        (long)status);
@@ -2149,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 /**
 * ntfs_file_fsync - sync a file to disk
 * @filp:       file to be synced
- * @dentry:     dentry describing the file to sync
 * @datasync:   if non-zero only flush user data and not metadata
 *
 * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
@@ -2165,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 * Also, if @datasync is true, we do not wait on the inode to be written out
 * but we always wait on the page cache pages to be written out.
 *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
 * Locking: Caller must hold i_mutex on the inode.
 *
 * TODO: We should probably also write all attribute/index inodes associated
 * with this inode but since we have no simple way of getting to them we ignore
 * this problem for now.
 */
-static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
+static int ntfs_file_fsync(struct file *filp, int datasync)
-                int datasync)
 {
-        struct inode *vi = dentry->d_inode;
+        struct inode *vi = filp->f_mapping->host;
        int err, ret = 0;
        ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c
index b7428c5d0d3b..ec6d12339593 100644
--- a/fs/ocfs2/blockcheck.c
+++ b/fs/ocfs2/blockcheck.c
@@ -403,7 +403,7 @@ void ocfs2_block_check_compute(void *data, size_t blocksize,
         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
         * larger than 16 bits.
         */
-        BUG_ON(ecc > USHORT_MAX);
+        BUG_ON(ecc > USHRT_MAX);
        bc->bc_crc32e = cpu_to_le32(crc);
        bc->bc_ecc = cpu_to_le16((u16)ecc);
@@ -508,7 +508,7 @@ void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr,
         * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no
         * larger than 16 bits.
         */
-        BUG_ON(ecc > USHORT_MAX);
+        BUG_ON(ecc > USHRT_MAX);
        bc->bc_crc32e = cpu_to_le32(crc);
        bc->bc_ecc = cpu_to_le16((u16)ecc);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 97e54b9e654b..6a13ea64c447 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
        return 0;
 }
-static int ocfs2_sync_file(struct file *file,
+static int ocfs2_sync_file(struct file *file, int datasync)
-                           struct dentry *dentry,
-                           int datasync)
 {
        int err = 0;
        journal_t *journal;
-        struct inode *inode = dentry->d_inode;
+        struct dentry *dentry = file->f_path.dentry;
+        struct inode *inode = file->f_mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -1053,7 +1052,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        }
        /*
-         * This will intentionally not wind up calling vmtruncate(),
+         * This will intentionally not wind up calling simple_setsize(),
         * since all the work for a size change has been done above.
         * Otherwise, we could get into problems with truncate as
         * ip_alloc_sem is used there to protect against i_size
@@ -2119,9 +2118,13 @@ relock:
                         * direct write may have instantiated a few
                         * blocks outside i_size. Trim these off again.
                         * Don't need i_size_read because we hold i_mutex.
+                         *
+                         * XXX(hch): this looks buggy because ocfs2 did not
+                         * actually implement ->truncate.  Take a look at
+                         * the new truncate sequence and update this accordingly
                         */
                        if (*ppos + count > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
+                                simple_setsize(inode, inode->i_size);
                        ret = written;
                        goto out_dio;
                }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2c26ce251cb3..0eaa929a4dbf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -879,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                        continue;
                if (unsuspend)
-                        status = vfs_quota_enable(
+                        status = dquot_resume(sb, type);
-                                        sb_dqopt(sb)->files[type],
+                else {
-                                        type, QFMT_OCFS2,
+                        struct ocfs2_mem_dqinfo *oinfo;
-                                        DQUOT_SUSPENDED);
-                else
+                        /* Cancel periodic syncing before suspending */
-                        status = vfs_quota_disable(sb, type,
+                        oinfo = sb_dqinfo(sb, type)->dqi_priv;
-                                                   DQUOT_SUSPENDED);
+                        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+                        status = dquot_suspend(sb, type);
+                }
                if (status < 0)
                        break;
        }
@@ -916,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
                        status = -ENOENT;
                        goto out_quota_off;
                }
-                status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+                status = dquot_enable(inode[type], type, QFMT_OCFS2,
-                                                DQUOT_USAGE_ENABLED);
+                                      DQUOT_USAGE_ENABLED);
                if (status < 0)
                        goto out_quota_off;
        }
@@ -952,8 +954,8 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
                /* Turn off quotas. This will remove all dquot structures from
                 * memory and so they will be automatically synced to global
                 * quota files */
-                vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+                dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
-                                            DQUOT_LIMITS_ENABLED);
+                                        DQUOT_LIMITS_ENABLED);
                if (!inode)
                        continue;
                iput(inode);
@@ -962,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 /* Handle quota on quotactl */
 static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
-                          char *path, int remount)
+                          char *path)
 {
        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -970,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                return -EINVAL;
-        if (remount)
+        return dquot_enable(sb_dqopt(sb)->files[type], type,
-                return 0;       /* Just ignore it has been handled in
+                            format_id, DQUOT_LIMITS_ENABLED);
-                                 * ocfs2_remount() */
-        return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
-                                    format_id, DQUOT_LIMITS_ENABLED);
 }
 /* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+static int ocfs2_quota_off(struct super_block *sb, int type)
 {
-        if (remount)
+        return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-                return 0;       /* Ignore now and handle later in
-                                 * ocfs2_remount() */
-        return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
 }
 static const struct quotactl_ops ocfs2_quotactl_ops = {
        .quota_on       = ocfs2_quota_on,
        .quota_off      = ocfs2_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk,
+        .set_dqblk      = dquot_set_dqblk,
 };
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..6e7a3291bbe8 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = {
        .aio_read = generic_file_aio_read,
        .aio_write = generic_file_aio_write,
        .mmap = generic_file_mmap,
-        .fsync = simple_fsync,
+        .fsync = generic_file_fsync,
        .splice_read = generic_file_splice_read,
 };
diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c
index 3ceca05b668c..648c9d8f3357 100644
--- a/fs/partitions/ldm.c
+++ b/fs/partitions/ldm.c
@@ -26,6 +26,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/stringify.h>
+#include <linux/kernel.h>
 #include "ldm.h"
 #include "check.h"
 #include "msdos.h"
@@ -77,17 +78,16 @@ static int ldm_parse_hexbyte (const u8 *src)
        int h;
        /* high part */
-        if      ((x = src[0] - '0') <= '9'-'0') h = x;
+        x = h = hex_to_bin(src[0]);
-        else if ((x = src[0] - 'a') <= 'f'-'a') h = x+10;
+        if (h < 0)
-        else if ((x = src[0] - 'A') <= 'F'-'A') h = x+10;
+                return -1;
-        else return -1;
-        h <<= 4;
        /* low part */
-        if ((x = src[1] - '0') <= '9'-'0') return h | x;
+        h = hex_to_bin(src[1]);
-        if ((x = src[1] - 'a') <= 'f'-'a') return h | (x+10);
+        if (h < 0)
-        if ((x = src[1] - 'A') <= 'F'-'A') return h | (x+10);
+                return -1;
-        return -1;
+        return (x << 4) + h;
 }
 /**
diff --git a/fs/pipe.c b/fs/pipe.c
index bdd3f96054b9..541d6626f9d9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -230,6 +230,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
        return kmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_map);
 /**
 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -249,6 +250,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
        } else
                kunmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_unmap);
 /**
 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -279,6 +281,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
        return 1;
 }
+EXPORT_SYMBOL(generic_pipe_buf_steal);
 /**
 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -294,6 +297,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
        page_cache_get(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_get);
 /**
 * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -309,6 +313,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 {
        return 0;
 }
+EXPORT_SYMBOL(generic_pipe_buf_confirm);
 /**
 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -323,6 +328,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 {
        page_cache_release(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_release);
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .can_merge = 1,
@@ -1172,16 +1178,20 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
                nr_pages = (arg + PAGE_SIZE - 1) >> PAGE_SHIFT;
                nr_pages = roundup_pow_of_two(nr_pages);
-                if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages)
+                if (!capable(CAP_SYS_ADMIN) && nr_pages > pipe_max_pages) {
-                        return -EPERM;
+                        ret = -EPERM;
+                        goto out;
+                }
                /*
                 * The pipe needs to be at least 2 pages large to
                 * guarantee POSIX behaviour.
                 */
-                if (nr_pages < 2)
+                if (arg < 2) {
-                        return -EINVAL;
+                        ret = -EINVAL;
-                ret = pipe_set_size(pipe, nr_pages);
+                        goto out;
+                }
+                ret = pipe_set_size(pipe, arg);
                break;
                }
        case F_GETPIPE_SZ:
@@ -1192,6 +1202,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
                break;
        }
+out:
        mutex_unlock(&pipe->inode->i_mutex);
        return ret;
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac5..9b58d38bc911 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
                shpending = p->signal->shared_pending.signal;
                blocked = p->blocked;
                collect_sigign_sigcatch(p, &ignored, &caught);
-                num_threads = atomic_read(&p->signal->count);
+                num_threads = get_nr_threads(p);
                rcu_read_lock();  /* FIXME: is this correct? */
                qsize = atomic_read(&__task_cred(p)->user->sigpending);
                rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        tty_nr = new_encode_dev(tty_devnum(sig->tty));
                }
-                num_threads = atomic_read(&sig->count);
+                num_threads = get_nr_threads(task);
                collect_sigign_sigcatch(task, &sigign, &sigcatch);
                cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7f9f23449dc..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
        return result;
 }
-static int get_nr_threads(struct task_struct *tsk)
-{
-        unsigned long flags;
-        int count = 0;
-        if (lock_task_sighand(tsk, &flags)) {
-                count = atomic_read(&tsk->signal->count);
-                unlock_task_sighand(tsk, &flags);
-        }
-        return count;
-}
 static int proc_cwd_link(struct inode *inode, struct path *path)
 {
        struct task_struct *task = get_proc_task(inode);
@@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        const struct pid_entry *p = ptr;
        struct inode *inode;
        struct proc_inode *ei;
-        struct dentry *error = ERR_PTR(-EINVAL);
+        struct dentry *error;
        /* Allocate the inode */
        error = ERR_PTR(-ENOMEM);
@@ -2794,7 +2782,7 @@ out:
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
-        struct dentry *result = ERR_PTR(-ENOENT);
+        struct dentry *result;
        struct task_struct *task;
        unsigned tgid;
        struct pid_namespace *ns;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c127490606..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 /*
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
- *
- * Current inode allocations in the proc-fs (hex-numbers):
- *
- * 00000000             reserved
- * 00000001-00000fff    static entries  (goners)
- *      001             root-ino
- *
- * 00001000-00001fff    unused
- * 0001xxxx-7fffxxxx    pid-dir entries for pid 1-7fff
- * 80000000-efffffff    unused
- * f0000000-ffffffff    dynamic entries
- *
- * Goal:
- *      Once we split the thing into several virtual filesystems,
- *      we will get rid of magical ranges (and this comment, BTW).
 */
 static unsigned int get_inode_number(void)
 {
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c837a77351be..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -588,7 +588,7 @@ static struct kcore_list kcore_text;
 */
 static void __init proc_kcore_text_init(void)
 {
-        kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+        kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
 }
 #else
 static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
        if (err)
                return;
        proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
-        err = PTR_ERR(proc_mnt);
        if (IS_ERR(proc_mnt)) {
                unregister_filesystem(&proc_fs_type);
                return;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 47f5b145f56e..aea1d3f1ffb5 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -634,6 +634,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
        return err;
 }
+#ifdef CONFIG_HUGETLB_PAGE
 static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset)
 {
        u64 pme = 0;
@@ -664,6 +665,7 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask,
        return err;
 }
+#endif /* HUGETLB_PAGE */
 /*
 * /proc/pid/pagemap - an array mapping virtual pages to pfns
@@ -733,7 +735,9 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
        pagemap_walk.pmd_entry = pagemap_pte_range;
        pagemap_walk.pte_hole = pagemap_pte_hole;
+#ifdef CONFIG_HUGETLB_PAGE
        pagemap_walk.hugetlb_entry = pagemap_hugetlb_range;
+#endif
        pagemap_walk.mm = mm;
        pagemap_walk.private = &pm;
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
 const struct file_operations qnx4_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = qnx4_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 655a4c52b8c3..12c233da1b6b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -228,10 +228,6 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
-#ifdef CONFIG_SMP
-struct dqstats *dqstats_pcpu;
-EXPORT_SYMBOL(dqstats_pcpu);
-#endif
 static qsize_t inode_get_rsv_space(struct inode *inode);
 static void __dquot_initialize(struct inode *inode, int type);
@@ -584,7 +580,7 @@ out:
 }
 EXPORT_SYMBOL(dquot_scan_active);
-int vfs_quota_sync(struct super_block *sb, int type, int wait)
+int dquot_quota_sync(struct super_block *sb, int type, int wait)
 {
        struct list_head *dirty;
        struct dquot *dquot;
@@ -656,7 +652,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
        return 0;
 }
-EXPORT_SYMBOL(vfs_quota_sync);
+EXPORT_SYMBOL(dquot_quota_sync);
 /* Free unused dquots from cache */
 static void prune_dqcache(int count)
@@ -676,27 +672,10 @@ static void prune_dqcache(int count)
        }
 }
-static int dqstats_read(unsigned int type)
-{
-        int count = 0;
-#ifdef CONFIG_SMP
-        int cpu;
-        for_each_possible_cpu(cpu)
-                count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type];
-        /* Statistics reading is racy, but absolute accuracy isn't required */
-        if (count < 0)
-                count = 0;
-#else
-        count = dqstats.stat[type];
-#endif
-        return count;
-}
 /*
 * This is called from kswapd when we think we need some
 * more memory
 */
 static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
 {
        if (nr) {
@@ -704,7 +683,9 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
                prune_dqcache(nr);
                spin_unlock(&dq_list_lock);
        }
-        return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure;
+        return ((unsigned)
+                percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
+                /100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dqcache_shrinker = {
@@ -1514,11 +1495,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 /*
 * This operation can block, but only after everything is updated
 */
-int __dquot_alloc_space(struct inode *inode, qsize_t number,
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
-                int warn, int reserve)
 {
        int cnt, ret = 0;
        char warntype[MAXQUOTAS];
+        int warn = flags & DQUOT_SPACE_WARN;
+        int reserve = flags & DQUOT_SPACE_RESERVE;
+        int nofail = flags & DQUOT_SPACE_NOFAIL;
        /*
         * First test before acquiring mutex - solves deadlocks when we
@@ -1539,7 +1522,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
                        continue;
                ret = check_bdq(inode->i_dquot[cnt], number, !warn,
                                warntype+cnt);
-                if (ret) {
+                if (ret && !nofail) {
                        spin_unlock(&dq_data_lock);
                        goto out_flush_warn;
                }
@@ -1638,10 +1621,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
 /*
 * This operation can block, but only after everything is updated
 */
-void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
+        int reserve = flags & DQUOT_SPACE_RESERVE;
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
@@ -1812,7 +1796,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
        if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
                transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
        if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
-                transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA);
+                transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
        ret = __dquot_transfer(inode, transfer_to);
        dqput_all(transfer_to);
@@ -1847,6 +1831,7 @@ const struct dquot_operations dquot_operations = {
        .alloc_dquot    = dquot_alloc,
        .destroy_dquot  = dquot_destroy,
 };
+EXPORT_SYMBOL(dquot_operations);
 /*
 * Generic helper for ->open on filesystems supporting disk quotas.
@@ -1865,7 +1850,7 @@ EXPORT_SYMBOL(dquot_file_open);
 /*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
-int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
+int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 {
        int cnt, ret = 0;
        struct quota_info *dqopt = sb_dqopt(sb);
@@ -1995,14 +1980,15 @@ put_inodes:
                }
        return ret;
 }
-EXPORT_SYMBOL(vfs_quota_disable);
+EXPORT_SYMBOL(dquot_disable);
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int dquot_quota_off(struct super_block *sb, int type)
 {
-        return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+        return dquot_disable(sb, type,
-                                 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+                             DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 }
-EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_quota_off);
 /*
 *      Turn quotas on on a device
 */
@@ -2120,36 +2106,43 @@ out_fmt:
 }
 /* Reenable quotas on remount RW */
-static int vfs_quota_on_remount(struct super_block *sb, int type)
+int dquot_resume(struct super_block *sb, int type)
 {
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode;
-        int ret;
+        int ret = 0, cnt;
        unsigned int flags;
-        mutex_lock(&dqopt->dqonoff_mutex);
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-        if (!sb_has_quota_suspended(sb, type)) {
+                if (type != -1 && cnt != type)
+                        continue;
+                mutex_lock(&dqopt->dqonoff_mutex);
+                if (!sb_has_quota_suspended(sb, cnt)) {
+                        mutex_unlock(&dqopt->dqonoff_mutex);
+                        continue;
+                }
+                inode = dqopt->files[cnt];
+                dqopt->files[cnt] = NULL;
+                spin_lock(&dq_state_lock);
+                flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+                                                        DQUOT_LIMITS_ENABLED,
+                                                        cnt);
+                dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
+                spin_unlock(&dq_state_lock);
                mutex_unlock(&dqopt->dqonoff_mutex);
-                return 0;
-        }
-        inode = dqopt->files[type];
-        dqopt->files[type] = NULL;
-        spin_lock(&dq_state_lock);
-        flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
-                                                DQUOT_LIMITS_ENABLED, type);
-        dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
-        spin_unlock(&dq_state_lock);
-        mutex_unlock(&dqopt->dqonoff_mutex);
-        flags = dquot_generic_flag(flags, type);
+                flags = dquot_generic_flag(flags, cnt);
-        ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+                ret = vfs_load_quota_inode(inode, cnt,
-                                   flags);
+                                dqopt->info[cnt].dqi_fmt_id, flags);
-        iput(inode);
+                iput(inode);
+        }
        return ret;
 }
+EXPORT_SYMBOL(dquot_resume);
-int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
+int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
                      struct path *path)
 {
        int error = security_quota_on(path->dentry);
@@ -2164,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
                                             DQUOT_LIMITS_ENABLED);
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on_path);
+EXPORT_SYMBOL(dquot_quota_on_path);
-int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
+int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
-                 int remount)
 {
        struct path path;
        int error;
-        if (remount)
-                return vfs_quota_on_remount(sb, type);
        error = kern_path(name, LOOKUP_FOLLOW, &path);
        if (!error) {
-                error = vfs_quota_on_path(sb, type, format_id, &path);
+                error = dquot_quota_on_path(sb, type, format_id, &path);
                path_put(&path);
        }
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on);
+EXPORT_SYMBOL(dquot_quota_on);
 /*
 * More powerful function for turning on quotas allowing setting
 * of individual quota flags
 */
-int vfs_quota_enable(struct inode *inode, int type, int format_id,
+int dquot_enable(struct inode *inode, int type, int format_id,
-                unsigned int flags)
+                 unsigned int flags)
 {
        int ret = 0;
        struct super_block *sb = inode->i_sb;
        struct quota_info *dqopt = sb_dqopt(sb);
        /* Just unsuspend quotas? */
-        if (flags & DQUOT_SUSPENDED)
+        BUG_ON(flags & DQUOT_SUSPENDED);
-                return vfs_quota_on_remount(sb, type);
        if (!flags)
                return 0;
        /* Just updating flags needed? */
@@ -2229,13 +2218,13 @@ out_lock:
 load_quota:
        return vfs_load_quota_inode(inode, type, format_id, flags);
 }
-EXPORT_SYMBOL(vfs_quota_enable);
+EXPORT_SYMBOL(dquot_enable);
 /*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
-int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
                int format_id, int type)
 {
        struct dentry *dentry;
@@ -2261,24 +2250,7 @@ out:
        dput(dentry);
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(dquot_quota_on_mount);
-/* Wrapper to turn on quotas when remounting rw */
-int vfs_dq_quota_on_remount(struct super_block *sb)
-{
-        int cnt;
-        int ret = 0, err;
-        if (!sb->s_qcop || !sb->s_qcop->quota_on)
-                return -ENOSYS;
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
-                if (err < 0 && !ret)
-                        ret = err;
-        }
-        return ret;
-}
-EXPORT_SYMBOL(vfs_dq_quota_on_remount);
 static inline qsize_t qbtos(qsize_t blocks)
 {
@@ -2313,8 +2285,8 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
        spin_unlock(&dq_data_lock);
 }
-int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
+int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
-                  struct fs_disk_quota *di)
+                    struct fs_disk_quota *di)
 {
        struct dquot *dquot;
@@ -2326,7 +2298,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
        return 0;
 }
-EXPORT_SYMBOL(vfs_get_dqblk);
+EXPORT_SYMBOL(dquot_get_dqblk);
 #define VFS_FS_DQ_MASK \
        (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
@@ -2425,7 +2397,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
        return 0;
 }
-int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
+int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
                  struct fs_disk_quota *di)
 {
        struct dquot *dquot;
@@ -2441,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
 out:
        return rc;
 }
-EXPORT_SYMBOL(vfs_set_dqblk);
+EXPORT_SYMBOL(dquot_set_dqblk);
 /* Generic routine for getting common part of quota file information */
-int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
  
@@ -2463,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return 0;
 }
-EXPORT_SYMBOL(vfs_get_dqinfo);
+EXPORT_SYMBOL(dquot_get_dqinfo);
 /* Generic routine for setting common part of quota file information */
-int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
        int err = 0;
@@ -2493,27 +2465,27 @@ out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return err;
 }
-EXPORT_SYMBOL(vfs_set_dqinfo);
+EXPORT_SYMBOL(dquot_set_dqinfo);
-const struct quotactl_ops vfs_quotactl_ops = {
+const struct quotactl_ops dquot_quotactl_ops = {
-        .quota_on       = vfs_quota_on,
+        .quota_on       = dquot_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
+EXPORT_SYMBOL(dquot_quotactl_ops);
 static int do_proc_dqstats(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-#ifdef CONFIG_SMP
-        /* Update global table */
        unsigned int type = (int *)table->data - dqstats.stat;
-        dqstats.stat[type] = dqstats_read(type);
-#endif
+        /* Update global table */
+        dqstats.stat[type] =
+                        percpu_counter_sum_positive(&dqstats.counter[type]);
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
@@ -2606,7 +2578,7 @@ static ctl_table sys_table[] = {
 static int __init dquot_init(void)
 {
-        int i;
+        int i, ret;
        unsigned long nr_hash, order;
        printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2624,12 +2596,11 @@ static int __init dquot_init(void)
        if (!dquot_hash)
                panic("Cannot create dquot hash table");
-#ifdef CONFIG_SMP
+        for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
-        dqstats_pcpu = alloc_percpu(struct dqstats);
+                ret = percpu_counter_init(&dqstats.counter[i], 0);
-        if (!dqstats_pcpu)
+                if (ret)
-                panic("Cannot create dquot stats table");
+                        panic("Cannot create dquot stat counters");
-#endif
+        }
-        memset(&dqstats, 0, sizeof(struct dqstats));
        /* Find power-of-two hlist_heads which can fit into allocation */
        nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ce3dfd066f59..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -73,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
        if (IS_ERR(pathname))
                return PTR_ERR(pathname);
        if (sb->s_qcop->quota_on)
-                ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
+                ret = sb->s_qcop->quota_on(sb, type, id, pathname);
        putname(pathname);
        return ret;
 }
@@ -260,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        case Q_QUOTAOFF:
                if (!sb->s_qcop->quota_off)
                        return -ENOSYS;
-                return sb->s_qcop->quota_off(sb, type, 0);
+                return sb->s_qcop->quota_off(sb, type);
        case Q_GETFMT:
                return quota_getfmt(sb, type, addr);
        case Q_GETINFO:
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_sync_file,
+        .fsync          = noop_fsync,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .llseek         = generic_file_llseek,
 };
 const struct inode_operations ramfs_file_inode_operations = {
+        .setattr        = simple_setattr,
        .getattr        = simple_getattr,
 };
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5ea4ad81a429..d532c20fc179 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
        .aio_read               = generic_file_aio_read,
        .write                  = do_sync_write,
        .aio_write              = generic_file_aio_write,
-        .fsync                  = simple_sync_file,
+        .fsync                  = noop_fsync,
        .splice_read            = generic_file_splice_read,
        .splice_write           = generic_file_splice_write,
        .llseek                 = generic_file_llseek,
@@ -146,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
                        return ret;
        }
-        ret = vmtruncate(inode, newsize);
+        ret = simple_setsize(inode, newsize);
        return ret;
 }
@@ -169,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
        /* pick out size-changing events */
        if (ia->ia_valid & ATTR_SIZE) {
-                loff_t size = i_size_read(inode);
+                loff_t size = inode->i_size;
                if (ia->ia_size != size) {
                        ret = ramfs_nommu_resize(inode, ia->ia_size, size);
                        if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -182,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
                }
        }
-        ret = inode_setattr(inode, ia);
+        generic_setattr(inode, ia);
 out:
        ia->ia_valid = old_ia_valid;
        return ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2d..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(generic_file_llseek);
+/**
+ * noop_llseek - No Operation Performed llseek implementation
+ * @file:       file structure to seek on
+ * @offset:     file offset to seek to
+ * @origin:     type of seek
+ *
+ * This is an implementation of ->llseek useable for the rare special case when
+ * userspace expects the seek to succeed but the (device) file is actually not
+ * able to perform the seek. In this case you use noop_llseek() instead of
+ * falling back to the default implementation of ->llseek.
+ */
+loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+{
+        return file->f_pos;
+}
+EXPORT_SYMBOL(noop_llseek);
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
        return -ESPIPE;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a958..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,10 +14,10 @@
 extern const struct reiserfs_key MIN_KEY;
 static int reiserfs_readdir(struct file *, void *, filldir_t);
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int reiserfs_dir_fsync(struct file *filp, int datasync);
-                              int datasync);
 const struct file_operations reiserfs_dir_operations = {
+        .llseek = generic_file_llseek,
        .read = generic_read_dir,
        .readdir = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
@@ -27,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
 #endif
 };
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int reiserfs_dir_fsync(struct file *filp, int datasync)
-                              int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int err;
        reiserfs_write_lock(inode->i_sb);
        err = reiserfs_commit_for_inode(inode);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9977df9f3a54..b82cdd8a45dd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
 * be removed...
 */
-static int reiserfs_sync_file(struct file *filp,
+static int reiserfs_sync_file(struct file *filp, int datasync)
-                              struct dentry *dentry, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int err;
        int barrier_done;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 59125fb36d42..9822fa15118b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -158,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
        int i;
        int ms_active_set;
+        int quota_enabled[MAXQUOTAS];
 #endif
        /* compose key to look for "save" links */
@@ -179,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
        }
        /* Turn on quotas so that they are updated correctly */
        for (i = 0; i < MAXQUOTAS; i++) {
+                quota_enabled[i] = 1;
                if (REISERFS_SB(s)->s_qf_names[i]) {
-                        int ret = reiserfs_quota_on_mount(s, i);
+                        int ret;
+                        if (sb_has_quota_active(s, i)) {
+                                quota_enabled[i] = 0;
+                                continue;
+                        }
+                        ret = reiserfs_quota_on_mount(s, i);
                        if (ret < 0)
                                reiserfs_warning(s, "reiserfs-2500",
                                                 "cannot turn on journaled "
@@ -304,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
-                if (sb_dqopt(s)->files[i])
+                if (sb_dqopt(s)->files[i] && quota_enabled[i])
-                        vfs_quota_off(s, i, 0);
+                        dquot_quota_off(s, i);
        }
        if (ms_active_set)
                /* Restore the flag back */
@@ -466,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
+        dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        reiserfs_write_lock(s);
        if (s->s_dirt)
@@ -620,7 +630,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
+static int reiserfs_quota_on(struct super_block *, int, int, char *);
 static const struct dquot_operations reiserfs_quota_operations = {
        .write_dquot = reiserfs_write_dquot,
@@ -634,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
 static const struct quotactl_ops reiserfs_qctl_operations = {
        .quota_on = reiserfs_quota_on,
-        .quota_off = vfs_quota_off,
+        .quota_off = dquot_quota_off,
-        .quota_sync = vfs_quota_sync,
+        .quota_sync = dquot_quota_sync,
-        .get_info = vfs_get_dqinfo,
+        .get_info = dquot_get_dqinfo,
-        .set_info = vfs_set_dqinfo,
+        .set_info = dquot_set_dqinfo,
-        .get_dqblk = vfs_get_dqblk,
+        .get_dqblk = dquot_get_dqblk,
-        .set_dqblk = vfs_set_dqblk,
+        .set_dqblk = dquot_set_dqblk,
 };
 #endif
@@ -1242,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                if (s->s_flags & MS_RDONLY)
                        /* it is read-only already */
                        goto out_ok;
+                err = dquot_suspend(s, -1);
+                if (err < 0)
+                        goto out_err;
                /* try to remount file system with read-only permissions */
                if (sb_umount_state(rs) == REISERFS_VALID_FS
                    || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1295,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        s->s_dirt = 0;
        if (!(*mount_flags & MS_RDONLY)) {
+                dquot_resume(s, -1);
                finish_unfinished(s);
                reiserfs_xattr_init(s, *mount_flags);
        }
@@ -2022,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
 */
 static int reiserfs_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
-                                  REISERFS_SB(sb)->s_jquota_fmt, type);
+                                        REISERFS_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-                             char *name, int remount)
+                             char *name)
 {
        int err;
        struct path path;
@@ -2039,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
                return -EINVAL;
-        /* No more checks needed? Path and format_id are bogus anyway... */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, 1);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
                return err;
@@ -2085,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                if (err)
                        goto out;
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
 out:
        path_put(&path);
        return err;
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 3e4803b4427e..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,9 +37,10 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
 const struct file_operations smb_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = smb_readdir,
-        .ioctl          = smb_ioctl,
+        .unlocked_ioctl = smb_ioctl,
        .open           = smb_dir_open,
 };
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index dbf6548bbf06..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -28,8 +28,9 @@
 #include "proto.h"
 static int
-smb_fsync(struct file *file, struct dentry * dentry, int datasync)
+smb_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct smb_sb_info *server = server_from_dentry(dentry);
        int result;
@@ -437,7 +438,7 @@ const struct file_operations smb_file_operations =
        .aio_read       = smb_file_aio_read,
        .write          = do_sync_write,
        .aio_write      = smb_file_aio_write,
-        .ioctl          = smb_ioctl,
+        .unlocked_ioctl = smb_ioctl,
        .mmap           = smb_file_mmap,
        .open           = smb_file_open,
        .release        = smb_file_release,
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index dfa1d67f8fca..9551cb6f7fe4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -714,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
                error = server->ops->truncate(inode, attr->ia_size);
                if (error)
                        goto out;
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
                if (error)
                        goto out;
                refresh = 1;
diff --git a/fs/smbfs/ioctl.c b/fs/smbfs/ioctl.c
index dbae1f8ea26f..07215312ad39 100644
--- a/fs/smbfs/ioctl.c
+++ b/fs/smbfs/ioctl.c
@@ -13,6 +13,7 @@
 #include <linux/time.h>
 #include <linux/mm.h>
 #include <linux/highuid.h>
+#include <linux/smp_lock.h>
 #include <linux/net.h>
 #include <linux/smb_fs.h>
@@ -22,14 +23,14 @@
 #include "proto.h"
-int
+long
-smb_ioctl(struct inode *inode, struct file *filp,
+smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
-          unsigned int cmd, unsigned long arg)
 {
-        struct smb_sb_info *server = server_from_inode(inode);
+        struct smb_sb_info *server = server_from_inode(filp->f_path.dentry->d_inode);
        struct smb_conn_opt opt;
        int result = -EINVAL;
+        lock_kernel();
        switch (cmd) {
                uid16_t uid16;
                uid_t uid32;
@@ -62,6 +63,7 @@ smb_ioctl(struct inode *inode, struct file *filp,
        default:
                break;
        }
+        unlock_kernel();
        return result;
 }
diff --git a/fs/smbfs/proto.h b/fs/smbfs/proto.h
index 03f456c1b7d4..05939a6f43e6 100644
--- a/fs/smbfs/proto.h
+++ b/fs/smbfs/proto.h
@@ -67,7 +67,7 @@ extern const struct address_space_operations smb_file_aops;
 extern const struct file_operations smb_file_operations;
 extern const struct inode_operations smb_file_inode_operations;
 /* ioctl.c */
-extern int smb_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg);
+extern long smb_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
 /* smbiod.c */
 extern void smbiod_wake_up(void);
 extern int smbiod_register_server(struct smb_sb_info *server);
diff --git a/fs/smbfs/symlink.c b/fs/smbfs/symlink.c
index 54350b59046b..00b2909bd469 100644
--- a/fs/smbfs/symlink.c
+++ b/fs/smbfs/symlink.c
@@ -15,7 +15,6 @@
 #include <linux/pagemap.h>
 #include <linux/net.h>
 #include <linux/namei.h>
-#include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <asm/system.h>
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
          If unsure, say N.
+config SQUASHFS_XATTRS
+        bool "Squashfs XATTR support"
+        depends on SQUASHFS
+        default n
+        help
+          Saying Y here includes support for extended attributes (xattrs).
+          Xattrs are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page).
+          If unsure, say N.
 config SQUASHFS_EMBEDDED
        bool "Additional option for memory-constrained systems" 
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
        int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
        union squashfs_inode squashfs_ino;
        struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+        int xattr_id = SQUASHFS_INVALID_XATTR;
        TRACE("Entered squashfs_read_inode\n");
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                        frag_offset = 0;
                }
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+                inode->i_op = &squashfs_inode_ops;
                inode->i_fop = &generic_ro_fops;
                inode->i_mode |= S_IFREG;
                inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                if (err < 0)
                        goto failed_read;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
                inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
-                inode->i_op = &page_symlink_inode_operations;
+                inode->i_op = &squashfs_symlink_inode_ops;
                inode->i_data.a_ops = &squashfs_symlink_aops;
                inode->i_mode |= S_IFLNK;
                squashfs_i(inode)->start = block;
                squashfs_i(inode)->offset = offset;
+                if (type == SQUASHFS_LSYMLINK_TYPE) {
+                        __le32 xattr;
+                        err = squashfs_read_metadata(sb, NULL, &block,
+                                                &offset, inode->i_size);
+                        if (err < 0)
+                                goto failed_read;
+                        err = squashfs_read_metadata(sb, &xattr, &block,
+                                                &offset, sizeof(xattr));
+                        if (err < 0)
+                                goto failed_read;
+                        xattr_id = le32_to_cpu(xattr);
+                }
                TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
                                "%x\n", SQUASHFS_INODE_BLK(ino), offset,
                                block, offset);
                break;
        }
        case SQUASHFS_BLKDEV_TYPE:
-        case SQUASHFS_CHRDEV_TYPE:
+        case SQUASHFS_CHRDEV_TYPE: {
-        case SQUASHFS_LBLKDEV_TYPE:
-        case SQUASHFS_LCHRDEV_TYPE: {
                struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
                unsigned int rdev;
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                                SQUASHFS_INODE_BLK(ino), offset, rdev);
                break;
        }
+        case SQUASHFS_LBLKDEV_TYPE:
+        case SQUASHFS_LCHRDEV_TYPE: {
+                struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
+                unsigned int rdev;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_LCHRDEV_TYPE)
+                        inode->i_mode |= S_IFCHR;
+                else
+                        inode->i_mode |= S_IFBLK;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
+                inode->i_op = &squashfs_inode_ops;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                rdev = le32_to_cpu(sqsh_ino->rdev);
+                init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+                TRACE("Device inode %x:%x, rdev %x\n",
+                                SQUASHFS_INODE_BLK(ino), offset, rdev);
+                break;
+        }
        case SQUASHFS_FIFO_TYPE:
-        case SQUASHFS_SOCKET_TYPE:
+        case SQUASHFS_SOCKET_TYPE: {
-        case SQUASHFS_LFIFO_TYPE:
-        case SQUASHFS_LSOCKET_TYPE: {
                struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                init_special_inode(inode, inode->i_mode, 0);
                break;
        }
+        case SQUASHFS_LFIFO_TYPE:
+        case SQUASHFS_LSOCKET_TYPE: {
+                struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_LFIFO_TYPE)
+                        inode->i_mode |= S_IFIFO;
+                else
+                        inode->i_mode |= S_IFSOCK;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
+                inode->i_op = &squashfs_inode_ops;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                init_special_inode(inode, inode->i_mode, 0);
+                break;
+        }
        default:
                ERROR("Unknown inode type %d in squashfs_iget!\n", type);
                return -EINVAL;
        }
+        if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
+                err = squashfs_xattr_lookup(sb, xattr_id,
+                                        &squashfs_i(inode)->xattr_count,
+                                        &squashfs_i(inode)->xattr_size,
+                                        &squashfs_i(inode)->xattr);
+                if (err < 0)
+                        goto failed_read;
+                inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
+                                + 1;
+        } else
+                squashfs_i(inode)->xattr_count = 0;
        return 0;
 failed_read:
        ERROR("Unable to read inode 0x%llx\n", ino);
        return err;
 }
+const struct inode_operations squashfs_inode_ops = {
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
+};
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dcache.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
 const struct inode_operations squashfs_dir_inode_ops = {
-        .lookup = squashfs_lookup
+        .lookup = squashfs_lookup,
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
 };
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
                                unsigned int);
 extern int squashfs_read_inode(struct inode *, long long);
+/* xattr.c */
+extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
 /*
- * Inodes, files and decompressor operations
+ * Inodes, files,  decompressor and xattr operations
 */
 /* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
 /* file.c */
 extern const struct address_space_operations squashfs_aops;
+/* inode.c */
+extern const struct inode_operations squashfs_inode_ops;
 /* namei.c */
 extern const struct inode_operations squashfs_dir_inode_ops;
 /* symlink.c */
 extern const struct address_space_operations squashfs_symlink_aops;
+extern const struct inode_operations squashfs_symlink_inode_ops;
+/* xattr.c */
+extern const struct xattr_handler *squashfs_xattr_handlers[];
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
 #define SQUASHFS_NAME_LEN               256
 #define SQUASHFS_INVALID_FRAG           (0xffffffffU)
+#define SQUASHFS_INVALID_XATTR          (0xffffffffU)
 #define SQUASHFS_INVALID_BLK            (-1LL)
 /* Filesystem flags */
@@ -96,6 +97,13 @@
 #define SQUASHFS_LFIFO_TYPE             13
 #define SQUASHFS_LSOCKET_TYPE           14
+/* Xattr types */
+#define SQUASHFS_XATTR_USER             0
+#define SQUASHFS_XATTR_TRUSTED          1
+#define SQUASHFS_XATTR_SECURITY         2
+#define SQUASHFS_XATTR_VALUE_OOL        256
+#define SQUASHFS_XATTR_PREFIX_MASK      0xff
 /* Flag whether block is compressed or uncompressed, bit is set if block is
 * uncompressed */
 #define SQUASHFS_COMPRESSED_BIT         (1 << 15)
@@ -174,6 +182,24 @@
 #define SQUASHFS_ID_BLOCK_BYTES(A)      (SQUASHFS_ID_BLOCKS(A) *\
                                        sizeof(u64))
+/* xattr id lookup table defines */
+#define SQUASHFS_XATTR_BYTES(A)         ((A) * sizeof(struct squashfs_xattr_id))
+#define SQUASHFS_XATTR_BLOCK(A)         (SQUASHFS_XATTR_BYTES(A) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCK_OFFSET(A)  (SQUASHFS_XATTR_BYTES(A) % \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCKS(A)        ((SQUASHFS_XATTR_BYTES(A) + \
+                                        SQUASHFS_METADATA_SIZE - 1) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCK_BYTES(A)   (SQUASHFS_XATTR_BLOCKS(A) *\
+                                        sizeof(u64))
+#define SQUASHFS_XATTR_BLK(A)           ((unsigned int) ((A) >> 16))
+#define SQUASHFS_XATTR_OFFSET(A)        ((unsigned int) ((A) & 0xffff))
 /* cached data constants for filesystem */
 #define SQUASHFS_CACHED_BLKS            8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
        __le64                  root_inode;
        __le64                  bytes_used;
        __le64                  id_table_start;
-        __le64                  xattr_table_start;
+        __le64                  xattr_id_table_start;
        __le64                  inode_table_start;
        __le64                  directory_table_start;
        __le64                  fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
        __le32                  nlink;
 };
+struct squashfs_lipc_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  xattr;
+};
 struct squashfs_dev_inode {
        __le16                  inode_type;
        __le16                  mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
        __le32                  rdev;
 };
+struct squashfs_ldev_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  rdev;
+        __le32                  xattr;
+};
 struct squashfs_symlink_inode {
        __le16                  inode_type;
        __le16                  mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
 union squashfs_inode {
        struct squashfs_base_inode              base;
        struct squashfs_dev_inode               dev;
+        struct squashfs_ldev_inode              ldev;
        struct squashfs_symlink_inode           symlink;
        struct squashfs_reg_inode               reg;
        struct squashfs_lreg_inode              lreg;
        struct squashfs_dir_inode               dir;
        struct squashfs_ldir_inode              ldir;
        struct squashfs_ipc_inode               ipc;
+        struct squashfs_lipc_inode              lipc;
 };
 struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
        unsigned int            unused;
 };
+struct squashfs_xattr_entry {
+        __le16                  type;
+        __le16                  size;
+        char                    data[0];
+};
+struct squashfs_xattr_val {
+        __le32                  vsize;
+        char                    value[0];
+};
+struct squashfs_xattr_id {
+        __le64                  xattr;
+        __le32                  count;
+        __le32                  size;
+};
+struct squashfs_xattr_id_table {
+        __le64                  xattr_table_start;
+        __le32                  xattr_ids;
+        __le32                  unused;
+};
 #endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
 struct squashfs_inode_info {
        u64             start;
        int             offset;
+        u64             xattr;
+        unsigned int    xattr_size;
+        int             xattr_count;
        union {
                struct {
                        u64             fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
        int                                     next_meta_index;
        __le64                                  *id_table;
        __le64                                  *fragment_index;
+        __le64                                  *xattr_id_table;
        struct mutex                            read_data_mutex;
        struct mutex                            meta_index_mutex;
        struct meta_index                       *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
        __le64                                  *inode_lookup_table;
        u64                                     inode_table;
        u64                                     directory_table;
+        u64                                     xattr_table;
        unsigned int                            block_size;
        unsigned short                          block_log;
        long long                               bytes_used;
        unsigned int                            inodes;
+        int                                     xattr_ids;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a6..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/magic.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "xattr.h"
 static struct file_system_type squashfs_fs_type;
 static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        long long root_inode;
        unsigned short flags;
        unsigned int fragments;
-        u64 lookup_table_start;
+        u64 lookup_table_start, xattr_id_table_start;
        int err;
        TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        if (msblk->decompressor == NULL)
                goto failed_mount;
-        /*
-         * Check if there's xattrs in the filesystem.  These are not
-         * supported in this version, so warn that they will be ignored.
-         */
-        if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
-                ERROR("Xattrs in filesystem, these will be ignored\n");
        /* Check the filesystem does not extend beyond the end of the
           block device */
        msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 allocate_lookup_table:
        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
        if (lookup_table_start == SQUASHFS_INVALID_BLK)
-                goto allocate_root;
+                goto allocate_xattr_table;
        /* Allocate and read inode lookup table */
        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
        sb->s_export_op = &squashfs_export_ops;
+allocate_xattr_table:
+        sb->s_xattr = squashfs_xattr_handlers;
+        xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+        if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
+                goto allocate_root;
+        /* Allocate and read xattr id lookup table */
+        msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+                xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+        if (IS_ERR(msblk->xattr_id_table)) {
+                err = PTR_ERR(msblk->xattr_id_table);
+                msblk->xattr_id_table = NULL;
+                if (err != -ENOTSUPP)
+                        goto failed_mount;
+        }
 allocate_root:
        root = new_inode(sb);
        if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
        kfree(msblk->inode_lookup_table);
        kfree(msblk->fragment_index);
        kfree(msblk->id_table);
+        kfree(msblk->xattr_id_table);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
        kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sbi->fragment_index);
                kfree(sbi->meta_index);
                kfree(sbi->inode_lookup_table);
+                kfree(sbi->xattr_id_table);
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee39..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 static int squashfs_symlink_readpage(struct file *file, struct page *page)
 {
@@ -114,3 +116,12 @@ error_out:
 const struct address_space_operations squashfs_symlink_aops = {
        .readpage = squashfs_symlink_readpage
 };
+const struct inode_operations squashfs_symlink_inode_ops = {
+        .readlink = generic_readlink,
+        .follow_link = page_follow_link_light,
+        .put_link = page_put_link,
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
+};
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+static const struct xattr_handler *squashfs_xattr_handler(int);
+ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
+        size_t buffer_size)
+{
+        struct inode *inode = d->d_inode;
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+                                                 + msblk->xattr_table;
+        int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+        int count = squashfs_i(inode)->xattr_count;
+        size_t rest = buffer_size;
+        int err;
+        /* check that the file system has xattrs */
+        if (msblk->xattr_id_table == NULL)
+                return -EOPNOTSUPP;
+        /* loop reading each xattr name */
+        while (count--) {
+                struct squashfs_xattr_entry entry;
+                struct squashfs_xattr_val val;
+                const struct xattr_handler *handler;
+                int name_size, prefix_size = 0;
+                err = squashfs_read_metadata(sb, &entry, &start, &offset,
+                                                        sizeof(entry));
+                if (err < 0)
+                        goto failed;
+                name_size = le16_to_cpu(entry.size);
+                handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
+                if (handler)
+                        prefix_size = handler->list(d, buffer, rest, NULL,
+                                name_size, handler->flags);
+                if (prefix_size) {
+                        if (buffer) {
+                                if (prefix_size + name_size + 1 > rest) {
+                                        err = -ERANGE;
+                                        goto failed;
+                                }
+                                buffer += prefix_size;
+                        }
+                        err = squashfs_read_metadata(sb, buffer, &start,
+                                &offset, name_size);
+                        if (err < 0)
+                                goto failed;
+                        if (buffer) {
+                                buffer[name_size] = '\0';
+                                buffer += name_size + 1;
+                        }
+                        rest -= prefix_size + name_size + 1;
+                } else  {
+                        /* no handler or insuffficient privileges, so skip */
+                        err = squashfs_read_metadata(sb, NULL, &start,
+                                &offset, name_size);
+                        if (err < 0)
+                                goto failed;
+                }
+                /* skip remaining xattr entry */
+                err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                sizeof(val));
+                if (err < 0)
+                        goto failed;
+                err = squashfs_read_metadata(sb, NULL, &start, &offset,
+                                                le32_to_cpu(val.vsize));
+                if (err < 0)
+                        goto failed;
+        }
+        err = buffer_size - rest;
+failed:
+        return err;
+}
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+        const char *name, void *buffer, size_t buffer_size)
+{
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+                                                 + msblk->xattr_table;
+        int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+        int count = squashfs_i(inode)->xattr_count;
+        int name_len = strlen(name);
+        int err, vsize;
+        char *target = kmalloc(name_len, GFP_KERNEL);
+        if (target == NULL)
+                return  -ENOMEM;
+        /* loop reading each xattr name */
+        for (; count; count--) {
+                struct squashfs_xattr_entry entry;
+                struct squashfs_xattr_val val;
+                int type, prefix, name_size;
+                err = squashfs_read_metadata(sb, &entry, &start, &offset,
+                                                        sizeof(entry));
+                if (err < 0)
+                        goto failed;
+                name_size = le16_to_cpu(entry.size);
+                type = le16_to_cpu(entry.type);
+                prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
+                if (prefix == name_index && name_size == name_len)
+                        err = squashfs_read_metadata(sb, target, &start,
+                                                &offset, name_size);
+                else
+                        err = squashfs_read_metadata(sb, NULL, &start,
+                                                &offset, name_size);
+                if (err < 0)
+                        goto failed;
+                if (prefix == name_index && name_size == name_len &&
+                                        strncmp(target, name, name_size) == 0) {
+                        /* found xattr */
+                        if (type & SQUASHFS_XATTR_VALUE_OOL) {
+                                __le64 xattr;
+                                /* val is a reference to the real location */
+                                err = squashfs_read_metadata(sb, &val, &start,
+                                                &offset, sizeof(val));
+                                if (err < 0)
+                                        goto failed;
+                                err = squashfs_read_metadata(sb, &xattr, &start,
+                                         &offset, sizeof(xattr));
+                                if (err < 0)
+                                        goto failed;
+                                xattr = le64_to_cpu(xattr);
+                                start = SQUASHFS_XATTR_BLK(xattr) +
+                                                        msblk->xattr_table;
+                                offset = SQUASHFS_XATTR_OFFSET(xattr);
+                        }
+                        /* read xattr value */
+                        err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                        sizeof(val));
+                        if (err < 0)
+                                goto failed;
+                        vsize = le32_to_cpu(val.vsize);
+                        if (buffer) {
+                                if (vsize > buffer_size) {
+                                        err = -ERANGE;
+                                        goto failed;
+                                }
+                                err = squashfs_read_metadata(sb, buffer, &start,
+                                         &offset, vsize);
+                                if (err < 0)
+                                        goto failed;
+                        }
+                        break;
+                }
+                /* no match, skip remaining xattr entry */
+                err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                        sizeof(val));
+                if (err < 0)
+                        goto failed;
+                err = squashfs_read_metadata(sb, NULL, &start, &offset,
+                                                le32_to_cpu(val.vsize));
+                if (err < 0)
+                        goto failed;
+        }
+        err = count ? vsize : -ENODATA;
+failed:
+        kfree(target);
+        return err;
+}
+/*
+ * User namespace support
+ */
+static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
+        const char *name, size_t name_len, int type)
+{
+        if (list && XATTR_USER_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+        return XATTR_USER_PREFIX_LEN;
+}
+static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
+        size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .list   = squashfs_user_list,
+        .get    = squashfs_user_get
+};
+/*
+ * Trusted namespace support
+ */
+static size_t squashfs_trusted_list(struct dentry *d, char *list,
+        size_t list_size, const char *name, size_t name_len, int type)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return 0;
+        if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+        return XATTR_TRUSTED_PREFIX_LEN;
+}
+static int squashfs_trusted_get(struct dentry *d, const char *name,
+        void *buffer, size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_trusted_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .list   = squashfs_trusted_list,
+        .get    = squashfs_trusted_get
+};
+/*
+ * Security namespace support
+ */
+static size_t squashfs_security_list(struct dentry *d, char *list,
+        size_t list_size, const char *name, size_t name_len, int type)
+{
+        if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+        return XATTR_SECURITY_PREFIX_LEN;
+}
+static int squashfs_security_get(struct dentry *d, const char *name,
+        void *buffer, size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = squashfs_security_list,
+        .get    = squashfs_security_get
+};
+static inline const struct xattr_handler *squashfs_xattr_handler(int type)
+{
+        if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
+                /* ignore unrecognised type */
+                return NULL;
+        switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
+        case SQUASHFS_XATTR_USER:
+                return &squashfs_xattr_user_handler;
+        case SQUASHFS_XATTR_TRUSTED:
+                return &squashfs_xattr_trusted_handler;
+        case SQUASHFS_XATTR_SECURITY:
+                return &squashfs_xattr_security_handler;
+        default:
+                /* ignore unrecognised type */
+                return NULL;
+        }
+}
+const struct xattr_handler *squashfs_xattr_handlers[] = {
+        &squashfs_xattr_user_handler,
+        &squashfs_xattr_trusted_handler,
+        &squashfs_xattr_security_handler,
+        NULL
+};
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.h
+ */
+#ifdef CONFIG_SQUASHFS_XATTRS
+extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
+                u64 *, int *);
+extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
+                int *, unsigned long long *);
+#else
+static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
+                u64 start, u64 *xattr_table_start, int *xattr_ids)
+{
+        ERROR("Xattrs in filesystem, these will be ignored\n");
+        return ERR_PTR(-ENOTSUPP);
+}
+static inline int squashfs_xattr_lookup(struct super_block *sb,
+                unsigned int index, int *count, int *size,
+                unsigned long long *xattr)
+{
+        return 0;
+}
+#define squashfs_listxattr NULL
+#define generic_getxattr NULL
+#define squashfs_xattr_handlers NULL
+#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+/*
+ * This file implements code to map the 32-bit xattr id stored in the inode
+ * into the on disk location of the xattr data.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Map xattr id using the xattr id look up table
+ */
+int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
+                int *count, unsigned int *size, unsigned long long *xattr)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int block = SQUASHFS_XATTR_BLOCK(index);
+        int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
+        u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+        struct squashfs_xattr_id id;
+        int err;
+        err = squashfs_read_metadata(sb, &id, &start_block, &offset,
+                                                        sizeof(id));
+        if (err < 0)
+                return err;
+        *xattr = le64_to_cpu(id.xattr);
+        *size = le32_to_cpu(id.size);
+        *count = le32_to_cpu(id.count);
+        return 0;
+}
+/*
+ * Read uncompressed xattr id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+                u64 *xattr_table_start, int *xattr_ids)
+{
+        unsigned int len;
+        __le64 *xid_table;
+        struct squashfs_xattr_id_table id_table;
+        int err;
+        err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
+        if (err < 0) {
+                ERROR("unable to read xattr id table\n");
+                return ERR_PTR(err);
+        }
+        *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
+        *xattr_ids = le32_to_cpu(id_table.xattr_ids);
+        len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+        TRACE("In read_xattr_index_table, length %d\n", len);
+        /* Allocate xattr id lookup table indexes */
+        xid_table = kmalloc(len, GFP_KERNEL);
+        if (xid_table == NULL) {
+                ERROR("Failed to allocate xattr id index table\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
+        if (err < 0) {
+                ERROR("unable to read xattr id index table\n");
+                kfree(xid_table);
+                return ERR_PTR(err);
+        }
+        return xid_table;
+}
diff --git a/fs/super.c b/fs/super.c
index 69688b15f1fa..5c35bc7a499e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,7 +24,6 @@
 #include <linux/slab.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
-#include <linux/quotaops.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/writeback.h>            /* for the emergency remount stuff */
@@ -94,8 +93,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
                init_rwsem(&s->s_dquot.dqptr_sem);
                init_waitqueue_head(&s->s_wait_unfrozen);
                s->s_maxbytes = MAX_NON_LFS;
-                s->dq_op = sb_dquot_ops;
-                s->s_qcop = sb_quotactl_ops;
                s->s_op = &default_op;
                s->s_time_gran = 1000000000;
        }
@@ -160,7 +157,6 @@ void deactivate_locked_super(struct super_block *s)
 {
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
-                vfs_dq_off(s, 0);
                fs->kill_sb(s);
                put_filesystem(fs);
                put_super(s);
@@ -524,7 +520,7 @@ rescan:
 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
        int retval;
-        int remount_rw, remount_ro;
+        int remount_ro;
        if (sb->s_frozen != SB_UNFROZEN)
                return -EBUSY;
@@ -540,7 +536,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        sync_filesystem(sb);
        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
-        remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
@@ -549,9 +544,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                        mark_files_ro(sb);
                else if (!fs_may_remount_ro(sb))
                        return -EBUSY;
-                retval = vfs_dq_off(sb, 1);
-                if (retval < 0 && retval != -ENOSYS)
-                        return -EBUSY;
        }
        if (sb->s_op->remount_fs) {
@@ -560,8 +552,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                        return retval;
        }
        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
-        if (remount_rw)
-                vfs_dq_quota_on_remount(sb);
        /*
         * Some filesystems modify their metadata via some other path than the
         * bdev buffer cache (eg. use a private mapping, or directories in
@@ -946,8 +937,8 @@ out:
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 /**
- * freeze_super -- lock the filesystem and force it into a consistent state
+ * freeze_super - lock the filesystem and force it into a consistent state
- * @super: the super to lock
+ * @sb: the super to lock
 *
 * Syncs the super to make sure the filesystem is consistent and calls the fs's
 * freeze_fs.  Subsequent calls to this without first thawing the fs will return
diff --git a/fs/sync.c b/fs/sync.c
index 5a537ccd2e85..15aa6f03b2da 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -130,12 +130,10 @@ void emergency_sync(void)
 /*
 * Generic function to fsync a file.
- *
- * filp may be NULL if called via the msync of a vma.
 */
-int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int file_fsync(struct file *filp, int datasync)
 {
-        struct inode * inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        struct super_block * sb;
        int ret, err;
@@ -183,7 +181,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
         * livelocks in fsync_buffers_list().
         */
        mutex_lock(&mapping->host->i_mutex);
-        err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+        err = file->f_op->fsync(file, datasync);
        if (!ret)
                ret = err;
        mutex_unlock(&mapping->host->i_mutex);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index bbd77e95cf7f..bde1a4c3679a 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,13 +117,11 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if (error)
                goto out;
-        iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
+        /* this ignores size changes */
+        generic_setattr(inode, iattr);
-        error = inode_setattr(inode, iattr);
-        if (error)
-                goto out;
        error = sysfs_sd_setattr(sd, iattr);
 out:
        mutex_unlock(&sysfs_mutex);
        return error;
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 1dabed286b4c..79941e4964a4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = sysv_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..750cc22349bd 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 4573734d723d..d4a5380b5669 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
         * then attach current time stamp.
         * But if the filesystem was marked clean, keep it clean.
         */
+        sb->s_dirt = 0;
        old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
        if (sbi->s_type == FSTYPE_SYSV4) {
                if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5692cf72b807..12f445cee9f7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -967,12 +967,15 @@ static int do_writepage(struct page *page, int len)
 * the page locked, and it locks @ui_mutex. However, write-back does take inode
 * @i_mutex, which means other VFS operations may be run on this inode at the
 * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'vmtruncate()', which first changes @inode->i_size, then
+ * we have to call 'simple_setsize()', which first changes @inode->i_size, then
 * drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
+ * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
 * means that @inode->i_size is changed while @ui_mutex is unlocked.
 *
+ * XXX: with the new truncate the above is not true anymore, the simple_setsize
+ * calls can be replaced with the individual components.
+ *
 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
 * inode size. How do we do this if @inode->i_size may became smaller while we
 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1125,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
                budgeted = 0;
        }
-        err = vmtruncate(inode, new_size);
+        err = simple_setsize(inode, new_size);
        if (err)
                goto out_budg;
@@ -1214,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        if (attr->ia_valid & ATTR_SIZE) {
                dbg_gen("size %lld -> %lld", inode->i_size, new_size);
-                err = vmtruncate(inode, new_size);
+                err = simple_setsize(inode, new_size);
                if (err)
                        goto out;
        }
@@ -1223,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        if (attr->ia_valid & ATTR_SIZE) {
                /* Truncation changes inode [mc]time */
                inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
-                /* 'vmtruncate()' changed @i_size, update @ui_size */
+                /* 'simple_setsize()' changed @i_size, update @ui_size */
                ui->ui_size = inode->i_size;
        }
@@ -1304,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
        return NULL;
 }
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ubifs_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        int err;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bd2542dad014..2eef553d50c8 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb {
 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
 * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
 * could consider to rework locking and base it on "shadow" fields.
@@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
 int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 /* file.c */
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_fsync(struct file *file, int datasync);
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
 /* dir.c */
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 9a9378b4eb5a..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
 #include "udfdecl.h"
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
@@ -159,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                                udf_debug("byte=%2x\n",
                                        ((char *)bh->b_data)[(bit + i) >> 3]);
                        } else {
-                                if (inode)
-                                        dquot_free_block(inode, 1);
                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
@@ -210,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
                bit = block % (sb->s_blocksize << 3);
                while (bit < (sb->s_blocksize << 3) && block_count > 0) {
-                        if (!udf_test_bit(bit, bh->b_data))
+                        if (!udf_clear_bit(bit, bh->b_data))
                                goto out;
-                        else if (dquot_prealloc_block(inode, 1))
-                                goto out;
-                        else if (!udf_clear_bit(bit, bh->b_data)) {
-                                udf_debug("bit already cleared for block %d\n", bit);
-                                dquot_free_block(inode, 1);
-                                goto out;
-                        }
                        block_count--;
                        alloc_count++;
                        bit++;
@@ -338,20 +328,6 @@ search_back:
        }
 got_block:
-        /*
-         * Check quota for allocation of this block.
-         */
-        if (inode) {
-                int ret = dquot_alloc_block(inode, 1);
-                if (ret) {
-                        mutex_unlock(&sbi->s_alloc_mutex);
-                        *err = ret;
-                        return 0;
-                }
-        }
        newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
                (sizeof(struct spaceBitmapDesc) << 3);
@@ -401,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
        }
        iinfo = UDF_I(table);
-        /* We do this up front - There are some error conditions that
-           could occure, but.. oh well */
-        if (inode)
-                dquot_free_block(inode, count);
        udf_add_free_space(sb, sbi->s_partition, count);
        start = bloc->logicalBlockNum + offset;
@@ -649,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
                epos.offset -= adsize;
                alloc_count = (elen >> sb->s_blocksize_bits);
-                if (inode && dquot_prealloc_block(inode,
+                if (alloc_count > block_count) {
-                        alloc_count > block_count ? block_count : alloc_count))
-                        alloc_count = 0;
-                else if (alloc_count > block_count) {
                        alloc_count = block_count;
                        eloc.logicalBlockNum += alloc_count;
                        elen -= (alloc_count << sb->s_blocksize_bits);
@@ -752,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb,
        newblock = goal_eloc.logicalBlockNum;
        goal_eloc.logicalBlockNum++;
        goal_elen -= sb->s_blocksize;
-        if (inode) {
-                *err = dquot_alloc_block(inode, 1);
-                if (*err) {
-                        brelse(goal_epos.bh);
-                        mutex_unlock(&sbi->s_alloc_mutex);
-                        return 0;
-                }
-        }
        if (goal_elen)
                udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 3a84455c2a77..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
+        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
        .readdir                = udf_readdir,
        .unlocked_ioctl         = udf_ioctl,
-        .fsync                  = simple_fsync,
+        .fsync                  = generic_file_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index baae3a723946..94e06d6bddbd 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,6 @@
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
 #include <linux/smp_lock.h>
@@ -219,39 +218,16 @@ const struct file_operations udf_file_operations = {
        .read                   = do_sync_read,
        .aio_read               = generic_file_aio_read,
        .unlocked_ioctl         = udf_ioctl,
-        .open                   = dquot_file_open,
+        .open                   = generic_file_open,
        .mmap                   = generic_file_mmap,
        .write                  = do_sync_write,
        .aio_write              = udf_file_aio_write,
        .release                = udf_release_file,
-        .fsync                  = simple_fsync,
+        .fsync                  = generic_file_fsync,
        .splice_read            = generic_file_splice_read,
        .llseek                 = generic_file_llseek,
 };
-int udf_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-        struct inode *inode = dentry->d_inode;
-        int error;
-        error = inode_change_ok(inode, iattr);
-        if (error)
-                return error;
-        if (is_quota_modification(inode, iattr))
-                dquot_initialize(inode);
-        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
-            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-                error = dquot_transfer(inode, iattr);
-                if (error)
-                        return error;
-        }
-        return inode_setattr(inode, iattr);
-}
 const struct inode_operations udf_file_inode_operations = {
        .truncate               = udf_truncate,
-        .setattr                = udf_setattr,
 };
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 2b5586c7f02a..18cd7111185d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
 #include "udfdecl.h"
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
-        /*
-         * Note: we must free any quota before locking the superblock,
-         * as writing the quota to disk may need the lock as well.
-         */
-        dquot_free_inode(inode);
-        dquot_drop(inode);
        clear_inode(inode);
        mutex_lock(&sbi->s_alloc_mutex);
@@ -61,7 +53,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        struct super_block *sb = dir->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct inode *inode;
-        int block, ret;
+        int block;
        uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
        struct udf_inode_info *iinfo;
        struct udf_inode_info *dinfo = UDF_I(dir);
@@ -146,17 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        dquot_initialize(inode);
-        ret = dquot_alloc_inode(inode);
-        if (ret) {
-                dquot_drop(inode);
-                inode->i_flags |= S_NOQUOTA;
-                inode->i_nlink = 0;
-                iput(inode);
-                *err = ret;
-                return NULL;
-        }
        *err = 0;
        return inode;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8a3fbd177cab..124852bcf6fe 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,7 +36,6 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
@@ -71,9 +70,6 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 void udf_delete_inode(struct inode *inode)
 {
-        if (!is_bad_inode(inode))
-                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
@@ -113,7 +109,6 @@ void udf_clear_inode(struct inode *inode)
                        (unsigned long long)iinfo->i_lenExtents);
        }
-        dquot_drop(inode);
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
 }
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 585f733615dc..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/quotaops.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
@@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
        inode = udf_new_inode(dir, mode, &err);
        if (!inode) {
@@ -617,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        dquot_initialize(dir);
        lock_kernel();
        err = -EIO;
        inode = udf_new_inode(dir, mode, &err);
@@ -664,8 +659,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
        err = -EMLINK;
        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -800,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc *fi, cfi;
        struct kernel_lb_addr tloc;
-        dquot_initialize(dir);
        retval = -ENOENT;
        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -848,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc cfi;
        struct kernel_lb_addr tloc;
-        dquot_initialize(dir);
        retval = -ENOENT;
        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -904,8 +893,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        struct buffer_head *bh;
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
        if (!inode)
@@ -1075,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        struct buffer_head *bh;
-        dquot_initialize(dir);
        lock_kernel();
        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
                unlock_kernel();
@@ -1139,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
-        dquot_initialize(old_dir);
-        dquot_initialize(new_dir);
        lock_kernel();
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
@@ -1387,7 +1369,6 @@ const struct export_operations udf_export_ops = {
 const struct inode_operations udf_dir_inode_operations = {
        .lookup                         = udf_lookup,
        .create                         = udf_create,
-        .setattr                        = udf_setattr,
        .link                           = udf_link,
        .unlink                         = udf_unlink,
        .symlink                        = udf_symlink,
@@ -1400,5 +1381,4 @@ const struct inode_operations udf_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
-        .setattr        = udf_setattr,
 };
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..612d1e2e285a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 {
        struct udf_options uopt;
        struct udf_sb_info *sbi = UDF_SB(sb);
+        int error = 0;
        uopt.flags = sbi->s_flags;
        uopt.uid   = sbi->s_uid;
@@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                        *flags |= MS_RDONLY;
        }
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
-                unlock_kernel();
+                goto out_unlock;
-                return 0;
-        }
        if (*flags & MS_RDONLY)
                udf_close_lvid(sb);
        else
                udf_open_lvid(sb);
+out_unlock:
        unlock_kernel();
-        return 0;
+        return error;
 }
 /* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* Fill in the rest of the superblock */
        sb->s_op = &udf_sb_ops;
        sb->s_export_op = &udf_export_ops;
-        sb->dq_op = NULL;
        sb->s_dirt = 0;
        sb->s_magic = UDF_SUPER_MAGIC;
        sb->s_time_gran = 1000;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 9079ff7d6255..2bac0354891f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -131,7 +131,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 /* file.c */
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
-extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 5cfa4d85ccf2..048484fb10d2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/string.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
                                   "bit already cleared for fragment %u", i);
        }
        
-        dquot_free_block(inode, count);
-        
        fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
        uspi->cs_total.cs_nffree += count;
        fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -195,7 +191,6 @@ do_more:
                ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
                if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                        ufs_clusteracct (sb, ucpi, blkno, 1);
-                dquot_free_block(inode, uspi->s_fpb);
                fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
                uspi->cs_total.cs_nbfree++;
@@ -511,7 +506,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        unsigned cgno, fragno, fragoff, count, fragsize, i;
-        int ret;
        
        UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
             (unsigned long long)fragment, oldcount, newcount);
@@ -557,11 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
                fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
        for (i = oldcount; i < newcount; i++)
                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
-        ret = dquot_alloc_block(inode, count);
-        if (ret) {
-                *err = ret;
-                return 0;
-        }
        fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
        fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -598,7 +587,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
        struct ufs_cylinder_group * ucg;
        unsigned oldcg, i, j, k, allocsize;
        u64 result;
-        int ret;
        
        UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
             inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -667,7 +655,6 @@ cg_found:
                for (i = count; i < uspi->s_fpb; i++)
                        ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
                i = uspi->s_fpb - count;
-                dquot_free_block(inode, i);
                fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
                uspi->cs_total.cs_nffree += i;
@@ -679,11 +666,6 @@ cg_found:
        result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
        if (result == INVBLOCK)
                return 0;
-        ret = dquot_alloc_block(inode, count);
-        if (ret) {
-                *err = ret;
-                return 0;
-        }
        for (i = 0; i < count; i++)
                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
        
@@ -718,7 +700,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
        struct ufs_super_block_first * usb1;
        struct ufs_cylinder_group * ucg;
        u64 result, blkno;
-        int ret;
        UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -752,11 +733,6 @@ gotit:
        ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
        if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                ufs_clusteracct (sb, ucpi, blkno, -1);
-        ret = dquot_alloc_block(inode, uspi->s_fpb);
-        if (ret) {
-                *err = ret;
-                return INVBLOCK;
-        }
        fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
        uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d444f6b..ec784756dc65 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = ufs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962cecde5b..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,7 +24,6 @@
 */
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .open           = dquot_file_open,
+        .open           = generic_file_open,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3a959d55084d..594480e537d2 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
 #include <linux/time.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/bitops.h>
@@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode)
        is_directory = S_ISDIR(inode->i_mode);
-        dquot_free_inode(inode);
-        dquot_drop(inode);
        clear_inode (inode);
        if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
@@ -347,21 +343,12 @@ cg_found:
        unlock_super (sb);
-        dquot_initialize(inode);
-        err = dquot_alloc_inode(inode);
-        if (err) {
-                dquot_drop(inode);
-                goto fail_without_unlock;
-        }
        UFSD("allocating inode %lu\n", inode->i_ino);
        UFSD("EXIT\n");
        return inode;
 fail_remove_inode:
        unlock_super(sb);
-fail_without_unlock:
-        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
        iput(inode);
        UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index cffa756f1047..73fe773aa034 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,7 +37,6 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -910,9 +909,6 @@ void ufs_delete_inode (struct inode * inode)
 {
        loff_t old_i_size;
-        if (!is_bad_inode(inode))
-                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
                goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index eabc02eb1294..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,7 +30,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
        UFSD("BEGIN\n");
-        dquot_initialize(dir);
        inode = ufs_new_inode(dir, mode);
        err = PTR_ERR(inode);
@@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        dquot_initialize(dir);
        inode = ufs_new_inode(dir, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
@@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out_notlocked;
-        dquot_initialize(dir);
        lock_kernel();
        inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
@@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
                return -EMLINK;
        }
-        dquot_initialize(dir);
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
        atomic_inc(&inode->i_count);
@@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        if (dir->i_nlink >= UFS_LINK_MAX)
                goto out;
-        dquot_initialize(dir);
        lock_kernel();
        inode_inc_link_count(dir);
@@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
        struct page *page;
        int err = -ENOENT;
-        dquot_initialize(dir);
        de = ufs_find_entry(dir, &dentry->d_name, &page);
        if (!de)
                goto out;
@@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
-        dquot_initialize(old_dir);
-        dquot_initialize(new_dir);
        old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..3ec5a9eb6efb 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
        sbi->s_bytesex = BYTESEX_LE;
        switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
                case UFS_MAGIC:
+                case UFS_MAGIC_BW:
                case UFS2_MAGIC:
                case UFS_MAGIC_LFN:
                case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
        sbi->s_bytesex = BYTESEX_BE;
        switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
                case UFS_MAGIC:
+                case UFS_MAGIC_BW:
                case UFS2_MAGIC:
                case UFS_MAGIC_LFN:
                case UFS_MAGIC_FEA:
@@ -1045,7 +1046,7 @@ magic_found:
         */
        sb->s_op = &ufs_super_ops;
        sb->s_export_op = &ufs_export_ops;
-        sb->dq_op = NULL; /***/
        sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
        uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1435,126 +1436,19 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(ufs_inode_cachep);
 }
-static void ufs_clear_inode(struct inode *inode)
-{
-        dquot_drop(inode);
-}
-#ifdef CONFIG_QUOTA
-static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
-static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
-#endif
 static const struct super_operations ufs_super_ops = {
        .alloc_inode    = ufs_alloc_inode,
        .destroy_inode  = ufs_destroy_inode,
        .write_inode    = ufs_write_inode,
        .delete_inode   = ufs_delete_inode,
-        .clear_inode    = ufs_clear_inode,
        .put_super      = ufs_put_super,
        .write_super    = ufs_write_super,
        .sync_fs        = ufs_sync_fs,
        .statfs         = ufs_statfs,
        .remount_fs     = ufs_remount,
        .show_options   = ufs_show_options,
-#ifdef CONFIG_QUOTA
-        .quota_read     = ufs_quota_read,
-        .quota_write    = ufs_quota_write,
-#endif
 };
-#ifdef CONFIG_QUOTA
-/* Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
- * we don't have to be afraid of races */
-static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
-                               size_t len, loff_t off)
-{
-        struct inode *inode = sb_dqopt(sb)->files[type];
-        sector_t blk = off >> sb->s_blocksize_bits;
-        int err = 0;
-        int offset = off & (sb->s_blocksize - 1);
-        int tocopy;
-        size_t toread;
-        struct buffer_head *bh;
-        loff_t i_size = i_size_read(inode);
-        if (off > i_size)
-                return 0;
-        if (off+len > i_size)
-                len = i_size-off;
-        toread = len;
-        while (toread > 0) {
-                tocopy = sb->s_blocksize - offset < toread ?
-                                sb->s_blocksize - offset : toread;
-                bh = ufs_bread(inode, blk, 0, &err);
-                if (err)
-                        return err;
-                if (!bh)        /* A hole? */
-                        memset(data, 0, tocopy);
-                else {
-                        memcpy(data, bh->b_data+offset, tocopy);
-                        brelse(bh);
-                }
-                offset = 0;
-                toread -= tocopy;
-                data += tocopy;
-                blk++;
-        }
-        return len;
-}
-/* Write to quotafile */
-static ssize_t ufs_quota_write(struct super_block *sb, int type,
-                                const char *data, size_t len, loff_t off)
-{
-        struct inode *inode = sb_dqopt(sb)->files[type];
-        sector_t blk = off >> sb->s_blocksize_bits;
-        int err = 0;
-        int offset = off & (sb->s_blocksize - 1);
-        int tocopy;
-        size_t towrite = len;
-        struct buffer_head *bh;
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-        while (towrite > 0) {
-                tocopy = sb->s_blocksize - offset < towrite ?
-                                sb->s_blocksize - offset : towrite;
-                bh = ufs_bread(inode, blk, 1, &err);
-                if (!bh)
-                        goto out;
-                lock_buffer(bh);
-                memcpy(bh->b_data+offset, data, tocopy);
-                flush_dcache_page(bh->b_page);
-                set_buffer_uptodate(bh);
-                mark_buffer_dirty(bh);
-                unlock_buffer(bh);
-                brelse(bh);
-                offset = 0;
-                towrite -= tocopy;
-                data += tocopy;
-                blk++;
-        }
-out:
-        if (len == towrite) {
-                mutex_unlock(&inode->i_mutex);
-                return err;
-        }
-        if (inode->i_size < off+len-towrite)
-                i_size_write(inode, off+len-towrite);
-        inode->i_version++;
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        mutex_unlock(&inode->i_mutex);
-        return len - towrite;
-}
-#endif
 static int ufs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index f294c44577dc..589e01a465ba 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,7 +44,6 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/sched.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -501,12 +500,10 @@ out:
        return err;
 }
 /*
- * We don't define our `inode->i_op->truncate', and call it here,
+ * TODO:
- * because of:
+ *      - truncate case should use proper ordering instead of using
- * - there is no way to know old size
+ *        simple_setsize
- * - there is no way inform user about error, if it happens in `truncate'
 */
 int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -518,19 +515,10 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if (is_quota_modification(inode, attr))
-                dquot_initialize(inode);
-        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-            (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                error = dquot_transfer(inode, attr);
-                if (error)
-                        return error;
-        }
        if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
                loff_t old_i_size = inode->i_size;
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
                if (error)
                        return error;
                error = ufs_truncate(inode, old_i_size);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
 #define UFS_SECTOR_SIZE 512
 #define UFS_SECTOR_BITS 9
 #define UFS_MAGIC  0x00011954
+#define UFS_MAGIC_BW 0x0f242697
 #define UFS2_MAGIC 0x19540119
 #define UFS_CIGAM  0x54190100 /* byteswapped MAGIC */
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index b4769e40e8bc..c8fb13f83b3f 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -77,6 +77,7 @@ xfs-y				+= xfs_alloc.o \
                                   xfs_itable.o \
                                   xfs_dfrag.o \
                                   xfs_log.o \
+                                   xfs_log_cil.o \
                                   xfs_log_recover.o \
                                   xfs_mount.o \
                                   xfs_mru_cache.o \
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index f01de3c55c43..649ade8ef598 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -37,6 +37,7 @@
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_dmapi.h"
 #include "xfs_mount.h"
@@ -850,6 +851,12 @@ xfs_buf_lock_value(
 *      Note that this in no way locks the underlying pages, so it is only
 *      useful for synchronizing concurrent use of buffer objects, not for
 *      synchronizing independent access to the underlying pages.
+ *
+ *      If we come across a stale, pinned, locked buffer, we know that we
+ *      are being asked to lock a buffer that has been reallocated. Because
+ *      it is pinned, we know that the log has not been pushed to disk and
+ *      hence it will still be locked. Rather than sleeping until someone
+ *      else pushes the log, push it ourselves before trying to get the lock.
 */
 void
 xfs_buf_lock(
@@ -857,6 +864,8 @@ xfs_buf_lock(
 {
        trace_xfs_buf_lock(bp, _RET_IP_);
+        if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE))
+                xfs_log_force(bp->b_mount, 0);
        if (atomic_read(&bp->b_io_remaining))
                blk_run_address_space(bp->b_target->bt_mapping);
        down(&bp->b_sema);
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index d8fb1b5d6cb5..257a56b127cf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -100,10 +100,10 @@ xfs_iozero(
 STATIC int
 xfs_file_fsync(
        struct file             *file,
-        struct dentry           *dentry,
        int                     datasync)
 {
-        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
+        struct inode            *inode = file->f_mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_trans        *tp;
        int                     error = 0;
        int                     log_flushed = 0;
@@ -140,8 +140,8 @@ xfs_file_fsync(
         * might gets cleared when the inode gets written out via the AIL
         * or xfs_iflush_cluster.
         */
-        if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
+        if (((inode->i_state & I_DIRTY_DATASYNC) ||
-            ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+            ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
            ip->i_update_core) {
                /*
                 * Kick off a transaction to log the inode core to get the
@@ -868,7 +868,7 @@ write_retry:
                        mutex_lock(&inode->i_mutex);
                xfs_ilock(ip, iolock);
-                error2 = -xfs_file_fsync(file, file->f_path.dentry,
+                error2 = -xfs_file_fsync(file,
                                         (file->f_flags & __O_SYNC) ? 0 : 1);
                if (!error)
                        error = error2;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index e31bf21fe5d3..9ac8aea91529 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -19,6 +19,7 @@
 #include "xfs_dmapi.h"
 #include "xfs_sb.h"
 #include "xfs_inum.h"
+#include "xfs_log.h"
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index f24dbe5efde3..f2d1718c9165 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -119,6 +119,8 @@ mempool_t *xfs_ioend_pool;
 #define MNTOPT_DMAPI    "dmapi"         /* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_XDSM     "xdsm"          /* DMI enabled (DMAPI / XDSM) */
 #define MNTOPT_DMI      "dmi"           /* DMI enabled (DMAPI / XDSM) */
+#define MNTOPT_DELAYLOG   "delaylog"    /* Delayed loging enabled */
+#define MNTOPT_NODELAYLOG "nodelaylog"  /* Delayed loging disabled */
 /*
 * Table driven mount option parser.
@@ -374,6 +376,13 @@ xfs_parseargs(
                        mp->m_flags |= XFS_MOUNT_DMAPI;
                } else if (!strcmp(this_char, MNTOPT_DMI)) {
                        mp->m_flags |= XFS_MOUNT_DMAPI;
+                } else if (!strcmp(this_char, MNTOPT_DELAYLOG)) {
+                        mp->m_flags |= XFS_MOUNT_DELAYLOG;
+                        cmn_err(CE_WARN,
+                                "Enabling EXPERIMENTAL delayed logging feature "
+                                "- use at your own risk.\n");
+                } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) {
+                        mp->m_flags &= ~XFS_MOUNT_DELAYLOG;
                } else if (!strcmp(this_char, "ihashsize")) {
                        cmn_err(CE_WARN,
        "XFS: ihashsize no longer used, option is deprecated.");
@@ -535,6 +544,7 @@ xfs_showargs(
                { XFS_MOUNT_FILESTREAMS,        "," MNTOPT_FILESTREAM },
                { XFS_MOUNT_DMAPI,              "," MNTOPT_DMAPI },
                { XFS_MOUNT_GRPID,              "," MNTOPT_GRPID },
+                { XFS_MOUNT_DELAYLOG,           "," MNTOPT_DELAYLOG },
                { 0, NULL }
        };
        static struct proc_xfs_info xfs_info_unset[] = {
@@ -1755,7 +1765,7 @@ xfs_init_zones(void)
         * but it is much faster.
         */
        xfs_buf_item_zone = kmem_zone_init((sizeof(xfs_buf_log_item_t) +
-                                (((XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK) /
+                                (((XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK) /
                                  NBWORD) * sizeof(int))), "xfs_buf_item");
        if (!xfs_buf_item_zone)
                goto out_destroy_trans_zone;
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 8a319cfd2901..ff6bc797baf2 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -1059,83 +1059,112 @@ TRACE_EVENT(xfs_bunmap,
 );
+#define XFS_BUSY_SYNC \
+        { 0,    "async" }, \
+        { 1,    "sync" }
 TRACE_EVENT(xfs_alloc_busy,
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+        TP_PROTO(struct xfs_trans *trans, xfs_agnumber_t agno,
-                 xfs_extlen_t len, int slot),
+                 xfs_agblock_t agbno, xfs_extlen_t len, int sync),
-        TP_ARGS(mp, agno, agbno, len, slot),
+        TP_ARGS(trans, agno, agbno, len, sync),
        TP_STRUCT__entry(
                __field(dev_t, dev)
+                __field(struct xfs_trans *, tp)
+                __field(int, tid)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(int, slot)
+                __field(int, sync)
        ),
        TP_fast_assign(
-                __entry->dev = mp->m_super->s_dev;
+                __entry->dev = trans->t_mountp->m_super->s_dev;
+                __entry->tp = trans;
+                __entry->tid = trans->t_ticket->t_tid;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->slot = slot;
+                __entry->sync = sync;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u slot %d",
+        TP_printk("dev %d:%d trans 0x%p tid 0x%x agno %u agbno %u len %u %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tp,
+                  __entry->tid,
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
-                  __entry->slot)
+                  __print_symbolic(__entry->sync, XFS_BUSY_SYNC))
 );
-#define XFS_BUSY_STATES \
-        { 0,    "found" }, \
-        { 1,    "missing" }
 TRACE_EVENT(xfs_alloc_unbusy,
        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 int slot, int found),
+                 xfs_agblock_t agbno, xfs_extlen_t len),
-        TP_ARGS(mp, agno, slot, found),
+        TP_ARGS(mp, agno, agbno, len),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
-                __field(int, slot)
+                __field(xfs_agblock_t, agbno)
-                __field(int, found)
+                __field(xfs_extlen_t, len)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
-                __entry->slot = slot;
+                __entry->agbno = agbno;
-                __entry->found = found;
+                __entry->len = len;
        ),
-        TP_printk("dev %d:%d agno %u slot %d %s",
+        TP_printk("dev %d:%d agno %u agbno %u len %u",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
-                  __entry->slot,
+                  __entry->agbno,
-                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+                  __entry->len)
 );
+#define XFS_BUSY_STATES \
+        { 0,    "missing" }, \
+        { 1,    "found" }
 TRACE_EVENT(xfs_alloc_busysearch,
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
-                 xfs_extlen_t len, xfs_lsn_t lsn),
+                 xfs_agblock_t agbno, xfs_extlen_t len, int found),
-        TP_ARGS(mp, agno, agbno, len, lsn),
+        TP_ARGS(mp, agno, agbno, len, found),
        TP_STRUCT__entry(
                __field(dev_t, dev)
                __field(xfs_agnumber_t, agno)
                __field(xfs_agblock_t, agbno)
                __field(xfs_extlen_t, len)
-                __field(xfs_lsn_t, lsn)
+                __field(int, found)
        ),
        TP_fast_assign(
                __entry->dev = mp->m_super->s_dev;
                __entry->agno = agno;
                __entry->agbno = agbno;
                __entry->len = len;
-                __entry->lsn = lsn;
+                __entry->found = found;
        ),
-        TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx",
+        TP_printk("dev %d:%d agno %u agbno %u len %u %s",
                  MAJOR(__entry->dev), MINOR(__entry->dev),
                  __entry->agno,
                  __entry->agbno,
                  __entry->len,
+                  __print_symbolic(__entry->found, XFS_BUSY_STATES))
+);
+TRACE_EVENT(xfs_trans_commit_lsn,
+        TP_PROTO(struct xfs_trans *trans),
+        TP_ARGS(trans),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(struct xfs_trans *, tp)
+                __field(xfs_lsn_t, lsn)
+        ),
+        TP_fast_assign(
+                __entry->dev = trans->t_mountp->m_super->s_dev;
+                __entry->tp = trans;
+                __entry->lsn = trans->t_commit_lsn;
+        ),
+        TP_printk("dev %d:%d trans 0x%p commit_lsn 0x%llx",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->tp,
                  __entry->lsn)
 );
diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c
index b89ec5df0129..585e7633dfc7 100644
--- a/fs/xfs/quota/xfs_dquot.c
+++ b/fs/xfs/quota/xfs_dquot.c
@@ -344,9 +344,9 @@ xfs_qm_init_dquot_blk(
        for (i = 0; i < q->qi_dqperchunk; i++, d++, curid++)
                xfs_qm_dqinit_core(curid, type, d);
        xfs_trans_dquot_buf(tp, bp,
-                            (type & XFS_DQ_USER ? XFS_BLI_UDQUOT_BUF :
+                            (type & XFS_DQ_USER ? XFS_BLF_UDQUOT_BUF :
-                            ((type & XFS_DQ_PROJ) ? XFS_BLI_PDQUOT_BUF :
+                            ((type & XFS_DQ_PROJ) ? XFS_BLF_PDQUOT_BUF :
-                             XFS_BLI_GDQUOT_BUF)));
+                             XFS_BLF_GDQUOT_BUF)));
        xfs_trans_log_buf(tp, bp, 0, BBTOB(q->qi_dqchunklen) - 1);
 }
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index abb8222b88c9..401f364ad36c 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -175,14 +175,20 @@ typedef struct xfs_agfl {
 } xfs_agfl_t;
 /*
- * Busy block/extent entry.  Used in perag to mark blocks that have been freed
+ * Busy block/extent entry.  Indexed by a rbtree in perag to mark blocks that
- * but whose transactions aren't committed to disk yet.
+ * have been freed but whose transactions aren't committed to disk yet.
+ *
+ * Note that we use the transaction ID to record the transaction, not the
+ * transaction structure itself. See xfs_alloc_busy_insert() for details.
 */
-typedef struct xfs_perag_busy {
+struct xfs_busy_extent {
-        xfs_agblock_t   busy_start;
+        struct rb_node  rb_node;        /* ag by-bno indexed search tree */
-        xfs_extlen_t    busy_length;
+        struct list_head list;          /* transaction busy extent list */
-        struct xfs_trans *busy_tp;      /* transaction that did the free */
+        xfs_agnumber_t  agno;
-} xfs_perag_busy_t;
+        xfs_agblock_t   bno;
+        xfs_extlen_t    length;
+        xlog_tid_t      tid;            /* transaction that created this */
+};
 /*
 * Per-ag incore structure, copies of information in agf and agi,
@@ -216,7 +222,8 @@ typedef struct xfs_perag {
        xfs_agino_t     pagl_leftrec;
        xfs_agino_t     pagl_rightrec;
 #ifdef __KERNEL__
-        spinlock_t      pagb_lock;      /* lock for pagb_list */
+        spinlock_t      pagb_lock;      /* lock for pagb_tree */
+        struct rb_root  pagb_tree;      /* ordered tree of busy extents */
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
@@ -226,7 +233,6 @@ typedef struct xfs_perag {
        int             pag_ici_reclaimable;    /* reclaimable inodes */
 #endif
        int             pagb_count;     /* pagb slots in use */
-        xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */
 } xfs_perag_t;
 /*
diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c
index 94cddbfb2560..a7fbe8a99b12 100644
--- a/fs/xfs/xfs_alloc.c
+++ b/fs/xfs/xfs_alloc.c
@@ -46,11 +46,9 @@
 #define XFSA_FIXUP_BNO_OK       1
 #define XFSA_FIXUP_CNT_OK       2
-STATIC void
+static int
-xfs_alloc_search_busy(xfs_trans_t *tp,
+xfs_alloc_busy_search(struct xfs_mount *mp, xfs_agnumber_t agno,
-                    xfs_agnumber_t agno,
+                    xfs_agblock_t bno, xfs_extlen_t len);
-                    xfs_agblock_t bno,
-                    xfs_extlen_t len);
 /*
 * Prototypes for per-ag allocation routines
@@ -540,9 +538,16 @@ xfs_alloc_ag_vextent(
                                be32_to_cpu(agf->agf_length));
                        xfs_alloc_log_agf(args->tp, args->agbp,
                                                XFS_AGF_FREEBLKS);
-                        /* search the busylist for these blocks */
+                        /*
-                        xfs_alloc_search_busy(args->tp, args->agno,
+                         * Search the busylist for these blocks and mark the
-                                        args->agbno, args->len);
+                         * transaction as synchronous if blocks are found. This
+                         * avoids the need to block due to a synchronous log
+                         * force to ensure correct ordering as the synchronous
+                         * transaction will guarantee that for us.
+                         */
+                        if (xfs_alloc_busy_search(args->mp, args->agno,
+                                                args->agbno, args->len))
+                                xfs_trans_set_sync(args->tp);
                }
                if (!args->isfl)
                        xfs_trans_mod_sb(args->tp,
@@ -1693,7 +1698,7 @@ xfs_free_ag_extent(
         * when the iclog commits to disk.  If a busy block is allocated,
         * the iclog is pushed up to the LSN that freed the block.
         */
-        xfs_alloc_mark_busy(tp, agno, bno, len);
+        xfs_alloc_busy_insert(tp, agno, bno, len);
        return 0;
 error0:
@@ -1989,14 +1994,20 @@ xfs_alloc_get_freelist(
        *bnop = bno;
        /*
-         * As blocks are freed, they are added to the per-ag busy list
+         * As blocks are freed, they are added to the per-ag busy list and
-         * and remain there until the freeing transaction is committed to
+         * remain there until the freeing transaction is committed to disk.
-         * disk.  Now that we have allocated blocks, this list must be
+         * Now that we have allocated blocks, this list must be searched to see
-         * searched to see if a block is being reused.  If one is, then
+         * if a block is being reused.  If one is, then the freeing transaction
-         * the freeing transaction must be pushed to disk NOW by forcing
+         * must be pushed to disk before this transaction.
-         * to disk all iclogs up that transaction's LSN.
+         *
+         * We do this by setting the current transaction to a sync transaction
+         * which guarantees that the freeing transaction is on disk before this
+         * transaction. This is done instead of a synchronous log force here so
+         * that we don't sit and wait with the AGF locked in the transaction
+         * during the log force.
         */
-        xfs_alloc_search_busy(tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+        if (xfs_alloc_busy_search(mp, be32_to_cpu(agf->agf_seqno), bno, 1))
+                xfs_trans_set_sync(tp);
        return 0;
 }
@@ -2201,7 +2212,7 @@ xfs_alloc_read_agf(
                        be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
                spin_lock_init(&pag->pagb_lock);
                pag->pagb_count = 0;
-                memset(pag->pagb_list, 0, sizeof(pag->pagb_list));
+                pag->pagb_tree = RB_ROOT;
                pag->pagf_init = 1;
        }
 #ifdef DEBUG
@@ -2479,127 +2490,263 @@ error0:
 * list is reused, the transaction that freed it must be forced to disk
 * before continuing to use the block.
 *
- * xfs_alloc_mark_busy - add to the per-ag busy list
+ * xfs_alloc_busy_insert - add to the per-ag busy list
- * xfs_alloc_clear_busy - remove an item from the per-ag busy list
+ * xfs_alloc_busy_clear - remove an item from the per-ag busy list
+ * xfs_alloc_busy_search - search for a busy extent
+ */
+/*
+ * Insert a new extent into the busy tree.
+ *
+ * The busy extent tree is indexed by the start block of the busy extent.
+ * there can be multiple overlapping ranges in the busy extent tree but only
+ * ever one entry at a given start block. The reason for this is that
+ * multi-block extents can be freed, then smaller chunks of that extent
+ * allocated and freed again before the first transaction commit is on disk.
+ * If the exact same start block is freed a second time, we have to wait for
+ * that busy extent to pass out of the tree before the new extent is inserted.
+ * There are two main cases we have to handle here.
+ *
+ * The first case is a transaction that triggers a "free - allocate - free"
+ * cycle. This can occur during btree manipulations as a btree block is freed
+ * to the freelist, then allocated from the free list, then freed again. In
+ * this case, the second extxpnet free is what triggers the duplicate and as
+ * such the transaction IDs should match. Because the extent was allocated in
+ * this transaction, the transaction must be marked as synchronous. This is
+ * true for all cases where the free/alloc/free occurs in the one transaction,
+ * hence the addition of the ASSERT(tp->t_flags & XFS_TRANS_SYNC) to this case.
+ * This serves to catch violations of the second case quite effectively.
+ *
+ * The second case is where the free/alloc/free occur in different
+ * transactions. In this case, the thread freeing the extent the second time
+ * can't mark the extent busy immediately because it is already tracked in a
+ * transaction that may be committing.  When the log commit for the existing
+ * busy extent completes, the busy extent will be removed from the tree. If we
+ * allow the second busy insert to continue using that busy extent structure,
+ * it can be freed before this transaction is safely in the log.  Hence our
+ * only option in this case is to force the log to remove the existing busy
+ * extent from the list before we insert the new one with the current
+ * transaction ID.
+ *
+ * The problem we are trying to avoid in the free-alloc-free in separate
+ * transactions is most easily described with a timeline:
+ *
+ *      Thread 1        Thread 2        Thread 3        xfslogd
+ *      xact alloc
+ *      free X
+ *      mark busy
+ *      commit xact
+ *      free xact
+ *                      xact alloc
+ *                      alloc X
+ *                      busy search
+ *                      mark xact sync
+ *                      commit xact
+ *                      free xact
+ *                      force log
+ *                      checkpoint starts
+ *                      ....
+ *                                      xact alloc
+ *                                      free X
+ *                                      mark busy
+ *                                      finds match
+ *                                      *** KABOOM! ***
+ *                                      ....
+ *                                                      log IO completes
+ *                                                      unbusy X
+ *                      checkpoint completes
+ *
+ * By issuing a log force in thread 3 @ "KABOOM", the thread will block until
+ * the checkpoint completes, and the busy extent it matched will have been
+ * removed from the tree when it is woken. Hence it can then continue safely.
+ *
+ * However, to ensure this matching process is robust, we need to use the
+ * transaction ID for identifying transaction, as delayed logging results in
+ * the busy extent and transaction lifecycles being different. i.e. the busy
+ * extent is active for a lot longer than the transaction.  Hence the
+ * transaction structure can be freed and reallocated, then mark the same
+ * extent busy again in the new transaction. In this case the new transaction
+ * will have a different tid but can have the same address, and hence we need
+ * to check against the tid.
+ *
+ * Future: for delayed logging, we could avoid the log force if the extent was
+ * first freed in the current checkpoint sequence. This, however, requires the
+ * ability to pin the current checkpoint in memory until this transaction
+ * commits to ensure that both the original free and the current one combine
+ * logically into the one checkpoint. If the checkpoint sequences are
+ * different, however, we still need to wait on a log force.
 */
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(
-                    xfs_agnumber_t agno,
+        struct xfs_trans        *tp,
-                    xfs_agblock_t bno,
+        xfs_agnumber_t          agno,
-                    xfs_extlen_t len)
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len)
 {
-        xfs_perag_busy_t        *bsy;
+        struct xfs_busy_extent  *new;
+        struct xfs_busy_extent  *busyp;
        struct xfs_perag        *pag;
-        int                     n;
+        struct rb_node          **rbp;
+        struct rb_node          *parent;
+        int                     match;
-        pag = xfs_perag_get(tp->t_mountp, agno);
-        spin_lock(&pag->pagb_lock);
-        /* search pagb_list for an open slot */
+        new = kmem_zalloc(sizeof(struct xfs_busy_extent), KM_MAYFAIL);
-        for (bsy = pag->pagb_list, n = 0;
+        if (!new) {
-             n < XFS_PAGB_NUM_SLOTS;
+                /*
-             bsy++, n++) {
+                 * No Memory!  Since it is now not possible to track the free
-                if (bsy->busy_tp == NULL) {
+                 * block, make this a synchronous transaction to insure that
-                        break;
+                 * the block is not reused before this transaction commits.
-                }
+                 */
+                trace_xfs_alloc_busy(tp, agno, bno, len, 1);
+                xfs_trans_set_sync(tp);
+                return;
        }
-        trace_xfs_alloc_busy(tp->t_mountp, agno, bno, len, n);
+        new->agno = agno;
+        new->bno = bno;
+        new->length = len;
+        new->tid = xfs_log_get_trans_ident(tp);
-        if (n < XFS_PAGB_NUM_SLOTS) {
+        INIT_LIST_HEAD(&new->list);
-                bsy = &pag->pagb_list[n];
-                pag->pagb_count++;
+        /* trace before insert to be able to see failed inserts */
-                bsy->busy_start = bno;
+        trace_xfs_alloc_busy(tp, agno, bno, len, 0);
-                bsy->busy_length = len;
-                bsy->busy_tp = tp;
+        pag = xfs_perag_get(tp->t_mountp, new->agno);
-                xfs_trans_add_busy(tp, agno, n);
+restart:
-        } else {
+        spin_lock(&pag->pagb_lock);
+        rbp = &pag->pagb_tree.rb_node;
+        parent = NULL;
+        busyp = NULL;
+        match = 0;
+        while (*rbp && match >= 0) {
+                parent = *rbp;
+                busyp = rb_entry(parent, struct xfs_busy_extent, rb_node);
+                if (new->bno < busyp->bno) {
+                        /* may overlap, but exact start block is lower */
+                        rbp = &(*rbp)->rb_left;
+                        if (new->bno + new->length > busyp->bno)
+                                match = busyp->tid == new->tid ? 1 : -1;
+                } else if (new->bno > busyp->bno) {
+                        /* may overlap, but exact start block is higher */
+                        rbp = &(*rbp)->rb_right;
+                        if (bno < busyp->bno + busyp->length)
+                                match = busyp->tid == new->tid ? 1 : -1;
+                } else {
+                        match = busyp->tid == new->tid ? 1 : -1;
+                        break;
+                }
+        }
+        if (match < 0) {
+                /* overlap marked busy in different transaction */
+                spin_unlock(&pag->pagb_lock);
+                xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
+                goto restart;
+        }
+        if (match > 0) {
                /*
-                 * The busy list is full!  Since it is now not possible to
+                 * overlap marked busy in same transaction. Update if exact
-                 * track the free block, make this a synchronous transaction
+                 * start block match, otherwise combine the busy extents into
-                 * to insure that the block is not reused before this
+                 * a single range.
-                 * transaction commits.
                 */
-                xfs_trans_set_sync(tp);
+                if (busyp->bno == new->bno) {
-        }
+                        busyp->length = max(busyp->length, new->length);
+                        spin_unlock(&pag->pagb_lock);
+                        ASSERT(tp->t_flags & XFS_TRANS_SYNC);
+                        xfs_perag_put(pag);
+                        kmem_free(new);
+                        return;
+                }
+                rb_erase(&busyp->rb_node, &pag->pagb_tree);
+                new->length = max(busyp->bno + busyp->length,
+                                        new->bno + new->length) -
+                                min(busyp->bno, new->bno);
+                new->bno = min(busyp->bno, new->bno);
+        } else
+                busyp = NULL;
+        rb_link_node(&new->rb_node, parent, rbp);
+        rb_insert_color(&new->rb_node, &pag->pagb_tree);
+        list_add(&new->list, &tp->t_busy);
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
+        kmem_free(busyp);
 }
-void
+/*
-xfs_alloc_clear_busy(xfs_trans_t *tp,
+ * Search for a busy extent within the range of the extent we are about to
-                     xfs_agnumber_t agno,
+ * allocate.  You need to be holding the busy extent tree lock when calling
-                     int idx)
+ * xfs_alloc_busy_search(). This function returns 0 for no overlapping busy
+ * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
+ * match. This is done so that a non-zero return indicates an overlap that
+ * will require a synchronous transaction, but it can still be
+ * used to distinguish between a partial or exact match.
+ */
+static int
+xfs_alloc_busy_search(
+        struct xfs_mount        *mp,
+        xfs_agnumber_t          agno,
+        xfs_agblock_t           bno,
+        xfs_extlen_t            len)
 {
        struct xfs_perag        *pag;
-        xfs_perag_busy_t        *list;
+        struct rb_node          *rbp;
+        struct xfs_busy_extent  *busyp;
+        int                     match = 0;
-        ASSERT(idx < XFS_PAGB_NUM_SLOTS);
+        pag = xfs_perag_get(mp, agno);
-        pag = xfs_perag_get(tp->t_mountp, agno);
        spin_lock(&pag->pagb_lock);
-        list = pag->pagb_list;
-        trace_xfs_alloc_unbusy(tp->t_mountp, agno, idx, list[idx].busy_tp == tp);
+        rbp = pag->pagb_tree.rb_node;
-        if (list[idx].busy_tp == tp) {
+        /* find closest start bno overlap */
-                list[idx].busy_tp = NULL;
+        while (rbp) {
-                pag->pagb_count--;
+                busyp = rb_entry(rbp, struct xfs_busy_extent, rb_node);
+                if (bno < busyp->bno) {
+                        /* may overlap, but exact start block is lower */
+                        if (bno + len > busyp->bno)
+                                match = -1;
+                        rbp = rbp->rb_left;
+                } else if (bno > busyp->bno) {
+                        /* may overlap, but exact start block is higher */
+                        if (bno < busyp->bno + busyp->length)
+                                match = -1;
+                        rbp = rbp->rb_right;
+                } else {
+                        /* bno matches busyp, length determines exact match */
+                        match = (busyp->length == len) ? 1 : -1;
+                        break;
+                }
        }
        spin_unlock(&pag->pagb_lock);
+        trace_xfs_alloc_busysearch(mp, agno, bno, len, !!match);
        xfs_perag_put(pag);
+        return match;
 }
+void
-/*
+xfs_alloc_busy_clear(
- * If we find the extent in the busy list, force the log out to get the
+        struct xfs_mount        *mp,
- * extent out of the busy list so the caller can use it straight away.
+        struct xfs_busy_extent  *busyp)
- */
-STATIC void
-xfs_alloc_search_busy(xfs_trans_t *tp,
-                    xfs_agnumber_t agno,
-                    xfs_agblock_t bno,
-                    xfs_extlen_t len)
 {
        struct xfs_perag        *pag;
-        xfs_perag_busy_t        *bsy;
-        xfs_agblock_t           uend, bend;
-        xfs_lsn_t               lsn = 0;
-        int                     cnt;
-        pag = xfs_perag_get(tp->t_mountp, agno);
+        trace_xfs_alloc_unbusy(mp, busyp->agno, busyp->bno,
-        spin_lock(&pag->pagb_lock);
+                                                busyp->length);
-        cnt = pag->pagb_count;
-        /*
+        ASSERT(xfs_alloc_busy_search(mp, busyp->agno, busyp->bno,
-         * search pagb_list for this slot, skipping open slots. We have to
+                                                busyp->length) == 1);
-         * search the entire array as there may be multiple overlaps and
-         * we have to get the most recent LSN for the log force to push out
-         * all the transactions that span the range.
-         */
-        uend = bno + len - 1;
-        for (cnt = 0; cnt < pag->pagb_count; cnt++) {
-                bsy = &pag->pagb_list[cnt];
-                if (!bsy->busy_tp)
-                        continue;
-                bend = bsy->busy_start + bsy->busy_length - 1;
+        list_del_init(&busyp->list);
-                if (bno > bend || uend < bsy->busy_start)
-                        continue;
-                /* (start1,length1) within (start2, length2) */
+        pag = xfs_perag_get(mp, busyp->agno);
-                if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0)
+        spin_lock(&pag->pagb_lock);
-                        lsn = bsy->busy_tp->t_commit_lsn;
+        rb_erase(&busyp->rb_node, &pag->pagb_tree);
-        }
        spin_unlock(&pag->pagb_lock);
        xfs_perag_put(pag);
-        trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn);
-        /*
+        kmem_free(busyp);
-         * If a block was found, force the log through the LSN of the
-         * transaction that freed the block
-         */
-        if (lsn)
-                xfs_log_force_lsn(tp->t_mountp, lsn, XFS_LOG_SYNC);
 }
diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h
index 599bffa39784..6d05199b667c 100644
--- a/fs/xfs/xfs_alloc.h
+++ b/fs/xfs/xfs_alloc.h
@@ -22,6 +22,7 @@ struct xfs_buf;
 struct xfs_mount;
 struct xfs_perag;
 struct xfs_trans;
+struct xfs_busy_extent;
 /*
 * Freespace allocation types.  Argument to xfs_alloc_[v]extent.
@@ -119,15 +120,13 @@ xfs_alloc_longest_free_extent(struct xfs_mount *mp,
 #ifdef __KERNEL__
 void
-xfs_alloc_mark_busy(xfs_trans_t *tp,
+xfs_alloc_busy_insert(xfs_trans_t *tp,
                xfs_agnumber_t agno,
                xfs_agblock_t bno,
                xfs_extlen_t len);
 void
-xfs_alloc_clear_busy(xfs_trans_t *tp,
+xfs_alloc_busy_clear(struct xfs_mount *mp, struct xfs_busy_extent *busyp);
-                xfs_agnumber_t ag,
-                int idx);
 #endif  /* __KERNEL__ */
diff --git a/fs/xfs/xfs_alloc_btree.c b/fs/xfs/xfs_alloc_btree.c
index b726e10d2c1c..83f494218759 100644
--- a/fs/xfs/xfs_alloc_btree.c
+++ b/fs/xfs/xfs_alloc_btree.c
@@ -134,7 +134,7 @@ xfs_allocbt_free_block(
         * disk. If a busy block is allocated, the iclog is pushed up to the
         * LSN that freed the block.
         */
-        xfs_alloc_mark_busy(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
+        xfs_alloc_busy_insert(cur->bc_tp, be32_to_cpu(agf->agf_seqno), bno, 1);
        xfs_trans_agbtree_delta(cur->bc_tp, -1);
        return 0;
 }
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 240340a4727b..02a80984aa05 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -64,7 +64,7 @@ xfs_buf_item_log_debug(
        nbytes = last - first + 1;
        bfset(bip->bli_logged, first, nbytes);
        for (x = 0; x < nbytes; x++) {
-                chunk_num = byte >> XFS_BLI_SHIFT;
+                chunk_num = byte >> XFS_BLF_SHIFT;
                word_num = chunk_num >> BIT_TO_WORD_SHIFT;
                bit_num = chunk_num & (NBWORD - 1);
                wordp = &(bip->bli_format.blf_data_map[word_num]);
@@ -166,7 +166,7 @@ xfs_buf_item_size(
                 * cancel flag in it.
                 */
                trace_xfs_buf_item_size_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                return 1;
        }
@@ -197,9 +197,9 @@ xfs_buf_item_size(
                } else if (next_bit != last_bit + 1) {
                        last_bit = next_bit;
                        nvecs++;
-                } else if (xfs_buf_offset(bp, next_bit * XFS_BLI_CHUNK) !=
+                } else if (xfs_buf_offset(bp, next_bit * XFS_BLF_CHUNK) !=
-                           (xfs_buf_offset(bp, last_bit * XFS_BLI_CHUNK) +
+                           (xfs_buf_offset(bp, last_bit * XFS_BLF_CHUNK) +
-                            XFS_BLI_CHUNK)) {
+                            XFS_BLF_CHUNK)) {
                        last_bit = next_bit;
                        nvecs++;
                } else {
@@ -254,6 +254,20 @@ xfs_buf_item_format(
        vecp++;
        nvecs = 1;
+        /*
+         * If it is an inode buffer, transfer the in-memory state to the
+         * format flags and clear the in-memory state. We do not transfer
+         * this state if the inode buffer allocation has not yet been committed
+         * to the log as setting the XFS_BLI_INODE_BUF flag will prevent
+         * correct replay of the inode allocation.
+         */
+        if (bip->bli_flags & XFS_BLI_INODE_BUF) {
+                if (!((bip->bli_flags & XFS_BLI_INODE_ALLOC_BUF) &&
+                      xfs_log_item_in_current_chkpt(&bip->bli_item)))
+                        bip->bli_format.blf_flags |= XFS_BLF_INODE_BUF;
+                bip->bli_flags &= ~XFS_BLI_INODE_BUF;
+        }
        if (bip->bli_flags & XFS_BLI_STALE) {
                /*
                 * The buffer is stale, so all we need to log
@@ -261,7 +275,7 @@ xfs_buf_item_format(
                 * cancel flag in it.
                 */
                trace_xfs_buf_item_format_stale(bip);
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                bip->bli_format.blf_size = nvecs;
                return;
        }
@@ -294,28 +308,28 @@ xfs_buf_item_format(
                 * keep counting and scanning.
                 */
                if (next_bit == -1) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
                        nvecs++;
                        break;
                } else if (next_bit != last_bit + 1) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
                        nvecs++;
                        vecp++;
                        first_bit = next_bit;
                        last_bit = next_bit;
                        nbits = 1;
-                } else if (xfs_buf_offset(bp, next_bit << XFS_BLI_SHIFT) !=
+                } else if (xfs_buf_offset(bp, next_bit << XFS_BLF_SHIFT) !=
-                           (xfs_buf_offset(bp, last_bit << XFS_BLI_SHIFT) +
+                           (xfs_buf_offset(bp, last_bit << XFS_BLF_SHIFT) +
-                            XFS_BLI_CHUNK)) {
+                            XFS_BLF_CHUNK)) {
-                        buffer_offset = first_bit * XFS_BLI_CHUNK;
+                        buffer_offset = first_bit * XFS_BLF_CHUNK;
                        vecp->i_addr = xfs_buf_offset(bp, buffer_offset);
-                        vecp->i_len = nbits * XFS_BLI_CHUNK;
+                        vecp->i_len = nbits * XFS_BLF_CHUNK;
                        vecp->i_type = XLOG_REG_TYPE_BCHUNK;
 /* You would think we need to bump the nvecs here too, but we do not
 * this number is used by recovery, and it gets confused by the boundary
@@ -341,10 +355,15 @@ xfs_buf_item_format(
 }
 /*
- * This is called to pin the buffer associated with the buf log
+ * This is called to pin the buffer associated with the buf log item in memory
- * item in memory so it cannot be written out.  Simply call bpin()
+ * so it cannot be written out.  Simply call bpin() on the buffer to do this.
- * on the buffer to do this.
+ *
+ * We also always take a reference to the buffer log item here so that the bli
+ * is held while the item is pinned in memory. This means that we can
+ * unconditionally drop the reference count a transaction holds when the
+ * transaction is completed.
 */
 STATIC void
 xfs_buf_item_pin(
        xfs_buf_log_item_t      *bip)
@@ -356,6 +375,7 @@ xfs_buf_item_pin(
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT((bip->bli_flags & XFS_BLI_LOGGED) ||
               (bip->bli_flags & XFS_BLI_STALE));
+        atomic_inc(&bip->bli_refcount);
        trace_xfs_buf_item_pin(bip);
        xfs_bpin(bp);
 }
@@ -393,7 +413,7 @@ xfs_buf_item_unpin(
                ASSERT(XFS_BUF_VALUSEMA(bp) <= 0);
                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
                ASSERT(XFS_BUF_ISSTALE(bp));
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                trace_xfs_buf_item_unpin_stale(bip);
                /*
@@ -489,20 +509,23 @@ xfs_buf_item_trylock(
 }
 /*
- * Release the buffer associated with the buf log item.
+ * Release the buffer associated with the buf log item.  If there is no dirty
- * If there is no dirty logged data associated with the
+ * logged data associated with the buffer recorded in the buf log item, then
- * buffer recorded in the buf log item, then free the
+ * free the buf log item and remove the reference to it in the buffer.
- * buf log item and remove the reference to it in the
+ *
- * buffer.
+ * This call ignores the recursion count.  It is only called when the buffer
+ * should REALLY be unlocked, regardless of the recursion count.
 *
- * This call ignores the recursion count.  It is only called
+ * We unconditionally drop the transaction's reference to the log item. If the
- * when the buffer should REALLY be unlocked, regardless
+ * item was logged, then another reference was taken when it was pinned, so we
- * of the recursion count.
+ * can safely drop the transaction reference now.  This also allows us to avoid
+ * potential races with the unpin code freeing the bli by not referencing the
+ * bli after we've dropped the reference count.
 *
- * If the XFS_BLI_HOLD flag is set in the buf log item, then
+ * If the XFS_BLI_HOLD flag is set in the buf log item, then free the log item
- * free the log item if necessary but do not unlock the buffer.
+ * if necessary but do not unlock the buffer.  This is for support of
- * This is for support of xfs_trans_bhold(). Make sure the
+ * xfs_trans_bhold(). Make sure the XFS_BLI_HOLD field is cleared if we don't
- * XFS_BLI_HOLD field is cleared if we don't free the item.
+ * free the item.
 */
 STATIC void
 xfs_buf_item_unlock(
@@ -514,73 +537,54 @@ xfs_buf_item_unlock(
        bp = bip->bli_buf;
-        /*
+        /* Clear the buffer's association with this transaction. */
-         * Clear the buffer's association with this transaction.
-         */
        XFS_BUF_SET_FSPRIVATE2(bp, NULL);
        /*
-         * If this is a transaction abort, don't return early.
+         * If this is a transaction abort, don't return early.  Instead, allow
-         * Instead, allow the brelse to happen.
+         * the brelse to happen.  Normally it would be done for stale
-         * Normally it would be done for stale (cancelled) buffers
+         * (cancelled) buffers at unpin time, but we'll never go through the
-         * at unpin time, but we'll never go through the pin/unpin
+         * pin/unpin cycle if we abort inside commit.
-         * cycle if we abort inside commit.
         */
        aborted = (bip->bli_item.li_flags & XFS_LI_ABORTED) != 0;
        /*
-         * If the buf item is marked stale, then don't do anything.
+         * Before possibly freeing the buf item, determine if we should
-         * We'll unlock the buffer and free the buf item when the
+         * release the buffer at the end of this routine.
-         * buffer is unpinned for the last time.
         */
-        if (bip->bli_flags & XFS_BLI_STALE) {
+        hold = bip->bli_flags & XFS_BLI_HOLD;
-                bip->bli_flags &= ~XFS_BLI_LOGGED;
-                trace_xfs_buf_item_unlock_stale(bip);
+        /* Clear the per transaction state. */
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_HOLD);
-                if (!aborted)
-                        return;
-        }
        /*
-         * Drop the transaction's reference to the log item if
+         * If the buf item is marked stale, then don't do anything.  We'll
-         * it was not logged as part of the transaction.  Otherwise
+         * unlock the buffer and free the buf item when the buffer is unpinned
-         * we'll drop the reference in xfs_buf_item_unpin() when
+         * for the last time.
-         * the transaction is really through with the buffer.
         */
-        if (!(bip->bli_flags & XFS_BLI_LOGGED)) {
+        if (bip->bli_flags & XFS_BLI_STALE) {
-                atomic_dec(&bip->bli_refcount);
+                trace_xfs_buf_item_unlock_stale(bip);
-        } else {
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
-                /*
+                if (!aborted) {
-                 * Clear the logged flag since this is per
+                        atomic_dec(&bip->bli_refcount);
-                 * transaction state.
+                        return;
-                 */
+                }
-                bip->bli_flags &= ~XFS_BLI_LOGGED;
        }
-        /*
-         * Before possibly freeing the buf item, determine if we should
-         * release the buffer at the end of this routine.
-         */
-        hold = bip->bli_flags & XFS_BLI_HOLD;
        trace_xfs_buf_item_unlock(bip);
        /*
-         * If the buf item isn't tracking any data, free it.
+         * If the buf item isn't tracking any data, free it, otherwise drop the
-         * Otherwise, if XFS_BLI_HOLD is set clear it.
+         * reference we hold to it.
         */
        if (xfs_bitmap_empty(bip->bli_format.blf_data_map,
-                             bip->bli_format.blf_map_size)) {
+                             bip->bli_format.blf_map_size))
                xfs_buf_item_relse(bp);
-        } else if (hold) {
+        else
-                bip->bli_flags &= ~XFS_BLI_HOLD;
+                atomic_dec(&bip->bli_refcount);
-        }
-        /*
+        if (!hold)
-         * Release the buffer if XFS_BLI_HOLD was not set.
-         */
-        if (!hold) {
                xfs_buf_relse(bp);
-        }
 }
 /*
@@ -717,12 +721,12 @@ xfs_buf_item_init(
        }
        /*
-         * chunks is the number of XFS_BLI_CHUNK size pieces
+         * chunks is the number of XFS_BLF_CHUNK size pieces
         * the buffer can be divided into. Make sure not to
         * truncate any pieces.  map_size is the size of the
         * bitmap needed to describe the chunks of the buffer.
         */
-        chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLI_CHUNK - 1)) >> XFS_BLI_SHIFT);
+        chunks = (int)((XFS_BUF_COUNT(bp) + (XFS_BLF_CHUNK - 1)) >> XFS_BLF_SHIFT);
        map_size = (int)((chunks + NBWORD) >> BIT_TO_WORD_SHIFT);
        bip = (xfs_buf_log_item_t*)kmem_zone_zalloc(xfs_buf_item_zone,
@@ -790,8 +794,8 @@ xfs_buf_item_log(
        /*
         * Convert byte offsets to bit numbers.
         */
-        first_bit = first >> XFS_BLI_SHIFT;
+        first_bit = first >> XFS_BLF_SHIFT;
-        last_bit = last >> XFS_BLI_SHIFT;
+        last_bit = last >> XFS_BLF_SHIFT;
        /*
         * Calculate the total number of bits to be set.
diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h
index df4454511f73..f20bb472d582 100644
--- a/fs/xfs/xfs_buf_item.h
+++ b/fs/xfs/xfs_buf_item.h
@@ -41,22 +41,22 @@ typedef struct xfs_buf_log_format {
 * This flag indicates that the buffer contains on disk inodes
 * and requires special recovery handling.
 */
-#define XFS_BLI_INODE_BUF       0x1
+#define XFS_BLF_INODE_BUF       0x1
 /*
 * This flag indicates that the buffer should not be replayed
 * during recovery because its blocks are being freed.
 */
-#define XFS_BLI_CANCEL          0x2
+#define XFS_BLF_CANCEL          0x2
 /*
 * This flag indicates that the buffer contains on disk
 * user or group dquots and may require special recovery handling.
 */
-#define XFS_BLI_UDQUOT_BUF      0x4
+#define XFS_BLF_UDQUOT_BUF      0x4
-#define XFS_BLI_PDQUOT_BUF      0x8
+#define XFS_BLF_PDQUOT_BUF      0x8
-#define XFS_BLI_GDQUOT_BUF      0x10
+#define XFS_BLF_GDQUOT_BUF      0x10
-#define XFS_BLI_CHUNK           128
+#define XFS_BLF_CHUNK           128
-#define XFS_BLI_SHIFT           7
+#define XFS_BLF_SHIFT           7
 #define BIT_TO_WORD_SHIFT       5
 #define NBWORD                  (NBBY * sizeof(unsigned int))
@@ -69,6 +69,7 @@ typedef struct xfs_buf_log_format {
 #define XFS_BLI_LOGGED          0x08
 #define XFS_BLI_INODE_ALLOC_BUF 0x10
 #define XFS_BLI_STALE_INODE     0x20
+#define XFS_BLI_INODE_BUF       0x40
 #define XFS_BLI_FLAGS \
        { XFS_BLI_HOLD,         "HOLD" }, \
@@ -76,7 +77,8 @@ typedef struct xfs_buf_log_format {
        { XFS_BLI_STALE,        "STALE" }, \
        { XFS_BLI_LOGGED,       "LOGGED" }, \
        { XFS_BLI_INODE_ALLOC_BUF, "INODE_ALLOC" }, \
-        { XFS_BLI_STALE_INODE,  "STALE_INODE" }
+        { XFS_BLI_STALE_INODE,  "STALE_INODE" }, \
+        { XFS_BLI_INODE_BUF,    "INODE_BUF" }
 #ifdef __KERNEL__
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index ef96175c0744..047b8a8e5c29 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -170,7 +170,7 @@ xfs_cmn_err(int panic_tag, int level, xfs_mount_t *mp, char *fmt, ...)
        va_list ap;
 #ifdef DEBUG
-        xfs_panic_mask |= XFS_PTAG_SHUTDOWN_CORRUPT;
+        xfs_panic_mask |= (XFS_PTAG_SHUTDOWN_CORRUPT | XFS_PTAG_LOGRES);
 #endif
        if (xfs_panic_mask && (xfs_panic_mask & panic_tag)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3038dd52c72a..5215abc8023a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -54,9 +54,6 @@ STATIC xlog_t *  xlog_alloc_log(xfs_mount_t	*mp,
 STATIC int       xlog_space_left(xlog_t *log, int cycle, int bytes);
 STATIC int       xlog_sync(xlog_t *log, xlog_in_core_t *iclog);
 STATIC void      xlog_dealloc_log(xlog_t *log);
-STATIC int       xlog_write(struct log *log, struct xfs_log_vec *log_vector,
-                            struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
-                            xlog_in_core_t **commit_iclog, uint flags);
 /* local state machine functions */
 STATIC void xlog_state_done_syncing(xlog_in_core_t *iclog, int);
@@ -86,14 +83,6 @@ STATIC int xlog_regrant_write_log_space(xlog_t		*log,
 STATIC void xlog_ungrant_log_space(xlog_t        *log,
                                   xlog_ticket_t *ticket);
-/* local ticket functions */
-STATIC xlog_ticket_t    *xlog_ticket_alloc(xlog_t *log,
-                                         int    unit_bytes,
-                                         int    count,
-                                         char   clientid,
-                                         uint   flags);
 #if defined(DEBUG)
 STATIC void     xlog_verify_dest_ptr(xlog_t *log, char *ptr);
 STATIC void     xlog_verify_grant_head(xlog_t *log, int equals);
@@ -360,6 +349,15 @@ xfs_log_reserve(
                ASSERT(flags & XFS_LOG_PERM_RESERV);
                internal_ticket = *ticket;
+                /*
+                 * this is a new transaction on the ticket, so we need to
+                 * change the transaction ID so that the next transaction has a
+                 * different TID in the log. Just add one to the existing tid
+                 * so that we can see chains of rolling transactions in the log
+                 * easily.
+                 */
+                internal_ticket->t_tid++;
                trace_xfs_log_reserve(log, internal_ticket);
                xlog_grant_push_ail(mp, internal_ticket->t_unit_res);
@@ -367,7 +365,8 @@ xfs_log_reserve(
        } else {
                /* may sleep if need to allocate more tickets */
                internal_ticket = xlog_ticket_alloc(log, unit_bytes, cnt,
-                                                  client, flags);
+                                                  client, flags,
+                                                  KM_SLEEP|KM_MAYFAIL);
                if (!internal_ticket)
                        return XFS_ERROR(ENOMEM);
                internal_ticket->t_trans_type = t_type;
@@ -452,6 +451,13 @@ xfs_log_mount(
        /* Normal transactions can now occur */
        mp->m_log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
+        /*
+         * Now the log has been fully initialised and we know were our
+         * space grant counters are, we can initialise the permanent ticket
+         * needed for delayed logging to work.
+         */
+        xlog_cil_init_post_recovery(mp->m_log);
        return 0;
 out_destroy_ail:
@@ -658,6 +664,10 @@ xfs_log_item_init(
        item->li_ailp = mp->m_ail;
        item->li_type = type;
        item->li_ops = ops;
+        item->li_lv = NULL;
+        INIT_LIST_HEAD(&item->li_ail);
+        INIT_LIST_HEAD(&item->li_cil);
 }
 /*
@@ -1168,6 +1178,9 @@ xlog_alloc_log(xfs_mount_t	*mp,
        *iclogp = log->l_iclog;                 /* complete ring */
        log->l_iclog->ic_prev = prev_iclog;     /* re-write 1st prev ptr */
+        error = xlog_cil_init(log);
+        if (error)
+                goto out_free_iclog;
        return log;
 out_free_iclog:
@@ -1494,6 +1507,8 @@ xlog_dealloc_log(xlog_t *log)
        xlog_in_core_t  *iclog, *next_iclog;
        int             i;
+        xlog_cil_destroy(log);
        iclog = log->l_iclog;
        for (i=0; i<log->l_iclog_bufs; i++) {
                sv_destroy(&iclog->ic_force_wait);
@@ -1536,8 +1551,10 @@ xlog_state_finish_copy(xlog_t		*log,
 * print out info relating to regions written which consume
 * the reservation
 */
-STATIC void
+void
-xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
+xlog_print_tic_res(
+        struct xfs_mount        *mp,
+        struct xlog_ticket      *ticket)
 {
        uint i;
        uint ophdr_spc = ticket->t_res_num_ophdrs * (uint)sizeof(xlog_op_header_t);
@@ -1637,6 +1654,10 @@ xlog_print_tic_res(xfs_mount_t *mp, xlog_ticket_t *ticket)
                            "bad-rtype" : res_type_str[r_type-1]),
                            ticket->t_res_arr[i].r_len);
        }
+        xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, mp,
+                "xfs_log_write: reservation ran out. Need to up reservation");
+        xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
 }
 /*
@@ -1865,7 +1886,7 @@ xlog_write_copy_finish(
 *      we don't update ic_offset until the end when we know exactly how many
 *      bytes have been written out.
 */
-STATIC int
+int
 xlog_write(
        struct log              *log,
        struct xfs_log_vec      *log_vector,
@@ -1889,22 +1910,26 @@ xlog_write(
        *start_lsn = 0;
        len = xlog_write_calc_vec_length(ticket, log_vector);
-        if (ticket->t_curr_res < len) {
+        if (log->l_cilp) {
-                xlog_print_tic_res(log->l_mp, ticket);
+                /*
-#ifdef DEBUG
+                 * Region headers and bytes are already accounted for.
-                xlog_panic(
+                 * We only need to take into account start records and
-        "xfs_log_write: reservation ran out. Need to up reservation");
+                 * split regions in this function.
-#else
+                 */
-                /* Customer configurable panic */
+                if (ticket->t_flags & XLOG_TIC_INITED)
-                xfs_cmn_err(XFS_PTAG_LOGRES, CE_ALERT, log->l_mp,
+                        ticket->t_curr_res -= sizeof(xlog_op_header_t);
-        "xfs_log_write: reservation ran out. Need to up reservation");
-                /* If we did not panic, shutdown the filesystem */
+                /*
-                xfs_force_shutdown(log->l_mp, SHUTDOWN_CORRUPT_INCORE);
+                 * Commit record headers need to be accounted for. These
-#endif
+                 * come in as separate writes so are easy to detect.
-        }
+                 */
+                if (flags & (XLOG_COMMIT_TRANS | XLOG_UNMOUNT_TRANS))
+                        ticket->t_curr_res -= sizeof(xlog_op_header_t);
+        } else
+                ticket->t_curr_res -= len;
-        ticket->t_curr_res -= len;
+        if (ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, ticket);
        index = 0;
        lv = log_vector;
@@ -3000,6 +3025,8 @@ _xfs_log_force(
        XFS_STATS_INC(xs_log_force);
+        xlog_cil_push(log, 1);
        spin_lock(&log->l_icloglock);
        iclog = log->l_iclog;
@@ -3149,6 +3176,12 @@ _xfs_log_force_lsn(
        XFS_STATS_INC(xs_log_force);
+        if (log->l_cilp) {
+                lsn = xlog_cil_push_lsn(log, lsn);
+                if (lsn == NULLCOMMITLSN)
+                        return 0;
+        }
 try_again:
        spin_lock(&log->l_icloglock);
        iclog = log->l_iclog;
@@ -3313,22 +3346,30 @@ xfs_log_ticket_get(
        return ticket;
 }
+xlog_tid_t
+xfs_log_get_trans_ident(
+        struct xfs_trans        *tp)
+{
+        return tp->t_ticket->t_tid;
+}
 /*
 * Allocate and initialise a new log ticket.
 */
-STATIC xlog_ticket_t *
+xlog_ticket_t *
 xlog_ticket_alloc(
        struct log      *log,
        int             unit_bytes,
        int             cnt,
        char            client,
-        uint            xflags)
+        uint            xflags,
+        int             alloc_flags)
 {
        struct xlog_ticket *tic;
        uint            num_headers;
        int             iclog_space;
-        tic = kmem_zone_zalloc(xfs_log_ticket_zone, KM_SLEEP|KM_MAYFAIL);
+        tic = kmem_zone_zalloc(xfs_log_ticket_zone, alloc_flags);
        if (!tic)
                return NULL;
@@ -3647,6 +3688,11 @@ xlog_state_ioerror(
 *      c. nothing new gets queued up after (a) and (b) are done.
 *      d. if !logerror, flush the iclogs to disk, then seal them off
 *         for business.
+ *
+ * Note: for delayed logging the !logerror case needs to flush the regions
+ * held in memory out to the iclogs before flushing them to disk. This needs
+ * to be done before the log is marked as shutdown, otherwise the flush to the
+ * iclogs will fail.
 */
 int
 xfs_log_force_umount(
@@ -3680,6 +3726,16 @@ xfs_log_force_umount(
                return 1;
        }
        retval = 0;
+        /*
+         * Flush the in memory commit item list before marking the log as
+         * being shut down. We need to do it in this order to ensure all the
+         * completed transactions are flushed to disk with the xfs_log_force()
+         * call below.
+         */
+        if (!logerror && (mp->m_flags & XFS_MOUNT_DELAYLOG))
+                xlog_cil_push(log, 1);
        /*
         * We must hold both the GRANT lock and the LOG lock,
         * before we mark the filesystem SHUTDOWN and wake
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 229d1f36ba9a..04c78e642cc8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -19,7 +19,6 @@
 #define __XFS_LOG_H__
 /* get lsn fields */
 #define CYCLE_LSN(lsn) ((uint)((lsn)>>32))
 #define BLOCK_LSN(lsn) ((uint)(lsn))
@@ -114,6 +113,9 @@ struct xfs_log_vec {
        struct xfs_log_vec      *lv_next;       /* next lv in build list */
        int                     lv_niovecs;     /* number of iovecs in lv */
        struct xfs_log_iovec    *lv_iovecp;     /* iovec array */
+        struct xfs_log_item     *lv_item;       /* owner */
+        char                    *lv_buf;        /* formatted buffer */
+        int                     lv_buf_len;     /* size of formatted buffer */
 };
 /*
@@ -134,6 +136,7 @@ struct xlog_in_core;
 struct xlog_ticket;
 struct xfs_log_item;
 struct xfs_item_ops;
+struct xfs_trans;
 void    xfs_log_item_init(struct xfs_mount      *mp,
                        struct xfs_log_item     *item,
@@ -187,9 +190,16 @@ int	  xfs_log_need_covered(struct xfs_mount *mp);
 void      xlog_iodone(struct xfs_buf *);
-struct xlog_ticket * xfs_log_ticket_get(struct xlog_ticket *ticket);
+struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
 void      xfs_log_ticket_put(struct xlog_ticket *ticket);
+xlog_tid_t xfs_log_get_trans_ident(struct xfs_trans *tp);
+int     xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
+                                struct xfs_log_vec *log_vector,
+                                xfs_lsn_t *commit_lsn, int flags);
+bool    xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
 #endif
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
new file mode 100644
index 000000000000..bb17cc044bf3
--- /dev/null
+++ b/fs/xfs/xfs_log_cil.c
@@ -0,0 +1,725 @@
+/*
+ * Copyright (c) 2010 Red Hat, Inc. All Rights Reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_types.h"
+#include "xfs_bit.h"
+#include "xfs_log.h"
+#include "xfs_inum.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_log_priv.h"
+#include "xfs_sb.h"
+#include "xfs_ag.h"
+#include "xfs_dir2.h"
+#include "xfs_dmapi.h"
+#include "xfs_mount.h"
+#include "xfs_error.h"
+#include "xfs_alloc.h"
+/*
+ * Perform initial CIL structure initialisation. If the CIL is not
+ * enabled in this filesystem, ensure the log->l_cilp is null so
+ * we can check this conditional to determine if we are doing delayed
+ * logging or not.
+ */
+int
+xlog_cil_init(
+        struct log      *log)
+{
+        struct xfs_cil  *cil;
+        struct xfs_cil_ctx *ctx;
+        log->l_cilp = NULL;
+        if (!(log->l_mp->m_flags & XFS_MOUNT_DELAYLOG))
+                return 0;
+        cil = kmem_zalloc(sizeof(*cil), KM_SLEEP|KM_MAYFAIL);
+        if (!cil)
+                return ENOMEM;
+        ctx = kmem_zalloc(sizeof(*ctx), KM_SLEEP|KM_MAYFAIL);
+        if (!ctx) {
+                kmem_free(cil);
+                return ENOMEM;
+        }
+        INIT_LIST_HEAD(&cil->xc_cil);
+        INIT_LIST_HEAD(&cil->xc_committing);
+        spin_lock_init(&cil->xc_cil_lock);
+        init_rwsem(&cil->xc_ctx_lock);
+        sv_init(&cil->xc_commit_wait, SV_DEFAULT, "cilwait");
+        INIT_LIST_HEAD(&ctx->committing);
+        INIT_LIST_HEAD(&ctx->busy_extents);
+        ctx->sequence = 1;
+        ctx->cil = cil;
+        cil->xc_ctx = ctx;
+        cil->xc_log = log;
+        log->l_cilp = cil;
+        return 0;
+}
+void
+xlog_cil_destroy(
+        struct log      *log)
+{
+        if (!log->l_cilp)
+                return;
+        if (log->l_cilp->xc_ctx) {
+                if (log->l_cilp->xc_ctx->ticket)
+                        xfs_log_ticket_put(log->l_cilp->xc_ctx->ticket);
+                kmem_free(log->l_cilp->xc_ctx);
+        }
+        ASSERT(list_empty(&log->l_cilp->xc_cil));
+        kmem_free(log->l_cilp);
+}
+/*
+ * Allocate a new ticket. Failing to get a new ticket makes it really hard to
+ * recover, so we don't allow failure here. Also, we allocate in a context that
+ * we don't want to be issuing transactions from, so we need to tell the
+ * allocation code this as well.
+ *
+ * We don't reserve any space for the ticket - we are going to steal whatever
+ * space we require from transactions as they commit. To ensure we reserve all
+ * the space required, we need to set the current reservation of the ticket to
+ * zero so that we know to steal the initial transaction overhead from the
+ * first transaction commit.
+ */
+static struct xlog_ticket *
+xlog_cil_ticket_alloc(
+        struct log      *log)
+{
+        struct xlog_ticket *tic;
+        tic = xlog_ticket_alloc(log, 0, 1, XFS_TRANSACTION, 0,
+                                KM_SLEEP|KM_NOFS);
+        tic->t_trans_type = XFS_TRANS_CHECKPOINT;
+        /*
+         * set the current reservation to zero so we know to steal the basic
+         * transaction overhead reservation from the first transaction commit.
+         */
+        tic->t_curr_res = 0;
+        return tic;
+}
+/*
+ * After the first stage of log recovery is done, we know where the head and
+ * tail of the log are. We need this log initialisation done before we can
+ * initialise the first CIL checkpoint context.
+ *
+ * Here we allocate a log ticket to track space usage during a CIL push.  This
+ * ticket is passed to xlog_write() directly so that we don't slowly leak log
+ * space by failing to account for space used by log headers and additional
+ * region headers for split regions.
+ */
+void
+xlog_cil_init_post_recovery(
+        struct log      *log)
+{
+        if (!log->l_cilp)
+                return;
+        log->l_cilp->xc_ctx->ticket = xlog_cil_ticket_alloc(log);
+        log->l_cilp->xc_ctx->sequence = 1;
+        log->l_cilp->xc_ctx->commit_lsn = xlog_assign_lsn(log->l_curr_cycle,
+                                                                log->l_curr_block);
+}
+/*
+ * Insert the log item into the CIL and calculate the difference in space
+ * consumed by the item. Add the space to the checkpoint ticket and calculate
+ * if the change requires additional log metadata. If it does, take that space
+ * as well. Remove the amount of space we addded to the checkpoint ticket from
+ * the current transaction ticket so that the accounting works out correctly.
+ *
+ * If this is the first time the item is being placed into the CIL in this
+ * context, pin it so it can't be written to disk until the CIL is flushed to
+ * the iclog and the iclog written to disk.
+ */
+static void
+xlog_cil_insert(
+        struct log              *log,
+        struct xlog_ticket      *ticket,
+        struct xfs_log_item     *item,
+        struct xfs_log_vec      *lv)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_log_vec      *old = lv->lv_item->li_lv;
+        struct xfs_cil_ctx      *ctx = cil->xc_ctx;
+        int                     len;
+        int                     diff_iovecs;
+        int                     iclog_space;
+        if (old) {
+                /* existing lv on log item, space used is a delta */
+                ASSERT(!list_empty(&item->li_cil));
+                ASSERT(old->lv_buf && old->lv_buf_len && old->lv_niovecs);
+                len = lv->lv_buf_len - old->lv_buf_len;
+                diff_iovecs = lv->lv_niovecs - old->lv_niovecs;
+                kmem_free(old->lv_buf);
+                kmem_free(old);
+        } else {
+                /* new lv, must pin the log item */
+                ASSERT(!lv->lv_item->li_lv);
+                ASSERT(list_empty(&item->li_cil));
+                len = lv->lv_buf_len;
+                diff_iovecs = lv->lv_niovecs;
+                IOP_PIN(lv->lv_item);
+        }
+        len += diff_iovecs * sizeof(xlog_op_header_t);
+        /* attach new log vector to log item */
+        lv->lv_item->li_lv = lv;
+        spin_lock(&cil->xc_cil_lock);
+        list_move_tail(&item->li_cil, &cil->xc_cil);
+        ctx->nvecs += diff_iovecs;
+        /*
+         * If this is the first time the item is being committed to the CIL,
+         * store the sequence number on the log item so we can tell
+         * in future commits whether this is the first checkpoint the item is
+         * being committed into.
+         */
+        if (!item->li_seq)
+                item->li_seq = ctx->sequence;
+        /*
+         * Now transfer enough transaction reservation to the context ticket
+         * for the checkpoint. The context ticket is special - the unit
+         * reservation has to grow as well as the current reservation as we
+         * steal from tickets so we can correctly determine the space used
+         * during the transaction commit.
+         */
+        if (ctx->ticket->t_curr_res == 0) {
+                /* first commit in checkpoint, steal the header reservation */
+                ASSERT(ticket->t_curr_res >= ctx->ticket->t_unit_res + len);
+                ctx->ticket->t_curr_res = ctx->ticket->t_unit_res;
+                ticket->t_curr_res -= ctx->ticket->t_unit_res;
+        }
+        /* do we need space for more log record headers? */
+        iclog_space = log->l_iclog_size - log->l_iclog_hsize;
+        if (len > 0 && (ctx->space_used / iclog_space !=
+                                (ctx->space_used + len) / iclog_space)) {
+                int hdrs;
+                hdrs = (len + iclog_space - 1) / iclog_space;
+                /* need to take into account split region headers, too */
+                hdrs *= log->l_iclog_hsize + sizeof(struct xlog_op_header);
+                ctx->ticket->t_unit_res += hdrs;
+                ctx->ticket->t_curr_res += hdrs;
+                ticket->t_curr_res -= hdrs;
+                ASSERT(ticket->t_curr_res >= len);
+        }
+        ticket->t_curr_res -= len;
+        ctx->space_used += len;
+        spin_unlock(&cil->xc_cil_lock);
+}
+/*
+ * Format log item into a flat buffers
+ *
+ * For delayed logging, we need to hold a formatted buffer containing all the
+ * changes on the log item. This enables us to relog the item in memory and
+ * write it out asynchronously without needing to relock the object that was
+ * modified at the time it gets written into the iclog.
+ *
+ * This function builds a vector for the changes in each log item in the
+ * transaction. It then works out the length of the buffer needed for each log
+ * item, allocates them and formats the vector for the item into the buffer.
+ * The buffer is then attached to the log item are then inserted into the
+ * Committed Item List for tracking until the next checkpoint is written out.
+ *
+ * We don't set up region headers during this process; we simply copy the
+ * regions into the flat buffer. We can do this because we still have to do a
+ * formatting step to write the regions into the iclog buffer.  Writing the
+ * ophdrs during the iclog write means that we can support splitting large
+ * regions across iclog boundares without needing a change in the format of the
+ * item/region encapsulation.
+ *
+ * Hence what we need to do now is change the rewrite the vector array to point
+ * to the copied region inside the buffer we just allocated. This allows us to
+ * format the regions into the iclog as though they are being formatted
+ * directly out of the objects themselves.
+ */
+static void
+xlog_cil_format_items(
+        struct log              *log,
+        struct xfs_log_vec      *log_vector,
+        struct xlog_ticket      *ticket,
+        xfs_lsn_t               *start_lsn)
+{
+        struct xfs_log_vec *lv;
+        if (start_lsn)
+                *start_lsn = log->l_cilp->xc_ctx->sequence;
+        ASSERT(log_vector);
+        for (lv = log_vector; lv; lv = lv->lv_next) {
+                void    *ptr;
+                int     index;
+                int     len = 0;
+                /* build the vector array and calculate it's length */
+                IOP_FORMAT(lv->lv_item, lv->lv_iovecp);
+                for (index = 0; index < lv->lv_niovecs; index++)
+                        len += lv->lv_iovecp[index].i_len;
+                lv->lv_buf_len = len;
+                lv->lv_buf = kmem_zalloc(lv->lv_buf_len, KM_SLEEP|KM_NOFS);
+                ptr = lv->lv_buf;
+                for (index = 0; index < lv->lv_niovecs; index++) {
+                        struct xfs_log_iovec *vec = &lv->lv_iovecp[index];
+                        memcpy(ptr, vec->i_addr, vec->i_len);
+                        vec->i_addr = ptr;
+                        ptr += vec->i_len;
+                }
+                ASSERT(ptr == lv->lv_buf + lv->lv_buf_len);
+                xlog_cil_insert(log, ticket, lv->lv_item, lv);
+        }
+}
+static void
+xlog_cil_free_logvec(
+        struct xfs_log_vec      *log_vector)
+{
+        struct xfs_log_vec      *lv;
+        for (lv = log_vector; lv; ) {
+                struct xfs_log_vec *next = lv->lv_next;
+                kmem_free(lv->lv_buf);
+                kmem_free(lv);
+                lv = next;
+        }
+}
+/*
+ * Commit a transaction with the given vector to the Committed Item List.
+ *
+ * To do this, we need to format the item, pin it in memory if required and
+ * account for the space used by the transaction. Once we have done that we
+ * need to release the unused reservation for the transaction, attach the
+ * transaction to the checkpoint context so we carry the busy extents through
+ * to checkpoint completion, and then unlock all the items in the transaction.
+ *
+ * For more specific information about the order of operations in
+ * xfs_log_commit_cil() please refer to the comments in
+ * xfs_trans_commit_iclog().
+ *
+ * Called with the context lock already held in read mode to lock out
+ * background commit, returns without it held once background commits are
+ * allowed again.
+ */
+int
+xfs_log_commit_cil(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        struct xfs_log_vec      *log_vector,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        struct log              *log = mp->m_log;
+        int                     log_flags = 0;
+        int                     push = 0;
+        if (flags & XFS_TRANS_RELEASE_LOG_RES)
+                log_flags = XFS_LOG_REL_PERM_RESERV;
+        if (XLOG_FORCED_SHUTDOWN(log)) {
+                xlog_cil_free_logvec(log_vector);
+                return XFS_ERROR(EIO);
+        }
+        /* lock out background commit */
+        down_read(&log->l_cilp->xc_ctx_lock);
+        xlog_cil_format_items(log, log_vector, tp->t_ticket, commit_lsn);
+        /* check we didn't blow the reservation */
+        if (tp->t_ticket->t_curr_res < 0)
+                xlog_print_tic_res(log->l_mp, tp->t_ticket);
+        /* attach the transaction to the CIL if it has any busy extents */
+        if (!list_empty(&tp->t_busy)) {
+                spin_lock(&log->l_cilp->xc_cil_lock);
+                list_splice_init(&tp->t_busy,
+                                        &log->l_cilp->xc_ctx->busy_extents);
+                spin_unlock(&log->l_cilp->xc_cil_lock);
+        }
+        tp->t_commit_lsn = *commit_lsn;
+        xfs_log_done(mp, tp->t_ticket, NULL, log_flags);
+        xfs_trans_unreserve_and_mod_sb(tp);
+        /* check for background commit before unlock */
+        if (log->l_cilp->xc_ctx->space_used > XLOG_CIL_SPACE_LIMIT(log))
+                push = 1;
+        up_read(&log->l_cilp->xc_ctx_lock);
+        /*
+         * We need to push CIL every so often so we don't cache more than we
+         * can fit in the log. The limit really is that a checkpoint can't be
+         * more than half the log (the current checkpoint is not allowed to
+         * overwrite the previous checkpoint), but commit latency and memory
+         * usage limit this to a smaller size in most cases.
+         */
+        if (push)
+                xlog_cil_push(log, 0);
+        return 0;
+}
+/*
+ * Mark all items committed and clear busy extents. We free the log vector
+ * chains in a separate pass so that we unpin the log items as quickly as
+ * possible.
+ */
+static void
+xlog_cil_committed(
+        void    *args,
+        int     abort)
+{
+        struct xfs_cil_ctx      *ctx = args;
+        struct xfs_log_vec      *lv;
+        int                     abortflag = abort ? XFS_LI_ABORTED : 0;
+        struct xfs_busy_extent  *busyp, *n;
+        /* unpin all the log items */
+        for (lv = ctx->lv_chain; lv; lv = lv->lv_next ) {
+                xfs_trans_item_committed(lv->lv_item, ctx->start_lsn,
+                                                        abortflag);
+        }
+        list_for_each_entry_safe(busyp, n, &ctx->busy_extents, list)
+                xfs_alloc_busy_clear(ctx->cil->xc_log->l_mp, busyp);
+        spin_lock(&ctx->cil->xc_cil_lock);
+        list_del(&ctx->committing);
+        spin_unlock(&ctx->cil->xc_cil_lock);
+        xlog_cil_free_logvec(ctx->lv_chain);
+        kmem_free(ctx);
+}
+/*
+ * Push the Committed Item List to the log. If the push_now flag is not set,
+ * then it is a background flush and so we can chose to ignore it.
+ */
+int
+xlog_cil_push(
+        struct log              *log,
+        int                     push_now)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_log_vec      *lv;
+        struct xfs_cil_ctx      *ctx;
+        struct xfs_cil_ctx      *new_ctx;
+        struct xlog_in_core     *commit_iclog;
+        struct xlog_ticket      *tic;
+        int                     num_lv;
+        int                     num_iovecs;
+        int                     len;
+        int                     error = 0;
+        struct xfs_trans_header thdr;
+        struct xfs_log_iovec    lhdr;
+        struct xfs_log_vec      lvhdr = { NULL };
+        xfs_lsn_t               commit_lsn;
+        if (!cil)
+                return 0;
+        new_ctx = kmem_zalloc(sizeof(*new_ctx), KM_SLEEP|KM_NOFS);
+        new_ctx->ticket = xlog_cil_ticket_alloc(log);
+        /* lock out transaction commit, but don't block on background push */
+        if (!down_write_trylock(&cil->xc_ctx_lock)) {
+                if (!push_now)
+                        goto out_free_ticket;
+                down_write(&cil->xc_ctx_lock);
+        }
+        ctx = cil->xc_ctx;
+        /* check if we've anything to push */
+        if (list_empty(&cil->xc_cil))
+                goto out_skip;
+        /* check for spurious background flush */
+        if (!push_now && cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log))
+                goto out_skip;
+        /*
+         * pull all the log vectors off the items in the CIL, and
+         * remove the items from the CIL. We don't need the CIL lock
+         * here because it's only needed on the transaction commit
+         * side which is currently locked out by the flush lock.
+         */
+        lv = NULL;
+        num_lv = 0;
+        num_iovecs = 0;
+        len = 0;
+        while (!list_empty(&cil->xc_cil)) {
+                struct xfs_log_item     *item;
+                int                     i;
+                item = list_first_entry(&cil->xc_cil,
+                                        struct xfs_log_item, li_cil);
+                list_del_init(&item->li_cil);
+                if (!ctx->lv_chain)
+                        ctx->lv_chain = item->li_lv;
+                else
+                        lv->lv_next = item->li_lv;
+                lv = item->li_lv;
+                item->li_lv = NULL;
+                num_lv++;
+                num_iovecs += lv->lv_niovecs;
+                for (i = 0; i < lv->lv_niovecs; i++)
+                        len += lv->lv_iovecp[i].i_len;
+        }
+        /*
+         * initialise the new context and attach it to the CIL. Then attach
+         * the current context to the CIL committing lsit so it can be found
+         * during log forces to extract the commit lsn of the sequence that
+         * needs to be forced.
+         */
+        INIT_LIST_HEAD(&new_ctx->committing);
+        INIT_LIST_HEAD(&new_ctx->busy_extents);
+        new_ctx->sequence = ctx->sequence + 1;
+        new_ctx->cil = cil;
+        cil->xc_ctx = new_ctx;
+        /*
+         * The switch is now done, so we can drop the context lock and move out
+         * of a shared context. We can't just go straight to the commit record,
+         * though - we need to synchronise with previous and future commits so
+         * that the commit records are correctly ordered in the log to ensure
+         * that we process items during log IO completion in the correct order.
+         *
+         * For example, if we get an EFI in one checkpoint and the EFD in the
+         * next (e.g. due to log forces), we do not want the checkpoint with
+         * the EFD to be committed before the checkpoint with the EFI.  Hence
+         * we must strictly order the commit records of the checkpoints so
+         * that: a) the checkpoint callbacks are attached to the iclogs in the
+         * correct order; and b) the checkpoints are replayed in correct order
+         * in log recovery.
+         *
+         * Hence we need to add this context to the committing context list so
+         * that higher sequences will wait for us to write out a commit record
+         * before they do.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        list_add(&ctx->committing, &cil->xc_committing);
+        spin_unlock(&cil->xc_cil_lock);
+        up_write(&cil->xc_ctx_lock);
+        /*
+         * Build a checkpoint transaction header and write it to the log to
+         * begin the transaction. We need to account for the space used by the
+         * transaction header here as it is not accounted for in xlog_write().
+         *
+         * The LSN we need to pass to the log items on transaction commit is
+         * the LSN reported by the first log vector write. If we use the commit
+         * record lsn then we can move the tail beyond the grant write head.
+         */
+        tic = ctx->ticket;
+        thdr.th_magic = XFS_TRANS_HEADER_MAGIC;
+        thdr.th_type = XFS_TRANS_CHECKPOINT;
+        thdr.th_tid = tic->t_tid;
+        thdr.th_num_items = num_iovecs;
+        lhdr.i_addr = (xfs_caddr_t)&thdr;
+        lhdr.i_len = sizeof(xfs_trans_header_t);
+        lhdr.i_type = XLOG_REG_TYPE_TRANSHDR;
+        tic->t_curr_res -= lhdr.i_len + sizeof(xlog_op_header_t);
+        lvhdr.lv_niovecs = 1;
+        lvhdr.lv_iovecp = &lhdr;
+        lvhdr.lv_next = ctx->lv_chain;
+        error = xlog_write(log, &lvhdr, tic, &ctx->start_lsn, NULL, 0);
+        if (error)
+                goto out_abort;
+        /*
+         * now that we've written the checkpoint into the log, strictly
+         * order the commit records so replay will get them in the right order.
+         */
+restart:
+        spin_lock(&cil->xc_cil_lock);
+        list_for_each_entry(new_ctx, &cil->xc_committing, committing) {
+                /*
+                 * Higher sequences will wait for this one so skip them.
+                 * Don't wait for own own sequence, either.
+                 */
+                if (new_ctx->sequence >= ctx->sequence)
+                        continue;
+                if (!new_ctx->commit_lsn) {
+                        /*
+                         * It is still being pushed! Wait for the push to
+                         * complete, then start again from the beginning.
+                         */
+                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        goto restart;
+                }
+        }
+        spin_unlock(&cil->xc_cil_lock);
+        commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0);
+        if (error || commit_lsn == -1)
+                goto out_abort;
+        /* attach all the transactions w/ busy extents to iclog */
+        ctx->log_cb.cb_func = xlog_cil_committed;
+        ctx->log_cb.cb_arg = ctx;
+        error = xfs_log_notify(log->l_mp, commit_iclog, &ctx->log_cb);
+        if (error)
+                goto out_abort;
+        /*
+         * now the checkpoint commit is complete and we've attached the
+         * callbacks to the iclog we can assign the commit LSN to the context
+         * and wake up anyone who is waiting for the commit to complete.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        ctx->commit_lsn = commit_lsn;
+        sv_broadcast(&cil->xc_commit_wait);
+        spin_unlock(&cil->xc_cil_lock);
+        /* release the hounds! */
+        return xfs_log_release_iclog(log->l_mp, commit_iclog);
+out_skip:
+        up_write(&cil->xc_ctx_lock);
+out_free_ticket:
+        xfs_log_ticket_put(new_ctx->ticket);
+        kmem_free(new_ctx);
+        return 0;
+out_abort:
+        xlog_cil_committed(ctx, XFS_LI_ABORTED);
+        return XFS_ERROR(EIO);
+}
+/*
+ * Conditionally push the CIL based on the sequence passed in.
+ *
+ * We only need to push if we haven't already pushed the sequence
+ * number given. Hence the only time we will trigger a push here is
+ * if the push sequence is the same as the current context.
+ *
+ * We return the current commit lsn to allow the callers to determine if a
+ * iclog flush is necessary following this call.
+ *
+ * XXX: Initially, just push the CIL unconditionally and return whatever
+ * commit lsn is there. It'll be empty, so this is broken for now.
+ */
+xfs_lsn_t
+xlog_cil_push_lsn(
+        struct log      *log,
+        xfs_lsn_t       push_seq)
+{
+        struct xfs_cil          *cil = log->l_cilp;
+        struct xfs_cil_ctx      *ctx;
+        xfs_lsn_t               commit_lsn = NULLCOMMITLSN;
+restart:
+        down_write(&cil->xc_ctx_lock);
+        ASSERT(push_seq <= cil->xc_ctx->sequence);
+        /* check to see if we need to force out the current context */
+        if (push_seq == cil->xc_ctx->sequence) {
+                up_write(&cil->xc_ctx_lock);
+                xlog_cil_push(log, 1);
+                goto restart;
+        }
+        /*
+         * See if we can find a previous sequence still committing.
+         * We can drop the flush lock as soon as we have the cil lock
+         * because we are now only comparing contexts protected by
+         * the cil lock.
+         *
+         * We need to wait for all previous sequence commits to complete
+         * before allowing the force of push_seq to go ahead. Hence block
+         * on commits for those as well.
+         */
+        spin_lock(&cil->xc_cil_lock);
+        up_write(&cil->xc_ctx_lock);
+        list_for_each_entry(ctx, &cil->xc_committing, committing) {
+                if (ctx->sequence > push_seq)
+                        continue;
+                if (!ctx->commit_lsn) {
+                        /*
+                         * It is still being pushed! Wait for the push to
+                         * complete, then start again from the beginning.
+                         */
+                        sv_wait(&cil->xc_commit_wait, 0, &cil->xc_cil_lock, 0);
+                        goto restart;
+                }
+                if (ctx->sequence != push_seq)
+                        continue;
+                /* found it! */
+                commit_lsn = ctx->commit_lsn;
+        }
+        spin_unlock(&cil->xc_cil_lock);
+        return commit_lsn;
+}
+/*
+ * Check if the current log item was first committed in this sequence.
+ * We can't rely on just the log item being in the CIL, we have to check
+ * the recorded commit sequence number.
+ *
+ * Note: for this to be used in a non-racy manner, it has to be called with
+ * CIL flushing locked out. As a result, it should only be used during the
+ * transaction commit process when deciding what to format into the item.
+ */
+bool
+xfs_log_item_in_current_chkpt(
+        struct xfs_log_item *lip)
+{
+        struct xfs_cil_ctx *ctx;
+        if (!(lip->li_mountp->m_flags & XFS_MOUNT_DELAYLOG))
+                return false;
+        if (list_empty(&lip->li_cil))
+                return false;
+        ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
+        /*
+         * li_seq is written on the first commit of a log item to record the
+         * first checkpoint it is written to. Hence if it is different to the
+         * current sequence, we're in a new checkpoint.
+         */
+        if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
+                return false;
+        return true;
+}
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 9cf695154451..8c072618965c 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -152,8 +152,6 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_RECOVERY_NEEDED    0x4     /* log was recovered */
 #define XLOG_IO_ERROR           0x8     /* log hit an I/O error, and being
                                           shutdown */
-typedef __uint32_t xlog_tid_t;
 #ifdef __KERNEL__
 /*
@@ -379,6 +377,99 @@ typedef struct xlog_in_core {
 } xlog_in_core_t;
 /*
+ * The CIL context is used to aggregate per-transaction details as well be
+ * passed to the iclog for checkpoint post-commit processing.  After being
+ * passed to the iclog, another context needs to be allocated for tracking the
+ * next set of transactions to be aggregated into a checkpoint.
+ */
+struct xfs_cil;
+struct xfs_cil_ctx {
+        struct xfs_cil          *cil;
+        xfs_lsn_t               sequence;       /* chkpt sequence # */
+        xfs_lsn_t               start_lsn;      /* first LSN of chkpt commit */
+        xfs_lsn_t               commit_lsn;     /* chkpt commit record lsn */
+        struct xlog_ticket      *ticket;        /* chkpt ticket */
+        int                     nvecs;          /* number of regions */
+        int                     space_used;     /* aggregate size of regions */
+        struct list_head        busy_extents;   /* busy extents in chkpt */
+        struct xfs_log_vec      *lv_chain;      /* logvecs being pushed */
+        xfs_log_callback_t      log_cb;         /* completion callback hook. */
+        struct list_head        committing;     /* ctx committing list */
+};
+/*
+ * Committed Item List structure
+ *
+ * This structure is used to track log items that have been committed but not
+ * yet written into the log. It is used only when the delayed logging mount
+ * option is enabled.
+ *
+ * This structure tracks the list of committing checkpoint contexts so
+ * we can avoid the problem of having to hold out new transactions during a
+ * flush until we have a the commit record LSN of the checkpoint. We can
+ * traverse the list of committing contexts in xlog_cil_push_lsn() to find a
+ * sequence match and extract the commit LSN directly from there. If the
+ * checkpoint is still in the process of committing, we can block waiting for
+ * the commit LSN to be determined as well. This should make synchronous
+ * operations almost as efficient as the old logging methods.
+ */
+struct xfs_cil {
+        struct log              *xc_log;
+        struct list_head        xc_cil;
+        spinlock_t              xc_cil_lock;
+        struct xfs_cil_ctx      *xc_ctx;
+        struct rw_semaphore     xc_ctx_lock;
+        struct list_head        xc_committing;
+        sv_t                    xc_commit_wait;
+};
+/*
+ * The amount of log space we should the CIL to aggregate is difficult to size.
+ * Whatever we chose we have to make we can get a reservation for the log space
+ * effectively, that it is large enough to capture sufficient relogging to
+ * reduce log buffer IO significantly, but it is not too large for the log or
+ * induces too much latency when writing out through the iclogs. We track both
+ * space consumed and the number of vectors in the checkpoint context, so we
+ * need to decide which to use for limiting.
+ *
+ * Every log buffer we write out during a push needs a header reserved, which
+ * is at least one sector and more for v2 logs. Hence we need a reservation of
+ * at least 512 bytes per 32k of log space just for the LR headers. That means
+ * 16KB of reservation per megabyte of delayed logging space we will consume,
+ * plus various headers.  The number of headers will vary based on the num of
+ * io vectors, so limiting on a specific number of vectors is going to result
+ * in transactions of varying size. IOWs, it is more consistent to track and
+ * limit space consumed in the log rather than by the number of objects being
+ * logged in order to prevent checkpoint ticket overruns.
+ *
+ * Further, use of static reservations through the log grant mechanism is
+ * problematic. It introduces a lot of complexity (e.g. reserve grant vs write
+ * grant) and a significant deadlock potential because regranting write space
+ * can block on log pushes. Hence if we have to regrant log space during a log
+ * push, we can deadlock.
+ *
+ * However, we can avoid this by use of a dynamic "reservation stealing"
+ * technique during transaction commit whereby unused reservation space in the
+ * transaction ticket is transferred to the CIL ctx commit ticket to cover the
+ * space needed by the checkpoint transaction. This means that we never need to
+ * specifically reserve space for the CIL checkpoint transaction, nor do we
+ * need to regrant space once the checkpoint completes. This also means the
+ * checkpoint transaction ticket is specific to the checkpoint context, rather
+ * than the CIL itself.
+ *
+ * With dynamic reservations, we can basically make up arbitrary limits for the
+ * checkpoint size so long as they don't violate any other size rules.  Hence
+ * the initial maximum size for the checkpoint transaction will be set to a
+ * quarter of the log or 8MB, which ever is smaller. 8MB is an arbitrary limit
+ * right now based on the latency of writing out a large amount of data through
+ * the circular iclog buffers.
+ */
+#define XLOG_CIL_SPACE_LIMIT(log)       \
+        (min((log->l_logsize >> 2), (8 * 1024 * 1024)))
+/*
 * The reservation head lsn is not made up of a cycle number and block number.
 * Instead, it uses a cycle number and byte number.  Logs don't expect to
 * overflow 31 bits worth of byte offset, so using a byte number will mean
@@ -388,6 +479,7 @@ typedef struct log {
        /* The following fields don't need locking */
        struct xfs_mount        *l_mp;          /* mount point */
        struct xfs_ail          *l_ailp;        /* AIL log is working with */
+        struct xfs_cil          *l_cilp;        /* CIL log is working with */
        struct xfs_buf          *l_xbuf;        /* extra buffer for log
                                                 * wrapping */
        struct xfs_buftarg      *l_targ;        /* buftarg of log */
@@ -438,14 +530,17 @@ typedef struct log {
 #define XLOG_FORCED_SHUTDOWN(log)       ((log)->l_flags & XLOG_IO_ERROR)
 /* common routines */
 extern xfs_lsn_t xlog_assign_tail_lsn(struct xfs_mount *mp);
 extern int       xlog_recover(xlog_t *log);
 extern int       xlog_recover_finish(xlog_t *log);
 extern void      xlog_pack_data(xlog_t *log, xlog_in_core_t *iclog, int);
-extern kmem_zone_t      *xfs_log_ticket_zone;
+extern kmem_zone_t *xfs_log_ticket_zone;
+struct xlog_ticket *xlog_ticket_alloc(struct log *log, int unit_bytes,
+                                int count, char client, uint xflags,
+                                int alloc_flags);
 static inline void
 xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
@@ -455,6 +550,21 @@ xlog_write_adv_cnt(void **ptr, int *len, int *off, size_t bytes)
        *off += bytes;
 }
+void    xlog_print_tic_res(struct xfs_mount *mp, struct xlog_ticket *ticket);
+int     xlog_write(struct log *log, struct xfs_log_vec *log_vector,
+                                struct xlog_ticket *tic, xfs_lsn_t *start_lsn,
+                                xlog_in_core_t **commit_iclog, uint flags);
+/*
+ * Committed Item List interfaces
+ */
+int     xlog_cil_init(struct log *log);
+void    xlog_cil_init_post_recovery(struct log *log);
+void    xlog_cil_destroy(struct log *log);
+int     xlog_cil_push(struct log *log, int push_now);
+xfs_lsn_t xlog_cil_push_lsn(struct log *log, xfs_lsn_t push_sequence);
 /*
 * Unmount record type is used as a pseudo transaction type for the ticket.
 * It's value must be outside the range of XFS_TRANS_* values.
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 0de08e366315..14a69aec2c0b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -1576,7 +1576,7 @@ xlog_recover_reorder_trans(
                switch (ITEM_TYPE(item)) {
                case XFS_LI_BUF:
-                        if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
+                        if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
                                trace_xfs_log_recover_item_reorder_head(log,
                                                        trans, item, pass);
                                list_move(&item->ri_list, &trans->r_itemq);
@@ -1638,7 +1638,7 @@ xlog_recover_do_buffer_pass1(
        /*
         * If this isn't a cancel buffer item, then just return.
         */
-        if (!(flags & XFS_BLI_CANCEL)) {
+        if (!(flags & XFS_BLF_CANCEL)) {
                trace_xfs_log_recover_buf_not_cancel(log, buf_f);
                return;
        }
@@ -1696,7 +1696,7 @@ xlog_recover_do_buffer_pass1(
 * Check to see whether the buffer being recovered has a corresponding
 * entry in the buffer cancel record table.  If it does then return 1
 * so that it will be cancelled, otherwise return 0.  If the buffer is
- * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
+ * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
 * the refcount on the entry in the table and remove it from the table
 * if this is the last reference.
 *
@@ -1721,7 +1721,7 @@ xlog_check_buffer_cancelled(
                 * There is nothing in the table built in pass one,
                 * so this buffer must not be cancelled.
                 */
-                ASSERT(!(flags & XFS_BLI_CANCEL));
+                ASSERT(!(flags & XFS_BLF_CANCEL));
                return 0;
        }
@@ -1733,7 +1733,7 @@ xlog_check_buffer_cancelled(
                 * There is no corresponding entry in the table built
                 * in pass one, so this buffer has not been cancelled.
                 */
-                ASSERT(!(flags & XFS_BLI_CANCEL));
+                ASSERT(!(flags & XFS_BLF_CANCEL));
                return 0;
        }
@@ -1752,7 +1752,7 @@ xlog_check_buffer_cancelled(
                         * one in the table and remove it if this is the
                         * last reference.
                         */
-                        if (flags & XFS_BLI_CANCEL) {
+                        if (flags & XFS_BLF_CANCEL) {
                                bcp->bc_refcount--;
                                if (bcp->bc_refcount == 0) {
                                        if (prevp == NULL) {
@@ -1772,7 +1772,7 @@ xlog_check_buffer_cancelled(
         * We didn't find a corresponding entry in the table, so
         * return 0 so that the buffer is NOT cancelled.
         */
-        ASSERT(!(flags & XFS_BLI_CANCEL));
+        ASSERT(!(flags & XFS_BLF_CANCEL));
        return 0;
 }
@@ -1874,8 +1874,8 @@ xlog_recover_do_inode_buffer(
                        nbits = xfs_contig_bits(data_map, map_size,
                                                         bit);
                        ASSERT(nbits > 0);
-                        reg_buf_offset = bit << XFS_BLI_SHIFT;
+                        reg_buf_offset = bit << XFS_BLF_SHIFT;
-                        reg_buf_bytes = nbits << XFS_BLI_SHIFT;
+                        reg_buf_bytes = nbits << XFS_BLF_SHIFT;
                        item_index++;
                }
@@ -1889,7 +1889,7 @@ xlog_recover_do_inode_buffer(
                }
                ASSERT(item->ri_buf[item_index].i_addr != NULL);
-                ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
+                ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
                ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
                /*
@@ -1955,9 +1955,9 @@ xlog_recover_do_reg_buffer(
                nbits = xfs_contig_bits(data_map, map_size, bit);
                ASSERT(nbits > 0);
                ASSERT(item->ri_buf[i].i_addr != NULL);
-                ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
+                ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
                ASSERT(XFS_BUF_COUNT(bp) >=
-                       ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
+                       ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
                /*
                 * Do a sanity check if this is a dquot buffer. Just checking
@@ -1966,7 +1966,7 @@ xlog_recover_do_reg_buffer(
                 */
                error = 0;
                if (buf_f->blf_flags &
-                   (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                        if (item->ri_buf[i].i_addr == NULL) {
                                cmn_err(CE_ALERT,
                                        "XFS: NULL dquot in %s.", __func__);
@@ -1987,9 +1987,9 @@ xlog_recover_do_reg_buffer(
                }
                memcpy(xfs_buf_offset(bp,
-                        (uint)bit << XFS_BLI_SHIFT),    /* dest */
+                        (uint)bit << XFS_BLF_SHIFT),    /* dest */
                        item->ri_buf[i].i_addr,         /* source */
-                        nbits<<XFS_BLI_SHIFT);          /* length */
+                        nbits<<XFS_BLF_SHIFT);          /* length */
 next:
                i++;
                bit += nbits;
@@ -2148,11 +2148,11 @@ xlog_recover_do_dquot_buffer(
        }
        type = 0;
-        if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
                type |= XFS_DQ_USER;
-        if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
                type |= XFS_DQ_PROJ;
-        if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
+        if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
                type |= XFS_DQ_GROUP;
        /*
         * This type of quotas was turned off, so ignore this buffer
@@ -2173,7 +2173,7 @@ xlog_recover_do_dquot_buffer(
 * here which overlaps that may be stale.
 *
 * When meta-data buffers are freed at run time we log a buffer item
- * with the XFS_BLI_CANCEL bit set to indicate that previous copies
+ * with the XFS_BLF_CANCEL bit set to indicate that previous copies
 * of the buffer in the log should not be replayed at recovery time.
 * This is so that if the blocks covered by the buffer are reused for
 * file data before we crash we don't end up replaying old, freed
@@ -2207,7 +2207,7 @@ xlog_recover_do_buffer_trans(
        if (pass == XLOG_RECOVER_PASS1) {
                /*
                 * In this pass we're only looking for buf items
-                 * with the XFS_BLI_CANCEL bit set.
+                 * with the XFS_BLF_CANCEL bit set.
                 */
                xlog_recover_do_buffer_pass1(log, buf_f);
                return 0;
@@ -2244,7 +2244,7 @@ xlog_recover_do_buffer_trans(
        mp = log->l_mp;
        buf_flags = XBF_LOCK;
-        if (!(flags & XFS_BLI_INODE_BUF))
+        if (!(flags & XFS_BLF_INODE_BUF))
                buf_flags |= XBF_MAPPED;
        bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
@@ -2257,10 +2257,10 @@ xlog_recover_do_buffer_trans(
        }
        error = 0;
-        if (flags & XFS_BLI_INODE_BUF) {
+        if (flags & XFS_BLF_INODE_BUF) {
                error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
        } else if (flags &
-                  (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
+                  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
                xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
        } else {
                xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
diff --git a/fs/xfs/xfs_log_recover.h b/fs/xfs/xfs_log_recover.h
index 75d749207258..1c55ccbb379d 100644
--- a/fs/xfs/xfs_log_recover.h
+++ b/fs/xfs/xfs_log_recover.h
@@ -28,7 +28,7 @@
 #define XLOG_RHASH(tid) \
        ((((__uint32_t)tid)>>XLOG_RHASH_SHIFT) & (XLOG_RHASH_SIZE-1))
-#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLI_CHUNK / 2 + 1)
+#define XLOG_MAX_REGIONS_IN_ITEM   (XFS_MAX_BLOCKSIZE / XFS_BLF_CHUNK / 2 + 1)
 /*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 9ff48a16a7ee..1d2c7eed4eda 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -268,6 +268,7 @@ typedef struct xfs_mount {
 #define XFS_MOUNT_WSYNC         (1ULL << 0)     /* for nfs - all metadata ops
                                                   must be synchronous except
                                                   for space allocations */
+#define XFS_MOUNT_DELAYLOG      (1ULL << 1)     /* delayed logging is enabled */
 #define XFS_MOUNT_DMAPI         (1ULL << 2)     /* dmapi is enabled */
 #define XFS_MOUNT_WAS_CLEAN     (1ULL << 3)
 #define XFS_MOUNT_FS_SHUTDOWN   (1ULL << 4)     /* atomic stop of all filesystem
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index be578ecb4af2..ce558efa2ea0 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -44,6 +44,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_trans_space.h"
 #include "xfs_inode_item.h"
+#include "xfs_trace.h"
 kmem_zone_t     *xfs_trans_zone;
@@ -243,9 +244,8 @@ _xfs_trans_alloc(
        tp->t_type = type;
        tp->t_mountp = mp;
        tp->t_items_free = XFS_LIC_NUM_SLOTS;
-        tp->t_busy_free = XFS_LBC_NUM_SLOTS;
        xfs_lic_init(&(tp->t_items));
-        XFS_LBC_INIT(&(tp->t_busy));
+        INIT_LIST_HEAD(&tp->t_busy);
        return tp;
 }
@@ -255,8 +255,13 @@ _xfs_trans_alloc(
 */
 STATIC void
 xfs_trans_free(
-        xfs_trans_t     *tp)
+        struct xfs_trans        *tp)
 {
+        struct xfs_busy_extent  *busyp, *n;
+        list_for_each_entry_safe(busyp, n, &tp->t_busy, list)
+                xfs_alloc_busy_clear(tp->t_mountp, busyp);
        atomic_dec(&tp->t_mountp->m_active_trans);
        xfs_trans_free_dqinfo(tp);
        kmem_zone_free(xfs_trans_zone, tp);
@@ -285,9 +290,8 @@ xfs_trans_dup(
        ntp->t_type = tp->t_type;
        ntp->t_mountp = tp->t_mountp;
        ntp->t_items_free = XFS_LIC_NUM_SLOTS;
-        ntp->t_busy_free = XFS_LBC_NUM_SLOTS;
        xfs_lic_init(&(ntp->t_items));
-        XFS_LBC_INIT(&(ntp->t_busy));
+        INIT_LIST_HEAD(&ntp->t_busy);
        ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
        ASSERT(tp->t_ticket != NULL);
@@ -423,7 +427,6 @@ undo_blocks:
        return error;
 }
 /*
 * Record the indicated change to the given field for application
 * to the file system's superblock when the transaction commits.
@@ -652,7 +655,7 @@ xfs_trans_apply_sb_deltas(
 * XFS_TRANS_SB_DIRTY will not be set when the transaction is updated but we
 * still need to update the incore superblock with the changes.
 */
-STATIC void
+void
 xfs_trans_unreserve_and_mod_sb(
        xfs_trans_t     *tp)
 {
@@ -880,7 +883,7 @@ xfs_trans_fill_vecs(
 * they could be immediately flushed and we'd have to race with the flusher
 * trying to pull the item from the AIL as we add it.
 */
-static void
+void
 xfs_trans_item_committed(
        struct xfs_log_item     *lip,
        xfs_lsn_t               commit_lsn,
@@ -930,26 +933,6 @@ xfs_trans_item_committed(
        IOP_UNPIN(lip);
 }
-/* Clear all the per-AG busy list items listed in this transaction */
-static void
-xfs_trans_clear_busy_extents(
-        struct xfs_trans        *tp)
-{
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_slot_t     *lbsp;
-        int                     i;
-        for (lbcp = &tp->t_busy; lbcp != NULL; lbcp = lbcp->lbc_next) {
-                i = 0;
-                for (lbsp = lbcp->lbc_busy; i < lbcp->lbc_unused; i++, lbsp++) {
-                        if (XFS_LBC_ISFREE(lbcp, i))
-                                continue;
-                        xfs_alloc_clear_busy(tp, lbsp->lbc_ag, lbsp->lbc_idx);
-                }
-        }
-        xfs_trans_free_busy(tp);
-}
 /*
 * This is typically called by the LM when a transaction has been fully
 * committed to disk.  It needs to unpin the items which have
@@ -984,7 +967,6 @@ xfs_trans_committed(
                kmem_free(licp);
        }
-        xfs_trans_clear_busy_extents(tp);
        xfs_trans_free(tp);
 }
@@ -1012,8 +994,7 @@ xfs_trans_uncommit(
        xfs_trans_unreserve_and_mod_sb(tp);
        xfs_trans_unreserve_and_mod_dquots(tp);
-        xfs_trans_free_items(tp, flags);
+        xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
-        xfs_trans_free_busy(tp);
        xfs_trans_free(tp);
 }
@@ -1075,6 +1056,8 @@ xfs_trans_commit_iclog(
        *commit_lsn = xfs_log_done(mp, tp->t_ticket, &commit_iclog, log_flags);
        tp->t_commit_lsn = *commit_lsn;
+        trace_xfs_trans_commit_lsn(tp);
        if (nvec > XFS_TRANS_LOGVEC_COUNT)
                kmem_free(log_vector);
@@ -1161,6 +1144,93 @@ xfs_trans_commit_iclog(
        return xfs_log_release_iclog(mp, commit_iclog);
 }
+/*
+ * Walk the log items and allocate log vector structures for
+ * each item large enough to fit all the vectors they require.
+ * Note that this format differs from the old log vector format in
+ * that there is no transaction header in these log vectors.
+ */
+STATIC struct xfs_log_vec *
+xfs_trans_alloc_log_vecs(
+        xfs_trans_t     *tp)
+{
+        xfs_log_item_desc_t     *lidp;
+        struct xfs_log_vec      *lv = NULL;
+        struct xfs_log_vec      *ret_lv = NULL;
+        lidp = xfs_trans_first_item(tp);
+        /* Bail out if we didn't find a log item.  */
+        if (!lidp) {
+                ASSERT(0);
+                return NULL;
+        }
+        while (lidp != NULL) {
+                struct xfs_log_vec *new_lv;
+                /* Skip items which aren't dirty in this transaction. */
+                if (!(lidp->lid_flags & XFS_LID_DIRTY)) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                /* Skip items that do not have any vectors for writing */
+                lidp->lid_size = IOP_SIZE(lidp->lid_item);
+                if (!lidp->lid_size) {
+                        lidp = xfs_trans_next_item(tp, lidp);
+                        continue;
+                }
+                new_lv = kmem_zalloc(sizeof(*new_lv) +
+                                lidp->lid_size * sizeof(struct xfs_log_iovec),
+                                KM_SLEEP);
+                /* The allocated iovec region lies beyond the log vector. */
+                new_lv->lv_iovecp = (struct xfs_log_iovec *)&new_lv[1];
+                new_lv->lv_niovecs = lidp->lid_size;
+                new_lv->lv_item = lidp->lid_item;
+                if (!ret_lv)
+                        ret_lv = new_lv;
+                else
+                        lv->lv_next = new_lv;
+                lv = new_lv;
+                lidp = xfs_trans_next_item(tp, lidp);
+        }
+        return ret_lv;
+}
+static int
+xfs_trans_commit_cil(
+        struct xfs_mount        *mp,
+        struct xfs_trans        *tp,
+        xfs_lsn_t               *commit_lsn,
+        int                     flags)
+{
+        struct xfs_log_vec      *log_vector;
+        int                     error;
+        /*
+         * Get each log item to allocate a vector structure for
+         * the log item to to pass to the log write code. The
+         * CIL commit code will format the vector and save it away.
+         */
+        log_vector = xfs_trans_alloc_log_vecs(tp);
+        if (!log_vector)
+                return ENOMEM;
+        error = xfs_log_commit_cil(mp, tp, log_vector, commit_lsn, flags);
+        if (error)
+                return error;
+        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
+        /* xfs_trans_free_items() unlocks them first */
+        xfs_trans_free_items(tp, *commit_lsn, 0);
+        xfs_trans_free(tp);
+        return 0;
+}
 /*
 * xfs_trans_commit
@@ -1221,7 +1291,11 @@ _xfs_trans_commit(
                xfs_trans_apply_sb_deltas(tp);
        xfs_trans_apply_dquot_deltas(tp);
-        error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
+        if (mp->m_flags & XFS_MOUNT_DELAYLOG)
+                error = xfs_trans_commit_cil(mp, tp, &commit_lsn, flags);
+        else
+                error = xfs_trans_commit_iclog(mp, tp, &commit_lsn, flags);
        if (error == ENOMEM) {
                xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
                error = XFS_ERROR(EIO);
@@ -1259,8 +1333,7 @@ out_unreserve:
                        error = XFS_ERROR(EIO);
        }
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        xfs_trans_free_items(tp, error ? XFS_TRANS_ABORT : 0);
+        xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0);
-        xfs_trans_free_busy(tp);
        xfs_trans_free(tp);
        XFS_STATS_INC(xs_trans_empty);
@@ -1338,8 +1411,7 @@ xfs_trans_cancel(
        /* mark this thread as no longer being in a transaction */
        current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
-        xfs_trans_free_items(tp, flags);
+        xfs_trans_free_items(tp, NULLCOMMITLSN, flags);
-        xfs_trans_free_busy(tp);
        xfs_trans_free(tp);
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index c62beee0921e..8c69e7824f68 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -106,7 +106,8 @@ typedef struct xfs_trans_header {
 #define XFS_TRANS_GROWFSRT_FREE         39
 #define XFS_TRANS_SWAPEXT               40
 #define XFS_TRANS_SB_COUNT              41
-#define XFS_TRANS_TYPE_MAX              41
+#define XFS_TRANS_CHECKPOINT            42
+#define XFS_TRANS_TYPE_MAX              42
 /* new transaction types need to be reflected in xfs_logprint(8) */
 #define XFS_TRANS_TYPES \
@@ -148,6 +149,7 @@ typedef struct xfs_trans_header {
        { XFS_TRANS_GROWFSRT_FREE,      "GROWFSRT_FREE" }, \
        { XFS_TRANS_SWAPEXT,            "SWAPEXT" }, \
        { XFS_TRANS_SB_COUNT,           "SB_COUNT" }, \
+        { XFS_TRANS_CHECKPOINT,         "CHECKPOINT" }, \
        { XFS_TRANS_DUMMY1,             "DUMMY1" }, \
        { XFS_TRANS_DUMMY2,             "DUMMY2" }, \
        { XLOG_UNMOUNT_REC_TYPE,        "UNMOUNT" }
@@ -813,6 +815,7 @@ struct xfs_log_item_desc;
 struct xfs_mount;
 struct xfs_trans;
 struct xfs_dquot_acct;
+struct xfs_busy_extent;
 typedef struct xfs_log_item {
        struct list_head                li_ail;         /* AIL pointers */
@@ -828,6 +831,11 @@ typedef struct xfs_log_item {
                                                        /* buffer item iodone */
                                                        /* callback func */
        struct xfs_item_ops             *li_ops;        /* function list */
+        /* delayed logging */
+        struct list_head                li_cil;         /* CIL pointers */
+        struct xfs_log_vec              *li_lv;         /* active log vector */
+        xfs_lsn_t                       li_seq;         /* CIL commit seq */
 } xfs_log_item_t;
 #define XFS_LI_IN_AIL   0x1
@@ -872,34 +880,6 @@ typedef struct xfs_item_ops {
 #define XFS_ITEM_PUSHBUF        3
 /*
- * This structure is used to maintain a list of block ranges that have been
- * freed in the transaction.  The ranges are listed in the perag[] busy list
- * between when they're freed and the transaction is committed to disk.
- */
-typedef struct xfs_log_busy_slot {
-        xfs_agnumber_t          lbc_ag;
-        ushort                  lbc_idx;        /* index in perag.busy[] */
-} xfs_log_busy_slot_t;
-#define XFS_LBC_NUM_SLOTS       31
-typedef struct xfs_log_busy_chunk {
-        struct xfs_log_busy_chunk       *lbc_next;
-        uint                            lbc_free;       /* free slots bitmask */
-        ushort                          lbc_unused;     /* first unused */
-        xfs_log_busy_slot_t             lbc_busy[XFS_LBC_NUM_SLOTS];
-} xfs_log_busy_chunk_t;
-#define XFS_LBC_MAX_SLOT        (XFS_LBC_NUM_SLOTS - 1)
-#define XFS_LBC_FREEMASK        ((1U << XFS_LBC_NUM_SLOTS) - 1)
-#define XFS_LBC_INIT(cp)        ((cp)->lbc_free = XFS_LBC_FREEMASK)
-#define XFS_LBC_CLAIM(cp, slot) ((cp)->lbc_free &= ~(1 << (slot)))
-#define XFS_LBC_SLOT(cp, slot)  (&((cp)->lbc_busy[(slot)]))
-#define XFS_LBC_VACANCY(cp)     (((cp)->lbc_free) & XFS_LBC_FREEMASK)
-#define XFS_LBC_ISFREE(cp, slot) ((cp)->lbc_free & (1 << (slot)))
-/*
 * This is the type of function which can be given to xfs_trans_callback()
 * to be called upon the transaction's commit to disk.
 */
@@ -950,8 +930,7 @@ typedef struct xfs_trans {
        unsigned int            t_items_free;   /* log item descs free */
        xfs_log_item_chunk_t    t_items;        /* first log item desc chunk */
        xfs_trans_header_t      t_header;       /* header for in-log trans */
-        unsigned int            t_busy_free;    /* busy descs free */
+        struct list_head        t_busy;         /* list of busy extents */
-        xfs_log_busy_chunk_t    t_busy;         /* busy/async free blocks */
        unsigned long           t_pflags;       /* saved process flags state */
 } xfs_trans_t;
@@ -1025,9 +1004,6 @@ int		_xfs_trans_commit(xfs_trans_t *,
 void            xfs_trans_cancel(xfs_trans_t *, int);
 int             xfs_trans_ail_init(struct xfs_mount *);
 void            xfs_trans_ail_destroy(struct xfs_mount *);
-xfs_log_busy_slot_t *xfs_trans_add_busy(xfs_trans_t *tp,
-                                        xfs_agnumber_t ag,
-                                        xfs_extlen_t idx);
 extern kmem_zone_t      *xfs_trans_zone;
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 9cd809025f3a..63d81a22f4fd 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -114,7 +114,7 @@ _xfs_trans_bjoin(
        xfs_buf_item_init(bp, tp->t_mountp);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(!(bip->bli_flags & XFS_BLI_LOGGED));
        if (reset_recur)
                bip->bli_recur = 0;
@@ -511,7 +511,7 @@ xfs_trans_brelse(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(bip->bli_item.li_type == XFS_LI_BUF);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        /*
@@ -619,7 +619,7 @@ xfs_trans_bhold(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        bip->bli_flags |= XFS_BLI_HOLD;
        trace_xfs_trans_bhold(bip);
@@ -641,7 +641,7 @@ xfs_trans_bhold_release(xfs_trans_t	*tp,
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
-        ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_CANCEL));
+        ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_CANCEL));
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
        ASSERT(bip->bli_flags & XFS_BLI_HOLD);
        bip->bli_flags &= ~XFS_BLI_HOLD;
@@ -704,7 +704,7 @@ xfs_trans_log_buf(xfs_trans_t	*tp,
                bip->bli_flags &= ~XFS_BLI_STALE;
                ASSERT(XFS_BUF_ISSTALE(bp));
                XFS_BUF_UNSTALE(bp);
-                bip->bli_format.blf_flags &= ~XFS_BLI_CANCEL;
+                bip->bli_format.blf_flags &= ~XFS_BLF_CANCEL;
        }
        lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)bip);
@@ -762,8 +762,8 @@ xfs_trans_binval(
                ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
                ASSERT(XFS_BUF_ISSTALE(bp));
                ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
-                ASSERT(!(bip->bli_format.blf_flags & XFS_BLI_INODE_BUF));
+                ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
-                ASSERT(bip->bli_format.blf_flags & XFS_BLI_CANCEL);
+                ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
                ASSERT(lidp->lid_flags & XFS_LID_DIRTY);
                ASSERT(tp->t_flags & XFS_TRANS_DIRTY);
                return;
@@ -774,7 +774,7 @@ xfs_trans_binval(
         * in the buf log item.  The STALE flag will be used in
         * xfs_buf_item_unpin() to determine if it should clean up
         * when the last reference to the buf item is given up.
-         * We set the XFS_BLI_CANCEL flag in the buf log format structure
+         * We set the XFS_BLF_CANCEL flag in the buf log format structure
         * and log the buf item.  This will be used at recovery time
         * to determine that copies of the buffer in the log before
         * this should not be replayed.
@@ -792,9 +792,9 @@ xfs_trans_binval(
        XFS_BUF_UNDELAYWRITE(bp);
        XFS_BUF_STALE(bp);
        bip->bli_flags |= XFS_BLI_STALE;
-        bip->bli_flags &= ~(XFS_BLI_LOGGED | XFS_BLI_DIRTY);
+        bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
-        bip->bli_format.blf_flags &= ~XFS_BLI_INODE_BUF;
+        bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
-        bip->bli_format.blf_flags |= XFS_BLI_CANCEL;
+        bip->bli_format.blf_flags |= XFS_BLF_CANCEL;
        memset((char *)(bip->bli_format.blf_data_map), 0,
              (bip->bli_format.blf_map_size * sizeof(uint)));
        lidp->lid_flags |= XFS_LID_DIRTY;
@@ -802,16 +802,16 @@ xfs_trans_binval(
 }
 /*
- * This call is used to indicate that the buffer contains on-disk
+ * This call is used to indicate that the buffer contains on-disk inodes which
- * inodes which must be handled specially during recovery.  They
+ * must be handled specially during recovery.  They require special handling
- * require special handling because only the di_next_unlinked from
+ * because only the di_next_unlinked from the inodes in the buffer should be
- * the inodes in the buffer should be recovered.  The rest of the
+ * recovered.  The rest of the data in the buffer is logged via the inodes
- * data in the buffer is logged via the inodes themselves.
+ * themselves.
 *
- * All we do is set the XFS_BLI_INODE_BUF flag in the buffer's log
+ * All we do is set the XFS_BLI_INODE_BUF flag in the items flags so it can be
- * format structure so that we'll know what to do at recovery time.
+ * transferred to the buffer's log format structure so that we'll know what to
+ * do at recovery time.
 */
-/* ARGSUSED */
 void
 xfs_trans_inode_buf(
        xfs_trans_t     *tp,
@@ -826,7 +826,7 @@ xfs_trans_inode_buf(
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
-        bip->bli_format.blf_flags |= XFS_BLI_INODE_BUF;
+        bip->bli_flags |= XFS_BLI_INODE_BUF;
 }
 /*
@@ -908,9 +908,9 @@ xfs_trans_dquot_buf(
        ASSERT(XFS_BUF_ISBUSY(bp));
        ASSERT(XFS_BUF_FSPRIVATE2(bp, xfs_trans_t *) == tp);
        ASSERT(XFS_BUF_FSPRIVATE(bp, void *) != NULL);
-        ASSERT(type == XFS_BLI_UDQUOT_BUF ||
+        ASSERT(type == XFS_BLF_UDQUOT_BUF ||
-               type == XFS_BLI_PDQUOT_BUF ||
+               type == XFS_BLF_PDQUOT_BUF ||
-               type == XFS_BLI_GDQUOT_BUF);
+               type == XFS_BLF_GDQUOT_BUF);
        bip = XFS_BUF_FSPRIVATE(bp, xfs_buf_log_item_t *);
        ASSERT(atomic_read(&bip->bli_refcount) > 0);
diff --git a/fs/xfs/xfs_trans_item.c b/fs/xfs/xfs_trans_item.c
index eb3fc57f9eef..f11d37d06dcc 100644
--- a/fs/xfs/xfs_trans_item.c
+++ b/fs/xfs/xfs_trans_item.c
@@ -299,6 +299,7 @@ xfs_trans_next_item(xfs_trans_t *tp, xfs_log_item_desc_t *lidp)
 void
 xfs_trans_free_items(
        xfs_trans_t     *tp,
+        xfs_lsn_t       commit_lsn,
        int             flags)
 {
        xfs_log_item_chunk_t    *licp;
@@ -311,7 +312,7 @@ xfs_trans_free_items(
         * Special case the embedded chunk so we don't free it below.
         */
        if (!xfs_lic_are_all_free(licp)) {
-                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+                (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
                xfs_lic_all_free(licp);
                licp->lic_unused = 0;
        }
@@ -322,7 +323,7 @@ xfs_trans_free_items(
         */
        while (licp != NULL) {
                ASSERT(!xfs_lic_are_all_free(licp));
-                (void) xfs_trans_unlock_chunk(licp, 1, abort, NULLCOMMITLSN);
+                (void) xfs_trans_unlock_chunk(licp, 1, abort, commit_lsn);
                next_licp = licp->lic_next;
                kmem_free(licp);
                licp = next_licp;
@@ -438,112 +439,3 @@ xfs_trans_unlock_chunk(
        return freed;
 }
-/*
- * This is called to add the given busy item to the transaction's
- * list of busy items.  It must find a free busy item descriptor
- * or allocate a new one and add the item to that descriptor.
- * The function returns a pointer to busy descriptor used to point
- * to the new busy entry.  The log busy entry will now point to its new
- * descriptor with its ???? field.
- */
-xfs_log_busy_slot_t *
-xfs_trans_add_busy(xfs_trans_t *tp, xfs_agnumber_t ag, xfs_extlen_t idx)
-{
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_slot_t     *lbsp;
-        int                     i=0;
-        /*
-         * If there are no free descriptors, allocate a new chunk
-         * of them and put it at the front of the chunk list.
-         */
-        if (tp->t_busy_free == 0) {
-                lbcp = (xfs_log_busy_chunk_t*)
-                       kmem_alloc(sizeof(xfs_log_busy_chunk_t), KM_SLEEP);
-                ASSERT(lbcp != NULL);
-                /*
-                 * Initialize the chunk, and then
-                 * claim the first slot in the newly allocated chunk.
-                 */
-                XFS_LBC_INIT(lbcp);
-                XFS_LBC_CLAIM(lbcp, 0);
-                lbcp->lbc_unused = 1;
-                lbsp = XFS_LBC_SLOT(lbcp, 0);
-                /*
-                 * Link in the new chunk and update the free count.
-                 */
-                lbcp->lbc_next = tp->t_busy.lbc_next;
-                tp->t_busy.lbc_next = lbcp;
-                tp->t_busy_free = XFS_LIC_NUM_SLOTS - 1;
-                /*
-                 * Initialize the descriptor and the generic portion
-                 * of the log item.
-                 *
-                 * Point the new slot at this item and return it.
-                 * Also point the log item at its currently active
-                 * descriptor and set the item's mount pointer.
-                 */
-                lbsp->lbc_ag = ag;
-                lbsp->lbc_idx = idx;
-                return lbsp;
-        }
-        /*
-         * Find the free descriptor. It is somewhere in the chunklist
-         * of descriptors.
-         */
-        lbcp = &tp->t_busy;
-        while (lbcp != NULL) {
-                if (XFS_LBC_VACANCY(lbcp)) {
-                        if (lbcp->lbc_unused <= XFS_LBC_MAX_SLOT) {
-                                i = lbcp->lbc_unused;
-                                break;
-                        } else {
-                                /* out-of-order vacancy */
-                                cmn_err(CE_DEBUG, "OOO vacancy lbcp 0x%p\n", lbcp);
-                                ASSERT(0);
-                        }
-                }
-                lbcp = lbcp->lbc_next;
-        }
-        ASSERT(lbcp != NULL);
-        /*
-         * If we find a free descriptor, claim it,
-         * initialize it, and return it.
-         */
-        XFS_LBC_CLAIM(lbcp, i);
-        if (lbcp->lbc_unused <= i) {
-                lbcp->lbc_unused = i + 1;
-        }
-        lbsp = XFS_LBC_SLOT(lbcp, i);
-        tp->t_busy_free--;
-        lbsp->lbc_ag = ag;
-        lbsp->lbc_idx = idx;
-        return lbsp;
-}
-/*
- * xfs_trans_free_busy
- * Free all of the busy lists from a transaction
- */
-void
-xfs_trans_free_busy(xfs_trans_t *tp)
-{
-        xfs_log_busy_chunk_t    *lbcp;
-        xfs_log_busy_chunk_t    *lbcq;
-        lbcp = tp->t_busy.lbc_next;
-        while (lbcp != NULL) {
-                lbcq = lbcp->lbc_next;
-                kmem_free(lbcp);
-                lbcp = lbcq;
-        }
-        XFS_LBC_INIT(&tp->t_busy);
-        tp->t_busy.lbc_unused = 0;
-}
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 73e2ad397432..c6e4f2c8de6e 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -35,13 +35,14 @@ struct xfs_log_item_desc	*xfs_trans_find_item(struct xfs_trans *,
 struct xfs_log_item_desc        *xfs_trans_first_item(struct xfs_trans *);
 struct xfs_log_item_desc        *xfs_trans_next_item(struct xfs_trans *,
                                             struct xfs_log_item_desc *);
-void                            xfs_trans_free_items(struct xfs_trans *, int);
-void                            xfs_trans_unlock_items(struct xfs_trans *,
+void    xfs_trans_unlock_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn);
-                                                        xfs_lsn_t);
+void    xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn,
-void                            xfs_trans_free_busy(xfs_trans_t *tp);
+                                int flags);
-xfs_log_busy_slot_t             *xfs_trans_add_busy(xfs_trans_t *tp,
-                                                    xfs_agnumber_t ag,
+void    xfs_trans_item_committed(struct xfs_log_item *lip,
-                                                    xfs_extlen_t idx);
+                                xfs_lsn_t commit_lsn, int aborted);
+void    xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp);
 /*
 * AIL traversal cursor.
diff --git a/fs/xfs/xfs_types.h b/fs/xfs/xfs_types.h
index b09904555d07..320775295e32 100644
--- a/fs/xfs/xfs_types.h
+++ b/fs/xfs/xfs_types.h
@@ -75,6 +75,8 @@ typedef	__uint32_t	xfs_dahash_t;	/* dir/attr hash value */
 typedef __uint16_t      xfs_prid_t;     /* prid_t truncated to 16bits in XFS */
+typedef __uint32_t      xlog_tid_t;     /* transaction ID type */
 /*
 * These types are 64 bits on disk but are either 32 or 64 bits in memory.
 * Disk based types:
author	Jens Axboe <jaxboe@fusionio.com>	2010-06-01 06:42:12 -0400
committer	Jens Axboe <jaxboe@fusionio.com>	2010-06-01 06:42:12 -0400
commit	b4ca761577535b2b4d153689ee97342797dfff05 (patch)
tree	29054d55508f1faa22ec32acf7c245751af03348 /fs
parent	28f4197e5d4707311febeec8a0eb97cb5fd93c97 (diff)
parent	67a3e12b05e055c0415c556a315a3d3eb637e29e (diff)