Merge branch 'master' of /home/davem/src/GIT/linux-2.6/

author: David S. Miller <davem@davemloft.net> 2010-06-26 13:27:00 -0400
committer: David S. Miller <davem@davemloft.net> 2010-06-26 13:27:00 -0400
commit: c67dda14389205f0a223c5089307495290939b3b (patch)
tree: fad0bb26b28703d02a22ebdd44d94eabac4a2ade /fs
parent: 43bc2db47292a824152145253b1dd2847e7312a3 (diff)
parent: 7e27d6e778cd87b6f2415515d7127eba53fe5d02 (diff)
238 files changed, 9711 insertions, 5791 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 25b300e1c9d7..2bedc6c94fc2 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -257,15 +257,13 @@ v9fs_file_write(struct file *filp, const char __user * data,
        return total;
 }
-static int v9fs_file_fsync(struct file *filp, struct dentry *dentry,
+static int v9fs_file_fsync(struct file *filp, int datasync)
-                                        int datasync)
 {
        struct p9_fid *fid;
        struct p9_wstat wstat;
        int retval;
-        P9_DPRINTK(P9_DEBUG_VFS, "filp %p dentry %p datasync %x\n", filp,
+        P9_DPRINTK(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
-                                                dentry, datasync);
        fid = filp->private_data;
        v9fs_blank_wstat(&wstat);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 23aa52f548a0..f4287e4de744 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -197,7 +197,7 @@ const struct file_operations adfs_dir_operations = {
        .read           = generic_read_dir,
        .llseek         = generic_file_llseek,
        .readdir        = adfs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static int
diff --git a/fs/adfs/file.c b/fs/adfs/file.c
index 005ea34d1758..a36da5382b40 100644
--- a/fs/adfs/file.c
+++ b/fs/adfs/file.c
@@ -26,7 +26,7 @@ const struct file_operations adfs_file_operations = {
        .read           = do_sync_read,
        .aio_read       = generic_file_aio_read,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .splice_read    = generic_file_splice_read,
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 0f5e30978135..6f850b06ab62 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -322,8 +322,9 @@ adfs_notify_change(struct dentry *dentry, struct iattr *attr)
        if (error)
                goto out;
+        /* XXX: this is missing some actual on-disk truncation.. */
        if (ia_valid & ATTR_SIZE)
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
        if (error)
                goto out;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index 861dae68ac12..f05b6155ccc8 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -183,7 +183,7 @@ extern int			 affs_add_entry(struct inode *dir, struct inode *inode, struct dent
 void            affs_free_prealloc(struct inode *inode);
 extern void     affs_truncate(struct inode *);
-int             affs_file_fsync(struct file *, struct dentry *, int);
+int             affs_file_fsync(struct file *, int);
 /* dir.c */
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 184e55c1c9ba..322710c3eedf 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -916,9 +916,9 @@ affs_truncate(struct inode *inode)
        affs_free_prealloc(inode);
 }
-int affs_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int affs_file_fsync(struct file *filp, int datasync)
 {
-        struct inode * inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int ret, err;
        ret = write_inode_now(inode, 0);
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index d70bbbac6b7b..914d1c0bc07a 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -224,7 +224,7 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
                affs_brelse(bh);
                inode = affs_iget(sb, ino);
                if (IS_ERR(inode))
-                        return ERR_PTR(PTR_ERR(inode));
+                        return ERR_CAST(inode);
        }
        dentry->d_op = AFFS_SB(sb)->s_flags & SF_INTL ? &affs_intl_dentry_operations : &affs_dentry_operations;
        d_add(dentry, inode);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 807f284cc75e..5f679b77ce24 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -740,7 +740,7 @@ extern void afs_pages_written_back(struct afs_vnode *, struct afs_call *);
 extern ssize_t afs_file_write(struct kiocb *, const struct iovec *,
                              unsigned long, loff_t);
 extern int afs_writeback_all(struct afs_vnode *);
-extern int afs_fsync(struct file *, struct dentry *, int);
+extern int afs_fsync(struct file *, int);
 /*****************************************************************************/
diff --git a/fs/afs/server.c b/fs/afs/server.c
index f49099516675..9fdc7fe3a7bc 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -91,9 +91,10 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell,
                memcpy(&server->addr, addr, sizeof(struct in_addr));
                server->addr.s_addr = addr->s_addr;
+                _leave(" = %p{%d}", server, atomic_read(&server->usage));
+        } else {
+                _leave(" = NULL [nomem]");
        }
-        _leave(" = %p{%d}", server, atomic_read(&server->usage));
        return server;
 }
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 3bed54a294d4..3dab9e9948d0 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -701,8 +701,9 @@ int afs_writeback_all(struct afs_vnode *vnode)
 * - the return status from this call provides a reliable indication of
 *   whether any write errors occurred for this process.
 */
-int afs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int afs_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct afs_writeback *wb, *xwb;
        struct afs_vnode *vnode = AFS_FS_I(dentry->d_inode);
        int ret;
diff --git a/fs/aio.c b/fs/aio.c
index 1cf12b3dd83a..1ccf25cef1f0 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -36,6 +36,7 @@
 #include <linux/blkdev.h>
 #include <linux/mempool.h>
 #include <linux/hash.h>
+#include <linux/compat.h>
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -526,7 +527,7 @@ static void aio_fput_routine(struct work_struct *data)
                /* Complete the fput(s) */
                if (req->ki_filp != NULL)
-                        __fput(req->ki_filp);
+                        fput(req->ki_filp);
                /* Link the iocb into the context's free list */
                spin_lock_irq(&ctx->ctx_lock);
@@ -559,11 +560,11 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req)
        /*
         * Try to optimize the aio and eventfd file* puts, by avoiding to
-         * schedule work in case it is not __fput() time. In normal cases,
+         * schedule work in case it is not final fput() time. In normal cases,
         * we would not be holding the last reference to the file*, so
         * this function will be executed w/out any aio kthread wakeup.
         */
-        if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) {
+        if (unlikely(!fput_atomic(req->ki_filp))) {
                get_ioctx(ctx);
                spin_lock(&fput_lock);
                list_add(&req->ki_list, &fput_head);
@@ -1384,13 +1385,22 @@ static ssize_t aio_fsync(struct kiocb *iocb)
        return ret;
 }
-static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb)
+static ssize_t aio_setup_vectored_rw(int type, struct kiocb *kiocb, bool compat)
 {
        ssize_t ret;
-        ret = rw_copy_check_uvector(type, (struct iovec __user *)kiocb->ki_buf,
+#ifdef CONFIG_COMPAT
-                                    kiocb->ki_nbytes, 1,
+        if (compat)
-                                    &kiocb->ki_inline_vec, &kiocb->ki_iovec);
+                ret = compat_rw_copy_check_uvector(type,
+                                (struct compat_iovec __user *)kiocb->ki_buf,
+                                kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+                                &kiocb->ki_iovec);
+        else
+#endif
+                ret = rw_copy_check_uvector(type,
+                                (struct iovec __user *)kiocb->ki_buf,
+                                kiocb->ki_nbytes, 1, &kiocb->ki_inline_vec,
+                                &kiocb->ki_iovec);
        if (ret < 0)
                goto out;
@@ -1420,7 +1430,7 @@ static ssize_t aio_setup_single_vector(struct kiocb *kiocb)
 *      Performs the initial checks and aio retry method
 *      setup for the kiocb at the time of io submission.
 */
-static ssize_t aio_setup_iocb(struct kiocb *kiocb)
+static ssize_t aio_setup_iocb(struct kiocb *kiocb, bool compat)
 {
        struct file *file = kiocb->ki_filp;
        ssize_t ret = 0;
@@ -1469,7 +1479,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
                ret = security_file_permission(file, MAY_READ);
                if (unlikely(ret))
                        break;
-                ret = aio_setup_vectored_rw(READ, kiocb);
+                ret = aio_setup_vectored_rw(READ, kiocb, compat);
                if (ret)
                        break;
                ret = -EINVAL;
@@ -1483,7 +1493,7 @@ static ssize_t aio_setup_iocb(struct kiocb *kiocb)
                ret = security_file_permission(file, MAY_WRITE);
                if (unlikely(ret))
                        break;
-                ret = aio_setup_vectored_rw(WRITE, kiocb);
+                ret = aio_setup_vectored_rw(WRITE, kiocb, compat);
                if (ret)
                        break;
                ret = -EINVAL;
@@ -1548,7 +1558,8 @@ static void aio_batch_free(struct hlist_head *batch_hash)
 }
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-                         struct iocb *iocb, struct hlist_head *batch_hash)
+                         struct iocb *iocb, struct hlist_head *batch_hash,
+                         bool compat)
 {
        struct kiocb *req;
        struct file *file;
@@ -1609,7 +1620,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
        req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
        req->ki_opcode = iocb->aio_lio_opcode;
-        ret = aio_setup_iocb(req);
+        ret = aio_setup_iocb(req, compat);
        if (ret)
                goto out_put_req;
@@ -1637,20 +1648,8 @@ out_put_req:
        return ret;
 }
-/* sys_io_submit:
+long do_io_submit(aio_context_t ctx_id, long nr,
- *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+                  struct iocb __user *__user *iocbpp, bool compat)
- *      the number of iocbs queued.  May return -EINVAL if the aio_context
- *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
- *      *iocbpp[0] is not properly initialized, if the operation specified
- *      is invalid for the file descriptor in the iocb.  May fail with
- *      -EFAULT if any of the data structures point to invalid data.  May
- *      fail with -EBADF if the file descriptor specified in the first
- *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
- *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
- *      fail with -ENOSYS if not implemented.
- */
-SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
-                struct iocb __user * __user *, iocbpp)
 {
        struct kioctx *ctx;
        long ret = 0;
@@ -1687,7 +1686,7 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
                        break;
                }
-                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
+                ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash, compat);
                if (ret)
                        break;
        }
@@ -1697,6 +1696,24 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
        return i ? i : ret;
 }
+/* sys_io_submit:
+ *      Queue the nr iocbs pointed to by iocbpp for processing.  Returns
+ *      the number of iocbs queued.  May return -EINVAL if the aio_context
+ *      specified by ctx_id is invalid, if nr is < 0, if the iocb at
+ *      *iocbpp[0] is not properly initialized, if the operation specified
+ *      is invalid for the file descriptor in the iocb.  May fail with
+ *      -EFAULT if any of the data structures point to invalid data.  May
+ *      fail with -EBADF if the file descriptor specified in the first
+ *      iocb is invalid.  May fail with -EAGAIN if insufficient resources
+ *      are available to queue any iocbs.  Will return 0 if nr is 0.  Will
+ *      fail with -ENOSYS if not implemented.
+ */
+SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
+                struct iocb __user * __user *, iocbpp)
+{
+        return do_io_submit(ctx_id, nr, iocbpp, 0);
+}
 /* lookup_kiocb
 *      Finds a given iocb for cancellation.
 */
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 9bd4b3876c99..e4b75d6eda83 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -205,7 +205,7 @@ static struct inode *anon_inode_mkinode(void)
         * that it already _is_ on the dirty list.
         */
        inode->i_state = I_DIRTY;
-        inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
+        inode->i_mode = S_IRUSR | S_IWUSR;
        inode->i_uid = current_fsuid();
        inode->i_gid = current_fsgid();
        inode->i_flags |= S_PRIVATE;
diff --git a/fs/attr.c b/fs/attr.c
index 0815e93bb487..b4fa3b0aa596 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -67,14 +67,14 @@ EXPORT_SYMBOL(inode_change_ok);
 * @offset:     the new size to assign to the inode
 * @Returns:    0 on success, -ve errno on failure
 *
+ * inode_newsize_ok must be called with i_mutex held.
+ *
 * inode_newsize_ok will check filesystem limits and ulimits to check that the
 * new inode size is within limits. inode_newsize_ok will also send SIGXFSZ
 * when necessary. Caller must not proceed with inode size change if failure is
 * returned. @inode must be a file (not directory), with appropriate
 * permissions to allow truncate (inode_newsize_ok does NOT check these
 * conditions).
- *
- * inode_newsize_ok must be called with i_mutex held.
 */
 int inode_newsize_ok(const struct inode *inode, loff_t offset)
 {
@@ -104,17 +104,25 @@ out_big:
 }
 EXPORT_SYMBOL(inode_newsize_ok);
-int inode_setattr(struct inode * inode, struct iattr * attr)
+/**
+ * generic_setattr - copy simple metadata updates into the generic inode
+ * @inode:      the inode to be updated
+ * @attr:       the new attributes
+ *
+ * generic_setattr must be called with i_mutex held.
+ *
+ * generic_setattr updates the inode's metadata with that specified
+ * in attr. Noticably missing is inode size update, which is more complex
+ * as it requires pagecache updates. See simple_setsize.
+ *
+ * The inode is not marked as dirty after this operation. The rationale is
+ * that for "simple" filesystems, the struct inode is the inode storage.
+ * The caller is free to mark the inode dirty afterwards if needed.
+ */
+void generic_setattr(struct inode *inode, const struct iattr *attr)
 {
        unsigned int ia_valid = attr->ia_valid;
-        if (ia_valid & ATTR_SIZE &&
-            attr->ia_size != i_size_read(inode)) {
-                int error = vmtruncate(inode, attr->ia_size);
-                if (error)
-                        return error;
-        }
        if (ia_valid & ATTR_UID)
                inode->i_uid = attr->ia_uid;
        if (ia_valid & ATTR_GID)
@@ -135,6 +143,28 @@ int inode_setattr(struct inode * inode, struct iattr * attr)
                        mode &= ~S_ISGID;
                inode->i_mode = mode;
        }
+}
+EXPORT_SYMBOL(generic_setattr);
+/*
+ * note this function is deprecated, the new truncate sequence should be
+ * used instead -- see eg. simple_setsize, generic_setattr.
+ */
+int inode_setattr(struct inode *inode, const struct iattr *attr)
+{
+        unsigned int ia_valid = attr->ia_valid;
+        if (ia_valid & ATTR_SIZE &&
+            attr->ia_size != i_size_read(inode)) {
+                int error;
+                error = vmtruncate(inode, attr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, attr);
        mark_inode_dirty(inode);
        return 0;
diff --git a/fs/autofs/root.c b/fs/autofs/root.c
index 8713c7cfbc79..9a0520b50663 100644
--- a/fs/autofs/root.c
+++ b/fs/autofs/root.c
@@ -28,6 +28,7 @@ static int autofs_root_mkdir(struct inode *,struct dentry *,int);
 static int autofs_root_ioctl(struct inode *, struct file *,unsigned int,unsigned long);
 const struct file_operations autofs_root_operations = {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = autofs_root_readdir,
        .ioctl          = autofs_root_ioctl,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index d832062869f6..ba4a38b9c22f 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -95,7 +95,7 @@ static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param)
 */
 static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in)
 {
-        struct autofs_dev_ioctl tmp, *ads;
+        struct autofs_dev_ioctl tmp;
        if (copy_from_user(&tmp, in, sizeof(tmp)))
                return ERR_PTR(-EFAULT);
@@ -103,16 +103,7 @@ static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *i
        if (tmp.size < sizeof(tmp))
                return ERR_PTR(-EINVAL);
-        ads = kmalloc(tmp.size, GFP_KERNEL);
+        return memdup_user(in, tmp.size);
-        if (!ads)
-                return ERR_PTR(-ENOMEM);
-        if (copy_from_user(ads, in, tmp.size)) {
-                kfree(ads);
-                return ERR_PTR(-EFAULT);
-        }
-        return ads;
 }
 static inline void free_dev_ioctl(struct autofs_dev_ioctl *param)
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index a05287a23f62..52e59bf4aa5f 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -93,8 +93,7 @@ static int bad_file_release(struct inode *inode, struct file *filp)
        return -EIO;
 }
-static int bad_file_fsync(struct file *file, struct dentry *dentry,
+static int bad_file_fsync(struct file *file, int datasync)
-                        int datasync)
 {
        return -EIO;
 }
diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c
index 8f73841fc974..d967e052b779 100644
--- a/fs/bfs/dir.c
+++ b/fs/bfs/dir.c
@@ -78,7 +78,7 @@ static int bfs_readdir(struct file *f, void *dirent, filldir_t filldir)
 const struct file_operations bfs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = bfs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 2c5f9a0e5d72..63039ed9576f 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -990,10 +990,9 @@ static int elf_fdpic_map_file_constdisp_on_uclinux(
                /* clear any space allocated but not loaded */
                if (phdr->p_filesz < phdr->p_memsz) {
-                        ret = clear_user((void *) (seg->addr + phdr->p_filesz),
+                        if (clear_user((void *) (seg->addr + phdr->p_filesz),
-                                         phdr->p_memsz - phdr->p_filesz);
+                                       phdr->p_memsz - phdr->p_filesz))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                }
                if (mm) {
@@ -1027,7 +1026,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
        struct elf32_fdpic_loadseg *seg;
        struct elf32_phdr *phdr;
        unsigned long load_addr, delta_vaddr;
-        int loop, dvset, ret;
+        int loop, dvset;
        load_addr = params->load_addr;
        delta_vaddr = 0;
@@ -1127,9 +1126,8 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                 * PT_LOAD */
                if (prot & PROT_WRITE && disp > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx", loop, maddr, disp);
-                        ret = clear_user((void __user *) maddr, disp);
+                        if (clear_user((void __user *) maddr, disp))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                        maddr += disp;
                }
@@ -1164,19 +1162,17 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params,
                if (prot & PROT_WRITE && excess1 > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess1);
-                        ret = clear_user((void __user *) maddr + phdr->p_filesz,
+                        if (clear_user((void __user *) maddr + phdr->p_filesz,
-                                         excess1);
+                                       excess1))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                }
 #else
                if (excess > 0) {
                        kdebug("clear[%d] ad=%lx sz=%lx",
                               loop, maddr + phdr->p_filesz, excess);
-                        ret = clear_user((void *) maddr + phdr->p_filesz, excess);
+                        if (clear_user((void *) maddr + phdr->p_filesz, excess))
-                        if (ret)
+                                return -EFAULT;
-                                return ret;
                }
 #endif
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index 49566c1687d8..b6ab27ccf214 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -56,15 +56,22 @@
 #endif
 /*
- * User data (stack, data section and bss) needs to be aligned
+ * User data (data section and bss) needs to be aligned.
- * for the same reasons as SLAB memory is, and to the same amount.
+ * We pick 0x20 here because it is the max value elf2flt has always
- * Avoid duplicating architecture specific code by using the same
+ * used in producing FLAT files, and because it seems to be large
- * macro as with SLAB allocation:
+ * enough to make all the gcc alignment related tests happy.
+ */
+#define FLAT_DATA_ALIGN (0x20)
+/*
+ * User data (stack) also needs to be aligned.
+ * Here we can be a bit looser than the data sections since this
+ * needs to only meet arch ABI requirements.
 */
 #ifdef ARCH_SLAB_MINALIGN
-#define FLAT_DATA_ALIGN (ARCH_SLAB_MINALIGN)
+#define FLAT_STACK_ALIGN        (ARCH_SLAB_MINALIGN)
 #else
-#define FLAT_DATA_ALIGN (sizeof(void *))
+#define FLAT_STACK_ALIGN        (sizeof(void *))
 #endif
 #define RELOC_FAILED 0xff00ff01         /* Relocation incorrect somewhere */
@@ -129,7 +136,7 @@ static unsigned long create_flat_tables(
        sp = (unsigned long *)p;
        sp -= (envc + argc + 2) + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
-        sp = (unsigned long *) ((unsigned long)sp & -FLAT_DATA_ALIGN);
+        sp = (unsigned long *) ((unsigned long)sp & -FLAT_STACK_ALIGN);
        argv = sp + 1 + (flat_argvp_envp_on_stack() ? 2 : 0);
        envp = argv + (argc + 1);
@@ -589,7 +596,7 @@ static int load_flat_file(struct linux_binprm * bprm,
                if (IS_ERR_VALUE(result)) {
                        printk("Unable to read data+bss, errno %d\n", (int)-result);
                        do_munmap(current->mm, textpos, text_len);
-                        do_munmap(current->mm, realdatastart, data_len + extra);
+                        do_munmap(current->mm, realdatastart, len);
                        ret = result;
                        goto err;
                }
@@ -876,7 +883,7 @@ static int load_flat_binary(struct linux_binprm * bprm, struct pt_regs * regs)
        stack_len = TOP_OF_ARGS - bprm->p;             /* the strings */
        stack_len += (bprm->argc + 1) * sizeof(char *); /* the argv array */
        stack_len += (bprm->envc + 1) * sizeof(char *); /* the envp array */
-        stack_len += FLAT_DATA_ALIGN - 1;  /* reserve for upcoming alignment */
+        stack_len += FLAT_STACK_ALIGN - 1;  /* reserve for upcoming alignment */
        
        res = load_flat_file(bprm, &libinfo, 0, &stack_len);
        if (IS_ERR_VALUE(res))
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 26e5f5026620..99d6af811747 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -172,8 +172,9 @@ blkdev_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        return blockdev_direct_IO_no_locking(rw, iocb, inode, I_BDEV(inode),
+        return blockdev_direct_IO_no_locking_newtrunc(rw, iocb, inode,
-                                iov, offset, nr_segs, blkdev_get_blocks, NULL);
+                                I_BDEV(inode), iov, offset, nr_segs,
+                                blkdev_get_blocks, NULL);
 }
 int __sync_blockdev(struct block_device *bdev, int wait)
@@ -309,8 +310,8 @@ static int blkdev_write_begin(struct file *file, struct address_space *mapping,
                        struct page **pagep, void **fsdata)
 {
        *pagep = NULL;
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        return block_write_begin_newtrunc(file, mapping, pos, len, flags,
-                                blkdev_get_block);
+                                pagep, fsdata, blkdev_get_block);
 }
 static int blkdev_write_end(struct file *file, struct address_space *mapping,
@@ -358,12 +359,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin)
        return retval;
 }
        
-/*
+int blkdev_fsync(struct file *filp, int datasync)
- *      Filp is never NULL; the only case when ->fsync() is called with
- *      NULL first argument is nfsd_sync_dir() and that's not a directory.
- */
- 
-int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync)
 {
        struct inode *bd_inode = filp->f_mapping->host;
        struct block_device *bdev = I_BDEV(bd_inode);
@@ -710,8 +706,13 @@ retry:
 * @bdev is about to be opened exclusively.  Check @bdev can be opened
 * exclusively and mark that an exclusive open is in progress.  Each
 * successful call to this function must be matched with a call to
- * either bd_claim() or bd_abort_claiming().  If this function
+ * either bd_finish_claiming() or bd_abort_claiming() (which do not
- * succeeds, the matching bd_claim() is guaranteed to succeed.
+ * fail).
+ *
+ * This function is used to gain exclusive access to the block device
+ * without actually causing other exclusive open attempts to fail. It
+ * should be used when the open sequence itself requires exclusive
+ * access but may subsequently fail.
 *
 * CONTEXT:
 * Might sleep.
@@ -738,6 +739,7 @@ static struct block_device *bd_start_claiming(struct block_device *bdev,
                return ERR_PTR(-ENXIO);
        whole = bdget_disk(disk, 0);
+        module_put(disk->fops->owner);
        put_disk(disk);
        if (!whole)
                return ERR_PTR(-ENOMEM);
@@ -786,15 +788,46 @@ static void bd_abort_claiming(struct block_device *whole, void *holder)
        __bd_abort_claiming(whole, holder);             /* releases bdev_lock */
 }
+/* increment holders when we have a legitimate claim. requires bdev_lock */
+static void __bd_claim(struct block_device *bdev, struct block_device *whole,
+                                        void *holder)
+{
+        /* note that for a whole device bd_holders
+         * will be incremented twice, and bd_holder will
+         * be set to bd_claim before being set to holder
+         */
+        whole->bd_holders++;
+        whole->bd_holder = bd_claim;
+        bdev->bd_holders++;
+        bdev->bd_holder = holder;
+}
+/**
+ * bd_finish_claiming - finish claiming a block device
+ * @bdev: block device of interest (passed to bd_start_claiming())
+ * @whole: whole block device returned by bd_start_claiming()
+ * @holder: holder trying to claim @bdev
+ *
+ * Finish a claiming block started by bd_start_claiming().
+ *
+ * CONTEXT:
+ * Grabs and releases bdev_lock.
+ */
+static void bd_finish_claiming(struct block_device *bdev,
+                                struct block_device *whole, void *holder)
+{
+        spin_lock(&bdev_lock);
+        BUG_ON(!bd_may_claim(bdev, whole, holder));
+        __bd_claim(bdev, whole, holder);
+        __bd_abort_claiming(whole, holder); /* not actually an abort */
+}
 /**
 * bd_claim - claim a block device
 * @bdev: block device to claim
 * @holder: holder trying to claim @bdev
 *
- * Try to claim @bdev which must have been opened successfully.  This
+ * Try to claim @bdev which must have been opened successfully.
- * function may be called with or without preceding
- * blk_start_claiming().  In the former case, this function is always
- * successful and terminates the claiming block.
 *
 * CONTEXT:
 * Might sleep.
@@ -810,23 +843,10 @@ int bd_claim(struct block_device *bdev, void *holder)
        might_sleep();
        spin_lock(&bdev_lock);
        res = bd_prepare_to_claim(bdev, whole, holder);
-        if (res == 0) {
+        if (res == 0)
-                /* note that for a whole device bd_holders
+                __bd_claim(bdev, whole, holder);
-                 * will be incremented twice, and bd_holder will
+        spin_unlock(&bdev_lock);
-                 * be set to bd_claim before being set to holder
-                 */
-                whole->bd_holders++;
-                whole->bd_holder = bd_claim;
-                bdev->bd_holders++;
-                bdev->bd_holder = holder;
-        }
-        if (whole->bd_claiming)
-                __bd_abort_claiming(whole, holder);     /* releases bdev_lock */
-        else
-                spin_unlock(&bdev_lock);
        return res;
 }
@@ -1480,7 +1500,7 @@ static int blkdev_open(struct inode * inode, struct file * filp)
        if (whole) {
                if (res == 0)
-                        BUG_ON(bd_claim(bdev, filp) != 0);
+                        bd_finish_claiming(bdev, whole, filp);
                else
                        bd_abort_claiming(whole, filp);
        }
@@ -1716,7 +1736,7 @@ struct block_device *open_bdev_exclusive(const char *path, fmode_t mode, void *h
        if ((mode & FMODE_WRITE) && bdev_read_only(bdev))
                goto out_blkdev_put;
-        BUG_ON(bd_claim(bdev, holder) != 0);
+        bd_finish_claiming(bdev, whole, holder);
        return bdev;
 out_blkdev_put:
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 8d432cd9d580..2222d161c7b6 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -60,6 +60,8 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                size = __btrfs_getxattr(inode, name, value, size);
                if (size > 0) {
                        acl = posix_acl_from_xattr(value, size);
+                        if (IS_ERR(acl))
+                                return acl;
                        set_cached_acl(inode, type, acl);
                }
                kfree(value);
@@ -160,6 +162,12 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        int ret;
        struct posix_acl *acl = NULL;
+        if (!is_owner_or_cap(dentry->d_inode))
+                return -EPERM;
+        if (!IS_POSIXACL(dentry->d_inode))
+                return -EOPNOTSUPP;
        if (value) {
                acl = posix_acl_from_xattr(value, size);
                if (acl == NULL) {
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 462859a30141..7ec14097fef1 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -377,6 +377,7 @@ again:
                                if (!list_empty(&worker->pending) ||
                                    !list_empty(&worker->prio_pending)) {
                                        spin_unlock_irq(&worker->lock);
+                                        set_current_state(TASK_RUNNING);
                                        goto again;
                                }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 7a4dee199832..6ad63f17eca0 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -137,8 +137,8 @@ struct btrfs_inode {
         * of extent items we've reserved metadata for.
         */
        spinlock_t accounting_lock;
+        atomic_t outstanding_extents;
        int reserved_extents;
-        int outstanding_extents;
        /*
         * ordered_data_close is set by truncate when a file that used
@@ -151,6 +151,7 @@ struct btrfs_inode {
         * of these.
         */
        unsigned ordered_data_close:1;
+        unsigned orphan_meta_reserved:1;
        unsigned dummy_inode:1;
        /*
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6795a713b205..0d1d966b0fe4 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -280,7 +280,8 @@ int btrfs_block_can_be_shared(struct btrfs_root *root,
 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct extent_buffer *buf,
-                                       struct extent_buffer *cow)
+                                       struct extent_buffer *cow,
+                                       int *last_ref)
 {
        u64 refs;
        u64 owner;
@@ -366,6 +367,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
                clean_tree_block(trans, root, buf);
+                *last_ref = 1;
        }
        return 0;
 }
@@ -392,6 +394,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        struct btrfs_disk_key disk_key;
        struct extent_buffer *cow;
        int level;
+        int last_ref = 0;
        int unlock_orig = 0;
        u64 parent_start;
@@ -442,7 +445,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                            (unsigned long)btrfs_header_fsid(cow),
                            BTRFS_FSID_SIZE);
-        update_ref_for_cow(trans, root, buf, cow);
+        update_ref_for_cow(trans, root, buf, cow, &last_ref);
+        if (root->ref_cows)
+                btrfs_reloc_cow_block(trans, root, buf, cow);
        if (buf == root->node) {
                WARN_ON(parent && parent != buf);
@@ -457,8 +463,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                extent_buffer_get(cow);
                spin_unlock(&root->node_lock);
-                btrfs_free_tree_block(trans, root, buf->start, buf->len,
+                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                parent_start, root->root_key.objectid, level);
+                                      last_ref);
                free_extent_buffer(buf);
                add_root_to_dirty_list(root);
        } else {
@@ -473,8 +479,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                btrfs_set_node_ptr_generation(parent, parent_slot,
                                              trans->transid);
                btrfs_mark_buffer_dirty(parent);
-                btrfs_free_tree_block(trans, root, buf->start, buf->len,
+                btrfs_free_tree_block(trans, root, buf, parent_start,
-                                parent_start, root->root_key.objectid, level);
+                                      last_ref);
        }
        if (unlock_orig)
                btrfs_tree_unlock(buf);
@@ -949,6 +955,22 @@ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
        return bin_search(eb, key, level, slot);
 }
+static void root_add_used(struct btrfs_root *root, u32 size)
+{
+        spin_lock(&root->accounting_lock);
+        btrfs_set_root_used(&root->root_item,
+                            btrfs_root_used(&root->root_item) + size);
+        spin_unlock(&root->accounting_lock);
+}
+static void root_sub_used(struct btrfs_root *root, u32 size)
+{
+        spin_lock(&root->accounting_lock);
+        btrfs_set_root_used(&root->root_item,
+                            btrfs_root_used(&root->root_item) - size);
+        spin_unlock(&root->accounting_lock);
+}
 /* given a node and slot number, this reads the blocks it points to.  The
 * extent buffer is returned with a reference taken (but unlocked).
 * NULL is returned on error.
@@ -1019,7 +1041,11 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(child);
                btrfs_set_lock_blocking(child);
                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
-                BUG_ON(ret);
+                if (ret) {
+                        btrfs_tree_unlock(child);
+                        free_extent_buffer(child);
+                        goto enospc;
+                }
                spin_lock(&root->node_lock);
                root->node = child;
@@ -1034,11 +1060,12 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_unlock(mid);
                /* once for the path */
                free_extent_buffer(mid);
-                ret = btrfs_free_tree_block(trans, root, mid->start, mid->len,
-                                            0, root->root_key.objectid, level);
+                root_sub_used(root, mid->len);
+                btrfs_free_tree_block(trans, root, mid, 0, 1);
                /* once for the root ptr */
                free_extent_buffer(mid);
-                return ret;
+                return 0;
        }
        if (btrfs_header_nritems(mid) >
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
@@ -1088,23 +1115,16 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                if (wret < 0 && wret != -ENOSPC)
                        ret = wret;
                if (btrfs_header_nritems(right) == 0) {
-                        u64 bytenr = right->start;
-                        u32 blocksize = right->len;
                        clean_tree_block(trans, root, right);
                        btrfs_tree_unlock(right);
-                        free_extent_buffer(right);
-                        right = NULL;
                        wret = del_ptr(trans, root, path, level + 1, pslot +
                                       1);
                        if (wret)
                                ret = wret;
-                        wret = btrfs_free_tree_block(trans, root,
+                        root_sub_used(root, right->len);
-                                                     bytenr, blocksize, 0,
+                        btrfs_free_tree_block(trans, root, right, 0, 1);
-                                                     root->root_key.objectid,
+                        free_extent_buffer(right);
-                                                     level);
+                        right = NULL;
-                        if (wret)
-                                ret = wret;
                } else {
                        struct btrfs_disk_key right_key;
                        btrfs_node_key(right, &right_key, 0);
@@ -1136,21 +1156,15 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                BUG_ON(wret == 1);
        }
        if (btrfs_header_nritems(mid) == 0) {
-                /* we've managed to empty the middle node, drop it */
-                u64 bytenr = mid->start;
-                u32 blocksize = mid->len;
                clean_tree_block(trans, root, mid);
                btrfs_tree_unlock(mid);
-                free_extent_buffer(mid);
-                mid = NULL;
                wret = del_ptr(trans, root, path, level + 1, pslot);
                if (wret)
                        ret = wret;
-                wret = btrfs_free_tree_block(trans, root, bytenr, blocksize,
+                root_sub_used(root, mid->len);
-                                         0, root->root_key.objectid, level);
+                btrfs_free_tree_block(trans, root, mid, 0, 1);
-                if (wret)
+                free_extent_buffer(mid);
-                        ret = wret;
+                mid = NULL;
        } else {
                /* update the parent key to reflect our changes */
                struct btrfs_disk_key mid_key;
@@ -1590,7 +1604,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        btrfs_release_path(NULL, p);
        ret = -EAGAIN;
-        tmp = read_tree_block(root, blocknr, blocksize, gen);
+        tmp = read_tree_block(root, blocknr, blocksize, 0);
        if (tmp) {
                /*
                 * If the read above didn't mark this buffer up to date,
@@ -1740,7 +1754,6 @@ again:
                                              p->nodes[level + 1],
                                              p->slots[level + 1], &b);
                        if (err) {
-                                free_extent_buffer(b);
                                ret = err;
                                goto done;
                        }
@@ -2076,6 +2089,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        if (IS_ERR(c))
                return PTR_ERR(c);
+        root_add_used(root, root->nodesize);
        memset_extent_buffer(c, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_nritems(c, 1);
        btrfs_set_header_level(c, level);
@@ -2134,6 +2149,7 @@ static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root
        int nritems;
        BUG_ON(!path->nodes[level]);
+        btrfs_assert_tree_locked(path->nodes[level]);
        lower = path->nodes[level];
        nritems = btrfs_header_nritems(lower);
        BUG_ON(slot > nritems);
@@ -2202,6 +2218,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
        if (IS_ERR(split))
                return PTR_ERR(split);
+        root_add_used(root, root->nodesize);
        memset_extent_buffer(split, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_level(split, btrfs_header_level(c));
        btrfs_set_header_bytenr(split, split->start);
@@ -2415,6 +2433,9 @@ static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
        if (left_nritems)
                btrfs_mark_buffer_dirty(left);
+        else
+                clean_tree_block(trans, root, left);
        btrfs_mark_buffer_dirty(right);
        btrfs_item_key(right, &disk_key, 0);
@@ -2660,6 +2681,8 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(left);
        if (right_nritems)
                btrfs_mark_buffer_dirty(right);
+        else
+                clean_tree_block(trans, root, right);
        btrfs_item_key(right, &disk_key, 0);
        wret = fixup_low_keys(trans, root, path, &disk_key, 1);
@@ -2669,8 +2692,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        /* then fixup the leaf pointer in the path */
        if (path->slots[0] < push_items) {
                path->slots[0] += old_left_nritems;
-                if (btrfs_header_nritems(path->nodes[0]) == 0)
-                        clean_tree_block(trans, root, path->nodes[0]);
                btrfs_tree_unlock(path->nodes[0]);
                free_extent_buffer(path->nodes[0]);
                path->nodes[0] = left;
@@ -2932,10 +2953,10 @@ again:
        right = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
                                        root->root_key.objectid,
                                        &disk_key, 0, l->start, 0);
-        if (IS_ERR(right)) {
+        if (IS_ERR(right))
-                BUG_ON(1);
                return PTR_ERR(right);
-        }
+        root_add_used(root, root->leafsize);
        memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header));
        btrfs_set_header_bytenr(right, right->start);
@@ -3054,7 +3075,8 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
        btrfs_set_path_blocking(path);
        ret = split_leaf(trans, root, &key, path, ins_len, 1);
-        BUG_ON(ret);
+        if (ret)
+                goto err;
        path->keep_locks = 0;
        btrfs_unlock_up_safe(path, 1);
@@ -3796,9 +3818,10 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
         */
        btrfs_unlock_up_safe(path, 0);
-        ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len,
+        root_sub_used(root, leaf->len);
-                                    0, root->root_key.objectid, 0);
-        return ret;
+        btrfs_free_tree_block(trans, root, leaf, 0, 1);
+        return 0;
 }
 /*
 * delete the item at the leaf level in path.  If that empties
@@ -3865,6 +3888,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                if (leaf == root->node) {
                        btrfs_set_header_level(leaf, 0);
                } else {
+                        btrfs_set_path_blocking(path);
+                        clean_tree_block(trans, root, leaf);
                        ret = btrfs_del_leaf(trans, root, path, leaf);
                        BUG_ON(ret);
                }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 746a7248678e..29c20092847e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -34,6 +34,7 @@
 struct btrfs_trans_handle;
 struct btrfs_transaction;
+struct btrfs_pending_snapshot;
 extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
@@ -663,6 +664,7 @@ struct btrfs_csum_item {
 #define BTRFS_BLOCK_GROUP_RAID1    (1 << 4)
 #define BTRFS_BLOCK_GROUP_DUP      (1 << 5)
 #define BTRFS_BLOCK_GROUP_RAID10   (1 << 6)
+#define BTRFS_NR_RAID_TYPES        5
 struct btrfs_block_group_item {
        __le64 used;
@@ -674,42 +676,46 @@ struct btrfs_space_info {
        u64 flags;
        u64 total_bytes;        /* total bytes in the space */
-        u64 bytes_used;         /* total bytes used on disk */
+        u64 bytes_used;         /* total bytes used,
+                                   this does't take mirrors into account */
        u64 bytes_pinned;       /* total bytes pinned, will be freed when the
                                   transaction finishes */
        u64 bytes_reserved;     /* total bytes the allocator has reserved for
                                   current allocations */
        u64 bytes_readonly;     /* total bytes that are read only */
-        u64 bytes_super;        /* total bytes reserved for the super blocks */
-        u64 bytes_root;         /* the number of bytes needed to commit a
-                                   transaction */
        u64 bytes_may_use;      /* number of bytes that may be used for
                                   delalloc/allocations */
-        u64 bytes_delalloc;     /* number of bytes currently reserved for
+        u64 disk_used;          /* total bytes used on disk */
-                                   delayed allocation */
        int full;               /* indicates that we cannot allocate any more
                                   chunks for this space */
        int force_alloc;        /* set if we need to force a chunk alloc for
                                   this space */
-        int force_delalloc;     /* make people start doing filemap_flush until
-                                   we're under a threshold */
        struct list_head list;
-        /* for controlling how we free up space for allocations */
-        wait_queue_head_t allocate_wait;
-        wait_queue_head_t flush_wait;
-        int allocating_chunk;
-        int flushing;
        /* for block groups in our same type */
-        struct list_head block_groups;
+        struct list_head block_groups[BTRFS_NR_RAID_TYPES];
        spinlock_t lock;
        struct rw_semaphore groups_sem;
        atomic_t caching_threads;
 };
+struct btrfs_block_rsv {
+        u64 size;
+        u64 reserved;
+        u64 freed[2];
+        struct btrfs_space_info *space_info;
+        struct list_head list;
+        spinlock_t lock;
+        atomic_t usage;
+        unsigned int priority:8;
+        unsigned int durable:1;
+        unsigned int refill_used:1;
+        unsigned int full:1;
+};
 /*
 * free clusters are used to claim free space in relatively large chunks,
 * allowing us to do less seeky writes.  They are used for all metadata
@@ -760,6 +766,7 @@ struct btrfs_block_group_cache {
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
+        u64 reserved_pinned;
        u64 bytes_super;
        u64 flags;
        u64 sectorsize;
@@ -825,6 +832,22 @@ struct btrfs_fs_info {
        /* logical->physical extent mapping */
        struct btrfs_mapping_tree mapping_tree;
+        /* block reservation for extent, checksum and root tree */
+        struct btrfs_block_rsv global_block_rsv;
+        /* block reservation for delay allocation */
+        struct btrfs_block_rsv delalloc_block_rsv;
+        /* block reservation for metadata operations */
+        struct btrfs_block_rsv trans_block_rsv;
+        /* block reservation for chunk tree */
+        struct btrfs_block_rsv chunk_block_rsv;
+        struct btrfs_block_rsv empty_block_rsv;
+        /* list of block reservations that cross multiple transactions */
+        struct list_head durable_block_rsv_list;
+        struct mutex durable_block_rsv_mutex;
        u64 generation;
        u64 last_trans_committed;
@@ -927,7 +950,6 @@ struct btrfs_fs_info {
        struct btrfs_workers endio_meta_write_workers;
        struct btrfs_workers endio_write_workers;
        struct btrfs_workers submit_workers;
-        struct btrfs_workers enospc_workers;
        /*
         * fixup workers take dirty pages that didn't properly go through
         * the cow mechanism and make them safe to write.  It happens
@@ -943,6 +965,7 @@ struct btrfs_fs_info {
        int do_barriers;
        int closing;
        int log_root_recovering;
+        int enospc_unlink;
        u64 total_pinned;
@@ -1012,6 +1035,9 @@ struct btrfs_root {
        struct completion kobj_unregister;
        struct mutex objectid_mutex;
+        spinlock_t accounting_lock;
+        struct btrfs_block_rsv *block_rsv;
        struct mutex log_mutex;
        wait_queue_head_t log_writer_wait;
        wait_queue_head_t log_commit_wait[2];
@@ -1043,7 +1069,6 @@ struct btrfs_root {
        int ref_cows;
        int track_dirty;
        int in_radix;
-        int clean_orphans;
        u64 defrag_trans_start;
        struct btrfs_key defrag_progress;
@@ -1057,8 +1082,11 @@ struct btrfs_root {
        struct list_head root_list;
-        spinlock_t list_lock;
+        spinlock_t orphan_lock;
        struct list_head orphan_list;
+        struct btrfs_block_rsv *orphan_block_rsv;
+        int orphan_item_inserted;
+        int orphan_cleanup_state;
        spinlock_t inode_lock;
        /* red-black tree that keeps track of in-memory inodes */
@@ -1965,6 +1993,9 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
@@ -1984,10 +2015,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        u64 parent, u64 root_objectid,
                                        struct btrfs_disk_key *key, int level,
                                        u64 hint, u64 empty_size);
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
+                           struct btrfs_root *root,
-                          u64 bytenr, u32 blocksize,
+                           struct extent_buffer *buf,
-                          u64 parent, u64 root_objectid, int level);
+                           u64 parent, int last_ref);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -2041,27 +2072,49 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                           u64 size);
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
-                                struct btrfs_block_group_cache *group);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
+                                struct btrfs_root *root,
-                                          struct inode *inode, int num_items);
+                                int num_items, int *retries);
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-                                        struct inode *inode, int num_items);
+                                struct btrfs_root *root);
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
-                                u64 bytes);
+                                  struct inode *inode);
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
+void btrfs_orphan_release_metadata(struct inode *inode);
-                                    struct inode *inode, u64 bytes);
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+                                struct btrfs_pending_snapshot *pending);
-                                 u64 bytes);
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
-                              u64 bytes);
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes);
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
+void btrfs_free_block_rsv(struct btrfs_root *root,
+                          struct btrfs_block_rsv *rsv);
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_rsv *rsv);
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv,
+                        u64 num_bytes, int *retries);
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved, int min_factor);
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+                            struct btrfs_block_rsv *dst_rsv,
+                            u64 num_bytes);
+void btrfs_block_rsv_release(struct btrfs_root *root,
+                             struct btrfs_block_rsv *block_rsv,
+                             u64 num_bytes);
+int btrfs_set_block_group_ro(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *cache);
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+                             struct btrfs_block_group_cache *cache);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2152,7 +2205,8 @@ static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans,
 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path);
 int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf);
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref);
+int btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref);
 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
@@ -2245,6 +2299,12 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
                           u64 inode_objectid, u64 ref_objectid, u64 *index);
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const char *name, int name_len,
+                        u64 inode_objectid, u64 ref_objectid, int mod);
 int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
@@ -2257,6 +2317,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
                          struct bio *bio, u32 *dst);
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+                              struct bio *bio, u64 logical_offset, u32 *dst);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             u64 objectid, u64 pos,
@@ -2311,6 +2373,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
@@ -2349,10 +2412,20 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 void btrfs_orphan_cleanup(struct btrfs_root *root);
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending,
+                                u64 *bytes_to_reserve);
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending);
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t size);
 int btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_root *root);
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
+                              u64 start, u64 num_bytes, u64 min_size,
+                              loff_t actual_len, u64 *alloc_hint);
 extern const struct dentry_operations btrfs_dentry_operations;
 /* ioctl.c */
@@ -2361,7 +2434,7 @@ void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
 /* file.c */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
+int btrfs_sync_file(struct file *file, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                            int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
@@ -2409,4 +2482,12 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root);
 int btrfs_recover_relocation(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct extent_buffer *buf,
+                           struct extent_buffer *cow);
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending,
+                              u64 *bytes_to_reserve);
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending);
 #endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 902ce507c4e3..e807b143b857 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -319,107 +319,6 @@ out:
 }
 /*
- * helper function to lookup reference count and flags of extent.
- *
- * the head node for delayed ref is used to store the sum of all the
- * reference count modifications queued up in the rbtree. the head
- * node may also store the extent flags to set. This way you can check
- * to see what the reference count and extent flags would be if all of
- * the delayed refs are not processed.
- */
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 bytenr,
-                             u64 num_bytes, u64 *refs, u64 *flags)
-{
-        struct btrfs_delayed_ref_node *ref;
-        struct btrfs_delayed_ref_head *head;
-        struct btrfs_delayed_ref_root *delayed_refs;
-        struct btrfs_path *path;
-        struct btrfs_extent_item *ei;
-        struct extent_buffer *leaf;
-        struct btrfs_key key;
-        u32 item_size;
-        u64 num_refs;
-        u64 extent_flags;
-        int ret;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        key.objectid = bytenr;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
-        key.offset = num_bytes;
-        delayed_refs = &trans->transaction->delayed_refs;
-again:
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
-                                &key, path, 0, 0);
-        if (ret < 0)
-                goto out;
-        if (ret == 0) {
-                leaf = path->nodes[0];
-                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
-                if (item_size >= sizeof(*ei)) {
-                        ei = btrfs_item_ptr(leaf, path->slots[0],
-                                            struct btrfs_extent_item);
-                        num_refs = btrfs_extent_refs(leaf, ei);
-                        extent_flags = btrfs_extent_flags(leaf, ei);
-                } else {
-#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
-                        struct btrfs_extent_item_v0 *ei0;
-                        BUG_ON(item_size != sizeof(*ei0));
-                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
-                                             struct btrfs_extent_item_v0);
-                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
-                        /* FIXME: this isn't correct for data */
-                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
-#else
-                        BUG();
-#endif
-                }
-                BUG_ON(num_refs == 0);
-        } else {
-                num_refs = 0;
-                extent_flags = 0;
-                ret = 0;
-        }
-        spin_lock(&delayed_refs->lock);
-        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
-        if (ref) {
-                head = btrfs_delayed_node_to_head(ref);
-                if (!mutex_trylock(&head->mutex)) {
-                        atomic_inc(&ref->refs);
-                        spin_unlock(&delayed_refs->lock);
-                        btrfs_release_path(root->fs_info->extent_root, path);
-                        mutex_lock(&head->mutex);
-                        mutex_unlock(&head->mutex);
-                        btrfs_put_delayed_ref(ref);
-                        goto again;
-                }
-                if (head->extent_op && head->extent_op->update_flags)
-                        extent_flags |= head->extent_op->flags_to_set;
-                else
-                        BUG_ON(num_refs == 0);
-                num_refs += ref->ref_mod;
-                mutex_unlock(&head->mutex);
-        }
-        WARN_ON(num_refs == 0);
-        if (refs)
-                *refs = num_refs;
-        if (flags)
-                *flags = extent_flags;
-out:
-        spin_unlock(&delayed_refs->lock);
-        btrfs_free_path(path);
-        return ret;
-}
-/*
 * helper function to update an extent delayed ref in the
 * rbtree.  existing and update must both have the same
 * bytenr and parent
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index f6fc67ddad36..50e3cf92fbda 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -167,9 +167,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root, u64 bytenr,
-                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
                          u64 bytenr, u64 num_bytes, u64 orig_parent,
                          u64 parent, u64 orig_ref_root, u64 ref_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index feca04197d02..34f7c375567e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -74,6 +74,11 @@ struct async_submit_bio {
        int rw;
        int mirror_num;
        unsigned long bio_flags;
+        /*
+         * bio_offset is optional, can be used if the pages in the bio
+         * can't tell us where in the file the bio should go
+         */
+        u64 bio_offset;
        struct btrfs_work work;
 };
@@ -534,7 +539,8 @@ static void run_one_async_start(struct btrfs_work *work)
        async = container_of(work, struct  async_submit_bio, work);
        fs_info = BTRFS_I(async->inode)->root->fs_info;
        async->submit_bio_start(async->inode, async->rw, async->bio,
-                               async->mirror_num, async->bio_flags);
+                               async->mirror_num, async->bio_flags,
+                               async->bio_offset);
 }
 static void run_one_async_done(struct btrfs_work *work)
@@ -556,7 +562,8 @@ static void run_one_async_done(struct btrfs_work *work)
                wake_up(&fs_info->async_submit_wait);
        async->submit_bio_done(async->inode, async->rw, async->bio,
-                               async->mirror_num, async->bio_flags);
+                               async->mirror_num, async->bio_flags,
+                               async->bio_offset);
 }
 static void run_one_async_free(struct btrfs_work *work)
@@ -570,6 +577,7 @@ static void run_one_async_free(struct btrfs_work *work)
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
                        unsigned long bio_flags,
+                        u64 bio_offset,
                        extent_submit_bio_hook_t *submit_bio_start,
                        extent_submit_bio_hook_t *submit_bio_done)
 {
@@ -592,6 +600,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
        async->work.flags = 0;
        async->bio_flags = bio_flags;
+        async->bio_offset = bio_offset;
        atomic_inc(&fs_info->nr_async_submits);
@@ -627,7 +636,8 @@ static int btree_csum_one_bio(struct bio *bio)
 static int __btree_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                    unsigned long bio_flags)
+                                    unsigned long bio_flags,
+                                    u64 bio_offset)
 {
        /*
         * when we're called for a write, we're already in the async
@@ -638,7 +648,8 @@ static int __btree_submit_bio_start(struct inode *inode, int rw,
 }
 static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                                 int mirror_num, unsigned long bio_flags)
+                                 int mirror_num, unsigned long bio_flags,
+                                 u64 bio_offset)
 {
        /*
         * when we're called for a write, we're already in the async
@@ -648,7 +659,8 @@ static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 }
 static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                                 int mirror_num, unsigned long bio_flags)
+                                 int mirror_num, unsigned long bio_flags,
+                                 u64 bio_offset)
 {
        int ret;
@@ -671,6 +683,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
         */
        return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num, 0,
+                                   bio_offset,
                                   __btree_submit_bio_start,
                                   __btree_submit_bio_done);
 }
@@ -894,7 +907,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->ref_cows = 0;
        root->track_dirty = 0;
        root->in_radix = 0;
-        root->clean_orphans = 0;
+        root->orphan_item_inserted = 0;
+        root->orphan_cleanup_state = 0;
        root->fs_info = fs_info;
        root->objectid = objectid;
@@ -903,13 +917,16 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->name = NULL;
        root->in_sysfs = 0;
        root->inode_tree = RB_ROOT;
+        root->block_rsv = NULL;
+        root->orphan_block_rsv = NULL;
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
        INIT_LIST_HEAD(&root->root_list);
        spin_lock_init(&root->node_lock);
-        spin_lock_init(&root->list_lock);
+        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
+        spin_lock_init(&root->accounting_lock);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -968,42 +985,6 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        return 0;
 }
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info)
-{
-        struct extent_buffer *eb;
-        struct btrfs_root *log_root_tree = fs_info->log_root_tree;
-        u64 start = 0;
-        u64 end = 0;
-        int ret;
-        if (!log_root_tree)
-                return 0;
-        while (1) {
-                ret = find_first_extent_bit(&log_root_tree->dirty_log_pages,
-                                0, &start, &end, EXTENT_DIRTY | EXTENT_NEW);
-                if (ret)
-                        break;
-                clear_extent_bits(&log_root_tree->dirty_log_pages, start, end,
-                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
-        }
-        eb = fs_info->log_root_tree->node;
-        WARN_ON(btrfs_header_level(eb) != 0);
-        WARN_ON(btrfs_header_nritems(eb) != 0);
-        ret = btrfs_free_reserved_extent(fs_info->tree_root,
-                                eb->start, eb->len);
-        BUG_ON(ret);
-        free_extent_buffer(eb);
-        kfree(fs_info->log_root_tree);
-        fs_info->log_root_tree = NULL;
-        return 0;
-}
 static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
                                         struct btrfs_fs_info *fs_info)
 {
@@ -1191,19 +1172,23 @@ again:
        if (root)
                return root;
-        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
-        if (ret == 0)
-                ret = -ENOENT;
-        if (ret < 0)
-                return ERR_PTR(ret);
        root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location);
        if (IS_ERR(root))
                return root;
-        WARN_ON(btrfs_root_refs(&root->root_item) == 0);
        set_anon_super(&root->anon_super, NULL);
+        if (btrfs_root_refs(&root->root_item) == 0) {
+                ret = -ENOENT;
+                goto fail;
+        }
+        ret = btrfs_find_orphan_item(fs_info->tree_root, location->objectid);
+        if (ret < 0)
+                goto fail;
+        if (ret == 0)
+                root->orphan_item_inserted = 1;
        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
        if (ret)
                goto fail;
@@ -1212,10 +1197,9 @@ again:
        ret = radix_tree_insert(&fs_info->fs_roots_radix,
                                (unsigned long)root->root_key.objectid,
                                root);
-        if (ret == 0) {
+        if (ret == 0)
                root->in_radix = 1;
-                root->clean_orphans = 1;
-        }
        spin_unlock(&fs_info->fs_roots_radix_lock);
        radix_tree_preload_end();
        if (ret) {
@@ -1461,10 +1445,6 @@ static int cleaner_kthread(void *arg)
        struct btrfs_root *root = arg;
        do {
-                smp_mb();
-                if (root->fs_info->closing)
-                        break;
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
@@ -1477,11 +1457,9 @@ static int cleaner_kthread(void *arg)
                if (freezing(current)) {
                        refrigerator();
                } else {
-                        smp_mb();
-                        if (root->fs_info->closing)
-                                break;
                        set_current_state(TASK_INTERRUPTIBLE);
-                        schedule();
+                        if (!kthread_should_stop())
+                                schedule();
                        __set_current_state(TASK_RUNNING);
                }
        } while (!kthread_should_stop());
@@ -1493,36 +1471,40 @@ static int transaction_kthread(void *arg)
        struct btrfs_root *root = arg;
        struct btrfs_trans_handle *trans;
        struct btrfs_transaction *cur;
+        u64 transid;
        unsigned long now;
        unsigned long delay;
        int ret;
        do {
-                smp_mb();
-                if (root->fs_info->closing)
-                        break;
                delay = HZ * 30;
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
-                mutex_lock(&root->fs_info->trans_mutex);
+                spin_lock(&root->fs_info->new_trans_lock);
                cur = root->fs_info->running_transaction;
                if (!cur) {
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                        spin_unlock(&root->fs_info->new_trans_lock);
                        goto sleep;
                }
                now = get_seconds();
-                if (now < cur->start_time || now - cur->start_time < 30) {
+                if (!cur->blocked &&
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                    (now < cur->start_time || now - cur->start_time < 30)) {
+                        spin_unlock(&root->fs_info->new_trans_lock);
                        delay = HZ * 5;
                        goto sleep;
                }
-                mutex_unlock(&root->fs_info->trans_mutex);
+                transid = cur->transid;
-                trans = btrfs_start_transaction(root, 1);
+                spin_unlock(&root->fs_info->new_trans_lock);
-                ret = btrfs_commit_transaction(trans, root);
+                trans = btrfs_join_transaction(root, 1);
+                if (transid == trans->transid) {
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                } else {
+                        btrfs_end_transaction(trans, root);
+                }
 sleep:
                wake_up_process(root->fs_info->cleaner_kthread);
                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1530,10 +1512,10 @@ sleep:
                if (freezing(current)) {
                        refrigerator();
                } else {
-                        if (root->fs_info->closing)
-                                break;
                        set_current_state(TASK_INTERRUPTIBLE);
-                        schedule_timeout(delay);
+                        if (!kthread_should_stop() &&
+                            !btrfs_transaction_blocked(root->fs_info))
+                                schedule_timeout(delay);
                        __set_current_state(TASK_RUNNING);
                }
        } while (!kthread_should_stop());
@@ -1620,6 +1602,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
        INIT_LIST_HEAD(&fs_info->space_info);
        btrfs_mapping_init(&fs_info->mapping_tree);
+        btrfs_init_block_rsv(&fs_info->global_block_rsv);
+        btrfs_init_block_rsv(&fs_info->delalloc_block_rsv);
+        btrfs_init_block_rsv(&fs_info->trans_block_rsv);
+        btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
+        btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+        INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+        mutex_init(&fs_info->durable_block_rsv_mutex);
        atomic_set(&fs_info->nr_async_submits, 0);
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
@@ -1759,9 +1748,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                           min_t(u64, fs_devices->num_devices,
                           fs_info->thread_pool_size),
                           &fs_info->generic_worker);
-        btrfs_init_workers(&fs_info->enospc_workers, "enospc",
-                           fs_info->thread_pool_size,
-                           &fs_info->generic_worker);
        /* a higher idle thresh on the submit workers makes it much more
         * likely that bios will be send down in a sane order to the
@@ -1809,7 +1795,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_start_workers(&fs_info->endio_meta_workers, 1);
        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
        btrfs_start_workers(&fs_info->endio_write_workers, 1);
-        btrfs_start_workers(&fs_info->enospc_workers, 1);
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1912,17 +1897,18 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        csum_root->track_dirty = 1;
+        fs_info->generation = generation;
+        fs_info->last_trans_committed = generation;
+        fs_info->data_alloc_profile = (u64)-1;
+        fs_info->metadata_alloc_profile = (u64)-1;
+        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
        ret = btrfs_read_block_groups(extent_root);
        if (ret) {
                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
                goto fail_block_groups;
        }
-        fs_info->generation = generation;
-        fs_info->last_trans_committed = generation;
-        fs_info->data_alloc_profile = (u64)-1;
-        fs_info->metadata_alloc_profile = (u64)-1;
-        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
        fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root,
                                               "btrfs-cleaner");
        if (IS_ERR(fs_info->cleaner_kthread))
@@ -1955,8 +1941,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                     btrfs_level_size(tree_root,
                                      btrfs_super_log_root_level(disk_super));
-                log_tree_root = kzalloc(sizeof(struct btrfs_root),
+                log_tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
-                                                      GFP_NOFS);
+                if (!log_tree_root) {
+                        err = -ENOMEM;
+                        goto fail_trans_kthread;
+                }
                __setup_root(nodesize, leafsize, sectorsize, stripesize,
                             log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID);
@@ -1977,6 +1966,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        BUG_ON(ret);
        if (!(sb->s_flags & MS_RDONLY)) {
+                ret = btrfs_cleanup_fs_roots(fs_info);
+                BUG_ON(ret);
                ret = btrfs_recover_relocation(tree_root);
                if (ret < 0) {
                        printk(KERN_WARNING
@@ -1993,6 +1985,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location);
        if (!fs_info->fs_root)
                goto fail_trans_kthread;
+        if (IS_ERR(fs_info->fs_root)) {
+                err = PTR_ERR(fs_info->fs_root);
+                goto fail_trans_kthread;
+        }
        if (!(sb->s_flags & MS_RDONLY)) {
                down_read(&fs_info->cleanup_work_sem);
@@ -2040,7 +2036,6 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
-        btrfs_stop_workers(&fs_info->enospc_workers);
 fail_iput:
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
@@ -2405,11 +2400,11 @@ int btrfs_commit_super(struct btrfs_root *root)
        down_write(&root->fs_info->cleanup_work_sem);
        up_write(&root->fs_info->cleanup_work_sem);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_join_transaction(root, 1);
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
        /* run commit again to drop the original snapshot */
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_join_transaction(root, 1);
        btrfs_commit_transaction(trans, root);
        ret = btrfs_write_and_wait_transaction(NULL, root);
        BUG_ON(ret);
@@ -2426,15 +2421,15 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
-        kthread_stop(root->fs_info->transaction_kthread);
-        kthread_stop(root->fs_info->cleaner_kthread);
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
                ret =  btrfs_commit_super(root);
                if (ret)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        kthread_stop(root->fs_info->transaction_kthread);
+        kthread_stop(root->fs_info->cleaner_kthread);
        fs_info->closing = 2;
        smp_mb();
@@ -2473,7 +2468,6 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->submit_workers);
-        btrfs_stop_workers(&fs_info->enospc_workers);
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index c958ecbc1916..88e825a0bf21 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -87,7 +87,7 @@ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        int rw, struct bio *bio, int mirror_num,
-                        unsigned long bio_flags,
+                        unsigned long bio_flags, u64 bio_offset,
                        extent_submit_bio_hook_t *submit_bio_start,
                        extent_submit_bio_hook_t *submit_bio_done);
@@ -95,8 +95,6 @@ int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
-int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
-                             struct btrfs_fs_info *fs_info);
 int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c6a4f459ad76..32d094002a57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -35,10 +35,9 @@
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
-                              u64 bytenr, u64 num_bytes, int alloc,
+                              u64 bytenr, u64 num_bytes, int alloc);
-                              int mark_free);
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+                                 u64 num_bytes, int reserve, int sinfo);
-                                   u64 num_bytes, int reserve);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -61,12 +60,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          struct btrfs_path *path,
-                          u64 bytenr, u64 num_bytes,
-                          int is_data, int reserved,
-                          struct extent_buffer **must_clean);
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
@@ -91,8 +84,12 @@ void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 {
-        if (atomic_dec_and_test(&cache->count))
+        if (atomic_dec_and_test(&cache->count)) {
+                WARN_ON(cache->pinned > 0);
+                WARN_ON(cache->reserved > 0);
+                WARN_ON(cache->reserved_pinned > 0);
                kfree(cache);
+        }
 }
 /*
@@ -319,7 +316,7 @@ static int caching_kthread(void *data)
        exclude_super_stripes(extent_root, block_group);
        spin_lock(&block_group->space_info->lock);
-        block_group->space_info->bytes_super += block_group->bytes_super;
+        block_group->space_info->bytes_readonly += block_group->bytes_super;
        spin_unlock(&block_group->space_info->lock);
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
@@ -507,6 +504,9 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        struct list_head *head = &info->space_info;
        struct btrfs_space_info *found;
+        flags &= BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_SYSTEM |
+                 BTRFS_BLOCK_GROUP_METADATA;
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags == flags) {
@@ -610,6 +610,113 @@ int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
 }
 /*
+ * helper function to lookup reference count and flags of extent.
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. the head
+ * node may also store the extent flags to set. This way you can check
+ * to see what the reference count and extent flags would be if all of
+ * the delayed refs are not processed.
+ */
+int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root, u64 bytenr,
+                             u64 num_bytes, u64 *refs, u64 *flags)
+{
+        struct btrfs_delayed_ref_head *head;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_path *path;
+        struct btrfs_extent_item *ei;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        u32 item_size;
+        u64 num_refs;
+        u64 extent_flags;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = num_bytes;
+        if (!trans) {
+                path->skip_locking = 1;
+                path->search_commit_root = 1;
+        }
+again:
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+                                &key, path, 0, 0);
+        if (ret < 0)
+                goto out_free;
+        if (ret == 0) {
+                leaf = path->nodes[0];
+                item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+                if (item_size >= sizeof(*ei)) {
+                        ei = btrfs_item_ptr(leaf, path->slots[0],
+                                            struct btrfs_extent_item);
+                        num_refs = btrfs_extent_refs(leaf, ei);
+                        extent_flags = btrfs_extent_flags(leaf, ei);
+                } else {
+#ifdef BTRFS_COMPAT_EXTENT_TREE_V0
+                        struct btrfs_extent_item_v0 *ei0;
+                        BUG_ON(item_size != sizeof(*ei0));
+                        ei0 = btrfs_item_ptr(leaf, path->slots[0],
+                                             struct btrfs_extent_item_v0);
+                        num_refs = btrfs_extent_refs_v0(leaf, ei0);
+                        /* FIXME: this isn't correct for data */
+                        extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
+#else
+                        BUG();
+#endif
+                }
+                BUG_ON(num_refs == 0);
+        } else {
+                num_refs = 0;
+                extent_flags = 0;
+                ret = 0;
+        }
+        if (!trans)
+                goto out;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        head = btrfs_find_delayed_ref_head(trans, bytenr);
+        if (head) {
+                if (!mutex_trylock(&head->mutex)) {
+                        atomic_inc(&head->node.refs);
+                        spin_unlock(&delayed_refs->lock);
+                        btrfs_release_path(root->fs_info->extent_root, path);
+                        mutex_lock(&head->mutex);
+                        mutex_unlock(&head->mutex);
+                        btrfs_put_delayed_ref(&head->node);
+                        goto again;
+                }
+                if (head->extent_op && head->extent_op->update_flags)
+                        extent_flags |= head->extent_op->flags_to_set;
+                else
+                        BUG_ON(num_refs == 0);
+                num_refs += head->node.ref_mod;
+                mutex_unlock(&head->mutex);
+        }
+        spin_unlock(&delayed_refs->lock);
+out:
+        WARN_ON(num_refs == 0);
+        if (refs)
+                *refs = num_refs;
+        if (flags)
+                *flags = extent_flags;
+out_free:
+        btrfs_free_path(path);
+        return ret;
+}
+/*
 * Back reference rules.  Back refs have three main goals:
 *
 * 1) differentiate between all holders of references to an extent so that
@@ -1871,7 +1978,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
        return ret;
 }
 /* helper function to actually process a single delayed ref entry */
 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
@@ -1891,32 +1997,14 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                BUG_ON(extent_op);
                head = btrfs_delayed_node_to_head(node);
                if (insert_reserved) {
-                        int mark_free = 0;
+                        btrfs_pin_extent(root, node->bytenr,
-                        struct extent_buffer *must_clean = NULL;
+                                         node->num_bytes, 1);
-                        ret = pin_down_bytes(trans, root, NULL,
-                                             node->bytenr, node->num_bytes,
-                                             head->is_data, 1, &must_clean);
-                        if (ret > 0)
-                                mark_free = 1;
-                        if (must_clean) {
-                                clean_tree_block(NULL, root, must_clean);
-                                btrfs_tree_unlock(must_clean);
-                                free_extent_buffer(must_clean);
-                        }
                        if (head->is_data) {
                                ret = btrfs_del_csums(trans, root,
                                                      node->bytenr,
                                                      node->num_bytes);
                                BUG_ON(ret);
                        }
-                        if (mark_free) {
-                                ret = btrfs_free_reserved_extent(root,
-                                                        node->bytenr,
-                                                        node->num_bytes);
-                                BUG_ON(ret);
-                        }
                }
                mutex_unlock(&head->mutex);
                return 0;
@@ -2347,6 +2435,8 @@ int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                ret = 0;
 out:
        btrfs_free_path(path);
+        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+                WARN_ON(ret > 0);
        return ret;
 }
@@ -2660,12 +2750,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                             struct btrfs_space_info **space_info)
 {
        struct btrfs_space_info *found;
+        int i;
+        int factor;
+        if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
+                     BTRFS_BLOCK_GROUP_RAID10))
+                factor = 2;
+        else
+                factor = 1;
        found = __find_space_info(info, flags);
        if (found) {
                spin_lock(&found->lock);
                found->total_bytes += total_bytes;
                found->bytes_used += bytes_used;
+                found->disk_used += bytes_used * factor;
                found->full = 0;
                spin_unlock(&found->lock);
                *space_info = found;
@@ -2675,18 +2774,20 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (!found)
                return -ENOMEM;
-        INIT_LIST_HEAD(&found->block_groups);
+        for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+                INIT_LIST_HEAD(&found->block_groups[i]);
        init_rwsem(&found->groups_sem);
-        init_waitqueue_head(&found->flush_wait);
-        init_waitqueue_head(&found->allocate_wait);
        spin_lock_init(&found->lock);
-        found->flags = flags;
+        found->flags = flags & (BTRFS_BLOCK_GROUP_DATA |
+                                BTRFS_BLOCK_GROUP_SYSTEM |
+                                BTRFS_BLOCK_GROUP_METADATA);
        found->total_bytes = total_bytes;
        found->bytes_used = bytes_used;
+        found->disk_used = bytes_used * factor;
        found->bytes_pinned = 0;
        found->bytes_reserved = 0;
        found->bytes_readonly = 0;
-        found->bytes_delalloc = 0;
+        found->bytes_may_use = 0;
        found->full = 0;
        found->force_alloc = 0;
        *space_info = found;
@@ -2711,19 +2812,6 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
        }
 }
-static void set_block_group_readonly(struct btrfs_block_group_cache *cache)
-{
-        spin_lock(&cache->space_info->lock);
-        spin_lock(&cache->lock);
-        if (!cache->ro) {
-                cache->space_info->bytes_readonly += cache->key.offset -
-                                        btrfs_block_group_used(&cache->item);
-                cache->ro = 1;
-        }
-        spin_unlock(&cache->lock);
-        spin_unlock(&cache->space_info->lock);
-}
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
        u64 num_devices = root->fs_info->fs_devices->rw_devices;
@@ -2752,491 +2840,50 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        return flags;
 }
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        u64 alloc_profile;
-        if (data) {
-                alloc_profile = info->avail_data_alloc_bits &
-                        info->data_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-        } else if (root == root->fs_info->chunk_root) {
-                alloc_profile = info->avail_system_alloc_bits &
-                        info->system_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-        } else {
-                alloc_profile = info->avail_metadata_alloc_bits &
-                        info->metadata_alloc_profile;
-                data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-        }
-        return btrfs_reduce_alloc_profile(root, data);
-}
-void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
-{
-        u64 alloc_target;
-        alloc_target = btrfs_get_alloc_profile(root, 1);
-        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-                                                       alloc_target);
-}
-static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
-{
-        u64 num_bytes;
-        int level;
-        level = BTRFS_MAX_LEVEL - 2;
-        /*
-         * NOTE: these calculations are absolutely the worst possible case.
-         * This assumes that _every_ item we insert will require a new leaf, and
-         * that the tree has grown to its maximum level size.
-         */
-        /*
-         * for every item we insert we could insert both an extent item and a
-         * extent ref item.  Then for ever item we insert, we will need to cow
-         * both the original leaf, plus the leaf to the left and right of it.
-         *
-         * Unless we are talking about the extent root, then we just want the
-         * number of items * 2, since we just need the extent item plus its ref.
-         */
-        if (root == root->fs_info->extent_root)
-                num_bytes = num_items * 2;
-        else
-                num_bytes = (num_items + (2 * num_items)) * 3;
-        /*
-         * num_bytes is total number of leaves we could need times the leaf
-         * size, and then for every leaf we could end up cow'ing 2 nodes per
-         * level, down to the leaf level.
-         */
-        num_bytes = (num_bytes * root->leafsize) +
-                (num_bytes * (level * 2)) * root->nodesize;
-        return num_bytes;
-}
-/*
- * Unreserve metadata space for delalloc.  If we have less reserved credits than
- * we have extents, this function does nothing.
- */
-int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
-                                          struct inode *inode, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 alloc_target;
-        bool bug = false;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-                                           num_items);
-        spin_lock(&meta_sinfo->lock);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
-        if (BTRFS_I(inode)->reserved_extents <=
-            BTRFS_I(inode)->outstanding_extents) {
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                spin_unlock(&meta_sinfo->lock);
-                return 0;
-        }
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-        BTRFS_I(inode)->reserved_extents -= num_items;
-        BUG_ON(BTRFS_I(inode)->reserved_extents < 0);
-        if (meta_sinfo->bytes_delalloc < num_bytes) {
-                bug = true;
-                meta_sinfo->bytes_delalloc = 0;
-        } else {
-                meta_sinfo->bytes_delalloc -= num_bytes;
-        }
-        spin_unlock(&meta_sinfo->lock);
-        BUG_ON(bug);
-        return 0;
-}
-static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
 {
-        u64 thresh;
+        if (flags & BTRFS_BLOCK_GROUP_DATA)
+                flags |= root->fs_info->avail_data_alloc_bits &
-        thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+                         root->fs_info->data_alloc_profile;
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
+        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
+                flags |= root->fs_info->avail_system_alloc_bits &
-                meta_sinfo->bytes_may_use;
+                         root->fs_info->system_alloc_profile;
+        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-        thresh = meta_sinfo->total_bytes - thresh;
+                flags |= root->fs_info->avail_metadata_alloc_bits &
-        thresh *= 80;
+                         root->fs_info->metadata_alloc_profile;
-        do_div(thresh, 100);
+        return btrfs_reduce_alloc_profile(root, flags);
-        if (thresh <= meta_sinfo->bytes_delalloc)
-                meta_sinfo->force_delalloc = 1;
-        else
-                meta_sinfo->force_delalloc = 0;
 }
-struct async_flush {
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
-        struct btrfs_root *root;
-        struct btrfs_space_info *info;
-        struct btrfs_work work;
-};
-static noinline void flush_delalloc_async(struct btrfs_work *work)
 {
-        struct async_flush *async;
+        u64 flags;
-        struct btrfs_root *root;
-        struct btrfs_space_info *info;
-        async = container_of(work, struct async_flush, work);
-        root = async->root;
-        info = async->info;
-        btrfs_start_delalloc_inodes(root, 0);
-        wake_up(&info->flush_wait);
-        btrfs_wait_ordered_extents(root, 0, 0);
-        spin_lock(&info->lock);
-        info->flushing = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->flush_wait);
-        kfree(async);
-}
-static void wait_on_flush(struct btrfs_space_info *info)
-{
-        DEFINE_WAIT(wait);
-        u64 used;
-        while (1) {
-                prepare_to_wait(&info->flush_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                spin_lock(&info->lock);
-                if (!info->flushing) {
-                        spin_unlock(&info->lock);
-                        break;
-                }
-                used = info->bytes_used + info->bytes_reserved +
-                        info->bytes_pinned + info->bytes_readonly +
-                        info->bytes_super + info->bytes_root +
-                        info->bytes_may_use + info->bytes_delalloc;
-                if (used < info->total_bytes) {
-                        spin_unlock(&info->lock);
-                        break;
-                }
-                spin_unlock(&info->lock);
-                schedule();
-        }
-        finish_wait(&info->flush_wait, &wait);
-}
-static void flush_delalloc(struct btrfs_root *root,
-                                 struct btrfs_space_info *info)
-{
-        struct async_flush *async;
-        bool wait = false;
-        spin_lock(&info->lock);
-        if (!info->flushing)
+        if (data)
-                info->flushing = 1;
+                flags = BTRFS_BLOCK_GROUP_DATA;
+        else if (root == root->fs_info->chunk_root)
+                flags = BTRFS_BLOCK_GROUP_SYSTEM;
        else
-                wait = true;
+                flags = BTRFS_BLOCK_GROUP_METADATA;
-        spin_unlock(&info->lock);
-        if (wait) {
-                wait_on_flush(info);
-                return;
-        }
-        async = kzalloc(sizeof(*async), GFP_NOFS);
-        if (!async)
-                goto flush;
-        async->root = root;
-        async->info = info;
-        async->work.func = flush_delalloc_async;
-        btrfs_queue_worker(&root->fs_info->enospc_workers,
-                           &async->work);
-        wait_on_flush(info);
-        return;
-flush:
-        btrfs_start_delalloc_inodes(root, 0);
-        btrfs_wait_ordered_extents(root, 0, 0);
-        spin_lock(&info->lock);
-        info->flushing = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->flush_wait);
-}
-static int maybe_allocate_chunk(struct btrfs_root *root,
-                                 struct btrfs_space_info *info)
-{
-        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
-        struct btrfs_trans_handle *trans;
-        bool wait = false;
-        int ret = 0;
-        u64 min_metadata;
-        u64 free_space;
-        free_space = btrfs_super_total_bytes(disk_super);
-        /*
-         * we allow the metadata to grow to a max of either 10gb or 5% of the
-         * space in the volume.
-         */
-        min_metadata = min((u64)10 * 1024 * 1024 * 1024,
-                             div64_u64(free_space * 5, 100));
-        if (info->total_bytes >= min_metadata) {
-                spin_unlock(&info->lock);
-                return 0;
-        }
-        if (info->full) {
-                spin_unlock(&info->lock);
-                return 0;
-        }
-        if (!info->allocating_chunk) {
-                info->force_alloc = 1;
-                info->allocating_chunk = 1;
-        } else {
-                wait = true;
-        }
-        spin_unlock(&info->lock);
-        if (wait) {
-                wait_event(info->allocate_wait,
-                           !info->allocating_chunk);
-                return 1;
-        }
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                             4096 + 2 * 1024 * 1024,
-                             info->flags, 0);
-        btrfs_end_transaction(trans, root);
-        if (ret)
-                goto out;
-out:
-        spin_lock(&info->lock);
-        info->allocating_chunk = 0;
-        spin_unlock(&info->lock);
-        wake_up(&info->allocate_wait);
-        if (ret)
-                return 0;
-        return 1;
-}
-/*
- * Reserve metadata space for delalloc.
- */
-int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
-                                        struct inode *inode, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 used;
-        u64 alloc_target;
-        int flushed = 0;
-        int force_delalloc;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
-                                           num_items);
-again:
-        spin_lock(&meta_sinfo->lock);
-        force_delalloc = meta_sinfo->force_delalloc;
-        if (unlikely(!meta_sinfo->bytes_root))
-                meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-        if (!flushed)
-                meta_sinfo->bytes_delalloc += num_bytes;
-        used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-                meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-        if (used > meta_sinfo->total_bytes) {
-                flushed++;
-                if (flushed == 1) {
-                        if (maybe_allocate_chunk(root, meta_sinfo))
-                                goto again;
-                        flushed++;
-                } else {
-                        spin_unlock(&meta_sinfo->lock);
-                }
-                if (flushed == 2) {
-                        filemap_flush(inode->i_mapping);
-                        goto again;
-                } else if (flushed == 3) {
-                        flush_delalloc(root, meta_sinfo);
-                        goto again;
-                }
-                spin_lock(&meta_sinfo->lock);
-                meta_sinfo->bytes_delalloc -= num_bytes;
-                spin_unlock(&meta_sinfo->lock);
-                printk(KERN_ERR "enospc, has %d, reserved %d\n",
-                       BTRFS_I(inode)->outstanding_extents,
-                       BTRFS_I(inode)->reserved_extents);
-                dump_space_info(meta_sinfo, 0, 0);
-                return -ENOSPC;
-        }
-        BTRFS_I(inode)->reserved_extents += num_items;
+        return get_alloc_profile(root, flags);
-        check_force_delalloc(meta_sinfo);
-        spin_unlock(&meta_sinfo->lock);
-        if (!flushed && force_delalloc)
-                filemap_flush(inode->i_mapping);
-        return 0;
 }
-/*
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
- * unreserve num_items number of items worth of metadata space.  This needs to
- * be paired with btrfs_reserve_metadata_space.
- *
- * NOTE: if you have the option, run this _AFTER_ you do a
- * btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
- * oprations which will result in more used metadata, so we want to make sure we
- * can do that without issue.
- */
-int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
-{
-        struct btrfs_fs_info *info = root->fs_info;
-        struct btrfs_space_info *meta_sinfo;
-        u64 num_bytes;
-        u64 alloc_target;
-        bool bug = false;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root, num_items);
-        spin_lock(&meta_sinfo->lock);
-        if (meta_sinfo->bytes_may_use < num_bytes) {
-                bug = true;
-                meta_sinfo->bytes_may_use = 0;
-        } else {
-                meta_sinfo->bytes_may_use -= num_bytes;
-        }
-        spin_unlock(&meta_sinfo->lock);
-        BUG_ON(bug);
-        return 0;
-}
-/*
- * Reserve some metadata space for use.  We'll calculate the worste case number
- * of bytes that would be needed to modify num_items number of items.  If we
- * have space, fantastic, if not, you get -ENOSPC.  Please call
- * btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
- * items you reserved, since whatever metadata you needed should have already
- * been allocated.
- *
- * This will commit the transaction to make more space if we don't have enough
- * metadata space.  THe only time we don't do this is if we're reserving space
- * inside of a transaction, then we will just return -ENOSPC and it is the
- * callers responsibility to handle it properly.
- */
-int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
 {
-        struct btrfs_fs_info *info = root->fs_info;
+        BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
-        struct btrfs_space_info *meta_sinfo;
+                                                       BTRFS_BLOCK_GROUP_DATA);
-        u64 num_bytes;
-        u64 used;
-        u64 alloc_target;
-        int retries = 0;
-        /* get the space info for where the metadata will live */
-        alloc_target = btrfs_get_alloc_profile(root, 0);
-        meta_sinfo = __find_space_info(info, alloc_target);
-        num_bytes = calculate_bytes_needed(root, num_items);
-again:
-        spin_lock(&meta_sinfo->lock);
-        if (unlikely(!meta_sinfo->bytes_root))
-                meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
-        if (!retries)
-                meta_sinfo->bytes_may_use += num_bytes;
-        used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
-                meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
-                meta_sinfo->bytes_super + meta_sinfo->bytes_root +
-                meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
-        if (used > meta_sinfo->total_bytes) {
-                retries++;
-                if (retries == 1) {
-                        if (maybe_allocate_chunk(root, meta_sinfo))
-                                goto again;
-                        retries++;
-                } else {
-                        spin_unlock(&meta_sinfo->lock);
-                }
-                if (retries == 2) {
-                        flush_delalloc(root, meta_sinfo);
-                        goto again;
-                }
-                spin_lock(&meta_sinfo->lock);
-                meta_sinfo->bytes_may_use -= num_bytes;
-                spin_unlock(&meta_sinfo->lock);
-                dump_space_info(meta_sinfo, 0, 0);
-                return -ENOSPC;
-        }
-        check_force_delalloc(meta_sinfo);
-        spin_unlock(&meta_sinfo->lock);
-        return 0;
 }
 /*
 * This will check the space that the inode allocates from to make sure we have
 * enough space for bytes.
 */
-int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
-                                u64 bytes)
 {
        struct btrfs_space_info *data_sinfo;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 used;
-        int ret = 0, committed = 0, flushed = 0;
+        int ret = 0, committed = 0;
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -3248,21 +2895,13 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 again:
        /* make sure we have enough space to handle the data first */
        spin_lock(&data_sinfo->lock);
-        used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc +
+        used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
-                data_sinfo->bytes_reserved + data_sinfo->bytes_pinned +
+                data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
-                data_sinfo->bytes_readonly + data_sinfo->bytes_may_use +
+                data_sinfo->bytes_may_use;
-                data_sinfo->bytes_super;
        if (used + bytes > data_sinfo->total_bytes) {
                struct btrfs_trans_handle *trans;
-                if (!flushed) {
-                        spin_unlock(&data_sinfo->lock);
-                        flush_delalloc(root, data_sinfo);
-                        flushed = 1;
-                        goto again;
-                }
                /*
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
@@ -3274,15 +2913,15 @@ again:
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_join_transaction(root, 1);
-                        if (!trans)
+                        if (IS_ERR(trans))
-                                return -ENOMEM;
+                                return PTR_ERR(trans);
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                             bytes + 2 * 1024 * 1024,
                                             alloc_target, 0);
                        btrfs_end_transaction(trans, root);
-                        if (ret)
+                        if (ret < 0)
                                return ret;
                        if (!data_sinfo) {
@@ -3297,25 +2936,26 @@ alloc:
                if (!committed && !root->fs_info->open_ioctl_trans) {
                        committed = 1;
                        trans = btrfs_join_transaction(root, 1);
-                        if (!trans)
+                        if (IS_ERR(trans))
-                                return -ENOMEM;
+                                return PTR_ERR(trans);
                        ret = btrfs_commit_transaction(trans, root);
                        if (ret)
                                return ret;
                        goto again;
                }
-                printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+#if 0 /* I hope we never need this code again, just in case */
-                       ", %llu bytes_used, %llu bytes_reserved, "
+                printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
-                       "%llu bytes_pinned, %llu bytes_readonly, %llu may use "
+                       "%llu bytes_reserved, " "%llu bytes_pinned, "
-                       "%llu total\n", (unsigned long long)bytes,
+                       "%llu bytes_readonly, %llu may use %llu total\n",
-                       (unsigned long long)data_sinfo->bytes_delalloc,
+                       (unsigned long long)bytes,
                       (unsigned long long)data_sinfo->bytes_used,
                       (unsigned long long)data_sinfo->bytes_reserved,
                       (unsigned long long)data_sinfo->bytes_pinned,
                       (unsigned long long)data_sinfo->bytes_readonly,
                       (unsigned long long)data_sinfo->bytes_may_use,
                       (unsigned long long)data_sinfo->total_bytes);
+#endif
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
@@ -3326,12 +2966,13 @@ alloc:
 }
 /*
- * if there was an error for whatever reason after calling
+ * called when we are clearing an delalloc extent from the
- * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ * inode's io_tree or there was an error for whatever reason
+ * after calling btrfs_check_data_free_space
 */
-void btrfs_free_reserved_data_space(struct btrfs_root *root,
+void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
-                                    struct inode *inode, u64 bytes)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_space_info *data_sinfo;
        /* make sure bytes are sectorsize aligned */
@@ -3344,48 +2985,6 @@ void btrfs_free_reserved_data_space(struct btrfs_root *root,
        spin_unlock(&data_sinfo->lock);
 }
-/* called when we are adding a delalloc extent to the inode's io_tree */
-void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
-                                  u64 bytes)
-{
-        struct btrfs_space_info *data_sinfo;
-        /* get the space info for where this inode will be storing its data */
-        data_sinfo = BTRFS_I(inode)->space_info;
-        /* make sure we have enough space to handle the data first */
-        spin_lock(&data_sinfo->lock);
-        data_sinfo->bytes_delalloc += bytes;
-        /*
-         * we are adding a delalloc extent without calling
-         * btrfs_check_data_free_space first.  This happens on a weird
-         * writepage condition, but shouldn't hurt our accounting
-         */
-        if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
-                data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
-                BTRFS_I(inode)->reserved_bytes = 0;
-        } else {
-                data_sinfo->bytes_may_use -= bytes;
-                BTRFS_I(inode)->reserved_bytes -= bytes;
-        }
-        spin_unlock(&data_sinfo->lock);
-}
-/* called when we are clearing an delalloc extent from the inode's io_tree */
-void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
-                              u64 bytes)
-{
-        struct btrfs_space_info *info;
-        info = BTRFS_I(inode)->space_info;
-        spin_lock(&info->lock);
-        info->bytes_delalloc -= bytes;
-        spin_unlock(&info->lock);
-}
 static void force_metadata_allocation(struct btrfs_fs_info *info)
 {
        struct list_head *head = &info->space_info;
@@ -3399,13 +2998,28 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
        rcu_read_unlock();
 }
+static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+                              u64 alloc_bytes)
+{
+        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+        if (sinfo->bytes_used + sinfo->bytes_reserved +
+            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+                return 0;
+        if (sinfo->bytes_used + sinfo->bytes_reserved +
+            alloc_bytes < div_factor(num_bytes, 8))
+                return 0;
+        return 1;
+}
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
                          u64 flags, int force)
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
-        u64 thresh;
        int ret = 0;
        mutex_lock(&fs_info->chunk_mutex);
@@ -3428,11 +3042,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                goto out;
        }
-        thresh = space_info->total_bytes - space_info->bytes_readonly;
+        if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
-        thresh = div_factor(thresh, 8);
-        if (!force &&
-           (space_info->bytes_used + space_info->bytes_pinned +
-            space_info->bytes_reserved + alloc_bytes) < thresh) {
                spin_unlock(&space_info->lock);
                goto out;
        }
@@ -3454,6 +3064,8 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        spin_lock(&space_info->lock);
        if (ret)
                space_info->full = 1;
+        else
+                ret = 1;
        space_info->force_alloc = 0;
        spin_unlock(&space_info->lock);
 out:
@@ -3461,13 +3073,713 @@ out:
        return ret;
 }
+static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_space_info *sinfo, u64 num_bytes)
+{
+        int ret;
+        int end_trans = 0;
+        if (sinfo->full)
+                return 0;
+        spin_lock(&sinfo->lock);
+        ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
+        spin_unlock(&sinfo->lock);
+        if (!ret)
+                return 0;
+        if (!trans) {
+                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
+                end_trans = 1;
+        }
+        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+                             num_bytes + 2 * 1024 * 1024,
+                             get_alloc_profile(root, sinfo->flags), 0);
+        if (end_trans)
+                btrfs_end_transaction(trans, root);
+        return ret == 1 ? 1 : 0;
+}
+/*
+ * shrink metadata reservation for delalloc
+ */
+static int shrink_delalloc(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, u64 to_reclaim)
+{
+        struct btrfs_block_rsv *block_rsv;
+        u64 reserved;
+        u64 max_reclaim;
+        u64 reclaimed = 0;
+        int pause = 1;
+        int ret;
+        block_rsv = &root->fs_info->delalloc_block_rsv;
+        spin_lock(&block_rsv->lock);
+        reserved = block_rsv->reserved;
+        spin_unlock(&block_rsv->lock);
+        if (reserved == 0)
+                return 0;
+        max_reclaim = min(reserved, to_reclaim);
+        while (1) {
+                ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+                if (!ret) {
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                        schedule_timeout(pause);
+                        pause <<= 1;
+                        if (pause > HZ / 10)
+                                pause = HZ / 10;
+                } else {
+                        pause = 1;
+                }
+                spin_lock(&block_rsv->lock);
+                if (reserved > block_rsv->reserved)
+                        reclaimed = reserved - block_rsv->reserved;
+                reserved = block_rsv->reserved;
+                spin_unlock(&block_rsv->lock);
+                if (reserved == 0 || reclaimed >= max_reclaim)
+                        break;
+                if (trans && trans->transaction->blocked)
+                        return -EAGAIN;
+        }
+        return reclaimed >= to_reclaim;
+}
+static int should_retry_reserve(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes, int *retries)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        int ret;
+        if ((*retries) > 2)
+                return -ENOSPC;
+        ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+        if (ret)
+                return 1;
+        if (trans && trans->transaction->in_commit)
+                return -ENOSPC;
+        ret = shrink_delalloc(trans, root, num_bytes);
+        if (ret)
+                return ret;
+        spin_lock(&space_info->lock);
+        if (space_info->bytes_pinned < num_bytes)
+                ret = 1;
+        spin_unlock(&space_info->lock);
+        if (ret)
+                return -ENOSPC;
+        (*retries)++;
+        if (trans)
+                return -EAGAIN;
+        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(IS_ERR(trans));
+        ret = btrfs_commit_transaction(trans, root);
+        BUG_ON(ret);
+        return 1;
+}
+static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+                                  u64 num_bytes)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        u64 unused;
+        int ret = -ENOSPC;
+        spin_lock(&space_info->lock);
+        unused = space_info->bytes_used + space_info->bytes_reserved +
+                 space_info->bytes_pinned + space_info->bytes_readonly;
+        if (unused < space_info->total_bytes)
+                unused = space_info->total_bytes - unused;
+        else
+                unused = 0;
+        if (unused >= num_bytes) {
+                if (block_rsv->priority >= 10) {
+                        space_info->bytes_reserved += num_bytes;
+                        ret = 0;
+                } else {
+                        if ((unused + block_rsv->reserved) *
+                            block_rsv->priority >=
+                            (num_bytes + block_rsv->reserved) * 10) {
+                                space_info->bytes_reserved += num_bytes;
+                                ret = 0;
+                        }
+                }
+        }
+        spin_unlock(&space_info->lock);
+        return ret;
+}
+static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root)
+{
+        struct btrfs_block_rsv *block_rsv;
+        if (root->ref_cows)
+                block_rsv = trans->block_rsv;
+        else
+                block_rsv = root->block_rsv;
+        if (!block_rsv)
+                block_rsv = &root->fs_info->empty_block_rsv;
+        return block_rsv;
+}
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+                               u64 num_bytes)
+{
+        int ret = -ENOSPC;
+        spin_lock(&block_rsv->lock);
+        if (block_rsv->reserved >= num_bytes) {
+                block_rsv->reserved -= num_bytes;
+                if (block_rsv->reserved < block_rsv->size)
+                        block_rsv->full = 0;
+                ret = 0;
+        }
+        spin_unlock(&block_rsv->lock);
+        return ret;
+}
+static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes, int update_size)
+{
+        spin_lock(&block_rsv->lock);
+        block_rsv->reserved += num_bytes;
+        if (update_size)
+                block_rsv->size += num_bytes;
+        else if (block_rsv->reserved >= block_rsv->size)
+                block_rsv->full = 1;
+        spin_unlock(&block_rsv->lock);
+}
+void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+                             struct btrfs_block_rsv *dest, u64 num_bytes)
+{
+        struct btrfs_space_info *space_info = block_rsv->space_info;
+        spin_lock(&block_rsv->lock);
+        if (num_bytes == (u64)-1)
+                num_bytes = block_rsv->size;
+        block_rsv->size -= num_bytes;
+        if (block_rsv->reserved >= block_rsv->size) {
+                num_bytes = block_rsv->reserved - block_rsv->size;
+                block_rsv->reserved = block_rsv->size;
+                block_rsv->full = 1;
+        } else {
+                num_bytes = 0;
+        }
+        spin_unlock(&block_rsv->lock);
+        if (num_bytes > 0) {
+                if (dest) {
+                        block_rsv_add_bytes(dest, num_bytes, 0);
+                } else {
+                        spin_lock(&space_info->lock);
+                        space_info->bytes_reserved -= num_bytes;
+                        spin_unlock(&space_info->lock);
+                }
+        }
+}
+static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
+                                   struct btrfs_block_rsv *dst, u64 num_bytes)
+{
+        int ret;
+        ret = block_rsv_use_bytes(src, num_bytes);
+        if (ret)
+                return ret;
+        block_rsv_add_bytes(dst, num_bytes, 1);
+        return 0;
+}
+void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
+{
+        memset(rsv, 0, sizeof(*rsv));
+        spin_lock_init(&rsv->lock);
+        atomic_set(&rsv->usage, 1);
+        rsv->priority = 6;
+        INIT_LIST_HEAD(&rsv->list);
+}
+struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
+{
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 alloc_target;
+        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
+        if (!block_rsv)
+                return NULL;
+        btrfs_init_block_rsv(block_rsv);
+        alloc_target = btrfs_get_alloc_profile(root, 0);
+        block_rsv->space_info = __find_space_info(fs_info,
+                                                  BTRFS_BLOCK_GROUP_METADATA);
+        return block_rsv;
+}
+void btrfs_free_block_rsv(struct btrfs_root *root,
+                          struct btrfs_block_rsv *rsv)
+{
+        if (rsv && atomic_dec_and_test(&rsv->usage)) {
+                btrfs_block_rsv_release(root, rsv, (u64)-1);
+                if (!rsv->durable)
+                        kfree(rsv);
+        }
+}
+/*
+ * make the block_rsv struct be able to capture freed space.
+ * the captured space will re-add to the the block_rsv struct
+ * after transaction commit
+ */
+void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_rsv *block_rsv)
+{
+        block_rsv->durable = 1;
+        mutex_lock(&fs_info->durable_block_rsv_mutex);
+        list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+        mutex_unlock(&fs_info->durable_block_rsv_mutex);
+}
+int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv,
+                        u64 num_bytes, int *retries)
+{
+        int ret;
+        if (num_bytes == 0)
+                return 0;
+again:
+        ret = reserve_metadata_bytes(block_rsv, num_bytes);
+        if (!ret) {
+                block_rsv_add_bytes(block_rsv, num_bytes, 1);
+                return 0;
+        }
+        ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
+        if (ret > 0)
+                goto again;
+        return ret;
+}
+int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved, int min_factor)
+{
+        u64 num_bytes = 0;
+        int commit_trans = 0;
+        int ret = -ENOSPC;
+        if (!block_rsv)
+                return 0;
+        spin_lock(&block_rsv->lock);
+        if (min_factor > 0)
+                num_bytes = div_factor(block_rsv->size, min_factor);
+        if (min_reserved > num_bytes)
+                num_bytes = min_reserved;
+        if (block_rsv->reserved >= num_bytes) {
+                ret = 0;
+        } else {
+                num_bytes -= block_rsv->reserved;
+                if (block_rsv->durable &&
+                    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
+                        commit_trans = 1;
+        }
+        spin_unlock(&block_rsv->lock);
+        if (!ret)
+                return 0;
+        if (block_rsv->refill_used) {
+                ret = reserve_metadata_bytes(block_rsv, num_bytes);
+                if (!ret) {
+                        block_rsv_add_bytes(block_rsv, num_bytes, 0);
+                        return 0;
+                }
+        }
+        if (commit_trans) {
+                if (trans)
+                        return -EAGAIN;
+                trans = btrfs_join_transaction(root, 1);
+                BUG_ON(IS_ERR(trans));
+                ret = btrfs_commit_transaction(trans, root);
+                return 0;
+        }
+        WARN_ON(1);
+        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+                block_rsv->size, block_rsv->reserved,
+                block_rsv->freed[0], block_rsv->freed[1]);
+        return -ENOSPC;
+}
+int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
+                            struct btrfs_block_rsv *dst_rsv,
+                            u64 num_bytes)
+{
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+void btrfs_block_rsv_release(struct btrfs_root *root,
+                             struct btrfs_block_rsv *block_rsv,
+                             u64 num_bytes)
+{
+        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
+        if (global_rsv->full || global_rsv == block_rsv ||
+            block_rsv->space_info != global_rsv->space_info)
+                global_rsv = NULL;
+        block_rsv_release_bytes(block_rsv, global_rsv, num_bytes);
+}
+/*
+ * helper to calculate size of global block reservation.
+ * the desired value is sum of space used by extent tree,
+ * checksum tree and root tree
+ */
+static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_space_info *sinfo;
+        u64 num_bytes;
+        u64 meta_used;
+        u64 data_used;
+        int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+#if 0
+        /*
+         * per tree used space accounting can be inaccuracy, so we
+         * can't rely on it.
+         */
+        spin_lock(&fs_info->extent_root->accounting_lock);
+        num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
+        spin_unlock(&fs_info->extent_root->accounting_lock);
+        spin_lock(&fs_info->csum_root->accounting_lock);
+        num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
+        spin_unlock(&fs_info->csum_root->accounting_lock);
+        spin_lock(&fs_info->tree_root->accounting_lock);
+        num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
+        spin_unlock(&fs_info->tree_root->accounting_lock);
+#endif
+        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
+        spin_lock(&sinfo->lock);
+        data_used = sinfo->bytes_used;
+        spin_unlock(&sinfo->lock);
+        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+        spin_lock(&sinfo->lock);
+        meta_used = sinfo->bytes_used;
+        spin_unlock(&sinfo->lock);
+        num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
+                    csum_size * 2;
+        num_bytes += div64_u64(data_used + meta_used, 50);
+        if (num_bytes * 3 > meta_used)
+                num_bytes = div64_u64(meta_used, 3);
+        return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
+}
+static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
+        struct btrfs_space_info *sinfo = block_rsv->space_info;
+        u64 num_bytes;
+        num_bytes = calc_global_metadata_size(fs_info);
+        spin_lock(&block_rsv->lock);
+        spin_lock(&sinfo->lock);
+        block_rsv->size = num_bytes;
+        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
+                    sinfo->bytes_reserved + sinfo->bytes_readonly;
+        if (sinfo->total_bytes > num_bytes) {
+                num_bytes = sinfo->total_bytes - num_bytes;
+                block_rsv->reserved += num_bytes;
+                sinfo->bytes_reserved += num_bytes;
+        }
+        if (block_rsv->reserved >= block_rsv->size) {
+                num_bytes = block_rsv->reserved - block_rsv->size;
+                sinfo->bytes_reserved -= num_bytes;
+                block_rsv->reserved = block_rsv->size;
+                block_rsv->full = 1;
+        }
+#if 0
+        printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
+                block_rsv->size, block_rsv->reserved);
+#endif
+        spin_unlock(&sinfo->lock);
+        spin_unlock(&block_rsv->lock);
+}
+static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_space_info *space_info;
+        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+        fs_info->chunk_block_rsv.space_info = space_info;
+        fs_info->chunk_block_rsv.priority = 10;
+        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
+        fs_info->global_block_rsv.space_info = space_info;
+        fs_info->global_block_rsv.priority = 10;
+        fs_info->global_block_rsv.refill_used = 1;
+        fs_info->delalloc_block_rsv.space_info = space_info;
+        fs_info->trans_block_rsv.space_info = space_info;
+        fs_info->empty_block_rsv.space_info = space_info;
+        fs_info->empty_block_rsv.priority = 10;
+        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
+        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
+        btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
+        btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
+        update_global_block_rsv(fs_info);
+}
+static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
+{
+        block_rsv_release_bytes(&fs_info->global_block_rsv, NULL, (u64)-1);
+        WARN_ON(fs_info->delalloc_block_rsv.size > 0);
+        WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
+        WARN_ON(fs_info->trans_block_rsv.size > 0);
+        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
+        WARN_ON(fs_info->chunk_block_rsv.size > 0);
+        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+}
+static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+{
+        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+                3 * num_items;
+}
+int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root,
+                                 int num_items, int *retries)
+{
+        u64 num_bytes;
+        int ret;
+        if (num_items == 0 || root->fs_info->chunk_root == root)
+                return 0;
+        num_bytes = calc_trans_metadata_size(root, num_items);
+        ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
+                                  num_bytes, retries);
+        if (!ret) {
+                trans->bytes_reserved += num_bytes;
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
+        }
+        return ret;
+}
+void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+        if (!trans->bytes_reserved)
+                return;
+        BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+        btrfs_block_rsv_release(root, trans->block_rsv,
+                                trans->bytes_reserved);
+        trans->bytes_reserved = 0;
+}
+int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
+                                  struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
+        /*
+         * one for deleting orphan item, one for updating inode and
+         * two for calling btrfs_truncate_inode_items.
+         *
+         * btrfs_truncate_inode_items is a delete operation, it frees
+         * more space than it uses in most cases. So two units of
+         * metadata space should be enough for calling it many times.
+         * If all of the metadata space is used, we can commit
+         * transaction and use space it freed.
+         */
+        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+void btrfs_orphan_release_metadata(struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
+}
+int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
+        /*
+         * two for root back/forward refs, two for directory entries
+         * and one for root of the snapshot.
+         */
+        u64 num_bytes = calc_trans_metadata_size(root, 5);
+        dst_rsv->space_info = src_rsv->space_info;
+        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+}
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+{
+        return num_bytes >>= 3;
+}
+int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
+        u64 to_reserve;
+        int nr_extents;
+        int retries = 0;
+        int ret;
+        if (btrfs_transaction_in_commit(root->fs_info))
+                schedule_timeout(1);
+        num_bytes = ALIGN(num_bytes, root->sectorsize);
+again:
+        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
+        if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+                nr_extents -= BTRFS_I(inode)->reserved_extents;
+                to_reserve = calc_trans_metadata_size(root, nr_extents);
+        } else {
+                nr_extents = 0;
+                to_reserve = 0;
+        }
+        to_reserve += calc_csum_metadata_size(inode, num_bytes);
+        ret = reserve_metadata_bytes(block_rsv, to_reserve);
+        if (ret) {
+                spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
+                                           &retries);
+                if (ret > 0)
+                        goto again;
+                return ret;
+        }
+        BTRFS_I(inode)->reserved_extents += nr_extents;
+        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+        block_rsv_add_bytes(block_rsv, to_reserve, 1);
+        if (block_rsv->size > 512 * 1024 * 1024)
+                shrink_delalloc(NULL, root, to_reserve);
+        return 0;
+}
+void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 to_free;
+        int nr_extents;
+        num_bytes = ALIGN(num_bytes, root->sectorsize);
+        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+        if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+                nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
+                BTRFS_I(inode)->reserved_extents -= nr_extents;
+        } else {
+                nr_extents = 0;
+        }
+        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+        to_free = calc_csum_metadata_size(inode, num_bytes);
+        if (nr_extents > 0)
+                to_free += calc_trans_metadata_size(root, nr_extents);
+        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
+                                to_free);
+}
+int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
+{
+        int ret;
+        ret = btrfs_check_data_free_space(inode, num_bytes);
+        if (ret)
+                return ret;
+        ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
+        if (ret) {
+                btrfs_free_reserved_data_space(inode, num_bytes);
+                return ret;
+        }
+        return 0;
+}
+void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
+{
+        btrfs_delalloc_release_metadata(inode, num_bytes);
+        btrfs_free_reserved_data_space(inode, num_bytes);
+}
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
-                              u64 bytenr, u64 num_bytes, int alloc,
+                              u64 bytenr, u64 num_bytes, int alloc)
-                              int mark_free)
 {
        struct btrfs_block_group_cache *cache;
        struct btrfs_fs_info *info = root->fs_info;
+        int factor;
        u64 total = num_bytes;
        u64 old_val;
        u64 byte_in_group;
@@ -3486,6 +3798,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                cache = btrfs_lookup_block_group(info, bytenr);
                if (!cache)
                        return -1;
+                if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
+                                    BTRFS_BLOCK_GROUP_RAID1 |
+                                    BTRFS_BLOCK_GROUP_RAID10))
+                        factor = 2;
+                else
+                        factor = 1;
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -3498,31 +3816,24 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        old_val += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
-                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
-                        if (cache->ro)
+                        cache->space_info->bytes_used += num_bytes;
-                                cache->space_info->bytes_readonly -= num_bytes;
+                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
                } else {
                        old_val -= num_bytes;
-                        cache->space_info->bytes_used -= num_bytes;
-                        if (cache->ro)
-                                cache->space_info->bytes_readonly += num_bytes;
                        btrfs_set_block_group_used(&cache->item, old_val);
+                        cache->pinned += num_bytes;
+                        cache->space_info->bytes_pinned += num_bytes;
+                        cache->space_info->bytes_used -= num_bytes;
+                        cache->space_info->disk_used -= num_bytes * factor;
                        spin_unlock(&cache->lock);
                        spin_unlock(&cache->space_info->lock);
-                        if (mark_free) {
-                                int ret;
-                                ret = btrfs_discard_extent(root, bytenr,
-                                                           num_bytes);
-                                WARN_ON(ret);
-                                ret = btrfs_add_free_space(cache, bytenr,
+                        set_extent_dirty(info->pinned_extents,
-                                                           num_bytes);
+                                         bytenr, bytenr + num_bytes - 1,
-                                WARN_ON(ret);
+                                         GFP_NOFS | __GFP_NOFAIL);
-                        }
                }
                btrfs_put_block_group(cache);
                total -= num_bytes;
@@ -3546,18 +3857,10 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
        return bytenr;
 }
-/*
+static int pin_down_extent(struct btrfs_root *root,
- * this function must be called within transaction
+                           struct btrfs_block_group_cache *cache,
- */
+                           u64 bytenr, u64 num_bytes, int reserved)
-int btrfs_pin_extent(struct btrfs_root *root,
-                     u64 bytenr, u64 num_bytes, int reserved)
 {
-        struct btrfs_fs_info *fs_info = root->fs_info;
-        struct btrfs_block_group_cache *cache;
-        cache = btrfs_lookup_block_group(fs_info, bytenr);
-        BUG_ON(!cache);
        spin_lock(&cache->space_info->lock);
        spin_lock(&cache->lock);
        cache->pinned += num_bytes;
@@ -3569,28 +3872,68 @@ int btrfs_pin_extent(struct btrfs_root *root,
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
-        btrfs_put_block_group(cache);
+        set_extent_dirty(root->fs_info->pinned_extents, bytenr,
+                         bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
+        return 0;
+}
-        set_extent_dirty(fs_info->pinned_extents,
+/*
-                         bytenr, bytenr + num_bytes - 1, GFP_NOFS);
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent(struct btrfs_root *root,
+                     u64 bytenr, u64 num_bytes, int reserved)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+        BUG_ON(!cache);
+        pin_down_extent(root, cache, bytenr, num_bytes, reserved);
+        btrfs_put_block_group(cache);
        return 0;
 }
-static int update_reserved_extents(struct btrfs_block_group_cache *cache,
+/*
-                                   u64 num_bytes, int reserve)
+ * update size of reserved extents. this function may return -EAGAIN
+ * if 'reserve' is true or 'sinfo' is false.
+ */
+static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                 u64 num_bytes, int reserve, int sinfo)
 {
-        spin_lock(&cache->space_info->lock);
+        int ret = 0;
-        spin_lock(&cache->lock);
+        if (sinfo) {
-        if (reserve) {
+                struct btrfs_space_info *space_info = cache->space_info;
-                cache->reserved += num_bytes;
+                spin_lock(&space_info->lock);
-                cache->space_info->bytes_reserved += num_bytes;
+                spin_lock(&cache->lock);
+                if (reserve) {
+                        if (cache->ro) {
+                                ret = -EAGAIN;
+                        } else {
+                                cache->reserved += num_bytes;
+                                space_info->bytes_reserved += num_bytes;
+                        }
+                } else {
+                        if (cache->ro)
+                                space_info->bytes_readonly += num_bytes;
+                        cache->reserved -= num_bytes;
+                        space_info->bytes_reserved -= num_bytes;
+                }
+                spin_unlock(&cache->lock);
+                spin_unlock(&space_info->lock);
        } else {
-                cache->reserved -= num_bytes;
+                spin_lock(&cache->lock);
-                cache->space_info->bytes_reserved -= num_bytes;
+                if (cache->ro) {
+                        ret = -EAGAIN;
+                } else {
+                        if (reserve)
+                                cache->reserved += num_bytes;
+                        else
+                                cache->reserved -= num_bytes;
+                }
+                spin_unlock(&cache->lock);
        }
-        spin_unlock(&cache->lock);
+        return ret;
-        spin_unlock(&cache->space_info->lock);
-        return 0;
 }
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
@@ -3621,6 +3964,8 @@ int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                fs_info->pinned_extents = &fs_info->freed_extents[0];
        up_write(&fs_info->extent_commit_sem);
+        update_global_block_rsv(fs_info);
        return 0;
 }
@@ -3647,14 +3992,21 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                        btrfs_add_free_space(cache, start, len);
                }
+                start += len;
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
                cache->pinned -= len;
                cache->space_info->bytes_pinned -= len;
+                if (cache->ro) {
+                        cache->space_info->bytes_readonly += len;
+                } else if (cache->reserved_pinned > 0) {
+                        len = min(len, cache->reserved_pinned);
+                        cache->reserved_pinned -= len;
+                        cache->space_info->bytes_reserved += len;
+                }
                spin_unlock(&cache->lock);
                spin_unlock(&cache->space_info->lock);
-                start += len;
        }
        if (cache)
@@ -3667,8 +4019,11 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *unpin;
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *next_rsv;
        u64 start;
        u64 end;
+        int idx;
        int ret;
        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -3689,59 +4044,30 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
-        return ret;
+        mutex_lock(&fs_info->durable_block_rsv_mutex);
-}
+        list_for_each_entry_safe(block_rsv, next_rsv,
+                                 &fs_info->durable_block_rsv_list, list) {
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          struct btrfs_path *path,
-                          u64 bytenr, u64 num_bytes,
-                          int is_data, int reserved,
-                          struct extent_buffer **must_clean)
-{
-        int err = 0;
-        struct extent_buffer *buf;
-        if (is_data)
+                idx = trans->transid & 0x1;
-                goto pinit;
+                if (block_rsv->freed[idx] > 0) {
+                        block_rsv_add_bytes(block_rsv,
-        /*
+                                            block_rsv->freed[idx], 0);
-         * discard is sloooow, and so triggering discards on
+                        block_rsv->freed[idx] = 0;
-         * individual btree blocks isn't a good plan.  Just
+                }
-         * pin everything in discard mode.
+                if (atomic_read(&block_rsv->usage) == 0) {
-         */
+                        btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-        if (btrfs_test_opt(root, DISCARD))
-                goto pinit;
-        buf = btrfs_find_tree_block(root, bytenr, num_bytes);
-        if (!buf)
-                goto pinit;
-        /* we can reuse a block if it hasn't been written
+                        if (block_rsv->freed[0] == 0 &&
-         * and it is from this transaction.  We can't
+                            block_rsv->freed[1] == 0) {
-         * reuse anything from the tree log root because
+                                list_del_init(&block_rsv->list);
-         * it has tiny sub-transactions.
+                                kfree(block_rsv);
-         */
+                        }
-        if (btrfs_buffer_uptodate(buf, 0) &&
+                } else {
-            btrfs_try_tree_lock(buf)) {
+                        btrfs_block_rsv_release(root, block_rsv, 0);
-                u64 header_owner = btrfs_header_owner(buf);
-                u64 header_transid = btrfs_header_generation(buf);
-                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
-                    header_transid == trans->transid &&
-                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                        *must_clean = buf;
-                        return 1;
                }
-                btrfs_tree_unlock(buf);
        }
-        free_extent_buffer(buf);
+        mutex_unlock(&fs_info->durable_block_rsv_mutex);
-pinit:
-        if (path)
-                btrfs_set_path_blocking(path);
-        /* unlocks the pinned mutex */
-        btrfs_pin_extent(root, bytenr, num_bytes, reserved);
-        BUG_ON(err < 0);
        return 0;
 }
@@ -3902,9 +4228,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
        } else {
-                int mark_free = 0;
-                struct extent_buffer *must_clean = NULL;
                if (found_extent) {
                        BUG_ON(is_data && refs_to_drop !=
                               extent_data_ref_count(root, path, iref));
@@ -3917,31 +4240,11 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
-                ret = pin_down_bytes(trans, root, path, bytenr,
-                                     num_bytes, is_data, 0, &must_clean);
-                if (ret > 0)
-                        mark_free = 1;
-                BUG_ON(ret < 0);
-                /*
-                 * it is going to be very rare for someone to be waiting
-                 * on the block we're freeing.  del_items might need to
-                 * schedule, so rather than get fancy, just force it
-                 * to blocking here
-                 */
-                if (must_clean)
-                        btrfs_set_lock_blocking(must_clean);
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
                btrfs_release_path(extent_root, path);
-                if (must_clean) {
-                        clean_tree_block(NULL, root, must_clean);
-                        btrfs_tree_unlock(must_clean);
-                        free_extent_buffer(must_clean);
-                }
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
@@ -3951,8 +4254,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                             (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
-                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
+                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
-                                         mark_free);
                BUG_ON(ret);
        }
        btrfs_free_path(path);
@@ -3960,7 +4262,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 }
 /*
- * when we free an extent, it is possible (and likely) that we free the last
+ * when we free an block, it is possible (and likely) that we free the last
 * delayed ref for that extent as well.  This searches the delayed ref tree for
 * a given extent, and if there are no other delayed refs to be processed, it
 * removes it from the tree.
@@ -3972,7 +4274,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_ref_root *delayed_refs;
        struct btrfs_delayed_ref_node *ref;
        struct rb_node *node;
-        int ret;
+        int ret = 0;
        delayed_refs = &trans->transaction->delayed_refs;
        spin_lock(&delayed_refs->lock);
@@ -4024,17 +4326,100 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        list_del_init(&head->cluster);
        spin_unlock(&delayed_refs->lock);
-        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+        BUG_ON(head->extent_op);
-                                  &head->node, head->extent_op,
+        if (head->must_insert_reserved)
-                                  head->must_insert_reserved);
+                ret = 1;
-        BUG_ON(ret);
+        mutex_unlock(&head->mutex);
        btrfs_put_delayed_ref(&head->node);
-        return 0;
+        return ret;
 out:
        spin_unlock(&delayed_refs->lock);
        return 0;
 }
+void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root,
+                           struct extent_buffer *buf,
+                           u64 parent, int last_ref)
+{
+        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_group_cache *cache = NULL;
+        int ret;
+        if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+                ret = btrfs_add_delayed_tree_ref(trans, buf->start, buf->len,
+                                                parent, root->root_key.objectid,
+                                                btrfs_header_level(buf),
+                                                BTRFS_DROP_DELAYED_REF, NULL);
+                BUG_ON(ret);
+        }
+        if (!last_ref)
+                return;
+        block_rsv = get_block_rsv(trans, root);
+        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
+        if (block_rsv->space_info != cache->space_info)
+                goto out;
+        if (btrfs_header_generation(buf) == trans->transid) {
+                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
+                        ret = check_ref_cleanup(trans, root, buf->start);
+                        if (!ret)
+                                goto pin;
+                }
+                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
+                        pin_down_extent(root, cache, buf->start, buf->len, 1);
+                        goto pin;
+                }
+                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
+                btrfs_add_free_space(cache, buf->start, buf->len);
+                ret = update_reserved_bytes(cache, buf->len, 0, 0);
+                if (ret == -EAGAIN) {
+                        /* block group became read-only */
+                        update_reserved_bytes(cache, buf->len, 0, 1);
+                        goto out;
+                }
+                ret = 1;
+                spin_lock(&block_rsv->lock);
+                if (block_rsv->reserved < block_rsv->size) {
+                        block_rsv->reserved += buf->len;
+                        ret = 0;
+                }
+                spin_unlock(&block_rsv->lock);
+                if (ret) {
+                        spin_lock(&cache->space_info->lock);
+                        cache->space_info->bytes_reserved -= buf->len;
+                        spin_unlock(&cache->space_info->lock);
+                }
+                goto out;
+        }
+pin:
+        if (block_rsv->durable && !cache->ro) {
+                ret = 0;
+                spin_lock(&cache->lock);
+                if (!cache->ro) {
+                        cache->reserved_pinned += buf->len;
+                        ret = 1;
+                }
+                spin_unlock(&cache->lock);
+                if (ret) {
+                        spin_lock(&block_rsv->lock);
+                        block_rsv->freed[trans->transid & 0x1] += buf->len;
+                        spin_unlock(&block_rsv->lock);
+                }
+        }
+out:
+        btrfs_put_block_group(cache);
+}
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      u64 bytenr, u64 num_bytes, u64 parent,
@@ -4056,8 +4441,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                                        parent, root_objectid, (int)owner,
                                        BTRFS_DROP_DELAYED_REF, NULL);
                BUG_ON(ret);
-                ret = check_ref_cleanup(trans, root, bytenr);
-                BUG_ON(ret);
        } else {
                ret = btrfs_add_delayed_data_ref(trans, bytenr, num_bytes,
                                        parent, root_objectid, owner,
@@ -4067,21 +4450,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
        return ret;
 }
-int btrfs_free_tree_block(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root,
-                          u64 bytenr, u32 blocksize,
-                          u64 parent, u64 root_objectid, int level)
-{
-        u64 used;
-        spin_lock(&root->node_lock);
-        used = btrfs_root_used(&root->root_item) - blocksize;
-        btrfs_set_root_used(&root->root_item, used);
-        spin_unlock(&root->node_lock);
-        return btrfs_free_extent(trans, root, bytenr, blocksize,
-                                 parent, root_objectid, level, 0);
-}
 static u64 stripe_align(struct btrfs_root *root, u64 val)
 {
        u64 mask = ((u64)root->stripesize - 1);
@@ -4134,6 +4502,22 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
        return 0;
 }
+static int get_block_group_index(struct btrfs_block_group_cache *cache)
+{
+        int index;
+        if (cache->flags & BTRFS_BLOCK_GROUP_RAID10)
+                index = 0;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID1)
+                index = 1;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_DUP)
+                index = 2;
+        else if (cache->flags & BTRFS_BLOCK_GROUP_RAID0)
+                index = 3;
+        else
+                index = 4;
+        return index;
+}
 enum btrfs_loop_type {
        LOOP_FIND_IDEAL = 0,
        LOOP_CACHING_NOWAIT = 1,
@@ -4155,7 +4539,6 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                                     u64 num_bytes, u64 empty_size,
                                     u64 search_start, u64 search_end,
                                     u64 hint_byte, struct btrfs_key *ins,
-                                     u64 exclude_start, u64 exclude_nr,
                                     int data)
 {
        int ret = 0;
@@ -4168,6 +4551,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        struct btrfs_space_info *space_info;
        int last_ptr_loop = 0;
        int loop = 0;
+        int index = 0;
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
@@ -4237,6 +4621,7 @@ ideal_cache:
                                btrfs_put_block_group(block_group);
                                up_read(&space_info->groups_sem);
                        } else {
+                                index = get_block_group_index(block_group);
                                goto have_block_group;
                        }
                } else if (block_group) {
@@ -4245,7 +4630,8 @@ ideal_cache:
        }
 search:
        down_read(&space_info->groups_sem);
-        list_for_each_entry(block_group, &space_info->block_groups, list) {
+        list_for_each_entry(block_group, &space_info->block_groups[index],
+                            list) {
                u64 offset;
                int cached;
@@ -4436,23 +4822,22 @@ checks:
                        goto loop;
                }
-                if (exclude_nr > 0 &&
+                ins->objectid = search_start;
-                    (search_start + num_bytes > exclude_start &&
+                ins->offset = num_bytes;
-                     search_start < exclude_start + exclude_nr)) {
-                        search_start = exclude_start + exclude_nr;
+                if (offset < search_start)
+                        btrfs_add_free_space(block_group, offset,
+                                             search_start - offset);
+                BUG_ON(offset > search_start);
+                ret = update_reserved_bytes(block_group, num_bytes, 1,
+                                            (data & BTRFS_BLOCK_GROUP_DATA));
+                if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
-                        /*
-                         * if search_start is still in this block group
-                         * then we just re-search this block group
-                         */
-                        if (search_start >= block_group->key.objectid &&
-                            search_start < (block_group->key.objectid +
-                                            block_group->key.offset))
-                                goto have_block_group;
                        goto loop;
                }
+                /* we are all good, lets return */
                ins->objectid = search_start;
                ins->offset = num_bytes;
@@ -4460,18 +4845,18 @@ checks:
                        btrfs_add_free_space(block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                update_reserved_extents(block_group, num_bytes, 1);
-                /* we are all good, lets return */
                break;
 loop:
                failed_cluster_refill = false;
                failed_alloc = false;
+                BUG_ON(index != get_block_group_index(block_group));
                btrfs_put_block_group(block_group);
        }
        up_read(&space_info->groups_sem);
+        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
+                goto search;
        /* LOOP_FIND_IDEAL, only search caching/cached bg's, and don't wait for
         *                      for them to make caching progress.  Also
         *                      determine the best possible bg to cache
@@ -4485,6 +4870,7 @@ loop:
        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
            (found_uncached_bg || empty_size || empty_cluster ||
             allowed_chunk_alloc)) {
+                index = 0;
                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                        found_uncached_bg = false;
                        loop++;
@@ -4567,31 +4953,30 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups)
 {
        struct btrfs_block_group_cache *cache;
+        int index = 0;
        spin_lock(&info->lock);
        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
               (unsigned long long)(info->total_bytes - info->bytes_used -
                                    info->bytes_pinned - info->bytes_reserved -
-                                    info->bytes_super),
+                                    info->bytes_readonly),
               (info->full) ? "" : "not ");
-        printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+        printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
-               " may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
+               "reserved=%llu, may_use=%llu, readonly=%llu\n",
-               "\n",
               (unsigned long long)info->total_bytes,
+               (unsigned long long)info->bytes_used,
               (unsigned long long)info->bytes_pinned,
-               (unsigned long long)info->bytes_delalloc,
+               (unsigned long long)info->bytes_reserved,
               (unsigned long long)info->bytes_may_use,
-               (unsigned long long)info->bytes_used,
+               (unsigned long long)info->bytes_readonly);
-               (unsigned long long)info->bytes_root,
-               (unsigned long long)info->bytes_super,
-               (unsigned long long)info->bytes_reserved);
        spin_unlock(&info->lock);
        if (!dump_block_groups)
                return;
        down_read(&info->groups_sem);
-        list_for_each_entry(cache, &info->block_groups, list) {
+again:
+        list_for_each_entry(cache, &info->block_groups[index], list) {
                spin_lock(&cache->lock);
                printk(KERN_INFO "block group %llu has %llu bytes, %llu used "
                       "%llu pinned %llu reserved\n",
@@ -4603,6 +4988,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                btrfs_dump_free_space(cache, bytes);
                spin_unlock(&cache->lock);
        }
+        if (++index < BTRFS_NR_RAID_TYPES)
+                goto again;
        up_read(&info->groups_sem);
 }
@@ -4628,9 +5015,8 @@ again:
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
-                               search_start, search_end, hint_byte, ins,
+                               search_start, search_end, hint_byte,
-                               trans->alloc_exclude_start,
+                               ins, data);
-                               trans->alloc_exclude_nr, data);
        if (ret == -ENOSPC && num_bytes > min_alloc_size) {
                num_bytes = num_bytes >> 1;
@@ -4668,7 +5054,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        ret = btrfs_discard_extent(root, start, len);
        btrfs_add_free_space(cache, start, len);
-        update_reserved_extents(cache, len, 0);
+        update_reserved_bytes(cache, len, 0, 1);
        btrfs_put_block_group(cache);
        return ret;
@@ -4731,8 +5117,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
-        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-                                 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -4792,8 +5177,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
-        ret = update_block_group(trans, root, ins->objectid, ins->offset,
+        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
-                                 1, 0);
        if (ret) {
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -4869,73 +5253,14 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                put_caching_control(caching_ctl);
        }
-        update_reserved_extents(block_group, ins->offset, 1);
+        ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+        BUG_ON(ret);
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
                                         0, owner, offset, ins, 1);
        return ret;
 }
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-static int alloc_tree_block(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            u64 num_bytes, u64 parent, u64 root_objectid,
-                            struct btrfs_disk_key *key, int level,
-                            u64 empty_size, u64 hint_byte, u64 search_end,
-                            struct btrfs_key *ins)
-{
-        int ret;
-        u64 flags = 0;
-        ret = btrfs_reserve_extent(trans, root, num_bytes, num_bytes,
-                                   empty_size, hint_byte, search_end,
-                                   ins, 0);
-        if (ret)
-                return ret;
-        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
-                if (parent == 0)
-                        parent = ins->objectid;
-                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
-        } else
-                BUG_ON(parent > 0);
-        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-                struct btrfs_delayed_extent_op *extent_op;
-                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                BUG_ON(!extent_op);
-                if (key)
-                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
-                else
-                        memset(&extent_op->key, 0, sizeof(extent_op->key));
-                extent_op->flags_to_set = flags;
-                extent_op->update_key = 1;
-                extent_op->update_flags = 1;
-                extent_op->is_data = 0;
-                ret = btrfs_add_delayed_tree_ref(trans, ins->objectid,
-                                        ins->offset, parent, root_objectid,
-                                        level, BTRFS_ADD_DELAYED_EXTENT,
-                                        extent_op);
-                BUG_ON(ret);
-        }
-        if (root_objectid == root->root_key.objectid) {
-                u64 used;
-                spin_lock(&root->node_lock);
-                used = btrfs_root_used(&root->root_item) + num_bytes;
-                btrfs_set_root_used(&root->root_item, used);
-                spin_unlock(&root->node_lock);
-        }
-        return ret;
-}
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
                                            struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize,
@@ -4974,8 +5299,45 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
        return buf;
 }
+static struct btrfs_block_rsv *
+use_block_rsv(struct btrfs_trans_handle *trans,
+              struct btrfs_root *root, u32 blocksize)
+{
+        struct btrfs_block_rsv *block_rsv;
+        int ret;
+        block_rsv = get_block_rsv(trans, root);
+        if (block_rsv->size == 0) {
+                ret = reserve_metadata_bytes(block_rsv, blocksize);
+                if (ret)
+                        return ERR_PTR(ret);
+                return block_rsv;
+        }
+        ret = block_rsv_use_bytes(block_rsv, blocksize);
+        if (!ret)
+                return block_rsv;
+        WARN_ON(1);
+        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+                block_rsv->size, block_rsv->reserved,
+                block_rsv->freed[0], block_rsv->freed[1]);
+        return ERR_PTR(-ENOSPC);
+}
+static void unuse_block_rsv(struct btrfs_block_rsv *block_rsv, u32 blocksize)
+{
+        block_rsv_add_bytes(block_rsv, blocksize, 0);
+        block_rsv_release_bytes(block_rsv, NULL, 0);
+}
 /*
- * helper function to allocate a block for a given tree
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
 * returns the tree buffer or NULL.
 */
 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
@@ -4985,18 +5347,53 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
                                        u64 hint, u64 empty_size)
 {
        struct btrfs_key ins;
-        int ret;
+        struct btrfs_block_rsv *block_rsv;
        struct extent_buffer *buf;
+        u64 flags = 0;
+        int ret;
-        ret = alloc_tree_block(trans, root, blocksize, parent, root_objectid,
+        block_rsv = use_block_rsv(trans, root, blocksize);
-                               key, level, empty_size, hint, (u64)-1, &ins);
+        if (IS_ERR(block_rsv))
+                return ERR_CAST(block_rsv);
+        ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+                                   empty_size, hint, (u64)-1, &ins, 0);
        if (ret) {
-                BUG_ON(ret > 0);
+                unuse_block_rsv(block_rsv, blocksize);
                return ERR_PTR(ret);
        }
        buf = btrfs_init_new_buffer(trans, root, ins.objectid,
                                    blocksize, level);
+        BUG_ON(IS_ERR(buf));
+        if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
+                if (parent == 0)
+                        parent = ins.objectid;
+                flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
+        } else
+                BUG_ON(parent > 0);
+        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
+                struct btrfs_delayed_extent_op *extent_op;
+                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                BUG_ON(!extent_op);
+                if (key)
+                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
+                else
+                        memset(&extent_op->key, 0, sizeof(extent_op->key));
+                extent_op->flags_to_set = flags;
+                extent_op->update_key = 1;
+                extent_op->update_flags = 1;
+                extent_op->is_data = 0;
+                ret = btrfs_add_delayed_tree_ref(trans, ins.objectid,
+                                        ins.offset, parent, root_objectid,
+                                        level, BTRFS_ADD_DELAYED_EXTENT,
+                                        extent_op);
+                BUG_ON(ret);
+        }
        return buf;
 }
@@ -5321,7 +5718,7 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                                 struct btrfs_path *path,
                                 struct walk_control *wc)
 {
-        int ret = 0;
+        int ret;
        int level = wc->level;
        struct extent_buffer *eb = path->nodes[level];
        u64 parent = 0;
@@ -5399,13 +5796,11 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
                               btrfs_header_owner(path->nodes[level + 1]));
        }
-        ret = btrfs_free_extent(trans, root, eb->start, eb->len, parent,
+        btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
-                                root->root_key.objectid, level, 0);
-        BUG_ON(ret);
 out:
        wc->refs[level] = 0;
        wc->flags[level] = 0;
-        return ret;
+        return 0;
 }
 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
@@ -5483,7 +5878,8 @@ static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
 * also make sure backrefs for the shared block and all lower level
 * blocks are properly updated.
 */
-int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
+int btrfs_drop_snapshot(struct btrfs_root *root,
+                        struct btrfs_block_rsv *block_rsv, int update_ref)
 {
        struct btrfs_path *path;
        struct btrfs_trans_handle *trans;
@@ -5501,7 +5897,9 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
        wc = kzalloc(sizeof(*wc), GFP_NOFS);
        BUG_ON(!wc);
-        trans = btrfs_start_transaction(tree_root, 1);
+        trans = btrfs_start_transaction(tree_root, 0);
+        if (block_rsv)
+                trans->block_rsv = block_rsv;
        if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
                level = btrfs_header_level(root->node);
@@ -5589,22 +5987,16 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
                }
                BUG_ON(wc->level == 0);
-                if (trans->transaction->in_commit ||
+                if (btrfs_should_end_transaction(trans, tree_root)) {
-                    trans->transaction->delayed_refs.flushing) {
                        ret = btrfs_update_root(trans, tree_root,
                                                &root->root_key,
                                                root_item);
                        BUG_ON(ret);
-                        btrfs_end_transaction(trans, tree_root);
+                        btrfs_end_transaction_throttle(trans, tree_root);
-                        trans = btrfs_start_transaction(tree_root, 1);
+                        trans = btrfs_start_transaction(tree_root, 0);
-                } else {
+                        if (block_rsv)
-                        unsigned long update;
+                                trans->block_rsv = block_rsv;
-                        update = trans->delayed_ref_updates;
-                        trans->delayed_ref_updates = 0;
-                        if (update)
-                                btrfs_run_delayed_refs(trans, tree_root,
-                                                       update);
                }
        }
        btrfs_release_path(root, path);
@@ -5632,7 +6024,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref)
                kfree(root);
        }
 out:
-        btrfs_end_transaction(trans, tree_root);
+        btrfs_end_transaction_throttle(trans, tree_root);
        kfree(wc);
        btrfs_free_path(path);
        return err;
@@ -7228,48 +7620,80 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
        return flags;
 }
-static int __alloc_chunk_for_shrink(struct btrfs_root *root,
+static int set_block_group_ro(struct btrfs_block_group_cache *cache)
-                     struct btrfs_block_group_cache *shrink_block_group,
-                     int force)
 {
-        struct btrfs_trans_handle *trans;
+        struct btrfs_space_info *sinfo = cache->space_info;
-        u64 new_alloc_flags;
+        u64 num_bytes;
-        u64 calc;
+        int ret = -ENOSPC;
-        spin_lock(&shrink_block_group->lock);
+        if (cache->ro)
-        if (btrfs_block_group_used(&shrink_block_group->item) +
+                return 0;
-            shrink_block_group->reserved > 0) {
-                spin_unlock(&shrink_block_group->lock);
-                trans = btrfs_start_transaction(root, 1);
+        spin_lock(&sinfo->lock);
-                spin_lock(&shrink_block_group->lock);
+        spin_lock(&cache->lock);
+        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+                    cache->bytes_super - btrfs_block_group_used(&cache->item);
+        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
+            sinfo->bytes_may_use + sinfo->bytes_readonly +
+            cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+                sinfo->bytes_readonly += num_bytes;
+                sinfo->bytes_reserved += cache->reserved_pinned;
+                cache->reserved_pinned = 0;
+                cache->ro = 1;
+                ret = 0;
+        }
+        spin_unlock(&cache->lock);
+        spin_unlock(&sinfo->lock);
+        return ret;
+}
-                new_alloc_flags = update_block_group_flags(root,
+int btrfs_set_block_group_ro(struct btrfs_root *root,
-                                                   shrink_block_group->flags);
+                             struct btrfs_block_group_cache *cache)
-                if (new_alloc_flags != shrink_block_group->flags) {
-                        calc =
-                             btrfs_block_group_used(&shrink_block_group->item);
-                } else {
-                        calc = shrink_block_group->key.offset;
-                }
-                spin_unlock(&shrink_block_group->lock);
-                do_chunk_alloc(trans, root->fs_info->extent_root,
+{
-                               calc + 2 * 1024 * 1024, new_alloc_flags, force);
+        struct btrfs_trans_handle *trans;
+        u64 alloc_flags;
+        int ret;
-                btrfs_end_transaction(trans, root);
+        BUG_ON(cache->ro);
-        } else
-                spin_unlock(&shrink_block_group->lock);
+        trans = btrfs_join_transaction(root, 1);
-        return 0;
+        BUG_ON(IS_ERR(trans));
-}
+        alloc_flags = update_block_group_flags(root, cache->flags);
+        if (alloc_flags != cache->flags)
+                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
-int btrfs_prepare_block_group_relocation(struct btrfs_root *root,
+        ret = set_block_group_ro(cache);
-                                         struct btrfs_block_group_cache *group)
+        if (!ret)
+                goto out;
+        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
+        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        if (ret < 0)
+                goto out;
+        ret = set_block_group_ro(cache);
+out:
+        btrfs_end_transaction(trans, root);
+        return ret;
+}
+int btrfs_set_block_group_rw(struct btrfs_root *root,
+                              struct btrfs_block_group_cache *cache)
 {
-        __alloc_chunk_for_shrink(root, group, 1);
+        struct btrfs_space_info *sinfo = cache->space_info;
-        set_block_group_readonly(group);
+        u64 num_bytes;
+        BUG_ON(!cache->ro);
+        spin_lock(&sinfo->lock);
+        spin_lock(&cache->lock);
+        num_bytes = cache->key.offset - cache->reserved - cache->pinned -
+                    cache->bytes_super - btrfs_block_group_used(&cache->item);
+        sinfo->bytes_readonly -= num_bytes;
+        cache->ro = 0;
+        spin_unlock(&cache->lock);
+        spin_unlock(&sinfo->lock);
        return 0;
 }
@@ -7436,17 +7860,33 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
         */
        synchronize_rcu();
+        release_global_block_rsv(info);
        while(!list_empty(&info->space_info)) {
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
+                if (space_info->bytes_pinned > 0 ||
+                    space_info->bytes_reserved > 0) {
+                        WARN_ON(1);
+                        dump_space_info(space_info, 0, 0);
+                }
                list_del(&space_info->list);
                kfree(space_info);
        }
        return 0;
 }
+static void __link_block_group(struct btrfs_space_info *space_info,
+                               struct btrfs_block_group_cache *cache)
+{
+        int index = get_block_group_index(cache);
+        down_write(&space_info->groups_sem);
+        list_add_tail(&cache->list, &space_info->block_groups[index]);
+        up_write(&space_info->groups_sem);
+}
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
        struct btrfs_path *path;
@@ -7468,10 +7908,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        while (1) {
                ret = find_first_block_group(root, path, &key);
-                if (ret > 0) {
+                if (ret > 0)
-                        ret = 0;
+                        break;
-                        goto error;
-                }
                if (ret != 0)
                        goto error;
@@ -7480,7 +7918,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                cache = kzalloc(sizeof(*cache), GFP_NOFS);
                if (!cache) {
                        ret = -ENOMEM;
-                        break;
+                        goto error;
                }
                atomic_set(&cache->count, 1);
@@ -7537,20 +7975,36 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                BUG_ON(ret);
                cache->space_info = space_info;
                spin_lock(&cache->space_info->lock);
-                cache->space_info->bytes_super += cache->bytes_super;
+                cache->space_info->bytes_readonly += cache->bytes_super;
                spin_unlock(&cache->space_info->lock);
-                down_write(&space_info->groups_sem);
+                __link_block_group(space_info, cache);
-                list_add_tail(&cache->list, &space_info->block_groups);
-                up_write(&space_info->groups_sem);
                ret = btrfs_add_block_group_cache(root->fs_info, cache);
                BUG_ON(ret);
                set_avail_alloc_bits(root->fs_info, cache->flags);
                if (btrfs_chunk_readonly(root, cache->key.objectid))
-                        set_block_group_readonly(cache);
+                        set_block_group_ro(cache);
        }
+        list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
+                if (!(get_alloc_profile(root, space_info->flags) &
+                      (BTRFS_BLOCK_GROUP_RAID10 |
+                       BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_DUP)))
+                        continue;
+                /*
+                 * avoid allocating from un-mirrored block group if there are
+                 * mirrored block groups.
+                 */
+                list_for_each_entry(cache, &space_info->block_groups[3], list)
+                        set_block_group_ro(cache);
+                list_for_each_entry(cache, &space_info->block_groups[4], list)
+                        set_block_group_ro(cache);
+        }
+        init_global_block_rsv(info);
        ret = 0;
 error:
        btrfs_free_path(path);
@@ -7611,12 +8065,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        spin_lock(&cache->space_info->lock);
-        cache->space_info->bytes_super += cache->bytes_super;
+        cache->space_info->bytes_readonly += cache->bytes_super;
        spin_unlock(&cache->space_info->lock);
-        down_write(&cache->space_info->groups_sem);
+        __link_block_group(cache->space_info, cache);
-        list_add_tail(&cache->list, &cache->space_info->block_groups);
-        up_write(&cache->space_info->groups_sem);
        ret = btrfs_add_block_group_cache(root->fs_info, cache);
        BUG_ON(ret);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d2d03684fab2..a4080c21ec55 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -135,7 +135,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask)
        return state;
 }
-static void free_extent_state(struct extent_state *state)
+void free_extent_state(struct extent_state *state)
 {
        if (!state)
                return;
@@ -335,21 +335,18 @@ static int merge_state(struct extent_io_tree *tree,
 }
 static int set_state_cb(struct extent_io_tree *tree,
-                         struct extent_state *state,
+                         struct extent_state *state, int *bits)
-                         unsigned long bits)
 {
        if (tree->ops && tree->ops->set_bit_hook) {
                return tree->ops->set_bit_hook(tree->mapping->host,
-                                               state->start, state->end,
+                                               state, bits);
-                                               state->state, bits);
        }
        return 0;
 }
 static void clear_state_cb(struct extent_io_tree *tree,
-                           struct extent_state *state,
+                           struct extent_state *state, int *bits)
-                           unsigned long bits)
 {
        if (tree->ops && tree->ops->clear_bit_hook)
                tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
@@ -367,9 +364,10 @@ static void clear_state_cb(struct extent_io_tree *tree,
 */
 static int insert_state(struct extent_io_tree *tree,
                        struct extent_state *state, u64 start, u64 end,
-                        int bits)
+                        int *bits)
 {
        struct rb_node *node;
+        int bits_to_set = *bits & ~EXTENT_CTLBITS;
        int ret;
        if (end < start) {
@@ -384,9 +382,9 @@ static int insert_state(struct extent_io_tree *tree,
        if (ret)
                return ret;
-        if (bits & EXTENT_DIRTY)
+        if (bits_to_set & EXTENT_DIRTY)
                tree->dirty_bytes += end - start + 1;
-        state->state |= bits;
+        state->state |= bits_to_set;
        node = tree_insert(&tree->state, end, &state->rb_node);
        if (node) {
                struct extent_state *found;
@@ -456,13 +454,13 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 * struct is freed and removed from the tree
 */
 static int clear_state_bit(struct extent_io_tree *tree,
-                            struct extent_state *state, int bits, int wake,
+                            struct extent_state *state,
-                            int delete)
+                            int *bits, int wake)
 {
-        int bits_to_clear = bits & ~EXTENT_DO_ACCOUNTING;
+        int bits_to_clear = *bits & ~EXTENT_CTLBITS;
        int ret = state->state & bits_to_clear;
-        if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
+        if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                WARN_ON(range > tree->dirty_bytes);
                tree->dirty_bytes -= range;
@@ -471,9 +469,8 @@ static int clear_state_bit(struct extent_io_tree *tree,
        state->state &= ~bits_to_clear;
        if (wake)
                wake_up(&state->wq);
-        if (delete || state->state == 0) {
+        if (state->state == 0) {
                if (state->tree) {
-                        clear_state_cb(tree, state, state->state);
                        rb_erase(&state->rb_node, &tree->state);
                        state->tree = NULL;
                        free_extent_state(state);
@@ -514,6 +511,10 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        int set = 0;
        int clear = 0;
+        if (delete)
+                bits |= ~EXTENT_CTLBITS;
+        bits |= EXTENT_FIRST_DELALLOC;
        if (bits & (EXTENT_IOBITS | EXTENT_BOUNDARY))
                clear = 1;
 again:
@@ -580,8 +581,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        set |= clear_state_bit(tree, state, bits, wake,
+                        set |= clear_state_bit(tree, state, &bits, wake);
-                                               delete);
                        if (last_end == (u64)-1)
                                goto out;
                        start = last_end + 1;
@@ -602,7 +602,7 @@ hit_next:
                if (wake)
                        wake_up(&state->wq);
-                set |= clear_state_bit(tree, prealloc, bits, wake, delete);
+                set |= clear_state_bit(tree, prealloc, &bits, wake);
                prealloc = NULL;
                goto out;
@@ -613,7 +613,7 @@ hit_next:
        else
                next_node = NULL;
-        set |= clear_state_bit(tree, state, bits, wake, delete);
+        set |= clear_state_bit(tree, state, &bits, wake);
        if (last_end == (u64)-1)
                goto out;
        start = last_end + 1;
@@ -706,19 +706,19 @@ out:
 static int set_state_bits(struct extent_io_tree *tree,
                           struct extent_state *state,
-                           int bits)
+                           int *bits)
 {
        int ret;
+        int bits_to_set = *bits & ~EXTENT_CTLBITS;
        ret = set_state_cb(tree, state, bits);
        if (ret)
                return ret;
+        if ((bits_to_set & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
-        if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
                u64 range = state->end - state->start + 1;
                tree->dirty_bytes += range;
        }
-        state->state |= bits;
+        state->state |= bits_to_set;
        return 0;
 }
@@ -745,10 +745,9 @@ static void cache_state(struct extent_state *state,
 * [start, end] is inclusive This takes the tree lock.
 */
-static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
-                          int bits, int exclusive_bits, u64 *failed_start,
+                   int bits, int exclusive_bits, u64 *failed_start,
-                          struct extent_state **cached_state,
+                   struct extent_state **cached_state, gfp_t mask)
-                          gfp_t mask)
 {
        struct extent_state *state;
        struct extent_state *prealloc = NULL;
@@ -757,6 +756,7 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
        u64 last_start;
        u64 last_end;
+        bits |= EXTENT_FIRST_DELALLOC;
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
@@ -778,7 +778,7 @@ again:
         */
        node = tree_search(tree, start);
        if (!node) {
-                err = insert_state(tree, prealloc, start, end, bits);
+                err = insert_state(tree, prealloc, start, end, &bits);
                prealloc = NULL;
                BUG_ON(err == -EEXIST);
                goto out;
@@ -802,7 +802,7 @@ hit_next:
                        goto out;
                }
-                err = set_state_bits(tree, state, bits);
+                err = set_state_bits(tree, state, &bits);
                if (err)
                        goto out;
@@ -852,7 +852,7 @@ hit_next:
                if (err)
                        goto out;
                if (state->end <= end) {
-                        err = set_state_bits(tree, state, bits);
+                        err = set_state_bits(tree, state, &bits);
                        if (err)
                                goto out;
                        cache_state(state, cached_state);
@@ -877,7 +877,7 @@ hit_next:
                else
                        this_end = last_start - 1;
                err = insert_state(tree, prealloc, start, this_end,
-                                   bits);
+                                   &bits);
                BUG_ON(err == -EEXIST);
                if (err) {
                        prealloc = NULL;
@@ -903,7 +903,7 @@ hit_next:
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
-                err = set_state_bits(tree, prealloc, bits);
+                err = set_state_bits(tree, prealloc, &bits);
                if (err) {
                        prealloc = NULL;
                        goto out;
@@ -966,8 +966,7 @@ int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 {
        return clear_extent_bit(tree, start, end,
                                EXTENT_DIRTY | EXTENT_DELALLOC |
-                                EXTENT_DO_ACCOUNTING, 0, 0,
+                                EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
-                                NULL, mask);
 }
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
@@ -1435,9 +1434,6 @@ int extent_clear_unlock_delalloc(struct inode *inode,
        if (op & EXTENT_CLEAR_DELALLOC)
                clear_bits |= EXTENT_DELALLOC;
-        if (op & EXTENT_CLEAR_ACCOUNTING)
-                clear_bits |= EXTENT_DO_ACCOUNTING;
        clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
        if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
                    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
@@ -1916,7 +1912,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
        if (tree->ops && tree->ops->submit_bio_hook)
                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
-                                           mirror_num, bio_flags);
+                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
@@ -2020,6 +2016,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        sector_t sector;
        struct extent_map *em;
        struct block_device *bdev;
+        struct btrfs_ordered_extent *ordered;
        int ret;
        int nr = 0;
        size_t page_offset = 0;
@@ -2031,7 +2028,15 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        set_page_extent_mapped(page);
        end = page_end;
-        lock_extent(tree, start, end, GFP_NOFS);
+        while (1) {
+                lock_extent(tree, start, end, GFP_NOFS);
+                ordered = btrfs_lookup_ordered_extent(inode, start);
+                if (!ordered)
+                        break;
+                unlock_extent(tree, start, end, GFP_NOFS);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+        }
        if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
                char *userpage;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index bbab4813646f..5691c7b590da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -16,7 +16,9 @@
 #define EXTENT_BOUNDARY (1 << 9)
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
+#define EXTENT_FIRST_DELALLOC (1 << 12)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
+#define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 /* flags for bio submission */
 #define EXTENT_BIO_COMPRESSED 1
@@ -47,7 +49,7 @@ struct extent_state;
 typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
                                       struct bio *bio, int mirror_num,
-                                       unsigned long bio_flags);
+                                       unsigned long bio_flags, u64 bio_offset);
 struct extent_io_ops {
        int (*fill_delalloc)(struct inode *inode, struct page *locked_page,
                             u64 start, u64 end, int *page_started,
@@ -69,10 +71,10 @@ struct extent_io_ops {
                                    struct extent_state *state);
        int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
                                      struct extent_state *state, int uptodate);
-        int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
+        int (*set_bit_hook)(struct inode *inode, struct extent_state *state,
-                            unsigned long old, unsigned long bits);
+                            int *bits);
        int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
-                              unsigned long bits);
+                              int *bits);
        int (*merge_extent_hook)(struct inode *inode,
                                 struct extent_state *new,
                                 struct extent_state *other);
@@ -176,6 +178,7 @@ u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
                     u64 max_bytes, unsigned long bits);
+void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   int bits, int filled, struct extent_state *cached_state);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@@ -185,6 +188,9 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
                    int bits, gfp_t mask);
+int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                   int bits, int exclusive_bits, u64 *failed_start,
+                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
                        gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 54a255065aa3..a562a250ae77 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -149,13 +149,14 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 }
-int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
-                          struct bio *bio, u32 *dst)
+                                   struct inode *inode, struct bio *bio,
+                                   u64 logical_offset, u32 *dst, int dio)
 {
        u32 sum;
        struct bio_vec *bvec = bio->bi_io_vec;
        int bio_index = 0;
-        u64 offset;
+        u64 offset = 0;
        u64 item_start_offset = 0;
        u64 item_last_offset = 0;
        u64 disk_bytenr;
@@ -174,8 +175,11 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
        WARN_ON(bio->bi_vcnt <= 0);
        disk_bytenr = (u64)bio->bi_sector << 9;
+        if (dio)
+                offset = logical_offset;
        while (bio_index < bio->bi_vcnt) {
-                offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+                if (!dio)
+                        offset = page_offset(bvec->bv_page) + bvec->bv_offset;
                ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum);
                if (ret == 0)
                        goto found;
@@ -238,6 +242,7 @@ found:
                else
                        set_state_private(io_tree, offset, sum);
                disk_bytenr += bvec->bv_len;
+                offset += bvec->bv_len;
                bio_index++;
                bvec++;
        }
@@ -245,6 +250,18 @@ found:
        return 0;
 }
+int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
+                          struct bio *bio, u32 *dst)
+{
+        return __btrfs_lookup_bio_sums(root, inode, bio, 0, dst, 0);
+}
+int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
+                              struct bio *bio, u64 offset, u32 *dst)
+{
+        return __btrfs_lookup_bio_sums(root, inode, bio, offset, dst, 1);
+}
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
                             struct list_head *list)
 {
@@ -657,6 +674,9 @@ again:
                goto found;
        }
        ret = PTR_ERR(item);
+        if (ret != -EFBIG && ret != -ENOENT)
+                goto fail_unlock;
        if (ret == -EFBIG) {
                u32 item_size;
                /* we found one, but it isn't big enough yet */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 29ff749ff4ca..e354c33df082 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -46,32 +46,42 @@
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                                         int write_bytes,
                                         struct page **prepared_pages,
-                                         const char __user *buf)
+                                         struct iov_iter *i)
 {
-        long page_fault = 0;
+        size_t copied;
-        int i;
+        int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
-        for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
+        while (write_bytes > 0) {
                size_t count = min_t(size_t,
                                     PAGE_CACHE_SIZE - offset, write_bytes);
-                struct page *page = prepared_pages[i];
+                struct page *page = prepared_pages[pg];
-                fault_in_pages_readable(buf, count);
+again:
+                if (unlikely(iov_iter_fault_in_readable(i, count)))
+                        return -EFAULT;
                /* Copy data from userspace to the current page */
-                kmap(page);
+                copied = iov_iter_copy_from_user(page, i, offset, count);
-                page_fault = __copy_from_user(page_address(page) + offset,
-                                              buf, count);
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
-                kunmap(page);
+                iov_iter_advance(i, copied);
-                buf += count;
+                write_bytes -= copied;
-                write_bytes -= count;
-                if (page_fault)
+                if (unlikely(copied == 0)) {
-                        break;
+                        count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+                                      iov_iter_single_seg_count(i));
+                        goto again;
+                }
+                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
+                        offset += copied;
+                } else {
+                        pg++;
+                        offset = 0;
+                }
        }
-        return page_fault ? -EFAULT : 0;
+        return 0;
 }
 /*
@@ -126,8 +136,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
                                        NULL);
-        if (err)
+        BUG_ON(err);
-                return err;
        for (i = 0; i < num_pages; i++) {
                struct page *p = pages[i];
@@ -142,7 +151,7 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
                 * at this time.
                 */
        }
-        return err;
+        return 0;
 }
 /*
@@ -823,45 +832,46 @@ again:
        return 0;
 }
-static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
-                                size_t count, loff_t *ppos)
+                                    const struct iovec *iov,
+                                    unsigned long nr_segs, loff_t pos)
 {
-        loff_t pos;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct page *pinned[2];
+        struct page **pages = NULL;
+        struct iov_iter i;
+        loff_t *ppos = &iocb->ki_pos;
        loff_t start_pos;
        ssize_t num_written = 0;
        ssize_t err = 0;
+        size_t count;
+        size_t ocount;
        int ret = 0;
-        struct inode *inode = fdentry(file)->d_inode;
-        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page **pages = NULL;
        int nrptrs;
-        struct page *pinned[2];
        unsigned long first_index;
        unsigned long last_index;
        int will_write;
+        int buffered = 0;
        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
                      (file->f_flags & O_DIRECT));
-        nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
-                     PAGE_CACHE_SIZE / (sizeof(struct page *)));
        pinned[0] = NULL;
        pinned[1] = NULL;
-        pos = *ppos;
        start_pos = pos;
        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-        /* do the reserve before the mutex lock in case we have to do some
-         * flushing.  We wouldn't deadlock, but this is more polite.
-         */
-        err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-        if (err)
-                goto out_nolock;
        mutex_lock(&inode->i_mutex);
+        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+        if (err)
+                goto out;
+        count = ocount;
        current->backing_dev_info = inode->i_mapping->backing_dev_info;
        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
        if (err)
@@ -875,15 +885,53 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                goto out;
        file_update_time(file);
+        BTRFS_I(inode)->sequence++;
+        if (unlikely(file->f_flags & O_DIRECT)) {
+                num_written = generic_file_direct_write(iocb, iov, &nr_segs,
+                                                        pos, ppos, count,
+                                                        ocount);
+                /*
+                 * the generic O_DIRECT will update in-memory i_size after the
+                 * DIOs are done.  But our endio handlers that update the on
+                 * disk i_size never update past the in memory i_size.  So we
+                 * need one more update here to catch any additions to the
+                 * file
+                 */
+                if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+                        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+                        mark_inode_dirty(inode);
+                }
+                if (num_written < 0) {
+                        ret = num_written;
+                        num_written = 0;
+                        goto out;
+                } else if (num_written == count) {
+                        /* pick up pos changes done by the generic code */
+                        pos = *ppos;
+                        goto out;
+                }
+                /*
+                 * We are going to do buffered for the rest of the range, so we
+                 * need to make sure to invalidate the buffered pages when we're
+                 * done.
+                 */
+                buffered = 1;
+                pos += num_written;
+        }
+        iov_iter_init(&i, iov, nr_segs, count, num_written);
+        nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
+                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+                     (sizeof(struct page *)));
        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
        /* generic_write_checks can change our pos */
        start_pos = pos;
-        BTRFS_I(inode)->sequence++;
        first_index = pos >> PAGE_CACHE_SHIFT;
-        last_index = (pos + count) >> PAGE_CACHE_SHIFT;
+        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
        /*
         * there are lots of better ways to do this, but this code
@@ -900,7 +948,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                        unlock_page(pinned[0]);
                }
        }
-        if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
+        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
                if (!PageUptodate(pinned[1])) {
                        ret = btrfs_readpage(NULL, pinned[1]);
@@ -911,10 +959,10 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                }
        }
-        while (count > 0) {
+        while (iov_iter_count(&i) > 0) {
                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
-                size_t write_bytes = min(count, nrptrs *
+                size_t write_bytes = min(iov_iter_count(&i),
-                                        (size_t)PAGE_CACHE_SIZE -
+                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
                                         offset);
                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
                                        PAGE_CACHE_SHIFT;
@@ -922,7 +970,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                WARN_ON(num_pages > nrptrs);
                memset(pages, 0, sizeof(struct page *) * nrptrs);
-                ret = btrfs_check_data_free_space(root, inode, write_bytes);
+                ret = btrfs_delalloc_reserve_space(inode, write_bytes);
                if (ret)
                        goto out;
@@ -930,26 +978,20 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                                    pos, first_index, last_index,
                                    write_bytes);
                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, write_bytes);
-                                                       write_bytes);
                        goto out;
                }
                ret = btrfs_copy_from_user(pos, num_pages,
-                                           write_bytes, pages, buf);
+                                           write_bytes, pages, &i);
-                if (ret) {
+                if (ret == 0) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        dirty_and_release_pages(NULL, root, file, pages,
-                                                       write_bytes);
+                                                num_pages, pos, write_bytes);
-                        btrfs_drop_pages(pages, num_pages);
-                        goto out;
                }
-                ret = dirty_and_release_pages(NULL, root, file, pages,
-                                              num_pages, pos, write_bytes);
                btrfs_drop_pages(pages, num_pages);
                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, write_bytes);
-                                                       write_bytes);
                        goto out;
                }
@@ -965,8 +1007,6 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
                        btrfs_throttle(root);
                }
-                buf += write_bytes;
-                count -= write_bytes;
                pos += write_bytes;
                num_written += write_bytes;
@@ -976,9 +1016,7 @@ out:
        mutex_unlock(&inode->i_mutex);
        if (ret)
                err = ret;
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
-out_nolock:
        kfree(pages);
        if (pinned[0])
                page_cache_release(pinned[0]);
@@ -1008,7 +1046,7 @@ out_nolock:
                        num_written = err;
                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_start_transaction(root, 0);
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
                        if (ret == 0) {
@@ -1023,7 +1061,7 @@ out_nolock:
                                btrfs_end_transaction(trans, root);
                        }
                }
-                if (file->f_flags & O_DIRECT) {
+                if (file->f_flags & O_DIRECT && buffered) {
                        invalidate_mapping_pages(inode->i_mapping,
                              start_pos >> PAGE_CACHE_SHIFT,
                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
@@ -1063,8 +1101,9 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
 * important optimization for directories because holding the mutex prevents
 * new operations on the dir while we write to disk.
 */
-int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int btrfs_sync_file(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct inode *inode = dentry->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1101,12 +1140,12 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        /*
         * ok we haven't committed the transaction yet, lets do a commit
         */
-        if (file && file->private_data)
+        if (file->private_data)
                btrfs_ioctl_trans_end(file);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
-        if (!trans) {
+        if (IS_ERR(trans)) {
-                ret = -ENOMEM;
+                ret = PTR_ERR(trans);
                goto out;
        }
@@ -1151,17 +1190,25 @@ static const struct vm_operations_struct btrfs_file_vm_ops = {
 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
 {
-        vma->vm_ops = &btrfs_file_vm_ops;
+        struct address_space *mapping = filp->f_mapping;
+        if (!mapping->a_ops->readpage)
+                return -ENOEXEC;
        file_accessed(filp);
+        vma->vm_ops = &btrfs_file_vm_ops;
+        vma->vm_flags |= VM_CAN_NONLINEAR;
        return 0;
 }
 const struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
+        .write          = do_sync_write,
        .aio_read       = generic_file_aio_read,
        .splice_read    = generic_file_splice_read,
-        .write          = btrfs_file_write,
+        .aio_write      = btrfs_file_aio_write,
        .mmap           = btrfs_file_mmap,
        .open           = generic_file_open,
        .release        = btrfs_release_file,
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 72ce3c173d6a..64f1150bb48d 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -49,6 +49,33 @@ static int find_name_in_backref(struct btrfs_path *path, const char *name,
        return 0;
 }
+struct btrfs_inode_ref *
+btrfs_lookup_inode_ref(struct btrfs_trans_handle *trans,
+                        struct btrfs_root *root,
+                        struct btrfs_path *path,
+                        const char *name, int name_len,
+                        u64 inode_objectid, u64 ref_objectid, int mod)
+{
+        struct btrfs_key key;
+        struct btrfs_inode_ref *ref;
+        int ins_len = mod < 0 ? -1 : 0;
+        int cow = mod != 0;
+        int ret;
+        key.objectid = inode_objectid;
+        key.type = BTRFS_INODE_REF_KEY;
+        key.offset = ref_objectid;
+        ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0)
+                return NULL;
+        if (!find_name_in_backref(path, name, name_len, &ref))
+                return NULL;
+        return ref;
+}
 int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           const char *name, int name_len,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d601629b85d1..1bff92ad4744 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -252,6 +252,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                   inline_len, compressed_size,
                                   compressed_pages);
        BUG_ON(ret);
+        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
 }
@@ -414,6 +415,7 @@ again:
                trans = btrfs_join_transaction(root, 1);
                BUG_ON(!trans);
                btrfs_set_trans_block_group(trans, inode);
+                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                /* lets try to make an inline extent */
                if (ret || total_in < (actual_end - start)) {
@@ -439,7 +441,6 @@ again:
                             start, end, NULL,
                             EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
                             EXTENT_CLEAR_DELALLOC |
-                             EXTENT_CLEAR_ACCOUNTING |
                             EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
                        btrfs_end_transaction(trans, root);
@@ -697,6 +698,38 @@ retry:
        return 0;
 }
+static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
+                                      u64 num_bytes)
+{
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct extent_map *em;
+        u64 alloc_hint = 0;
+        read_lock(&em_tree->lock);
+        em = search_extent_mapping(em_tree, start, num_bytes);
+        if (em) {
+                /*
+                 * if block start isn't an actual block number then find the
+                 * first block in this inode and use that as a hint.  If that
+                 * block is also bogus then just don't worry about it.
+                 */
+                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
+                        free_extent_map(em);
+                        em = search_extent_mapping(em_tree, 0, 0);
+                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
+                                alloc_hint = em->block_start;
+                        if (em)
+                                free_extent_map(em);
+                } else {
+                        alloc_hint = em->block_start;
+                        free_extent_map(em);
+                }
+        }
+        read_unlock(&em_tree->lock);
+        return alloc_hint;
+}
 /*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
@@ -734,6 +767,7 @@ static noinline int cow_file_range(struct inode *inode,
        trans = btrfs_join_transaction(root, 1);
        BUG_ON(!trans);
        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        actual_end = min_t(u64, isize, end + 1);
@@ -753,7 +787,6 @@ static noinline int cow_file_range(struct inode *inode,
                                     EXTENT_CLEAR_UNLOCK_PAGE |
                                     EXTENT_CLEAR_UNLOCK |
                                     EXTENT_CLEAR_DELALLOC |
-                                     EXTENT_CLEAR_ACCOUNTING |
                                     EXTENT_CLEAR_DIRTY |
                                     EXTENT_SET_WRITEBACK |
                                     EXTENT_END_WRITEBACK);
@@ -769,29 +802,7 @@ static noinline int cow_file_range(struct inode *inode,
        BUG_ON(disk_num_bytes >
               btrfs_super_total_bytes(&root->fs_info->super_copy));
+        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
-        read_lock(&BTRFS_I(inode)->extent_tree.lock);
-        em = search_extent_mapping(&BTRFS_I(inode)->extent_tree,
-                                   start, num_bytes);
-        if (em) {
-                /*
-                 * if block start isn't an actual block number then find the
-                 * first block in this inode and use that as a hint.  If that
-                 * block is also bogus then just don't worry about it.
-                 */
-                if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                        free_extent_map(em);
-                        em = search_extent_mapping(em_tree, 0, 0);
-                        if (em && em->block_start < EXTENT_MAP_LAST_BYTE)
-                                alloc_hint = em->block_start;
-                        if (em)
-                                free_extent_map(em);
-                } else {
-                        alloc_hint = em->block_start;
-                        free_extent_map(em);
-                }
-        }
-        read_unlock(&BTRFS_I(inode)->extent_tree.lock);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
        while (disk_num_bytes > 0) {
@@ -1174,6 +1185,13 @@ out_check:
                                               num_bytes, num_bytes, type);
                BUG_ON(ret);
+                if (root->root_key.objectid ==
+                    BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                        ret = btrfs_reloc_clone_csums(inode, cur_offset,
+                                                      num_bytes);
+                        BUG_ON(ret);
+                }
                extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
                                cur_offset, cur_offset + num_bytes - 1,
                                locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
@@ -1226,15 +1244,13 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
 }
 static int btrfs_split_extent_hook(struct inode *inode,
-                                    struct extent_state *orig, u64 split)
+                                   struct extent_state *orig, u64 split)
 {
+        /* not delalloc, ignore it */
        if (!(orig->state & EXTENT_DELALLOC))
                return 0;
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
-        BTRFS_I(inode)->outstanding_extents++;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        return 0;
 }
@@ -1252,10 +1268,7 @@ static int btrfs_merge_extent_hook(struct inode *inode,
        if (!(other->state & EXTENT_DELALLOC))
                return 0;
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-        BTRFS_I(inode)->outstanding_extents--;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        return 0;
 }
@@ -1264,8 +1277,8 @@ static int btrfs_merge_extent_hook(struct inode *inode,
 * bytes in this file, and to maintain the list of inodes that
 * have pending delalloc work to be done.
 */
-static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
+static int btrfs_set_bit_hook(struct inode *inode,
-                       unsigned long old, unsigned long bits)
+                              struct extent_state *state, int *bits)
 {
        /*
@@ -1273,17 +1286,18 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-        if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+                u64 len = state->end + 1 - state->start;
-                spin_lock(&BTRFS_I(inode)->accounting_lock);
+                if (*bits & EXTENT_FIRST_DELALLOC)
-                BTRFS_I(inode)->outstanding_extents++;
+                        *bits &= ~EXTENT_FIRST_DELALLOC;
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                else
-                btrfs_delalloc_reserve_space(root, inode, end - start + 1);
+                        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
                spin_lock(&root->fs_info->delalloc_lock);
-                BTRFS_I(inode)->delalloc_bytes += end - start + 1;
+                BTRFS_I(inode)->delalloc_bytes += len;
-                root->fs_info->delalloc_bytes += end - start + 1;
+                root->fs_info->delalloc_bytes += len;
                if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
                                      &root->fs_info->delalloc_inodes);
@@ -1297,45 +1311,32 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 * extent_io.c clear_bit_hook, see set_bit_hook for why
 */
 static int btrfs_clear_bit_hook(struct inode *inode,
-                                struct extent_state *state, unsigned long bits)
+                                struct extent_state *state, int *bits)
 {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
         * but in this case, we are only testeing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
-        if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
+        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
+                u64 len = state->end + 1 - state->start;
-                if (bits & EXTENT_DO_ACCOUNTING) {
+                if (*bits & EXTENT_FIRST_DELALLOC)
-                        spin_lock(&BTRFS_I(inode)->accounting_lock);
+                        *bits &= ~EXTENT_FIRST_DELALLOC;
-                        WARN_ON(!BTRFS_I(inode)->outstanding_extents);
+                else if (!(*bits & EXTENT_DO_ACCOUNTING))
-                        BTRFS_I(inode)->outstanding_extents--;
+                        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
-                        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
+                if (*bits & EXTENT_DO_ACCOUNTING)
-                }
+                        btrfs_delalloc_release_metadata(inode, len);
+                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+                        btrfs_free_reserved_data_space(inode, len);
                spin_lock(&root->fs_info->delalloc_lock);
-                if (state->end - state->start + 1 >
+                root->fs_info->delalloc_bytes -= len;
-                    root->fs_info->delalloc_bytes) {
+                BTRFS_I(inode)->delalloc_bytes -= len;
-                        printk(KERN_INFO "btrfs warning: delalloc account "
-                               "%llu %llu\n",
-                               (unsigned long long)
-                               state->end - state->start + 1,
-                               (unsigned long long)
-                               root->fs_info->delalloc_bytes);
-                        btrfs_delalloc_free_space(root, inode, (u64)-1);
-                        root->fs_info->delalloc_bytes = 0;
-                        BTRFS_I(inode)->delalloc_bytes = 0;
-                } else {
-                        btrfs_delalloc_free_space(root, inode,
-                                                  state->end -
-                                                  state->start + 1);
-                        root->fs_info->delalloc_bytes -= state->end -
-                                state->start + 1;
-                        BTRFS_I(inode)->delalloc_bytes -= state->end -
-                                state->start + 1;
-                }
                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
@@ -1384,7 +1385,8 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 */
 static int __btrfs_submit_bio_start(struct inode *inode, int rw,
                                    struct bio *bio, int mirror_num,
-                                    unsigned long bio_flags)
+                                    unsigned long bio_flags,
+                                    u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1403,7 +1405,8 @@ static int __btrfs_submit_bio_start(struct inode *inode, int rw,
 * are inserted into the btree
 */
 static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
-                          int mirror_num, unsigned long bio_flags)
+                          int mirror_num, unsigned long bio_flags,
+                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        return btrfs_map_bio(root, rw, bio, mirror_num, 1);
@@ -1414,7 +1417,8 @@ static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio,
 * on write, or reading the csums from the tree before a read
 */
 static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
-                          int mirror_num, unsigned long bio_flags)
+                          int mirror_num, unsigned long bio_flags,
+                          u64 bio_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
@@ -1439,7 +1443,8 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                /* we're doing a write, do the async checksumming */
                return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
                                   inode, rw, bio, mirror_num,
-                                   bio_flags, __btrfs_submit_bio_start,
+                                   bio_flags, bio_offset,
+                                   __btrfs_submit_bio_start,
                                   __btrfs_submit_bio_done);
        }
@@ -1520,6 +1525,7 @@ again:
                goto again;
        }
+        BUG();
        btrfs_set_extent_delalloc(inode, page_start, page_end, &cached_state);
        ClearPageChecked(page);
 out:
@@ -1650,7 +1656,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct btrfs_trans_handle *trans;
+        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
@@ -1668,9 +1674,10 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
                        trans = btrfs_join_transaction(root, 1);
+                        btrfs_set_trans_block_group(trans, inode);
+                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
-                        btrfs_end_transaction(trans, root);
                }
                goto out;
        }
@@ -1680,6 +1687,8 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                         0, &cached_state, GFP_NOFS);
        trans = btrfs_join_transaction(root, 1);
+        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
                compressed = 1;
@@ -1711,12 +1720,13 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        /* this also removes the ordered extent from the tree */
        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
-        btrfs_end_transaction(trans, root);
 out:
+        btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+        if (trans)
+                btrfs_end_transaction(trans, root);
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
@@ -1838,7 +1848,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
-                                                      failrec->bio_flags);
+                                                      failrec->bio_flags, 0);
        return 0;
 }
@@ -1993,32 +2003,196 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 }
 /*
+ * calculate extra metadata reservation when snapshotting a subvolume
+ * contains orphan files.
+ */
+void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending,
+                                u64 *bytes_to_reserve)
+{
+        struct btrfs_root *root;
+        struct btrfs_block_rsv *block_rsv;
+        u64 num_bytes;
+        int index;
+        root = pending->root;
+        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+                return;
+        block_rsv = root->orphan_block_rsv;
+        /* orphan block reservation for the snapshot */
+        num_bytes = block_rsv->size;
+        /*
+         * after the snapshot is created, COWing tree blocks may use more
+         * space than it frees. So we should make sure there is enough
+         * reserved space.
+         */
+        index = trans->transid & 0x1;
+        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+                num_bytes += block_rsv->size -
+                             (block_rsv->reserved + block_rsv->freed[index]);
+        }
+        *bytes_to_reserve += num_bytes;
+}
+void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
+                                struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_root *snap = pending->snap;
+        struct btrfs_block_rsv *block_rsv;
+        u64 num_bytes;
+        int index;
+        int ret;
+        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
+                return;
+        /* refill source subvolume's orphan block reservation */
+        block_rsv = root->orphan_block_rsv;
+        index = trans->transid & 0x1;
+        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
+                num_bytes = block_rsv->size -
+                            (block_rsv->reserved + block_rsv->freed[index]);
+                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                              root->orphan_block_rsv,
+                                              num_bytes);
+                BUG_ON(ret);
+        }
+        /* setup orphan block reservation for the snapshot */
+        block_rsv = btrfs_alloc_block_rsv(snap);
+        BUG_ON(!block_rsv);
+        btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
+        snap->orphan_block_rsv = block_rsv;
+        num_bytes = root->orphan_block_rsv->size;
+        ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                      block_rsv, num_bytes);
+        BUG_ON(ret);
+#if 0
+        /* insert orphan item for the snapshot */
+        WARN_ON(!root->orphan_item_inserted);
+        ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                       snap->root_key.objectid);
+        BUG_ON(ret);
+        snap->orphan_item_inserted = 1;
+#endif
+}
+enum btrfs_orphan_cleanup_state {
+        ORPHAN_CLEANUP_STARTED  = 1,
+        ORPHAN_CLEANUP_DONE     = 2,
+};
+/*
+ * This is called in transaction commmit time. If there are no orphan
+ * files in the subvolume, it removes orphan item and frees block_rsv
+ * structure.
+ */
+void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
+                              struct btrfs_root *root)
+{
+        int ret;
+        if (!list_empty(&root->orphan_list) ||
+            root->orphan_cleanup_state != ORPHAN_CLEANUP_DONE)
+                return;
+        if (root->orphan_item_inserted &&
+            btrfs_root_refs(&root->root_item) > 0) {
+                ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
+                                            root->root_key.objectid);
+                BUG_ON(ret);
+                root->orphan_item_inserted = 0;
+        }
+        if (root->orphan_block_rsv) {
+                WARN_ON(root->orphan_block_rsv->size > 0);
+                btrfs_free_block_rsv(root, root->orphan_block_rsv);
+                root->orphan_block_rsv = NULL;
+        }
+}
+/*
 * This creates an orphan entry for the given inode in case something goes
 * wrong in the middle of an unlink/truncate.
+ *
+ * NOTE: caller of this function should reserve 5 units of metadata for
+ *       this function.
 */
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        int ret = 0;
+        struct btrfs_block_rsv *block_rsv = NULL;
+        int reserve = 0;
+        int insert = 0;
+        int ret;
-        spin_lock(&root->list_lock);
+        if (!root->orphan_block_rsv) {
+                block_rsv = btrfs_alloc_block_rsv(root);
+                BUG_ON(!block_rsv);
+        }
-        /* already on the orphan list, we're good */
+        spin_lock(&root->orphan_lock);
-        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
+        if (!root->orphan_block_rsv) {
-                spin_unlock(&root->list_lock);
+                root->orphan_block_rsv = block_rsv;
-                return 0;
+        } else if (block_rsv) {
+                btrfs_free_block_rsv(root, block_rsv);
+                block_rsv = NULL;
+        }
+        if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+#if 0
+                /*
+                 * For proper ENOSPC handling, we should do orphan
+                 * cleanup when mounting. But this introduces backward
+                 * compatibility issue.
+                 */
+                if (!xchg(&root->orphan_item_inserted, 1))
+                        insert = 2;
+                else
+                        insert = 1;
+#endif
+                insert = 1;
+        } else {
+                WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
        }
-        list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+        if (!BTRFS_I(inode)->orphan_meta_reserved) {
+                BTRFS_I(inode)->orphan_meta_reserved = 1;
+                reserve = 1;
+        }
+        spin_unlock(&root->orphan_lock);
-        spin_unlock(&root->list_lock);
+        if (block_rsv)
+                btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-        /*
+        /* grab metadata reservation from transaction handle */
-         * insert an orphan item to track this unlinked/truncated file
+        if (reserve) {
-         */
+                ret = btrfs_orphan_reserve_metadata(trans, inode);
-        ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
-        return ret;
+        /* insert an orphan item to track this unlinked/truncated file */
+        if (insert >= 1) {
+                ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
+        /* insert an orphan item to track subvolume contains orphan files */
+        if (insert >= 2) {
+                ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
+                                               root->root_key.objectid);
+                BUG_ON(ret);
+        }
+        return 0;
 }
 /*
@@ -2028,26 +2202,31 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int delete_item = 0;
+        int release_rsv = 0;
        int ret = 0;
-        spin_lock(&root->list_lock);
+        spin_lock(&root->orphan_lock);
+        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-        if (list_empty(&BTRFS_I(inode)->i_orphan)) {
+                list_del_init(&BTRFS_I(inode)->i_orphan);
-                spin_unlock(&root->list_lock);
+                delete_item = 1;
-                return 0;
        }
-        list_del_init(&BTRFS_I(inode)->i_orphan);
+        if (BTRFS_I(inode)->orphan_meta_reserved) {
-        if (!trans) {
+                BTRFS_I(inode)->orphan_meta_reserved = 0;
-                spin_unlock(&root->list_lock);
+                release_rsv = 1;
-                return 0;
        }
+        spin_unlock(&root->orphan_lock);
-        spin_unlock(&root->list_lock);
+        if (trans && delete_item) {
+                ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+                BUG_ON(ret);
+        }
-        ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+        if (release_rsv)
+                btrfs_orphan_release_metadata(inode);
-        return ret;
+        return 0;
 }
 /*
@@ -2064,7 +2243,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        struct inode *inode;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
-        if (!xchg(&root->clean_orphans, 0))
+        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
                return;
        path = btrfs_alloc_path();
@@ -2117,16 +2296,15 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-                if (IS_ERR(inode))
+                BUG_ON(IS_ERR(inode));
-                        break;
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
                 * the proper thing when we hit it
                 */
-                spin_lock(&root->list_lock);
+                spin_lock(&root->orphan_lock);
                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-                spin_unlock(&root->list_lock);
+                spin_unlock(&root->orphan_lock);
                /*
                 * if this is a bad inode, means we actually succeeded in
@@ -2135,7 +2313,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 * do a destroy_inode
                 */
                if (is_bad_inode(inode)) {
-                        trans = btrfs_start_transaction(root, 1);
+                        trans = btrfs_start_transaction(root, 0);
                        btrfs_orphan_del(trans, inode);
                        btrfs_end_transaction(trans, root);
                        iput(inode);
@@ -2153,13 +2331,23 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                /* this will do delete_inode and everything for us */
                iput(inode);
        }
+        btrfs_free_path(path);
+        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
+        if (root->orphan_block_rsv)
+                btrfs_block_rsv_release(root, root->orphan_block_rsv,
+                                        (u64)-1);
+        if (root->orphan_block_rsv || root->orphan_item_inserted) {
+                trans = btrfs_join_transaction(root, 1);
+                btrfs_end_transaction(trans, root);
+        }
        if (nr_unlink)
                printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
        if (nr_truncate)
                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
-        btrfs_free_path(path);
 }
 /*
@@ -2478,29 +2666,201 @@ out:
        return ret;
 }
-static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+/* helper to check if there is any shared block in the path */
+static int check_path_shared(struct btrfs_root *root,
+                             struct btrfs_path *path)
+{
+        struct extent_buffer *eb;
+        int level;
+        int ret;
+        u64 refs = 1;
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                if (!path->nodes[level])
+                        break;
+                eb = path->nodes[level];
+                if (!btrfs_block_can_be_shared(root, eb))
+                        continue;
+                ret = btrfs_lookup_extent_info(NULL, root, eb->start, eb->len,
+                                               &refs, NULL);
+                if (refs > 1)
+                        return 1;
+        }
+        return 0;
+}
+/*
+ * helper to start transaction for unlink and rmdir.
+ *
+ * unlink and rmdir are special in btrfs, they do not always free space.
+ * so in enospc case, we should make sure they will free space before
+ * allowing them to use the global metadata reservation.
+ */
+static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
+                                                       struct dentry *dentry)
 {
-        struct btrfs_root *root;
        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_path *path;
+        struct btrfs_inode_ref *ref;
+        struct btrfs_dir_item *di;
        struct inode *inode = dentry->d_inode;
+        u64 index;
+        int check_link = 1;
+        int err = -ENOSPC;
        int ret;
-        unsigned long nr = 0;
-        root = BTRFS_I(dir)->root;
+        trans = btrfs_start_transaction(root, 10);
+        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
+                return trans;
-        /*
+        if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
-         * 5 items for unlink inode
+                return ERR_PTR(-ENOSPC);
-         * 1 for orphan
-         */
+        /* check if there is someone else holds reference */
-        ret = btrfs_reserve_metadata_space(root, 6);
+        if (S_ISDIR(inode->i_mode) && atomic_read(&inode->i_count) > 1)
-        if (ret)
+                return ERR_PTR(-ENOSPC);
-                return ret;
+        if (atomic_read(&inode->i_count) > 2)
+                return ERR_PTR(-ENOSPC);
+        if (xchg(&root->fs_info->enospc_unlink, 1))
+                return ERR_PTR(-ENOSPC);
-        trans = btrfs_start_transaction(root, 1);
+        path = btrfs_alloc_path();
+        if (!path) {
+                root->fs_info->enospc_unlink = 0;
+                return ERR_PTR(-ENOMEM);
+        }
+        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
-                btrfs_unreserve_metadata_space(root, 6);
+                btrfs_free_path(path);
-                return PTR_ERR(trans);
+                root->fs_info->enospc_unlink = 0;
+                return trans;
+        }
+        path->skip_locking = 1;
+        path->search_commit_root = 1;
+        ret = btrfs_lookup_inode(trans, root, path,
+                                &BTRFS_I(dir)->location, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret == 0) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                check_link = 0;
+        }
+        btrfs_release_path(root, path);
+        ret = btrfs_lookup_inode(trans, root, path,
+                                &BTRFS_I(inode)->location, 0);
+        if (ret < 0) {
+                err = ret;
+                goto out;
+        }
+        if (ret == 0) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                check_link = 0;
+        }
+        btrfs_release_path(root, path);
+        if (ret == 0 && S_ISREG(inode->i_mode)) {
+                ret = btrfs_lookup_file_extent(trans, root, path,
+                                               inode->i_ino, (u64)-1, 0);
+                if (ret < 0) {
+                        err = ret;
+                        goto out;
+                }
+                BUG_ON(ret == 0);
+                if (check_path_shared(root, path))
+                        goto out;
+                btrfs_release_path(root, path);
+        }
+        if (!check_link) {
+                err = 0;
+                goto out;
+        }
+        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+                                dentry->d_name.name, dentry->d_name.len, 0);
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto out;
+        }
+        if (di) {
+                if (check_path_shared(root, path))
+                        goto out;
+        } else {
+                err = 0;
+                goto out;
        }
+        btrfs_release_path(root, path);
+        ref = btrfs_lookup_inode_ref(trans, root, path,
+                                dentry->d_name.name, dentry->d_name.len,
+                                inode->i_ino, dir->i_ino, 0);
+        if (IS_ERR(ref)) {
+                err = PTR_ERR(ref);
+                goto out;
+        }
+        BUG_ON(!ref);
+        if (check_path_shared(root, path))
+                goto out;
+        index = btrfs_inode_ref_index(path->nodes[0], ref);
+        btrfs_release_path(root, path);
+        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+                                dentry->d_name.name, dentry->d_name.len, 0);
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto out;
+        }
+        BUG_ON(ret == -ENOENT);
+        if (check_path_shared(root, path))
+                goto out;
+        err = 0;
+out:
+        btrfs_free_path(path);
+        if (err) {
+                btrfs_end_transaction(trans, root);
+                root->fs_info->enospc_unlink = 0;
+                return ERR_PTR(err);
+        }
+        trans->block_rsv = &root->fs_info->global_block_rsv;
+        return trans;
+}
+static void __unlink_end_trans(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root)
+{
+        if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+                BUG_ON(!root->fs_info->enospc_unlink);
+                root->fs_info->enospc_unlink = 0;
+        }
+        btrfs_end_transaction_throttle(trans, root);
+}
+static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_trans_handle *trans;
+        struct inode *inode = dentry->d_inode;
+        int ret;
+        unsigned long nr = 0;
+        trans = __unlink_start_trans(dir, dentry);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        btrfs_set_trans_block_group(trans, dir);
@@ -2508,14 +2868,15 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
                                 dentry->d_name.name, dentry->d_name.len);
+        BUG_ON(ret);
-        if (inode->i_nlink == 0)
+        if (inode->i_nlink == 0) {
                ret = btrfs_orphan_add(trans, inode);
+                BUG_ON(ret);
+        }
        nr = trans->blocks_used;
+        __unlink_end_trans(trans, root);
-        btrfs_end_transaction_throttle(trans, root);
-        btrfs_unreserve_metadata_space(root, 6);
        btrfs_btree_balance_dirty(root, nr);
        return ret;
 }
@@ -2587,7 +2948,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 {
        struct inode *inode = dentry->d_inode;
        int err = 0;
-        int ret;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_trans_handle *trans;
        unsigned long nr = 0;
@@ -2596,15 +2956,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
            inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                return -ENOTEMPTY;
-        ret = btrfs_reserve_metadata_space(root, 5);
+        trans = __unlink_start_trans(dir, dentry);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
-        trans = btrfs_start_transaction(root, 1);
-        if (IS_ERR(trans)) {
-                btrfs_unreserve_metadata_space(root, 5);
                return PTR_ERR(trans);
-        }
        btrfs_set_trans_block_group(trans, dir);
@@ -2627,12 +2981,9 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
                btrfs_i_size_write(inode, 0);
 out:
        nr = trans->blocks_used;
-        ret = btrfs_end_transaction_throttle(trans, root);
+        __unlink_end_trans(trans, root);
-        btrfs_unreserve_metadata_space(root, 5);
        btrfs_btree_balance_dirty(root, nr);
-        if (ret && !err)
-                err = ret;
        return err;
 }
@@ -3029,6 +3380,7 @@ out:
        if (pending_del_nr) {
                ret = btrfs_del_items(trans, root, path, pending_del_slot,
                                      pending_del_nr);
+                BUG_ON(ret);
        }
        btrfs_free_path(path);
        return err;
@@ -3056,11 +3408,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        if ((offset & (blocksize - 1)) == 0)
                goto out;
-        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+        ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-        if (ret)
-                goto out;
-        ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
        if (ret)
                goto out;
@@ -3068,8 +3416,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 again:
        page = grab_cache_page(mapping, index);
        if (!page) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-                btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
                goto out;
        }
@@ -3132,8 +3479,7 @@ again:
 out_unlock:
        if (ret)
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
+                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        unlock_page(page);
        page_cache_release(page);
 out:
@@ -3145,7 +3491,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-        struct extent_map *em;
+        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 mask = root->sectorsize - 1;
        u64 hole_start = (inode->i_size + mask) & ~mask;
@@ -3183,11 +3529,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                        u64 hint_byte = 0;
                        hole_size = last_byte - cur_offset;
-                        err = btrfs_reserve_metadata_space(root, 2);
+                        trans = btrfs_start_transaction(root, 2);
-                        if (err)
+                        if (IS_ERR(trans)) {
+                                err = PTR_ERR(trans);
                                break;
+                        }
-                        trans = btrfs_start_transaction(root, 1);
                        btrfs_set_trans_block_group(trans, inode);
                        err = btrfs_drop_extents(trans, inode, cur_offset,
@@ -3205,14 +3551,15 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                                        last_byte - 1, 0);
                        btrfs_end_transaction(trans, root);
-                        btrfs_unreserve_metadata_space(root, 2);
                }
                free_extent_map(em);
+                em = NULL;
                cur_offset = last_byte;
                if (cur_offset >= block_end)
                        break;
        }
+        free_extent_map(em);
        unlock_extent_cached(io_tree, hole_start, block_end - 1, &cached_state,
                             GFP_NOFS);
        return err;
@@ -3239,11 +3586,10 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
                }
        }
-        ret = btrfs_reserve_metadata_space(root, 1);
+        trans = btrfs_start_transaction(root, 5);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
        ret = btrfs_orphan_add(trans, inode);
@@ -3251,7 +3597,6 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
-        btrfs_unreserve_metadata_space(root, 1);
        btrfs_btree_balance_dirty(root, nr);
        if (attr->ia_size > inode->i_size) {
@@ -3264,8 +3609,11 @@ static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
                i_size_write(inode, attr->ia_size);
                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
+                trans->block_rsv = root->orphan_block_rsv;
+                BUG_ON(!trans->block_rsv);
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
@@ -3345,10 +3693,21 @@ void btrfs_delete_inode(struct inode *inode)
        btrfs_i_size_write(inode, 0);
        while (1) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                btrfs_set_trans_block_group(trans, inode);
-                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
+                trans->block_rsv = root->orphan_block_rsv;
+                ret = btrfs_block_rsv_check(trans, root,
+                                            root->orphan_block_rsv, 0, 5);
+                if (ret) {
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        continue;
+                }
+                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
                if (ret != -EAGAIN)
                        break;
@@ -3356,6 +3715,7 @@ void btrfs_delete_inode(struct inode *inode)
                btrfs_end_transaction(trans, root);
                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
        }
        if (ret == 0) {
@@ -3596,40 +3956,10 @@ again:
        return 0;
 }
-static noinline void init_btrfs_i(struct inode *inode)
-{
-        struct btrfs_inode *bi = BTRFS_I(inode);
-        bi->generation = 0;
-        bi->sequence = 0;
-        bi->last_trans = 0;
-        bi->last_sub_trans = 0;
-        bi->logged_trans = 0;
-        bi->delalloc_bytes = 0;
-        bi->reserved_bytes = 0;
-        bi->disk_i_size = 0;
-        bi->flags = 0;
-        bi->index_cnt = (u64)-1;
-        bi->last_unlink_trans = 0;
-        bi->ordered_data_close = 0;
-        bi->force_compress = 0;
-        extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
-        extent_io_tree_init(&BTRFS_I(inode)->io_tree,
-                             inode->i_mapping, GFP_NOFS);
-        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
-                             inode->i_mapping, GFP_NOFS);
-        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
-        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
-        RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
-        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
-        mutex_init(&BTRFS_I(inode)->log_mutex);
-}
 static int btrfs_init_locked_inode(struct inode *inode, void *p)
 {
        struct btrfs_iget_args *args = p;
        inode->i_ino = args->ino;
-        init_btrfs_i(inode);
        BTRFS_I(inode)->root = args->root;
        btrfs_set_inode_space_info(args->root, inode);
        return 0;
@@ -3692,8 +4022,6 @@ static struct inode *new_simple_dir(struct super_block *s,
        if (!inode)
                return ERR_PTR(-ENOMEM);
-        init_btrfs_i(inode);
        BTRFS_I(inode)->root = root;
        memcpy(&BTRFS_I(inode)->location, key, sizeof(*key));
        BTRFS_I(inode)->dummy_inode = 1;
@@ -3950,7 +4278,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct btrfs_trans_handle *trans;
        int ret = 0;
-        if (root->fs_info->btree_inode == inode)
+        if (BTRFS_I(inode)->dummy_inode)
                return 0;
        if (wbc->sync_mode == WB_SYNC_ALL) {
@@ -3971,10 +4299,38 @@ void btrfs_dirty_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
+        int ret;
+        if (BTRFS_I(inode)->dummy_inode)
+                return;
        trans = btrfs_join_transaction(root, 1);
        btrfs_set_trans_block_group(trans, inode);
-        btrfs_update_inode(trans, root, inode);
+        ret = btrfs_update_inode(trans, root, inode);
+        if (ret && ret == -ENOSPC) {
+                /* whoops, lets try again with the full transaction */
+                btrfs_end_transaction(trans, root);
+                trans = btrfs_start_transaction(root, 1);
+                if (IS_ERR(trans)) {
+                        if (printk_ratelimit()) {
+                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %lu error %ld\n",
+                                       inode->i_ino, PTR_ERR(trans));
+                        }
+                        return;
+                }
+                btrfs_set_trans_block_group(trans, inode);
+                ret = btrfs_update_inode(trans, root, inode);
+                if (ret) {
+                        if (printk_ratelimit()) {
+                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %lu error %d\n",
+                                       inode->i_ino, ret);
+                        }
+                }
+        }
        btrfs_end_transaction(trans, root);
 }
@@ -4092,7 +4448,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
         * btrfs_get_inode_index_count has an explanation for the magic
         * number
         */
-        init_btrfs_i(inode);
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
@@ -4247,26 +4602,21 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -4295,13 +4645,11 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
+        btrfs_btree_balance_dirty(root, nr);
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
        }
-        btrfs_btree_balance_dirty(root, nr);
        return err;
 }
@@ -4311,32 +4659,26 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct inode *inode = NULL;
-        int err;
        int drop_inode = 0;
+        int err;
        unsigned long nr = 0;
        u64 objectid;
        u64 index = 0;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino,
@@ -4368,8 +4710,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-fail:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4396,21 +4736,21 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        if (root->objectid != BTRFS_I(inode)->root->objectid)
                return -EPERM;
-        /*
-         * 1 item for inode ref
-         * 2 items for dir items
-         */
-        err = btrfs_reserve_metadata_space(root, 3);
-        if (err)
-                return err;
        btrfs_inc_nlink(inode);
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
-        trans = btrfs_start_transaction(root, 1);
+        /*
+         * 1 item for inode ref
+         * 2 items for dir items
+         */
+        trans = btrfs_start_transaction(root, 3);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto fail;
+        }
        btrfs_set_trans_block_group(trans, dir);
        atomic_inc(&inode->i_count);
@@ -4429,7 +4769,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
 fail:
-        btrfs_unreserve_metadata_space(root, 3);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -4449,28 +4788,20 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        u64 index = 0;
        unsigned long nr = 1;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 items for inode and ref
         * 2 items for dir items
         * 1 for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                err = -ENOMEM;
-                goto out_unlock;
-        }
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_fail;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -4510,9 +4841,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 out_fail:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-out_unlock:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_on_err)
                iput(inode);
        btrfs_btree_balance_dirty(root, nr);
@@ -4770,6 +5098,7 @@ again:
                        }
                        flush_dcache_page(page);
                } else if (create && PageUptodate(page)) {
+                        WARN_ON(1);
                        if (!trans) {
                                kunmap(page);
                                free_extent_map(em);
@@ -4866,11 +5195,651 @@ out:
        return em;
 }
+static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                  u64 start, u64 len)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct extent_map *em;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct btrfs_key ins;
+        u64 alloc_hint;
+        int ret;
+        btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        trans = btrfs_join_transaction(root, 0);
+        if (!trans)
+                return ERR_PTR(-ENOMEM);
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+        alloc_hint = get_extent_allocation_hint(inode, start, len);
+        ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+                                   alloc_hint, (u64)-1, &ins, 1);
+        if (ret) {
+                em = ERR_PTR(ret);
+                goto out;
+        }
+        em = alloc_extent_map(GFP_NOFS);
+        if (!em) {
+                em = ERR_PTR(-ENOMEM);
+                goto out;
+        }
+        em->start = start;
+        em->orig_start = em->start;
+        em->len = ins.offset;
+        em->block_start = ins.objectid;
+        em->block_len = ins.offset;
+        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        set_bit(EXTENT_FLAG_PINNED, &em->flags);
+        while (1) {
+                write_lock(&em_tree->lock);
+                ret = add_extent_mapping(em_tree, em);
+                write_unlock(&em_tree->lock);
+                if (ret != -EEXIST)
+                        break;
+                btrfs_drop_extent_cache(inode, start, start + em->len - 1, 0);
+        }
+        ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
+                                           ins.offset, ins.offset, 0);
+        if (ret) {
+                btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+                em = ERR_PTR(ret);
+        }
+out:
+        btrfs_end_transaction(trans, root);
+        return em;
+}
+/*
+ * returns 1 when the nocow is safe, < 1 on error, 0 if the
+ * block must be cow'd
+ */
+static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
+                                      struct inode *inode, u64 offset, u64 len)
+{
+        struct btrfs_path *path;
+        int ret;
+        struct extent_buffer *leaf;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_file_extent_item *fi;
+        struct btrfs_key key;
+        u64 disk_bytenr;
+        u64 backref_offset;
+        u64 extent_end;
+        u64 num_bytes;
+        int slot;
+        int found_type;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                                       offset, 0);
+        if (ret < 0)
+                goto out;
+        slot = path->slots[0];
+        if (ret == 1) {
+                if (slot == 0) {
+                        /* can't find the item, must cow */
+                        ret = 0;
+                        goto out;
+                }
+                slot--;
+        }
+        ret = 0;
+        leaf = path->nodes[0];
+        btrfs_item_key_to_cpu(leaf, &key, slot);
+        if (key.objectid != inode->i_ino ||
+            key.type != BTRFS_EXTENT_DATA_KEY) {
+                /* not our file or wrong item type, must cow */
+                goto out;
+        }
+        if (key.offset > offset) {
+                /* Wrong offset, must cow */
+                goto out;
+        }
+        fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
+        found_type = btrfs_file_extent_type(leaf, fi);
+        if (found_type != BTRFS_FILE_EXTENT_REG &&
+            found_type != BTRFS_FILE_EXTENT_PREALLOC) {
+                /* not a regular extent, must cow */
+                goto out;
+        }
+        disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
+        backref_offset = btrfs_file_extent_offset(leaf, fi);
+        extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
+        if (extent_end < offset + len) {
+                /* extent doesn't include our full range, must cow */
+                goto out;
+        }
+        if (btrfs_extent_readonly(root, disk_bytenr))
+                goto out;
+        /*
+         * look for other files referencing this extent, if we
+         * find any we must cow
+         */
+        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                                  key.offset - backref_offset, disk_bytenr))
+                goto out;
+        /*
+         * adjust disk_bytenr and num_bytes to cover just the bytes
+         * in this extent we are about to write.  If there
+         * are any csums in that range we have to cow in order
+         * to keep the csums correct
+         */
+        disk_bytenr += backref_offset;
+        disk_bytenr += offset - key.offset;
+        num_bytes = min(offset + len, extent_end) - offset;
+        if (csum_exist_in_range(root, disk_bytenr, num_bytes))
+                                goto out;
+        /*
+         * all of the above have passed, it is safe to overwrite this extent
+         * without cow
+         */
+        ret = 1;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        struct extent_map *em;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 start = iblock << inode->i_blkbits;
+        u64 len = bh_result->b_size;
+        struct btrfs_trans_handle *trans;
+        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
+        if (IS_ERR(em))
+                return PTR_ERR(em);
+        /*
+         * Ok for INLINE and COMPRESSED extents we need to fallback on buffered
+         * io.  INLINE is special, and we could probably kludge it in here, but
+         * it's still buffered so for safety lets just fall back to the generic
+         * buffered path.
+         *
+         * For COMPRESSED we _have_ to read the entire extent in so we can
+         * decompress it, so there will be buffering required no matter what we
+         * do, so go ahead and fallback to buffered.
+         *
+         * We return -ENOTBLK because thats what makes DIO go ahead and go back
+         * to buffered IO.  Don't blame me, this is the price we pay for using
+         * the generic code.
+         */
+        if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
+            em->block_start == EXTENT_MAP_INLINE) {
+                free_extent_map(em);
+                return -ENOTBLK;
+        }
+        /* Just a good old fashioned hole, return */
+        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
+                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                free_extent_map(em);
+                /* DIO will do one hole at a time, so just unlock a sector */
+                unlock_extent(&BTRFS_I(inode)->io_tree, start,
+                              start + root->sectorsize - 1, GFP_NOFS);
+                return 0;
+        }
+        /*
+         * We don't allocate a new extent in the following cases
+         *
+         * 1) The inode is marked as NODATACOW.  In this case we'll just use the
+         * existing extent.
+         * 2) The extent is marked as PREALLOC.  We're good to go here and can
+         * just use the extent.
+         *
+         */
+        if (!create) {
+                len = em->len - (start - em->start);
+                goto map;
+        }
+        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) ||
+            ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) &&
+             em->block_start != EXTENT_MAP_HOLE)) {
+                int type;
+                int ret;
+                u64 block_start;
+                if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                        type = BTRFS_ORDERED_PREALLOC;
+                else
+                        type = BTRFS_ORDERED_NOCOW;
+                len = min(len, em->len - (start - em->start));
+                block_start = em->block_start + (start - em->start);
+                /*
+                 * we're not going to log anything, but we do need
+                 * to make sure the current transaction stays open
+                 * while we look for nocow cross refs
+                 */
+                trans = btrfs_join_transaction(root, 0);
+                if (!trans)
+                        goto must_cow;
+                if (can_nocow_odirect(trans, inode, start, len) == 1) {
+                        ret = btrfs_add_ordered_extent_dio(inode, start,
+                                           block_start, len, len, type);
+                        btrfs_end_transaction(trans, root);
+                        if (ret) {
+                                free_extent_map(em);
+                                return ret;
+                        }
+                        goto unlock;
+                }
+                btrfs_end_transaction(trans, root);
+        }
+must_cow:
+        /*
+         * this will cow the extent, reset the len in case we changed
+         * it above
+         */
+        len = bh_result->b_size;
+        free_extent_map(em);
+        em = btrfs_new_extent_direct(inode, start, len);
+        if (IS_ERR(em))
+                return PTR_ERR(em);
+        len = min(len, em->len - (start - em->start));
+unlock:
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, start + len - 1,
+                          EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DIRTY, 1,
+                          0, NULL, GFP_NOFS);
+map:
+        bh_result->b_blocknr = (em->block_start + (start - em->start)) >>
+                inode->i_blkbits;
+        bh_result->b_size = len;
+        bh_result->b_bdev = em->bdev;
+        set_buffer_mapped(bh_result);
+        if (create && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
+                set_buffer_new(bh_result);
+        free_extent_map(em);
+        return 0;
+}
+struct btrfs_dio_private {
+        struct inode *inode;
+        u64 logical_offset;
+        u64 disk_bytenr;
+        u64 bytes;
+        u32 *csums;
+        void *private;
+};
+static void btrfs_endio_direct_read(struct bio *bio, int err)
+{
+        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        struct btrfs_dio_private *dip = bio->bi_private;
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 start;
+        u32 *private = dip->csums;
+        start = dip->logical_offset;
+        do {
+                if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
+                        struct page *page = bvec->bv_page;
+                        char *kaddr;
+                        u32 csum = ~(u32)0;
+                        unsigned long flags;
+                        local_irq_save(flags);
+                        kaddr = kmap_atomic(page, KM_IRQ0);
+                        csum = btrfs_csum_data(root, kaddr + bvec->bv_offset,
+                                               csum, bvec->bv_len);
+                        btrfs_csum_final(csum, (char *)&csum);
+                        kunmap_atomic(kaddr, KM_IRQ0);
+                        local_irq_restore(flags);
+                        flush_dcache_page(bvec->bv_page);
+                        if (csum != *private) {
+                                printk(KERN_ERR "btrfs csum failed ino %lu off"
+                                      " %llu csum %u private %u\n",
+                                      inode->i_ino, (unsigned long long)start,
+                                      csum, *private);
+                                err = -EIO;
+                        }
+                }
+                start += bvec->bv_len;
+                private++;
+                bvec++;
+        } while (bvec <= bvec_end);
+        unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
+                      dip->logical_offset + dip->bytes - 1, GFP_NOFS);
+        bio->bi_private = dip->private;
+        kfree(dip->csums);
+        kfree(dip);
+        dio_end_io(bio, err);
+}
+static void btrfs_endio_direct_write(struct bio *bio, int err)
+{
+        struct btrfs_dio_private *dip = bio->bi_private;
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_ordered_extent *ordered = NULL;
+        struct extent_state *cached_state = NULL;
+        int ret;
+        if (err)
+                goto out_done;
+        ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+                                             dip->logical_offset, dip->bytes);
+        if (!ret)
+                goto out_done;
+        BUG_ON(!ordered);
+        trans = btrfs_join_transaction(root, 1);
+        if (!trans) {
+                err = -ENOMEM;
+                goto out;
+        }
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
+                ret = btrfs_ordered_update_i_size(inode, 0, ordered);
+                if (!ret)
+                        ret = btrfs_update_inode(trans, root, inode);
+                err = ret;
+                goto out;
+        }
+        lock_extent_bits(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                         ordered->file_offset + ordered->len - 1, 0,
+                         &cached_state, GFP_NOFS);
+        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) {
+                ret = btrfs_mark_extent_written(trans, inode,
+                                                ordered->file_offset,
+                                                ordered->file_offset +
+                                                ordered->len);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
+        } else {
+                ret = insert_reserved_file_extent(trans, inode,
+                                                  ordered->file_offset,
+                                                  ordered->start,
+                                                  ordered->disk_len,
+                                                  ordered->len,
+                                                  ordered->len,
+                                                  0, 0, 0,
+                                                  BTRFS_FILE_EXTENT_REG);
+                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
+                                   ordered->file_offset, ordered->len);
+                if (ret) {
+                        err = ret;
+                        WARN_ON(1);
+                        goto out_unlock;
+                }
+        }
+        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
+        btrfs_ordered_update_i_size(inode, 0, ordered);
+        btrfs_update_inode(trans, root, inode);
+out_unlock:
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
+                             ordered->file_offset + ordered->len - 1,
+                             &cached_state, GFP_NOFS);
+out:
+        btrfs_delalloc_release_metadata(inode, ordered->len);
+        btrfs_end_transaction(trans, root);
+        btrfs_put_ordered_extent(ordered);
+        btrfs_put_ordered_extent(ordered);
+out_done:
+        bio->bi_private = dip->private;
+        kfree(dip->csums);
+        kfree(dip);
+        dio_end_io(bio, err);
+}
+static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
+                                    struct bio *bio, int mirror_num,
+                                    unsigned long bio_flags, u64 offset)
+{
+        int ret;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        ret = btrfs_csum_one_bio(root, inode, bio, offset, 1);
+        BUG_ON(ret);
+        return 0;
+}
+static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
+                                loff_t file_offset)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_dio_private *dip;
+        struct bio_vec *bvec = bio->bi_io_vec;
+        u64 start;
+        int skip_sum;
+        int write = rw & (1 << BIO_RW);
+        int ret = 0;
+        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+        dip = kmalloc(sizeof(*dip), GFP_NOFS);
+        if (!dip) {
+                ret = -ENOMEM;
+                goto free_ordered;
+        }
+        dip->csums = NULL;
+        if (!skip_sum) {
+                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
+                if (!dip->csums) {
+                        ret = -ENOMEM;
+                        goto free_ordered;
+                }
+        }
+        dip->private = bio->bi_private;
+        dip->inode = inode;
+        dip->logical_offset = file_offset;
+        start = dip->logical_offset;
+        dip->bytes = 0;
+        do {
+                dip->bytes += bvec->bv_len;
+                bvec++;
+        } while (bvec <= (bio->bi_io_vec + bio->bi_vcnt - 1));
+        dip->disk_bytenr = (u64)bio->bi_sector << 9;
+        bio->bi_private = dip;
+        if (write)
+                bio->bi_end_io = btrfs_endio_direct_write;
+        else
+                bio->bi_end_io = btrfs_endio_direct_read;
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        if (ret)
+                goto out_err;
+        if (write && !skip_sum) {
+                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
+                                   inode, rw, bio, 0, 0,
+                                   dip->logical_offset,
+                                   __btrfs_submit_bio_start_direct_io,
+                                   __btrfs_submit_bio_done);
+                if (ret)
+                        goto out_err;
+                return;
+        } else if (!skip_sum)
+                btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                          dip->logical_offset, dip->csums);
+        ret = btrfs_map_bio(root, rw, bio, 0, 1);
+        if (ret)
+                goto out_err;
+        return;
+out_err:
+        kfree(dip->csums);
+        kfree(dip);
+free_ordered:
+        /*
+         * If this is a write, we need to clean up the reserved space and kill
+         * the ordered extent.
+         */
+        if (write) {
+                struct btrfs_ordered_extent *ordered;
+                ordered = btrfs_lookup_ordered_extent(inode,
+                                                      dip->logical_offset);
+                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
+                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
+                        btrfs_free_reserved_extent(root, ordered->start,
+                                                   ordered->disk_len);
+                btrfs_put_ordered_extent(ordered);
+                btrfs_put_ordered_extent(ordered);
+        }
+        bio_endio(bio, ret);
+}
+static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *iocb,
+                        const struct iovec *iov, loff_t offset,
+                        unsigned long nr_segs)
+{
+        int seg;
+        size_t size;
+        unsigned long addr;
+        unsigned blocksize_mask = root->sectorsize - 1;
+        ssize_t retval = -EINVAL;
+        loff_t end = offset;
+        if (offset & blocksize_mask)
+                goto out;
+        /* Check the memory alignment.  Blocks cannot straddle pages */
+        for (seg = 0; seg < nr_segs; seg++) {
+                addr = (unsigned long)iov[seg].iov_base;
+                size = iov[seg].iov_len;
+                end += size;
+                if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                        goto out;
+        }
+        retval = 0;
+out:
+        return retval;
+}
 static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
                        const struct iovec *iov, loff_t offset,
                        unsigned long nr_segs)
 {
-        return -EINVAL;
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = file->f_mapping->host;
+        struct btrfs_ordered_extent *ordered;
+        struct extent_state *cached_state = NULL;
+        u64 lockstart, lockend;
+        ssize_t ret;
+        int writing = rw & WRITE;
+        int write_bits = 0;
+        size_t count = iov_length(iov, nr_segs);
+        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
+                            offset, nr_segs)) {
+                return 0;
+        }
+        lockstart = offset;
+        lockend = offset + count - 1;
+        if (writing) {
+                ret = btrfs_delalloc_reserve_space(inode, count);
+                if (ret)
+                        goto out;
+        }
+        while (1) {
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                 0, &cached_state, GFP_NOFS);
+                /*
+                 * We're concerned with the entire range that we're going to be
+                 * doing DIO to, so we need to make sure theres no ordered
+                 * extents in this range.
+                 */
+                ordered = btrfs_lookup_ordered_range(inode, lockstart,
+                                                     lockend - lockstart + 1);
+                if (!ordered)
+                        break;
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                     &cached_state, GFP_NOFS);
+                btrfs_start_ordered_extent(inode, ordered, 1);
+                btrfs_put_ordered_extent(ordered);
+                cond_resched();
+        }
+        /*
+         * we don't use btrfs_set_extent_delalloc because we don't want
+         * the dirty or uptodate bits
+         */
+        if (writing) {
+                write_bits = EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING;
+                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+                                     EXTENT_DELALLOC, 0, NULL, &cached_state,
+                                     GFP_NOFS);
+                if (ret) {
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                         lockend, EXTENT_LOCKED | write_bits,
+                                         1, 0, &cached_state, GFP_NOFS);
+                        goto out;
+                }
+        }
+        free_extent_state(cached_state);
+        cached_state = NULL;
+        ret = __blockdev_direct_IO(rw, iocb, inode,
+                   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+                   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                   btrfs_submit_direct, 0);
+        if (ret < 0 && ret != -EIOCBQUEUED) {
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset,
+                              offset + iov_length(iov, nr_segs) - 1,
+                              EXTENT_LOCKED | write_bits, 1, 0,
+                              &cached_state, GFP_NOFS);
+        } else if (ret >= 0 && ret < iov_length(iov, nr_segs)) {
+                /*
+                 * We're falling back to buffered, unlock the section we didn't
+                 * do IO on.
+                 */
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, offset + ret,
+                              offset + iov_length(iov, nr_segs) - 1,
+                              EXTENT_LOCKED | write_bits, 1, 0,
+                              &cached_state, GFP_NOFS);
+        }
+out:
+        free_extent_state(cached_state);
+        return ret;
 }
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -5034,7 +6003,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
        u64 page_start;
        u64 page_end;
-        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+        ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
        if (ret) {
                if (ret == -ENOMEM)
                        ret = VM_FAULT_OOM;
@@ -5043,13 +6012,6 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
                goto out;
        }
-        ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-        if (ret) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-                ret = VM_FAULT_SIGBUS;
-                goto out;
-        }
        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
        lock_page(page);
@@ -5059,7 +6021,6 @@ again:
        if ((page->mapping != inode->i_mapping) ||
            (page_start >= size)) {
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                /* page got truncated out from underneath us */
                goto out_unlock;
        }
@@ -5100,7 +6061,6 @@ again:
                unlock_extent_cached(io_tree, page_start, page_end,
                                     &cached_state, GFP_NOFS);
                ret = VM_FAULT_SIGBUS;
-                btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
                goto out_unlock;
        }
        ret = 0;
@@ -5127,10 +6087,10 @@ again:
        unlock_extent_cached(io_tree, page_start, page_end, &cached_state, GFP_NOFS);
 out_unlock:
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        if (!ret)
                return VM_FAULT_LOCKED;
        unlock_page(page);
+        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 out:
        return ret;
 }
@@ -5155,8 +6115,10 @@ static void btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
+        BUG_ON(IS_ERR(trans));
        btrfs_set_trans_block_group(trans, inode);
+        trans->block_rsv = root->orphan_block_rsv;
        /*
         * setattr is responsible for setting the ordered_data_close flag,
@@ -5179,6 +6141,23 @@ static void btrfs_truncate(struct inode *inode)
                btrfs_add_ordered_operation(trans, root, inode);
        while (1) {
+                if (!trans) {
+                        trans = btrfs_start_transaction(root, 0);
+                        BUG_ON(IS_ERR(trans));
+                        btrfs_set_trans_block_group(trans, inode);
+                        trans->block_rsv = root->orphan_block_rsv;
+                }
+                ret = btrfs_block_rsv_check(trans, root,
+                                            root->orphan_block_rsv, 0, 5);
+                if (ret) {
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        trans = NULL;
+                        continue;
+                }
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
@@ -5190,10 +6169,8 @@ static void btrfs_truncate(struct inode *inode)
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
+                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
-                trans = btrfs_start_transaction(root, 1);
-                btrfs_set_trans_block_group(trans, inode);
        }
        if (ret == 0 && inode->i_nlink > 0) {
@@ -5254,21 +6231,47 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 struct inode *btrfs_alloc_inode(struct super_block *sb)
 {
        struct btrfs_inode *ei;
+        struct inode *inode;
        ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
+        ei->root = NULL;
+        ei->space_info = NULL;
+        ei->generation = 0;
+        ei->sequence = 0;
        ei->last_trans = 0;
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
-        ei->outstanding_extents = 0;
+        ei->delalloc_bytes = 0;
-        ei->reserved_extents = 0;
+        ei->reserved_bytes = 0;
-        ei->root = NULL;
+        ei->disk_i_size = 0;
+        ei->flags = 0;
+        ei->index_cnt = (u64)-1;
+        ei->last_unlink_trans = 0;
        spin_lock_init(&ei->accounting_lock);
+        atomic_set(&ei->outstanding_extents, 0);
+        ei->reserved_extents = 0;
+        ei->ordered_data_close = 0;
+        ei->orphan_meta_reserved = 0;
+        ei->dummy_inode = 0;
+        ei->force_compress = 0;
+        inode = &ei->vfs_inode;
+        extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+        extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+        mutex_init(&ei->log_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
+        INIT_LIST_HEAD(&ei->delalloc_inodes);
        INIT_LIST_HEAD(&ei->ordered_operations);
-        return &ei->vfs_inode;
+        RB_CLEAR_NODE(&ei->rb_node);
+        return inode;
 }
 void btrfs_destroy_inode(struct inode *inode)
@@ -5278,6 +6281,8 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
+        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
+        WARN_ON(BTRFS_I(inode)->reserved_extents);
        /*
         * This can happen where we create an inode, but somebody else also
@@ -5298,13 +6303,13 @@ void btrfs_destroy_inode(struct inode *inode)
                spin_unlock(&root->fs_info->ordered_extent_lock);
        }
-        spin_lock(&root->list_lock);
+        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
                       inode->i_ino);
                list_del_init(&BTRFS_I(inode)->i_orphan);
        }
-        spin_unlock(&root->list_lock);
+        spin_unlock(&root->orphan_lock);
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -5425,19 +6430,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (S_ISDIR(old_inode->i_mode) && new_inode &&
            new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
                return -ENOTEMPTY;
-        /*
-         * We want to reserve the absolute worst case amount of items.  So if
-         * both inodes are subvols and we need to unlink them then that would
-         * require 4 item modifications, but if they are both normal inodes it
-         * would require 5 item modifications, so we'll assume their normal
-         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
-         * should cover the worst case number of items we'll modify.
-         */
-        ret = btrfs_reserve_metadata_space(root, 11);
-        if (ret)
-                return ret;
        /*
         * we're using rename to replace one file with another.
         * and the replacement file is large.  Start IO on it now so
@@ -5450,8 +6442,18 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        /* close the racy window with snapshot create/destroy ioctl */
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                down_read(&root->fs_info->subvol_sem);
+        /*
+         * We want to reserve the absolute worst case amount of items.  So if
+         * both inodes are subvols and we need to unlink them then that would
+         * require 4 item modifications, but if they are both normal inodes it
+         * would require 5 item modifications, so we'll assume their normal
+         * inodes.  So 5 * 2 is 10, plus 1 for the new link, so 11 total items
+         * should cover the worst case number of items we'll modify.
+         */
+        trans = btrfs_start_transaction(root, 20);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, new_dir);
        if (dest != root)
@@ -5550,7 +6552,6 @@ out_fail:
        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
-        btrfs_unreserve_metadata_space(root, 11);
        return ret;
 }
@@ -5602,6 +6603,38 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        return 0;
 }
+int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
+{
+        struct btrfs_inode *binode;
+        struct inode *inode = NULL;
+        spin_lock(&root->fs_info->delalloc_lock);
+        while (!list_empty(&root->fs_info->delalloc_inodes)) {
+                binode = list_entry(root->fs_info->delalloc_inodes.next,
+                                    struct btrfs_inode, delalloc_inodes);
+                inode = igrab(&binode->vfs_inode);
+                if (inode) {
+                        list_move_tail(&binode->delalloc_inodes,
+                                       &root->fs_info->delalloc_inodes);
+                        break;
+                }
+                list_del_init(&binode->delalloc_inodes);
+                cond_resched_lock(&root->fs_info->delalloc_lock);
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
+        if (inode) {
+                write_inode_now(inode, 0);
+                if (delay_iput)
+                        btrfs_add_delayed_iput(inode);
+                else
+                        iput(inode);
+                return 1;
+        }
+        return 0;
+}
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                         const char *symname)
 {
@@ -5625,26 +6658,20 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
+        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
+        if (err)
+                return err;
        /*
         * 2 items for inode item and ref
         * 2 items for dir items
         * 1 item for xattr if selinux is on
         */
-        err = btrfs_reserve_metadata_space(root, 5);
+        trans = btrfs_start_transaction(root, 5);
-        if (err)
+        if (IS_ERR(trans))
-                return err;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans)
-                goto out_fail;
        btrfs_set_trans_block_group(trans, dir);
-        err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
-        if (err) {
-                err = -ENOSPC;
-                goto out_unlock;
-        }
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
                                dentry->d_name.len,
                                dentry->d_parent->d_inode->i_ino, objectid,
@@ -5716,8 +6743,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
-out_fail:
-        btrfs_unreserve_metadata_space(root, 5);
        if (drop_inode) {
                inode_dec_link_count(inode);
                iput(inode);
@@ -5726,33 +6751,28 @@ out_fail:
        return err;
 }
-static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
-                        u64 alloc_hint, int mode, loff_t actual_len)
+                              u64 start, u64 num_bytes, u64 min_size,
+                              loff_t actual_len, u64 *alloc_hint)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
-        u64 num_bytes = end - start;
        int ret = 0;
-        u64 i_size;
        while (num_bytes > 0) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 3);
+                if (IS_ERR(trans)) {
-                ret = btrfs_reserve_extent(trans, root, num_bytes,
+                        ret = PTR_ERR(trans);
-                                           root->sectorsize, 0, alloc_hint,
+                        break;
-                                           (u64)-1, &ins, 1);
-                if (ret) {
-                        WARN_ON(1);
-                        goto stop_trans;
                }
-                ret = btrfs_reserve_metadata_space(root, 3);
+                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+                                           0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
-                        btrfs_free_reserved_extent(root, ins.objectid,
+                        btrfs_end_transaction(trans, root);
-                                                   ins.offset);
+                        break;
-                        goto stop_trans;
                }
                ret = insert_reserved_file_extent(trans, inode,
@@ -5766,34 +6786,27 @@ static int prealloc_file_range(struct inode *inode, u64 start, u64 end,
                num_bytes -= ins.offset;
                cur_offset += ins.offset;
-                alloc_hint = ins.objectid + ins.offset;
+                *alloc_hint = ins.objectid + ins.offset;
                inode->i_ctime = CURRENT_TIME;
                BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC;
                if (!(mode & FALLOC_FL_KEEP_SIZE) &&
-                        (actual_len > inode->i_size) &&
+                    (actual_len > inode->i_size) &&
-                        (cur_offset > inode->i_size)) {
+                    (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
-                                i_size  = actual_len;
+                                i_size_write(inode, actual_len);
                        else
-                                i_size = cur_offset;
+                                i_size_write(inode, cur_offset);
-                        i_size_write(inode, i_size);
+                        i_size_write(inode, cur_offset);
-                        btrfs_ordered_update_i_size(inode, i_size, NULL);
+                        btrfs_ordered_update_i_size(inode, cur_offset, NULL);
                }
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
                btrfs_end_transaction(trans, root);
-                btrfs_unreserve_metadata_space(root, 3);
        }
        return ret;
-stop_trans:
-        btrfs_end_transaction(trans, root);
-        return ret;
 }
 static long btrfs_fallocate(struct inode *inode, int mode,
@@ -5826,8 +6839,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                        goto out;
        }
-        ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode,
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-                                          alloc_end - alloc_start);
        if (ret)
                goto out;
@@ -5872,16 +6884,16 @@ static long btrfs_fallocate(struct inode *inode, int mode,
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                        ret = prealloc_file_range(inode,
+                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-                                                  cur_offset, last_byte,
+                                                        last_byte - cur_offset,
-                                                alloc_hint, mode, offset+len);
+                                                        1 << inode->i_blkbits,
+                                                        offset + len,
+                                                        &alloc_hint);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
                        }
                }
-                if (em->block_start <= EXTENT_MAP_LAST_BYTE)
-                        alloc_hint = em->block_start;
                free_extent_map(em);
                cur_offset = last_byte;
@@ -5893,8 +6905,7 @@ static long btrfs_fallocate(struct inode *inode, int mode,
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_NOFS);
-        btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode,
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
-                                       alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 97a97839a867..4dbaf89b1337 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -239,23 +239,19 @@ static noinline int create_subvol(struct btrfs_root *root,
        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
        u64 index = 0;
+        ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
+                                       0, &objectid);
+        if (ret)
+                return ret;
        /*
         * 1 - inode item
         * 2 - refs
         * 1 - root item
         * 2 - dir items
         */
-        ret = btrfs_reserve_metadata_space(root, 6);
+        trans = btrfs_start_transaction(root, 6);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
-        ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
-                                       0, &objectid);
-        if (ret)
-                goto fail;
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
                                      0, objectid, NULL, 0, 0, 0);
@@ -345,13 +341,10 @@ fail:
        err = btrfs_commit_transaction(trans, root);
        if (err && !ret)
                ret = err;
-        btrfs_unreserve_metadata_space(root, 6);
        return ret;
 }
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
-                           char *name, int namelen)
 {
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
@@ -361,40 +354,33 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        if (!root->ref_cows)
                return -EINVAL;
-        /*
-         * 1 - inode item
-         * 2 - refs
-         * 1 - root item
-         * 2 - dir items
-         */
-        ret = btrfs_reserve_metadata_space(root, 6);
-        if (ret)
-                goto fail;
        pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-        if (!pending_snapshot) {
+        if (!pending_snapshot)
-                ret = -ENOMEM;
+                return -ENOMEM;
-                btrfs_unreserve_metadata_space(root, 6);
-                goto fail;
+        btrfs_init_block_rsv(&pending_snapshot->block_rsv);
-        }
-        pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
-        if (!pending_snapshot->name) {
-                ret = -ENOMEM;
-                kfree(pending_snapshot);
-                btrfs_unreserve_metadata_space(root, 6);
-                goto fail;
-        }
-        memcpy(pending_snapshot->name, name, namelen);
-        pending_snapshot->name[namelen] = '\0';
        pending_snapshot->dentry = dentry;
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
        pending_snapshot->root = root;
+        trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto fail;
+        }
+        ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
+        BUG_ON(ret);
        list_add(&pending_snapshot->list,
                 &trans->transaction->pending_snapshots);
-        ret = btrfs_commit_transaction(trans, root);
+        ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
        BUG_ON(ret);
-        btrfs_unreserve_metadata_space(root, 6);
+        ret = pending_snapshot->error;
+        if (ret)
+                goto fail;
+        btrfs_orphan_cleanup(pending_snapshot->snap);
        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
        if (IS_ERR(inode)) {
@@ -405,6 +391,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        d_instantiate(dentry, inode);
        ret = 0;
 fail:
+        kfree(pending_snapshot);
        return ret;
 }
@@ -456,8 +443,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
                goto out_up_read;
        if (snap_src) {
-                error = create_snapshot(snap_src, dentry,
+                error = create_snapshot(snap_src, dentry);
-                                        name, namelen);
        } else {
                error = create_subvol(BTRFS_I(dir)->root, dentry,
                                      name, namelen);
@@ -601,19 +587,9 @@ static int btrfs_defrag_file(struct file *file,
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
                        BTRFS_I(inode)->force_compress = 1;
-                ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
+                ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
-                if (ret) {
+                if (ret)
-                        ret = -ENOSPC;
+                        goto err_unlock;
-                        break;
-                }
-                ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
-                if (ret) {
-                        btrfs_free_reserved_data_space(root, inode,
-                                                       PAGE_CACHE_SIZE);
-                        ret = -ENOSPC;
-                        break;
-                }
 again:
                if (inode->i_size == 0 ||
                    i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
@@ -622,8 +598,10 @@ again:
                }
                page = grab_cache_page(inode->i_mapping, i);
-                if (!page)
+                if (!page) {
+                        ret = -ENOMEM;
                        goto err_reservations;
+                }
                if (!PageUptodate(page)) {
                        btrfs_readpage(NULL, page);
@@ -631,6 +609,7 @@ again:
                        if (!PageUptodate(page)) {
                                unlock_page(page);
                                page_cache_release(page);
+                                ret = -EIO;
                                goto err_reservations;
                        }
                }
@@ -644,8 +623,7 @@ again:
                wait_on_page_writeback(page);
                if (PageDirty(page)) {
-                        btrfs_free_reserved_data_space(root, inode,
+                        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
-                                                       PAGE_CACHE_SIZE);
                        goto loop_unlock;
                }
@@ -683,7 +661,6 @@ loop_unlock:
                page_cache_release(page);
                mutex_unlock(&inode->i_mutex);
-                btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
                balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
                i++;
        }
@@ -713,9 +690,9 @@ loop_unlock:
        return 0;
 err_reservations:
+        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+err_unlock:
        mutex_unlock(&inode->i_mutex);
-        btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
-        btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
        return ret;
 }
@@ -811,7 +788,7 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                device->name, (unsigned long long)new_size);
        if (new_size > old_size) {
-                trans = btrfs_start_transaction(root, 1);
+                trans = btrfs_start_transaction(root, 0);
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
        } else {
@@ -1300,7 +1277,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        if (err)
                goto out_up_write;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out_up_write;
+        }
+        trans->block_rsv = &root->fs_info->global_block_rsv;
        ret = btrfs_unlink_subvol(trans, root, dir,
                                dest->root_key.objectid,
                                dentry->d_name.name,
@@ -1314,10 +1297,12 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        dest->root_item.drop_level = 0;
        btrfs_set_root_refs(&dest->root_item, 0);
-        ret = btrfs_insert_orphan_item(trans,
+        if (!xchg(&dest->orphan_item_inserted, 1)) {
-                                root->fs_info->tree_root,
+                ret = btrfs_insert_orphan_item(trans,
-                                dest->root_key.objectid);
+                                        root->fs_info->tree_root,
-        BUG_ON(ret);
+                                        dest->root_key.objectid);
+                BUG_ON(ret);
+        }
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
@@ -1358,8 +1343,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        ret = -EPERM;
                        goto out;
                }
-                btrfs_defrag_root(root, 0);
+                ret = btrfs_defrag_root(root, 0);
-                btrfs_defrag_root(root->fs_info->extent_root, 0);
+                if (ret)
+                        goto out;
+                ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
                break;
        case S_IFREG:
                if (!(file->f_mode & FMODE_WRITE)) {
@@ -1389,9 +1376,11 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        /* the rest are all set to zero by kzalloc */
                        range->len = (u64)-1;
                }
-                btrfs_defrag_file(file, range);
+                ret = btrfs_defrag_file(file, range);
                kfree(range);
                break;
+        default:
+                ret = -EINVAL;
        }
 out:
        mnt_drop_write(file->f_path.mnt);
@@ -1550,12 +1539,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                btrfs_wait_ordered_range(src, off, off+len);
        }
-        trans = btrfs_start_transaction(root, 1);
-        BUG_ON(!trans);
-        /* punch hole in destination first */
-        btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1);
        /* clone data */
        key.objectid = src->i_ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
@@ -1566,7 +1549,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                 * note the key will change type as we walk through the
                 * tree.
                 */
-                ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
                        goto out;
@@ -1629,12 +1612,31 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        new_key.objectid = inode->i_ino;
                        new_key.offset = key.offset + destoff - off;
+                        trans = btrfs_start_transaction(root, 1);
+                        if (IS_ERR(trans)) {
+                                ret = PTR_ERR(trans);
+                                goto out;
+                        }
                        if (type == BTRFS_FILE_EXTENT_REG ||
                            type == BTRFS_FILE_EXTENT_PREALLOC) {
+                                if (off > key.offset) {
+                                        datao += off - key.offset;
+                                        datal -= off - key.offset;
+                                }
+                                if (key.offset + datal > off + len)
+                                        datal = off + len - key.offset;
+                                ret = btrfs_drop_extents(trans, inode,
+                                                         new_key.offset,
+                                                         new_key.offset + datal,
+                                                         &hint_byte, 1);
+                                BUG_ON(ret);
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
-                                if (ret)
+                                BUG_ON(ret);
-                                        goto out;
                                leaf = path->nodes[0];
                                slot = path->slots[0];
@@ -1645,14 +1647,6 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                extent = btrfs_item_ptr(leaf, slot,
                                                struct btrfs_file_extent_item);
-                                if (off > key.offset) {
-                                        datao += off - key.offset;
-                                        datal -= off - key.offset;
-                                }
-                                if (key.offset + datal > off + len)
-                                        datal = off + len - key.offset;
                                /* disko == 0 means it's a hole */
                                if (!disko)
                                        datao = 0;
@@ -1683,14 +1677,21 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                if (comp && (skip || trim)) {
                                        ret = -EINVAL;
+                                        btrfs_end_transaction(trans, root);
                                        goto out;
                                }
                                size -= skip + trim;
                                datal -= skip + trim;
+                                ret = btrfs_drop_extents(trans, inode,
+                                                         new_key.offset,
+                                                         new_key.offset + datal,
+                                                         &hint_byte, 1);
+                                BUG_ON(ret);
                                ret = btrfs_insert_empty_item(trans, root, path,
                                                              &new_key, size);
-                                if (ret)
+                                BUG_ON(ret);
-                                        goto out;
                                if (skip) {
                                        u32 start =
@@ -1708,8 +1709,17 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        }
                        btrfs_mark_buffer_dirty(leaf);
-                }
+                        btrfs_release_path(root, path);
+                        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+                        if (new_key.offset + datal > inode->i_size)
+                                btrfs_i_size_write(inode,
+                                                   new_key.offset + datal);
+                        BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
+                        ret = btrfs_update_inode(trans, root, inode);
+                        BUG_ON(ret);
+                        btrfs_end_transaction(trans, root);
+                }
 next:
                btrfs_release_path(root, path);
                key.offset++;
@@ -1717,17 +1727,7 @@ next:
        ret = 0;
 out:
        btrfs_release_path(root, path);
-        if (ret == 0) {
-                inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-                if (destoff + olen > inode->i_size)
-                        btrfs_i_size_write(inode, destoff + olen);
-                BTRFS_I(inode)->flags = BTRFS_I(src)->flags;
-                ret = btrfs_update_inode(trans, root, inode);
-        }
-        btrfs_end_transaction(trans, root);
        unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-        if (ret)
-                vmtruncate(inode, 0);
 out_unlock:
        mutex_unlock(&src->i_mutex);
        mutex_unlock(&inode->i_mutex);
@@ -1845,7 +1845,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
                                   dir_id, "default", 7, 1);
-        if (!di) {
+        if (IS_ERR_OR_NULL(di)) {
                btrfs_free_path(path);
                btrfs_end_transaction(trans, root);
                printk(KERN_ERR "Umm, you don't have the default dir item, "
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index a127c0ebb2dc..e56c72bc5add 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -124,6 +124,15 @@ static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
        return 1;
 }
+static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
+                          u64 len)
+{
+        if (file_offset + len <= entry->file_offset ||
+            entry->file_offset + entry->len <= file_offset)
+                return 0;
+        return 1;
+}
 /*
 * look find the first ordered struct that has this offset, otherwise
 * the first one less than this offset
@@ -161,8 +170,9 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 * The tree is given a single reference on the ordered extent that was
 * inserted.
 */
-int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                             u64 start, u64 len, u64 disk_len, int type)
+                                      u64 start, u64 len, u64 disk_len,
+                                      int type, int dio)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
@@ -182,6 +192,9 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
+        if (dio)
+                set_bit(BTRFS_ORDERED_DIRECT, &entry->flags);
        /* one ref for the tree */
        atomic_set(&entry->refs, 1);
        init_waitqueue_head(&entry->wait);
@@ -203,6 +216,20 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        return 0;
 }
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+                             u64 start, u64 len, u64 disk_len, int type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 0);
+}
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+                                 u64 start, u64 len, u64 disk_len, int type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 1);
+}
 /*
 * Add a struct btrfs_ordered_sum into the list of checksums to be inserted
 * when an ordered extent is finished.  If the list covers more than one
@@ -311,13 +338,6 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
        tree->last = NULL;
        set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
-        WARN_ON(!BTRFS_I(inode)->outstanding_extents);
-        BTRFS_I(inode)->outstanding_extents--;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
-        btrfs_unreserve_metadata_for_delalloc(BTRFS_I(inode)->root,
-                                              inode, 1);
        spin_lock(&root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
@@ -491,7 +511,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
         * start IO on any dirty ones so the wait doesn't stall waiting
         * for pdflush to find them
         */
-        filemap_fdatawrite_range(inode->i_mapping, start, end);
+        if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags))
+                filemap_fdatawrite_range(inode->i_mapping, start, end);
        if (wait) {
                wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
                                                 &entry->flags));
@@ -588,6 +609,47 @@ out:
        return entry;
 }
+/* Since the DIO code tries to lock a wide area we need to look for any ordered
+ * extents that exist in the range, rather than just the start of the range.
+ */
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+                                                        u64 file_offset,
+                                                        u64 len)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        spin_lock(&tree->lock);
+        node = tree_search(tree, file_offset);
+        if (!node) {
+                node = tree_search(tree, file_offset + len);
+                if (!node)
+                        goto out;
+        }
+        while (1) {
+                entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+                if (range_overlaps(entry, file_offset, len))
+                        break;
+                if (entry->file_offset >= file_offset + len) {
+                        entry = NULL;
+                        break;
+                }
+                entry = NULL;
+                node = rb_next(node);
+                if (!node)
+                        break;
+        }
+out:
+        if (entry)
+                atomic_inc(&entry->refs);
+        spin_unlock(&tree->lock);
+        return entry;
+}
 /*
 * lookup and return any extent before 'file_offset'.  NULL is returned
 * if none is found
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index c82f76a9f040..8ac365492a3f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -72,6 +72,8 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
+#define BTRFS_ORDERED_DIRECT 5 /* set when we're doing DIO with this extent */
 struct btrfs_ordered_extent {
        /* logical offset in the file */
        u64 file_offset;
@@ -140,7 +142,9 @@ int btrfs_dec_test_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
                                   u64 file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
-                             u64 start, u64 len, u64 disk_len, int tyep);
+                             u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
+                                 u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_sum(struct inode *inode,
                          struct btrfs_ordered_extent *entry,
                          struct btrfs_ordered_sum *sum);
@@ -151,6 +155,9 @@ void btrfs_start_ordered_extent(struct inode *inode,
 int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
 struct btrfs_ordered_extent *
 btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
+                                                        u64 file_offset,
+                                                        u64 len);
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e558dd941ded..b37d723b9d4a 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -44,8 +44,12 @@ struct tree_entry {
 struct backref_node {
        struct rb_node rb_node;
        u64 bytenr;
-        /* objectid tree block owner */
+        u64 new_bytenr;
+        /* objectid of tree block owner, can be not uptodate */
        u64 owner;
+        /* link to pending, changed or detached list */
+        struct list_head list;
        /* list of upper level blocks reference this block */
        struct list_head upper;
        /* list of child blocks in the cache */
@@ -56,9 +60,9 @@ struct backref_node {
        struct extent_buffer *eb;
        /* level of tree block */
        unsigned int level:8;
-        /* 1 if the block is root of old snapshot */
+        /* is the block in non-reference counted tree */
-        unsigned int old_root:1;
+        unsigned int cowonly:1;
-        /* 1 if no child blocks in the cache */
+        /* 1 if no child node in the cache */
        unsigned int lowest:1;
        /* is the extent buffer locked */
        unsigned int locked:1;
@@ -66,6 +70,16 @@ struct backref_node {
        unsigned int processed:1;
        /* have backrefs of this block been checked */
        unsigned int checked:1;
+        /*
+         * 1 if corresponding block has been cowed but some upper
+         * level block pointers may not point to the new location
+         */
+        unsigned int pending:1;
+        /*
+         * 1 if the backref node isn't connected to any other
+         * backref node.
+         */
+        unsigned int detached:1;
 };
 /*
@@ -74,7 +88,6 @@ struct backref_node {
 struct backref_edge {
        struct list_head list[2];
        struct backref_node *node[2];
-        u64 blockptr;
 };
 #define LOWER   0
@@ -83,9 +96,25 @@ struct backref_edge {
 struct backref_cache {
        /* red black tree of all backref nodes in the cache */
        struct rb_root rb_root;
-        /* list of backref nodes with no child block in the cache */
+        /* for passing backref nodes to btrfs_reloc_cow_block */
+        struct backref_node *path[BTRFS_MAX_LEVEL];
+        /*
+         * list of blocks that have been cowed but some block
+         * pointers in upper level blocks may not reflect the
+         * new location
+         */
        struct list_head pending[BTRFS_MAX_LEVEL];
-        spinlock_t lock;
+        /* list of backref nodes with no child node */
+        struct list_head leaves;
+        /* list of blocks that have been cowed in current transaction */
+        struct list_head changed;
+        /* list of detached backref node. */
+        struct list_head detached;
+        u64 last_trans;
+        int nr_nodes;
+        int nr_edges;
 };
 /*
@@ -113,15 +142,6 @@ struct tree_block {
        unsigned int key_ready:1;
 };
-/* inode vector */
-#define INODEVEC_SIZE 16
-struct inodevec {
-        struct list_head list;
-        struct inode *inode[INODEVEC_SIZE];
-        int nr;
-};
 #define MAX_EXTENTS 128
 struct file_extent_cluster {
@@ -138,36 +158,43 @@ struct reloc_control {
        struct btrfs_root *extent_root;
        /* inode for moving data */
        struct inode *data_inode;
-        struct btrfs_workers workers;
+        struct btrfs_block_rsv *block_rsv;
+        struct backref_cache backref_cache;
+        struct file_extent_cluster cluster;
        /* tree blocks have been processed */
        struct extent_io_tree processed_blocks;
        /* map start of tree root to corresponding reloc tree */
        struct mapping_tree reloc_root_tree;
        /* list of reloc trees */
        struct list_head reloc_roots;
+        /* size of metadata reservation for merging reloc trees */
+        u64 merging_rsv_size;
+        /* size of relocated tree nodes */
+        u64 nodes_relocated;
        u64 search_start;
        u64 extents_found;
-        u64 extents_skipped;
-        int stage;
+        int block_rsv_retries;
-        int create_reloc_root;
+        unsigned int stage:8;
+        unsigned int create_reloc_tree:1;
+        unsigned int merge_reloc_tree:1;
        unsigned int found_file_extent:1;
-        unsigned int found_old_snapshot:1;
+        unsigned int commit_transaction:1;
 };
 /* stages of data relocation */
 #define MOVE_DATA_EXTENTS       0
 #define UPDATE_DATA_PTRS        1
-/*
+static void remove_backref_node(struct backref_cache *cache,
- * merge reloc tree to corresponding fs tree in worker threads
+                                struct backref_node *node);
- */
+static void __mark_block_processed(struct reloc_control *rc,
-struct async_merge {
+                                   struct backref_node *node);
-        struct btrfs_work work;
-        struct reloc_control *rc;
-        struct btrfs_root *root;
-        struct completion *done;
-        atomic_t *num_pending;
-};
 static void mapping_tree_init(struct mapping_tree *tree)
 {
@@ -181,15 +208,80 @@ static void backref_cache_init(struct backref_cache *cache)
        cache->rb_root = RB_ROOT;
        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
                INIT_LIST_HEAD(&cache->pending[i]);
-        spin_lock_init(&cache->lock);
+        INIT_LIST_HEAD(&cache->changed);
+        INIT_LIST_HEAD(&cache->detached);
+        INIT_LIST_HEAD(&cache->leaves);
+}
+static void backref_cache_cleanup(struct backref_cache *cache)
+{
+        struct backref_node *node;
+        int i;
+        while (!list_empty(&cache->detached)) {
+                node = list_entry(cache->detached.next,
+                                  struct backref_node, list);
+                remove_backref_node(cache, node);
+        }
+        while (!list_empty(&cache->leaves)) {
+                node = list_entry(cache->leaves.next,
+                                  struct backref_node, lower);
+                remove_backref_node(cache, node);
+        }
+        cache->last_trans = 0;
+        for (i = 0; i < BTRFS_MAX_LEVEL; i++)
+                BUG_ON(!list_empty(&cache->pending[i]));
+        BUG_ON(!list_empty(&cache->changed));
+        BUG_ON(!list_empty(&cache->detached));
+        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
+        BUG_ON(cache->nr_nodes);
+        BUG_ON(cache->nr_edges);
+}
+static struct backref_node *alloc_backref_node(struct backref_cache *cache)
+{
+        struct backref_node *node;
+        node = kzalloc(sizeof(*node), GFP_NOFS);
+        if (node) {
+                INIT_LIST_HEAD(&node->list);
+                INIT_LIST_HEAD(&node->upper);
+                INIT_LIST_HEAD(&node->lower);
+                RB_CLEAR_NODE(&node->rb_node);
+                cache->nr_nodes++;
+        }
+        return node;
+}
+static void free_backref_node(struct backref_cache *cache,
+                              struct backref_node *node)
+{
+        if (node) {
+                cache->nr_nodes--;
+                kfree(node);
+        }
+}
+static struct backref_edge *alloc_backref_edge(struct backref_cache *cache)
+{
+        struct backref_edge *edge;
+        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+        if (edge)
+                cache->nr_edges++;
+        return edge;
 }
-static void backref_node_init(struct backref_node *node)
+static void free_backref_edge(struct backref_cache *cache,
+                              struct backref_edge *edge)
 {
-        memset(node, 0, sizeof(*node));
+        if (edge) {
-        INIT_LIST_HEAD(&node->upper);
+                cache->nr_edges--;
-        INIT_LIST_HEAD(&node->lower);
+                kfree(edge);
-        RB_CLEAR_NODE(&node->rb_node);
+        }
 }
 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
@@ -250,6 +342,7 @@ static struct backref_node *walk_up_backref(struct backref_node *node,
                edges[idx++] = edge;
                node = edge->node[UPPER];
        }
+        BUG_ON(node->detached);
        *index = idx;
        return node;
 }
@@ -281,13 +374,18 @@ static struct backref_node *walk_down_backref(struct backref_edge *edges[],
        return NULL;
 }
+static void unlock_node_buffer(struct backref_node *node)
+{
+        if (node->locked) {
+                btrfs_tree_unlock(node->eb);
+                node->locked = 0;
+        }
+}
 static void drop_node_buffer(struct backref_node *node)
 {
        if (node->eb) {
-                if (node->locked) {
+                unlock_node_buffer(node);
-                        btrfs_tree_unlock(node->eb);
-                        node->locked = 0;
-                }
                free_extent_buffer(node->eb);
                node->eb = NULL;
        }
@@ -296,14 +394,14 @@ static void drop_node_buffer(struct backref_node *node)
 static void drop_backref_node(struct backref_cache *tree,
                              struct backref_node *node)
 {
-        BUG_ON(!node->lowest);
        BUG_ON(!list_empty(&node->upper));
        drop_node_buffer(node);
+        list_del(&node->list);
        list_del(&node->lower);
+        if (!RB_EMPTY_NODE(&node->rb_node))
-        rb_erase(&node->rb_node, &tree->rb_root);
+                rb_erase(&node->rb_node, &tree->rb_root);
-        kfree(node);
+        free_backref_node(tree, node);
 }
 /*
@@ -318,27 +416,121 @@ static void remove_backref_node(struct backref_cache *cache,
        if (!node)
                return;
-        BUG_ON(!node->lowest);
+        BUG_ON(!node->lowest && !node->detached);
        while (!list_empty(&node->upper)) {
                edge = list_entry(node->upper.next, struct backref_edge,
                                  list[LOWER]);
                upper = edge->node[UPPER];
                list_del(&edge->list[LOWER]);
                list_del(&edge->list[UPPER]);
-                kfree(edge);
+                free_backref_edge(cache, edge);
+                if (RB_EMPTY_NODE(&upper->rb_node)) {
+                        BUG_ON(!list_empty(&node->upper));
+                        drop_backref_node(cache, node);
+                        node = upper;
+                        node->lowest = 1;
+                        continue;
+                }
                /*
-                 * add the node to pending list if no other
+                 * add the node to leaf node list if no other
                 * child block cached.
                 */
                if (list_empty(&upper->lower)) {
-                        list_add_tail(&upper->lower,
+                        list_add_tail(&upper->lower, &cache->leaves);
-                                      &cache->pending[upper->level]);
                        upper->lowest = 1;
                }
        }
        drop_backref_node(cache, node);
 }
+static void update_backref_node(struct backref_cache *cache,
+                                struct backref_node *node, u64 bytenr)
+{
+        struct rb_node *rb_node;
+        rb_erase(&node->rb_node, &cache->rb_root);
+        node->bytenr = bytenr;
+        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        BUG_ON(rb_node);
+}
+/*
+ * update backref cache after a transaction commit
+ */
+static int update_backref_cache(struct btrfs_trans_handle *trans,
+                                struct backref_cache *cache)
+{
+        struct backref_node *node;
+        int level = 0;
+        if (cache->last_trans == 0) {
+                cache->last_trans = trans->transid;
+                return 0;
+        }
+        if (cache->last_trans == trans->transid)
+                return 0;
+        /*
+         * detached nodes are used to avoid unnecessary backref
+         * lookup. transaction commit changes the extent tree.
+         * so the detached nodes are no longer useful.
+         */
+        while (!list_empty(&cache->detached)) {
+                node = list_entry(cache->detached.next,
+                                  struct backref_node, list);
+                remove_backref_node(cache, node);
+        }
+        while (!list_empty(&cache->changed)) {
+                node = list_entry(cache->changed.next,
+                                  struct backref_node, list);
+                list_del_init(&node->list);
+                BUG_ON(node->pending);
+                update_backref_node(cache, node, node->new_bytenr);
+        }
+        /*
+         * some nodes can be left in the pending list if there were
+         * errors during processing the pending nodes.
+         */
+        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                list_for_each_entry(node, &cache->pending[level], list) {
+                        BUG_ON(!node->pending);
+                        if (node->bytenr == node->new_bytenr)
+                                continue;
+                        update_backref_node(cache, node, node->new_bytenr);
+                }
+        }
+        cache->last_trans = 0;
+        return 1;
+}
+static int should_ignore_root(struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        if (!root->ref_cows)
+                return 0;
+        reloc_root = root->reloc_root;
+        if (!reloc_root)
+                return 0;
+        if (btrfs_root_last_snapshot(&reloc_root->root_item) ==
+            root->fs_info->running_transaction->transid - 1)
+                return 0;
+        /*
+         * if there is reloc tree and it was created in previous
+         * transaction backref lookup can find the reloc tree,
+         * so backref node for the fs tree root is useless for
+         * relocation.
+         */
+        return 1;
+}
 /*
 * find reloc tree by address of tree root
 */
@@ -453,11 +645,12 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
 * for all upper level blocks that directly/indirectly reference the
 * block are also cached.
 */
-static struct backref_node *build_backref_tree(struct reloc_control *rc,
+static noinline_for_stack
-                                               struct backref_cache *cache,
+struct backref_node *build_backref_tree(struct reloc_control *rc,
-                                               struct btrfs_key *node_key,
+                                        struct btrfs_key *node_key,
-                                               int level, u64 bytenr)
+                                        int level, u64 bytenr)
 {
+        struct backref_cache *cache = &rc->backref_cache;
        struct btrfs_path *path1;
        struct btrfs_path *path2;
        struct extent_buffer *eb;
@@ -473,6 +666,8 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
        unsigned long end;
        unsigned long ptr;
        LIST_HEAD(list);
+        LIST_HEAD(useless);
+        int cowonly;
        int ret;
        int err = 0;
@@ -483,15 +678,13 @@ static struct backref_node *build_backref_tree(struct reloc_control *rc,
                goto out;
        }
-        node = kmalloc(sizeof(*node), GFP_NOFS);
+        node = alloc_backref_node(cache);
        if (!node) {
                err = -ENOMEM;
                goto out;
        }
-        backref_node_init(node);
        node->bytenr = bytenr;
-        node->owner = 0;
        node->level = level;
        node->lowest = 1;
        cur = node;
@@ -587,17 +780,21 @@ again:
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
                if (key.type == BTRFS_SHARED_BLOCK_REF_KEY ||
                    key.type == BTRFS_EXTENT_REF_V0_KEY) {
-                        if (key.objectid == key.offset &&
+                        if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
-                            key.type == BTRFS_EXTENT_REF_V0_KEY) {
                                struct btrfs_extent_ref_v0 *ref0;
                                ref0 = btrfs_item_ptr(eb, path1->slots[0],
                                                struct btrfs_extent_ref_v0);
-                                root = find_tree_root(rc, eb, ref0);
+                                if (key.objectid == key.offset) {
-                                if (root)
+                                        root = find_tree_root(rc, eb, ref0);
-                                        cur->root = root;
+                                        if (root && !should_ignore_root(root))
-                                else
+                                                cur->root = root;
-                                        cur->old_root = 1;
+                                        else
-                                break;
+                                                list_add(&cur->list, &useless);
+                                        break;
+                                }
+                                if (is_cowonly_root(btrfs_ref_root_v0(eb,
+                                                                      ref0)))
+                                        cur->cowonly = 1;
                        }
 #else
                BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
@@ -614,22 +811,20 @@ again:
                                break;
                        }
-                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        edge = alloc_backref_edge(cache);
                        if (!edge) {
                                err = -ENOMEM;
                                goto out;
                        }
                        rb_node = tree_search(&cache->rb_root, key.offset);
                        if (!rb_node) {
-                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                upper = alloc_backref_node(cache);
                                if (!upper) {
-                                        kfree(edge);
+                                        free_backref_edge(cache, edge);
                                        err = -ENOMEM;
                                        goto out;
                                }
-                                backref_node_init(upper);
                                upper->bytenr = key.offset;
-                                upper->owner = 0;
                                upper->level = cur->level + 1;
                                /*
                                 *  backrefs for the upper level block isn't
@@ -639,11 +834,12 @@ again:
                        } else {
                                upper = rb_entry(rb_node, struct backref_node,
                                                 rb_node);
+                                BUG_ON(!upper->checked);
                                INIT_LIST_HEAD(&edge->list[UPPER]);
                        }
-                        list_add(&edge->list[LOWER], &cur->upper);
+                        list_add_tail(&edge->list[LOWER], &cur->upper);
-                        edge->node[UPPER] = upper;
                        edge->node[LOWER] = cur;
+                        edge->node[UPPER] = upper;
                        goto next;
                } else if (key.type != BTRFS_TREE_BLOCK_REF_KEY) {
@@ -657,11 +853,17 @@ again:
                        goto out;
                }
+                if (!root->ref_cows)
+                        cur->cowonly = 1;
                if (btrfs_root_level(&root->root_item) == cur->level) {
                        /* tree root */
                        BUG_ON(btrfs_root_bytenr(&root->root_item) !=
                               cur->bytenr);
-                        cur->root = root;
+                        if (should_ignore_root(root))
+                                list_add(&cur->list, &useless);
+                        else
+                                cur->root = root;
                        break;
                }
@@ -692,11 +894,14 @@ again:
                        if (!path2->nodes[level]) {
                                BUG_ON(btrfs_root_bytenr(&root->root_item) !=
                                       lower->bytenr);
-                                lower->root = root;
+                                if (should_ignore_root(root))
+                                        list_add(&lower->list, &useless);
+                                else
+                                        lower->root = root;
                                break;
                        }
-                        edge = kzalloc(sizeof(*edge), GFP_NOFS);
+                        edge = alloc_backref_edge(cache);
                        if (!edge) {
                                err = -ENOMEM;
                                goto out;
@@ -705,16 +910,17 @@ again:
                        eb = path2->nodes[level];
                        rb_node = tree_search(&cache->rb_root, eb->start);
                        if (!rb_node) {
-                                upper = kmalloc(sizeof(*upper), GFP_NOFS);
+                                upper = alloc_backref_node(cache);
                                if (!upper) {
-                                        kfree(edge);
+                                        free_backref_edge(cache, edge);
                                        err = -ENOMEM;
                                        goto out;
                                }
-                                backref_node_init(upper);
                                upper->bytenr = eb->start;
                                upper->owner = btrfs_header_owner(eb);
                                upper->level = lower->level + 1;
+                                if (!root->ref_cows)
+                                        upper->cowonly = 1;
                                /*
                                 * if we know the block isn't shared
@@ -744,10 +950,12 @@ again:
                                                 rb_node);
                                BUG_ON(!upper->checked);
                                INIT_LIST_HEAD(&edge->list[UPPER]);
+                                if (!upper->owner)
+                                        upper->owner = btrfs_header_owner(eb);
                        }
                        list_add_tail(&edge->list[LOWER], &lower->upper);
-                        edge->node[UPPER] = upper;
                        edge->node[LOWER] = lower;
+                        edge->node[UPPER] = upper;
                        if (rb_node)
                                break;
@@ -785,8 +993,13 @@ next:
         * into the cache.
         */
        BUG_ON(!node->checked);
-        rb_node = tree_insert(&cache->rb_root, node->bytenr, &node->rb_node);
+        cowonly = node->cowonly;
-        BUG_ON(rb_node);
+        if (!cowonly) {
+                rb_node = tree_insert(&cache->rb_root, node->bytenr,
+                                      &node->rb_node);
+                BUG_ON(rb_node);
+                list_add_tail(&node->lower, &cache->leaves);
+        }
        list_for_each_entry(edge, &node->upper, list[LOWER])
                list_add_tail(&edge->list[UPPER], &list);
@@ -795,6 +1008,14 @@ next:
                edge = list_entry(list.next, struct backref_edge, list[UPPER]);
                list_del_init(&edge->list[UPPER]);
                upper = edge->node[UPPER];
+                if (upper->detached) {
+                        list_del(&edge->list[LOWER]);
+                        lower = edge->node[LOWER];
+                        free_backref_edge(cache, edge);
+                        if (list_empty(&lower->upper))
+                                list_add(&lower->list, &useless);
+                        continue;
+                }
                if (!RB_EMPTY_NODE(&upper->rb_node)) {
                        if (upper->lowest) {
@@ -807,25 +1028,69 @@ next:
                }
                BUG_ON(!upper->checked);
-                rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                BUG_ON(cowonly != upper->cowonly);
-                                      &upper->rb_node);
+                if (!cowonly) {
-                BUG_ON(rb_node);
+                        rb_node = tree_insert(&cache->rb_root, upper->bytenr,
+                                              &upper->rb_node);
+                        BUG_ON(rb_node);
+                }
                list_add_tail(&edge->list[UPPER], &upper->lower);
                list_for_each_entry(edge, &upper->upper, list[LOWER])
                        list_add_tail(&edge->list[UPPER], &list);
        }
+        /*
+         * process useless backref nodes. backref nodes for tree leaves
+         * are deleted from the cache. backref nodes for upper level
+         * tree blocks are left in the cache to avoid unnecessary backref
+         * lookup.
+         */
+        while (!list_empty(&useless)) {
+                upper = list_entry(useless.next, struct backref_node, list);
+                list_del_init(&upper->list);
+                BUG_ON(!list_empty(&upper->upper));
+                if (upper == node)
+                        node = NULL;
+                if (upper->lowest) {
+                        list_del_init(&upper->lower);
+                        upper->lowest = 0;
+                }
+                while (!list_empty(&upper->lower)) {
+                        edge = list_entry(upper->lower.next,
+                                          struct backref_edge, list[UPPER]);
+                        list_del(&edge->list[UPPER]);
+                        list_del(&edge->list[LOWER]);
+                        lower = edge->node[LOWER];
+                        free_backref_edge(cache, edge);
+                        if (list_empty(&lower->upper))
+                                list_add(&lower->list, &useless);
+                }
+                __mark_block_processed(rc, upper);
+                if (upper->level > 0) {
+                        list_add(&upper->list, &cache->detached);
+                        upper->detached = 1;
+                } else {
+                        rb_erase(&upper->rb_node, &cache->rb_root);
+                        free_backref_node(cache, upper);
+                }
+        }
 out:
        btrfs_free_path(path1);
        btrfs_free_path(path2);
        if (err) {
-                INIT_LIST_HEAD(&list);
+                while (!list_empty(&useless)) {
+                        lower = list_entry(useless.next,
+                                           struct backref_node, upper);
+                        list_del_init(&lower->upper);
+                }
                upper = node;
+                INIT_LIST_HEAD(&list);
                while (upper) {
                        if (RB_EMPTY_NODE(&upper->rb_node)) {
                                list_splice_tail(&upper->upper, &list);
-                                kfree(upper);
+                                free_backref_node(cache, upper);
                        }
                        if (list_empty(&list))
@@ -833,15 +1098,104 @@ out:
                        edge = list_entry(list.next, struct backref_edge,
                                          list[LOWER]);
+                        list_del(&edge->list[LOWER]);
                        upper = edge->node[UPPER];
-                        kfree(edge);
+                        free_backref_edge(cache, edge);
                }
                return ERR_PTR(err);
        }
+        BUG_ON(node && node->detached);
        return node;
 }
 /*
+ * helper to add backref node for the newly created snapshot.
+ * the backref node is created by cloning backref node that
+ * corresponds to root of source tree
+ */
+static int clone_backref_node(struct btrfs_trans_handle *trans,
+                              struct reloc_control *rc,
+                              struct btrfs_root *src,
+                              struct btrfs_root *dest)
+{
+        struct btrfs_root *reloc_root = src->reloc_root;
+        struct backref_cache *cache = &rc->backref_cache;
+        struct backref_node *node = NULL;
+        struct backref_node *new_node;
+        struct backref_edge *edge;
+        struct backref_edge *new_edge;
+        struct rb_node *rb_node;
+        if (cache->last_trans > 0)
+                update_backref_cache(trans, cache);
+        rb_node = tree_search(&cache->rb_root, src->commit_root->start);
+        if (rb_node) {
+                node = rb_entry(rb_node, struct backref_node, rb_node);
+                if (node->detached)
+                        node = NULL;
+                else
+                        BUG_ON(node->new_bytenr != reloc_root->node->start);
+        }
+        if (!node) {
+                rb_node = tree_search(&cache->rb_root,
+                                      reloc_root->commit_root->start);
+                if (rb_node) {
+                        node = rb_entry(rb_node, struct backref_node,
+                                        rb_node);
+                        BUG_ON(node->detached);
+                }
+        }
+        if (!node)
+                return 0;
+        new_node = alloc_backref_node(cache);
+        if (!new_node)
+                return -ENOMEM;
+        new_node->bytenr = dest->node->start;
+        new_node->level = node->level;
+        new_node->lowest = node->lowest;
+        new_node->root = dest;
+        if (!node->lowest) {
+                list_for_each_entry(edge, &node->lower, list[UPPER]) {
+                        new_edge = alloc_backref_edge(cache);
+                        if (!new_edge)
+                                goto fail;
+                        new_edge->node[UPPER] = new_node;
+                        new_edge->node[LOWER] = edge->node[LOWER];
+                        list_add_tail(&new_edge->list[UPPER],
+                                      &new_node->lower);
+                }
+        }
+        rb_node = tree_insert(&cache->rb_root, new_node->bytenr,
+                              &new_node->rb_node);
+        BUG_ON(rb_node);
+        if (!new_node->lowest) {
+                list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
+                        list_add_tail(&new_edge->list[LOWER],
+                                      &new_edge->node[LOWER]->upper);
+                }
+        }
+        return 0;
+fail:
+        while (!list_empty(&new_node->lower)) {
+                new_edge = list_entry(new_node->lower.next,
+                                      struct backref_edge, list[UPPER]);
+                list_del(&new_edge->list[UPPER]);
+                free_backref_edge(cache, new_edge);
+        }
+        free_backref_node(cache, new_node);
+        return -ENOMEM;
+}
+/*
 * helper to add 'address of tree root -> reloc tree' mapping
 */
 static int __add_reloc_root(struct btrfs_root *root)
@@ -901,12 +1255,8 @@ static int __update_reloc_root(struct btrfs_root *root, int del)
        return 0;
 }
-/*
+static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
- * create reloc tree for a given fs tree. reloc tree is just a
+                                        struct btrfs_root *root, u64 objectid)
- * snapshot of the fs tree with special root objectid.
- */
-int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root)
 {
        struct btrfs_root *reloc_root;
        struct extent_buffer *eb;
@@ -914,36 +1264,45 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
        struct btrfs_key root_key;
        int ret;
-        if (root->reloc_root) {
-                reloc_root = root->reloc_root;
-                reloc_root->last_trans = trans->transid;
-                return 0;
-        }
-        if (!root->fs_info->reloc_ctl ||
-            !root->fs_info->reloc_ctl->create_reloc_root ||
-            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
-                return 0;
        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
        BUG_ON(!root_item);
        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
        root_key.type = BTRFS_ROOT_ITEM_KEY;
-        root_key.offset = root->root_key.objectid;
+        root_key.offset = objectid;
-        ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+        if (root->root_key.objectid == objectid) {
-                              BTRFS_TREE_RELOC_OBJECTID);
+                /* called by btrfs_init_reloc_root */
-        BUG_ON(ret);
+                ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
+                                      BTRFS_TREE_RELOC_OBJECTID);
+                BUG_ON(ret);
+                btrfs_set_root_last_snapshot(&root->root_item,
+                                             trans->transid - 1);
+        } else {
+                /*
+                 * called by btrfs_reloc_post_snapshot_hook.
+                 * the source tree is a reloc tree, all tree blocks
+                 * modified after it was created have RELOC flag
+                 * set in their headers. so it's OK to not update
+                 * the 'last_snapshot'.
+                 */
+                ret = btrfs_copy_root(trans, root, root->node, &eb,
+                                      BTRFS_TREE_RELOC_OBJECTID);
+                BUG_ON(ret);
+        }
-        btrfs_set_root_last_snapshot(&root->root_item, trans->transid - 1);
        memcpy(root_item, &root->root_item, sizeof(*root_item));
-        btrfs_set_root_refs(root_item, 1);
        btrfs_set_root_bytenr(root_item, eb->start);
        btrfs_set_root_level(root_item, btrfs_header_level(eb));
        btrfs_set_root_generation(root_item, trans->transid);
-        memset(&root_item->drop_progress, 0, sizeof(struct btrfs_disk_key));
-        root_item->drop_level = 0;
+        if (root->root_key.objectid == objectid) {
+                btrfs_set_root_refs(root_item, 0);
+                memset(&root_item->drop_progress, 0,
+                       sizeof(struct btrfs_disk_key));
+                root_item->drop_level = 0;
+        }
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
@@ -957,6 +1316,37 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
                                                 &root_key);
        BUG_ON(IS_ERR(reloc_root));
        reloc_root->last_trans = trans->transid;
+        return reloc_root;
+}
+/*
+ * create reloc tree for a given fs tree. reloc tree is just a
+ * snapshot of the fs tree with special root objectid.
+ */
+int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root)
+{
+        struct btrfs_root *reloc_root;
+        struct reloc_control *rc = root->fs_info->reloc_ctl;
+        int clear_rsv = 0;
+        if (root->reloc_root) {
+                reloc_root = root->reloc_root;
+                reloc_root->last_trans = trans->transid;
+                return 0;
+        }
+        if (!rc || !rc->create_reloc_tree ||
+            root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
+                return 0;
+        if (!trans->block_rsv) {
+                trans->block_rsv = rc->block_rsv;
+                clear_rsv = 1;
+        }
+        reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
+        if (clear_rsv)
+                trans->block_rsv = NULL;
        __add_reloc_root(reloc_root);
        root->reloc_root = reloc_root;
@@ -980,7 +1370,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
-        if (btrfs_root_refs(root_item) == 0) {
+        if (root->fs_info->reloc_ctl->merge_reloc_tree &&
+            btrfs_root_refs(root_item) == 0) {
                root->reloc_root = NULL;
                del = 1;
        }
@@ -1102,8 +1493,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
                goto out;
        }
-        if (new_bytenr)
+        *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-                *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
        ret = 0;
 out:
        btrfs_free_path(path);
@@ -1114,19 +1504,18 @@ out:
 * update file extent items in the tree leaf to point to
 * the new locations.
 */
-static int replace_file_extents(struct btrfs_trans_handle *trans,
+static noinline_for_stack
-                                struct reloc_control *rc,
+int replace_file_extents(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
+                         struct reloc_control *rc,
-                                struct extent_buffer *leaf,
+                         struct btrfs_root *root,
-                                struct list_head *inode_list)
+                         struct extent_buffer *leaf)
 {
        struct btrfs_key key;
        struct btrfs_file_extent_item *fi;
        struct inode *inode = NULL;
-        struct inodevec *ivec = NULL;
        u64 parent;
        u64 bytenr;
-        u64 new_bytenr;
+        u64 new_bytenr = 0;
        u64 num_bytes;
        u64 end;
        u32 nritems;
@@ -1166,21 +1555,12 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
                 * to complete and drop the extent cache
                 */
                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-                        if (!ivec || ivec->nr == INODEVEC_SIZE) {
-                                ivec = kmalloc(sizeof(*ivec), GFP_NOFS);
-                                BUG_ON(!ivec);
-                                ivec->nr = 0;
-                                list_add_tail(&ivec->list, inode_list);
-                        }
                        if (first) {
                                inode = find_next_inode(root, key.objectid);
-                                if (inode)
-                                        ivec->inode[ivec->nr++] = inode;
                                first = 0;
                        } else if (inode && inode->i_ino < key.objectid) {
+                                btrfs_add_delayed_iput(inode);
                                inode = find_next_inode(root, key.objectid);
-                                if (inode)
-                                        ivec->inode[ivec->nr++] = inode;
                        }
                        if (inode && inode->i_ino == key.objectid) {
                                end = key.offset +
@@ -1204,8 +1584,10 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
                ret = get_new_location(rc->data_inode, &new_bytenr,
                                       bytenr, num_bytes);
-                if (ret > 0)
+                if (ret > 0) {
+                        WARN_ON(1);
                        continue;
+                }
                BUG_ON(ret < 0);
                btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
@@ -1225,6 +1607,8 @@ static int replace_file_extents(struct btrfs_trans_handle *trans,
        }
        if (dirty)
                btrfs_mark_buffer_dirty(leaf);
+        if (inode)
+                btrfs_add_delayed_iput(inode);
        return 0;
 }
@@ -1248,11 +1632,11 @@ int memcmp_node_keys(struct extent_buffer *eb, int slot,
 * if no block got replaced, 0 is returned. if there are other
 * errors, a negative error number is returned.
 */
-static int replace_path(struct btrfs_trans_handle *trans,
+static noinline_for_stack
-                        struct btrfs_root *dest, struct btrfs_root *src,
+int replace_path(struct btrfs_trans_handle *trans,
-                        struct btrfs_path *path, struct btrfs_key *next_key,
+                 struct btrfs_root *dest, struct btrfs_root *src,
-                        struct extent_buffer **leaf,
+                 struct btrfs_path *path, struct btrfs_key *next_key,
-                        int lowest_level, int max_level)
+                 int lowest_level, int max_level)
 {
        struct extent_buffer *eb;
        struct extent_buffer *parent;
@@ -1263,16 +1647,16 @@ static int replace_path(struct btrfs_trans_handle *trans,
        u64 new_ptr_gen;
        u64 last_snapshot;
        u32 blocksize;
+        int cow = 0;
        int level;
        int ret;
        int slot;
        BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
        BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
-        BUG_ON(lowest_level > 1 && leaf);
        last_snapshot = btrfs_root_last_snapshot(&src->root_item);
+again:
        slot = path->slots[lowest_level];
        btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
@@ -1286,8 +1670,10 @@ static int replace_path(struct btrfs_trans_handle *trans,
                return 0;
        }
-        ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+        if (cow) {
-        BUG_ON(ret);
+                ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb);
+                BUG_ON(ret);
+        }
        btrfs_set_lock_blocking(eb);
        if (next_key) {
@@ -1331,7 +1717,7 @@ static int replace_path(struct btrfs_trans_handle *trans,
                if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
                    memcmp_node_keys(parent, slot, path, level)) {
-                        if (level <= lowest_level && !leaf) {
+                        if (level <= lowest_level) {
                                ret = 0;
                                break;
                        }
@@ -1339,16 +1725,12 @@ static int replace_path(struct btrfs_trans_handle *trans,
                        eb = read_tree_block(dest, old_bytenr, blocksize,
                                             old_ptr_gen);
                        btrfs_tree_lock(eb);
-                        ret = btrfs_cow_block(trans, dest, eb, parent,
+                        if (cow) {
-                                              slot, &eb);
+                                ret = btrfs_cow_block(trans, dest, eb, parent,
-                        BUG_ON(ret);
+                                                      slot, &eb);
-                        btrfs_set_lock_blocking(eb);
+                                BUG_ON(ret);
-                        if (level <= lowest_level) {
-                                *leaf = eb;
-                                ret = 0;
-                                break;
                        }
+                        btrfs_set_lock_blocking(eb);
                        btrfs_tree_unlock(parent);
                        free_extent_buffer(parent);
@@ -1357,6 +1739,13 @@ static int replace_path(struct btrfs_trans_handle *trans,
                        continue;
                }
+                if (!cow) {
+                        btrfs_tree_unlock(parent);
+                        free_extent_buffer(parent);
+                        cow = 1;
+                        goto again;
+                }
                btrfs_node_key_to_cpu(path->nodes[level], &key,
                                      path->slots[level]);
                btrfs_release_path(src, path);
@@ -1562,20 +1951,6 @@ static int invalidate_extent_cache(struct btrfs_root *root,
        return 0;
 }
-static void put_inodes(struct list_head *list)
-{
-        struct inodevec *ivec;
-        while (!list_empty(list)) {
-                ivec = list_entry(list->next, struct inodevec, list);
-                list_del(&ivec->list);
-                while (ivec->nr > 0) {
-                        ivec->nr--;
-                        iput(ivec->inode[ivec->nr]);
-                }
-                kfree(ivec);
-        }
-}
 static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key)
@@ -1608,13 +1983,14 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        struct btrfs_root *reloc_root;
        struct btrfs_root_item *root_item;
        struct btrfs_path *path;
-        struct extent_buffer *leaf = NULL;
+        struct extent_buffer *leaf;
        unsigned long nr;
        int level;
        int max_level;
        int replaced = 0;
        int ret;
        int err = 0;
+        u32 min_reserved;
        path = btrfs_alloc_path();
        if (!path)
@@ -1648,34 +2024,23 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                btrfs_unlock_up_safe(path, 0);
        }
-        if (level == 0 && rc->stage == UPDATE_DATA_PTRS) {
+        min_reserved = root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
-                trans = btrfs_start_transaction(root, 1);
+        memset(&next_key, 0, sizeof(next_key));
-                leaf = path->nodes[0];
+        while (1) {
-                btrfs_item_key_to_cpu(leaf, &key, 0);
+                trans = btrfs_start_transaction(root, 0);
-                btrfs_release_path(reloc_root, path);
+                trans->block_rsv = rc->block_rsv;
-                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+                ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
-                if (ret < 0) {
+                                            min_reserved, 0);
-                        err = ret;
+                if (ret) {
-                        goto out;
+                        BUG_ON(ret != -EAGAIN);
+                        ret = btrfs_commit_transaction(trans, root);
+                        BUG_ON(ret);
+                        continue;
                }
-                leaf = path->nodes[0];
-                btrfs_unlock_up_safe(path, 1);
-                ret = replace_file_extents(trans, rc, root, leaf,
-                                           &inode_list);
-                if (ret < 0)
-                        err = ret;
-                goto out;
-        }
-        memset(&next_key, 0, sizeof(next_key));
-        while (1) {
-                leaf = NULL;
                replaced = 0;
-                trans = btrfs_start_transaction(root, 1);
                max_level = level;
                ret = walk_down_reloc_tree(reloc_root, path, &level);
@@ -1689,14 +2054,9 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                if (!find_next_key(path, level, &key) &&
                    btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
                        ret = 0;
-                } else if (level == 1 && rc->stage == UPDATE_DATA_PTRS) {
-                        ret = replace_path(trans, root, reloc_root,
-                                           path, &next_key, &leaf,
-                                           level, max_level);
                } else {
-                        ret = replace_path(trans, root, reloc_root,
+                        ret = replace_path(trans, root, reloc_root, path,
-                                           path, &next_key, NULL,
+                                           &next_key, level, max_level);
-                                           level, max_level);
                }
                if (ret < 0) {
                        err = ret;
@@ -1708,16 +2068,6 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                        btrfs_node_key_to_cpu(path->nodes[level], &key,
                                              path->slots[level]);
                        replaced = 1;
-                } else if (leaf) {
-                        /*
-                         * no block got replaced, try replacing file extents
-                         */
-                        btrfs_item_key_to_cpu(leaf, &key, 0);
-                        ret = replace_file_extents(trans, rc, root, leaf,
-                                                   &inode_list);
-                        btrfs_tree_unlock(leaf);
-                        free_extent_buffer(leaf);
-                        BUG_ON(ret < 0);
                }
                ret = walk_up_reloc_tree(reloc_root, path, &level);
@@ -1734,15 +2084,10 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                root_item->drop_level = level;
                nr = trans->blocks_used;
-                btrfs_end_transaction(trans, root);
+                btrfs_end_transaction_throttle(trans, root);
                btrfs_btree_balance_dirty(root, nr);
-                /*
-                 * put inodes outside transaction, otherwise we may deadlock.
-                 */
-                put_inodes(&inode_list);
                if (replaced && rc->stage == UPDATE_DATA_PTRS)
                        invalidate_extent_cache(root, &key, &next_key);
        }
@@ -1765,87 +2110,125 @@ out:
                       sizeof(root_item->drop_progress));
                root_item->drop_level = 0;
                btrfs_set_root_refs(root_item, 0);
+                btrfs_update_reloc_root(trans, root);
        }
        nr = trans->blocks_used;
-        btrfs_end_transaction(trans, root);
+        btrfs_end_transaction_throttle(trans, root);
        btrfs_btree_balance_dirty(root, nr);
-        put_inodes(&inode_list);
        if (replaced && rc->stage == UPDATE_DATA_PTRS)
                invalidate_extent_cache(root, &key, &next_key);
        return err;
 }
-/*
+static noinline_for_stack
- * callback for the work threads.
+int prepare_to_merge(struct reloc_control *rc, int err)
- * this function merges reloc tree with corresponding fs tree,
- * and then drops the reloc tree.
- */
-static void merge_func(struct btrfs_work *work)
 {
-        struct btrfs_trans_handle *trans;
+        struct btrfs_root *root = rc->extent_root;
-        struct btrfs_root *root;
        struct btrfs_root *reloc_root;
-        struct async_merge *async;
+        struct btrfs_trans_handle *trans;
+        LIST_HEAD(reloc_roots);
+        u64 num_bytes = 0;
+        int ret;
+        int retries = 0;
+        mutex_lock(&root->fs_info->trans_mutex);
+        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+        rc->merging_rsv_size += rc->nodes_relocated * 2;
+        mutex_unlock(&root->fs_info->trans_mutex);
+again:
+        if (!err) {
+                num_bytes = rc->merging_rsv_size;
+                ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+                                          num_bytes, &retries);
+                if (ret)
+                        err = ret;
+        }
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        if (!err) {
+                if (num_bytes != rc->merging_rsv_size) {
+                        btrfs_end_transaction(trans, rc->extent_root);
+                        btrfs_block_rsv_release(rc->extent_root,
+                                                rc->block_rsv, num_bytes);
+                        retries = 0;
+                        goto again;
+                }
+        }
-        async = container_of(work, struct async_merge, work);
+        rc->merge_reloc_tree = 1;
-        reloc_root = async->root;
+        while (!list_empty(&rc->reloc_roots)) {
+                reloc_root = list_entry(rc->reloc_roots.next,
+                                        struct btrfs_root, root_list);
+                list_del_init(&reloc_root->root_list);
-        if (btrfs_root_refs(&reloc_root->root_item) > 0) {
                root = read_fs_root(reloc_root->fs_info,
                                    reloc_root->root_key.offset);
                BUG_ON(IS_ERR(root));
                BUG_ON(root->reloc_root != reloc_root);
-                merge_reloc_root(async->rc, root);
+                /*
+                 * set reference count to 1, so btrfs_recover_relocation
-                trans = btrfs_start_transaction(root, 1);
+                 * knows it should resumes merging
+                 */
+                if (!err)
+                        btrfs_set_root_refs(&reloc_root->root_item, 1);
                btrfs_update_reloc_root(trans, root);
-                btrfs_end_transaction(trans, root);
-        }
-        btrfs_drop_snapshot(reloc_root, 0);
+                list_add(&reloc_root->root_list, &reloc_roots);
+        }
-        if (atomic_dec_and_test(async->num_pending))
+        list_splice(&reloc_roots, &rc->reloc_roots);
-                complete(async->done);
-        kfree(async);
+        if (!err)
+                btrfs_commit_transaction(trans, rc->extent_root);
+        else
+                btrfs_end_transaction(trans, rc->extent_root);
+        return err;
 }
-static int merge_reloc_roots(struct reloc_control *rc)
+static noinline_for_stack
+int merge_reloc_roots(struct reloc_control *rc)
 {
-        struct async_merge *async;
        struct btrfs_root *root;
-        struct completion done;
+        struct btrfs_root *reloc_root;
-        atomic_t num_pending;
+        LIST_HEAD(reloc_roots);
+        int found = 0;
+        int ret;
+again:
+        root = rc->extent_root;
+        mutex_lock(&root->fs_info->trans_mutex);
+        list_splice_init(&rc->reloc_roots, &reloc_roots);
+        mutex_unlock(&root->fs_info->trans_mutex);
-        init_completion(&done);
+        while (!list_empty(&reloc_roots)) {
-        atomic_set(&num_pending, 1);
+                found = 1;
+                reloc_root = list_entry(reloc_roots.next,
+                                        struct btrfs_root, root_list);
-        while (!list_empty(&rc->reloc_roots)) {
+                if (btrfs_root_refs(&reloc_root->root_item) > 0) {
-                root = list_entry(rc->reloc_roots.next,
+                        root = read_fs_root(reloc_root->fs_info,
-                                  struct btrfs_root, root_list);
+                                            reloc_root->root_key.offset);
-                list_del_init(&root->root_list);
+                        BUG_ON(IS_ERR(root));
+                        BUG_ON(root->reloc_root != reloc_root);
-                async = kmalloc(sizeof(*async), GFP_NOFS);
+                        ret = merge_reloc_root(rc, root);
-                BUG_ON(!async);
+                        BUG_ON(ret);
-                async->work.func = merge_func;
+                } else {
-                async->work.flags = 0;
+                        list_del_init(&reloc_root->root_list);
-                async->rc = rc;
+                }
-                async->root = root;
+                btrfs_drop_snapshot(reloc_root, rc->block_rsv, 0);
-                async->done = &done;
-                async->num_pending = &num_pending;
-                atomic_inc(&num_pending);
-                btrfs_queue_worker(&rc->workers, &async->work);
        }
-        if (!atomic_dec_and_test(&num_pending))
+        if (found) {
-                wait_for_completion(&done);
+                found = 0;
+                goto again;
+        }
        BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
        return 0;
 }
@@ -1876,119 +2259,169 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
        return btrfs_record_root_in_trans(trans, root);
 }
-/*
+static noinline_for_stack
- * select one tree from trees that references the block.
+struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
- * for blocks in refernce counted trees, we preper reloc tree.
+                                     struct reloc_control *rc,
- * if no reloc tree found and reloc_only is true, NULL is returned.
+                                     struct backref_node *node,
- */
+                                     struct backref_edge *edges[], int *nr)
-static struct btrfs_root *__select_one_root(struct btrfs_trans_handle *trans,
-                                            struct backref_node *node,
-                                            struct backref_edge *edges[],
-                                            int *nr, int reloc_only)
 {
        struct backref_node *next;
        struct btrfs_root *root;
-        int index;
+        int index = 0;
-        int loop = 0;
-again:
-        index = 0;
        next = node;
        while (1) {
                cond_resched();
                next = walk_up_backref(next, edges, &index);
                root = next->root;
-                if (!root) {
+                BUG_ON(!root);
-                        BUG_ON(!node->old_root);
+                BUG_ON(!root->ref_cows);
-                        goto skip;
-                }
-                /* no other choice for non-refernce counted tree */
-                if (!root->ref_cows) {
-                        BUG_ON(reloc_only);
-                        break;
-                }
                if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
                        record_reloc_root_in_trans(trans, root);
                        break;
                }
-                if (loop) {
+                btrfs_record_root_in_trans(trans, root);
-                        btrfs_record_root_in_trans(trans, root);
+                root = root->reloc_root;
+                if (next->new_bytenr != root->node->start) {
+                        BUG_ON(next->new_bytenr);
+                        BUG_ON(!list_empty(&next->list));
+                        next->new_bytenr = root->node->start;
+                        next->root = root;
+                        list_add_tail(&next->list,
+                                      &rc->backref_cache.changed);
+                        __mark_block_processed(rc, next);
                        break;
                }
-                if (reloc_only || next != node) {
+                WARN_ON(1);
-                        if (!root->reloc_root)
-                                btrfs_record_root_in_trans(trans, root);
-                        root = root->reloc_root;
-                        /*
-                         * if the reloc tree was created in current
-                         * transation, there is no node in backref tree
-                         * corresponds to the root of the reloc tree.
-                         */
-                        if (btrfs_root_last_snapshot(&root->root_item) ==
-                            trans->transid - 1)
-                                break;
-                }
-skip:
                root = NULL;
                next = walk_down_backref(edges, &index);
                if (!next || next->level <= node->level)
                        break;
        }
+        if (!root)
+                return NULL;
-        if (!root && !loop && !reloc_only) {
+        *nr = index;
-                loop = 1;
+        next = node;
-                goto again;
+        /* setup backref node path for btrfs_reloc_cow_block */
+        while (1) {
+                rc->backref_cache.path[next->level] = next;
+                if (--index < 0)
+                        break;
+                next = edges[index]->node[UPPER];
        }
-        if (root)
-                *nr = index;
-        else
-                *nr = 0;
        return root;
 }
+/*
+ * select a tree root for relocation. return NULL if the block
+ * is reference counted. we should use do_relocation() in this
+ * case. return a tree root pointer if the block isn't reference
+ * counted. return -ENOENT if the block is root of reloc tree.
+ */
 static noinline_for_stack
 struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
                                   struct backref_node *node)
 {
+        struct backref_node *next;
+        struct btrfs_root *root;
+        struct btrfs_root *fs_root = NULL;
        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
-        int nr;
+        int index = 0;
-        return __select_one_root(trans, node, edges, &nr, 0);
+        next = node;
+        while (1) {
+                cond_resched();
+                next = walk_up_backref(next, edges, &index);
+                root = next->root;
+                BUG_ON(!root);
+                /* no other choice for non-refernce counted tree */
+                if (!root->ref_cows)
+                        return root;
+                if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
+                        fs_root = root;
+                if (next != node)
+                        return NULL;
+                next = walk_down_backref(edges, &index);
+                if (!next || next->level <= node->level)
+                        break;
+        }
+        if (!fs_root)
+                return ERR_PTR(-ENOENT);
+        return fs_root;
 }
 static noinline_for_stack
-struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
+u64 calcu_metadata_size(struct reloc_control *rc,
-                                     struct backref_node *node,
+                        struct backref_node *node, int reserve)
-                                     struct backref_edge *edges[], int *nr)
 {
-        return __select_one_root(trans, node, edges, nr, 1);
+        struct backref_node *next = node;
+        struct backref_edge *edge;
+        struct backref_edge *edges[BTRFS_MAX_LEVEL - 1];
+        u64 num_bytes = 0;
+        int index = 0;
+        BUG_ON(reserve && node->processed);
+        while (next) {
+                cond_resched();
+                while (1) {
+                        if (next->processed && (reserve || next != node))
+                                break;
+                        num_bytes += btrfs_level_size(rc->extent_root,
+                                                      next->level);
+                        if (list_empty(&next->upper))
+                                break;
+                        edge = list_entry(next->upper.next,
+                                          struct backref_edge, list[LOWER]);
+                        edges[index++] = edge;
+                        next = edge->node[UPPER];
+                }
+                next = walk_down_backref(edges, &index);
+        }
+        return num_bytes;
 }
-static void grab_path_buffers(struct btrfs_path *path,
+static int reserve_metadata_space(struct btrfs_trans_handle *trans,
-                              struct backref_node *node,
+                                  struct reloc_control *rc,
-                              struct backref_edge *edges[], int nr)
+                                  struct backref_node *node)
 {
-        int i = 0;
+        struct btrfs_root *root = rc->extent_root;
-        while (1) {
+        u64 num_bytes;
-                drop_node_buffer(node);
+        int ret;
-                node->eb = path->nodes[node->level];
-                BUG_ON(!node->eb);
+        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
-                if (path->locks[node->level])
-                        node->locked = 1;
-                path->nodes[node->level] = NULL;
-                path->locks[node->level] = 0;
-                if (i >= nr)
-                        break;
-                edges[i]->blockptr = node->eb->start;
+        trans->block_rsv = rc->block_rsv;
-                node = edges[i]->node[UPPER];
+        ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
-                i++;
+                                  &rc->block_rsv_retries);
+        if (ret) {
+                if (ret == -EAGAIN)
+                        rc->commit_transaction = 1;
+                return ret;
        }
+        rc->block_rsv_retries = 0;
+        return 0;
+}
+static void release_metadata_space(struct reloc_control *rc,
+                                   struct backref_node *node)
+{
+        u64 num_bytes = calcu_metadata_size(rc, node, 0) * 2;
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, num_bytes);
 }
 /*
@@ -1999,6 +2432,7 @@ static void grab_path_buffers(struct btrfs_path *path,
 * in that case this function just updates pointers.
 */
 static int do_relocation(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc,
                         struct backref_node *node,
                         struct btrfs_key *key,
                         struct btrfs_path *path, int lowest)
@@ -2019,18 +2453,25 @@ static int do_relocation(struct btrfs_trans_handle *trans,
        BUG_ON(lowest && node->eb);
        path->lowest_level = node->level + 1;
+        rc->backref_cache.path[node->level] = node;
        list_for_each_entry(edge, &node->upper, list[LOWER]) {
                cond_resched();
-                if (node->eb && node->eb->start == edge->blockptr)
-                        continue;
                upper = edge->node[UPPER];
-                root = select_reloc_root(trans, upper, edges, &nr);
+                root = select_reloc_root(trans, rc, upper, edges, &nr);
-                if (!root)
+                BUG_ON(!root);
-                        continue;
+                if (upper->eb && !upper->locked) {
-                if (upper->eb && !upper->locked)
+                        if (!lowest) {
+                                ret = btrfs_bin_search(upper->eb, key,
+                                                       upper->level, &slot);
+                                BUG_ON(ret);
+                                bytenr = btrfs_node_blockptr(upper->eb, slot);
+                                if (node->eb->start == bytenr)
+                                        goto next;
+                        }
                        drop_node_buffer(upper);
+                }
                if (!upper->eb) {
                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
@@ -2040,11 +2481,17 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        }
                        BUG_ON(ret > 0);
-                        slot = path->slots[upper->level];
+                        if (!upper->eb) {
+                                upper->eb = path->nodes[upper->level];
+                                path->nodes[upper->level] = NULL;
+                        } else {
+                                BUG_ON(upper->eb != path->nodes[upper->level]);
+                        }
-                        btrfs_unlock_up_safe(path, upper->level + 1);
+                        upper->locked = 1;
-                        grab_path_buffers(path, upper, edges, nr);
+                        path->locks[upper->level] = 0;
+                        slot = path->slots[upper->level];
                        btrfs_release_path(NULL, path);
                } else {
                        ret = btrfs_bin_search(upper->eb, key, upper->level,
@@ -2053,14 +2500,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                }
                bytenr = btrfs_node_blockptr(upper->eb, slot);
-                if (!lowest) {
+                if (lowest) {
-                        if (node->eb->start == bytenr) {
+                        BUG_ON(bytenr != node->bytenr);
-                                btrfs_tree_unlock(upper->eb);
-                                upper->locked = 0;
-                                continue;
-                        }
                } else {
-                        BUG_ON(node->bytenr != bytenr);
+                        if (node->eb->start == bytenr)
+                                goto next;
                }
                blocksize = btrfs_level_size(root, node->level);
@@ -2072,13 +2516,13 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                if (!node->eb) {
                        ret = btrfs_cow_block(trans, root, eb, upper->eb,
                                              slot, &eb);
+                        btrfs_tree_unlock(eb);
+                        free_extent_buffer(eb);
                        if (ret < 0) {
                                err = ret;
-                                break;
+                                goto next;
                        }
-                        btrfs_set_lock_blocking(eb);
+                        BUG_ON(node->eb != eb);
-                        node->eb = eb;
-                        node->locked = 1;
                } else {
                        btrfs_set_node_blockptr(upper->eb, slot,
                                                node->eb->start);
@@ -2096,67 +2540,80 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
                        BUG_ON(ret);
                }
-                if (!lowest) {
+next:
-                        btrfs_tree_unlock(upper->eb);
+                if (!upper->pending)
-                        upper->locked = 0;
+                        drop_node_buffer(upper);
-                }
+                else
+                        unlock_node_buffer(upper);
+                if (err)
+                        break;
        }
+        if (!err && node->pending) {
+                drop_node_buffer(node);
+                list_move_tail(&node->list, &rc->backref_cache.changed);
+                node->pending = 0;
+        }
        path->lowest_level = 0;
+        BUG_ON(err == -ENOSPC);
        return err;
 }
 static int link_to_upper(struct btrfs_trans_handle *trans,
+                         struct reloc_control *rc,
                         struct backref_node *node,
                         struct btrfs_path *path)
 {
        struct btrfs_key key;
-        if (!node->eb || list_empty(&node->upper))
-                return 0;
        btrfs_node_key_to_cpu(node->eb, &key, 0);
-        return do_relocation(trans, node, &key, path, 0);
+        return do_relocation(trans, rc, node, &key, path, 0);
 }
 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
-                                struct backref_cache *cache,
+                                struct reloc_control *rc,
-                                struct btrfs_path *path)
+                                struct btrfs_path *path, int err)
 {
+        LIST_HEAD(list);
+        struct backref_cache *cache = &rc->backref_cache;
        struct backref_node *node;
        int level;
        int ret;
-        int err = 0;
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
                while (!list_empty(&cache->pending[level])) {
                        node = list_entry(cache->pending[level].next,
-                                          struct backref_node, lower);
+                                          struct backref_node, list);
-                        BUG_ON(node->level != level);
+                        list_move_tail(&node->list, &list);
+                        BUG_ON(!node->pending);
-                        ret = link_to_upper(trans, node, path);
+                        if (!err) {
-                        if (ret < 0)
+                                ret = link_to_upper(trans, rc, node, path);
-                                err = ret;
+                                if (ret < 0)
-                        /*
+                                        err = ret;
-                         * this remove the node from the pending list and
+                        }
-                         * may add some other nodes to the level + 1
-                         * pending list
-                         */
-                        remove_backref_node(cache, node);
                }
+                list_splice_init(&list, &cache->pending[level]);
        }
-        BUG_ON(!RB_EMPTY_ROOT(&cache->rb_root));
        return err;
 }
 static void mark_block_processed(struct reloc_control *rc,
-                                 struct backref_node *node)
+                                 u64 bytenr, u32 blocksize)
+{
+        set_extent_bits(&rc->processed_blocks, bytenr, bytenr + blocksize - 1,
+                        EXTENT_DIRTY, GFP_NOFS);
+}
+static void __mark_block_processed(struct reloc_control *rc,
+                                   struct backref_node *node)
 {
        u32 blocksize;
        if (node->level == 0 ||
            in_block_group(node->bytenr, rc->block_group)) {
                blocksize = btrfs_level_size(rc->extent_root, node->level);
-                set_extent_bits(&rc->processed_blocks, node->bytenr,
+                mark_block_processed(rc, node->bytenr, blocksize);
-                                node->bytenr + blocksize - 1, EXTENT_DIRTY,
-                                GFP_NOFS);
        }
        node->processed = 1;
 }
@@ -2179,7 +2636,7 @@ static void update_processed_blocks(struct reloc_control *rc,
                        if (next->processed)
                                break;
-                        mark_block_processed(rc, next);
+                        __mark_block_processed(rc, next);
                        if (list_empty(&next->upper))
                                break;
@@ -2202,138 +2659,6 @@ static int tree_block_processed(u64 bytenr, u32 blocksize,
        return 0;
 }
-/*
- * check if there are any file extent pointers in the leaf point to
- * data require processing
- */
-static int check_file_extents(struct reloc_control *rc,
-                              u64 bytenr, u32 blocksize, u64 ptr_gen)
-{
-        struct btrfs_key found_key;
-        struct btrfs_file_extent_item *fi;
-        struct extent_buffer *leaf;
-        u32 nritems;
-        int i;
-        int ret = 0;
-        leaf = read_tree_block(rc->extent_root, bytenr, blocksize, ptr_gen);
-        nritems = btrfs_header_nritems(leaf);
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                btrfs_item_key_to_cpu(leaf, &found_key, i);
-                if (found_key.type != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, fi) ==
-                    BTRFS_FILE_EXTENT_INLINE)
-                        continue;
-                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-                if (bytenr == 0)
-                        continue;
-                if (in_block_group(bytenr, rc->block_group)) {
-                        ret = 1;
-                        break;
-                }
-        }
-        free_extent_buffer(leaf);
-        return ret;
-}
-/*
- * scan child blocks of a given block to find blocks require processing
- */
-static int add_child_blocks(struct btrfs_trans_handle *trans,
-                            struct reloc_control *rc,
-                            struct backref_node *node,
-                            struct rb_root *blocks)
-{
-        struct tree_block *block;
-        struct rb_node *rb_node;
-        u64 bytenr;
-        u64 ptr_gen;
-        u32 blocksize;
-        u32 nritems;
-        int i;
-        int err = 0;
-        nritems = btrfs_header_nritems(node->eb);
-        blocksize = btrfs_level_size(rc->extent_root, node->level - 1);
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                bytenr = btrfs_node_blockptr(node->eb, i);
-                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-                if (ptr_gen == trans->transid)
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-                        continue;
-                if (tree_block_processed(bytenr, blocksize, rc))
-                        continue;
-                readahead_tree_block(rc->extent_root,
-                                     bytenr, blocksize, ptr_gen);
-        }
-        for (i = 0; i < nritems; i++) {
-                cond_resched();
-                bytenr = btrfs_node_blockptr(node->eb, i);
-                ptr_gen = btrfs_node_ptr_generation(node->eb, i);
-                if (ptr_gen == trans->transid)
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    (node->level > 1 || rc->stage == MOVE_DATA_EXTENTS))
-                        continue;
-                if (tree_block_processed(bytenr, blocksize, rc))
-                        continue;
-                if (!in_block_group(bytenr, rc->block_group) &&
-                    !check_file_extents(rc, bytenr, blocksize, ptr_gen))
-                        continue;
-                block = kmalloc(sizeof(*block), GFP_NOFS);
-                if (!block) {
-                        err = -ENOMEM;
-                        break;
-                }
-                block->bytenr = bytenr;
-                btrfs_node_key_to_cpu(node->eb, &block->key, i);
-                block->level = node->level - 1;
-                block->key_ready = 1;
-                rb_node = tree_insert(blocks, block->bytenr, &block->rb_node);
-                BUG_ON(rb_node);
-        }
-        if (err)
-                free_block_list(blocks);
-        return err;
-}
-/*
- * find adjacent blocks require processing
- */
-static noinline_for_stack
-int add_adjacent_blocks(struct btrfs_trans_handle *trans,
-                        struct reloc_control *rc,
-                        struct backref_cache *cache,
-                        struct rb_root *blocks, int level,
-                        struct backref_node **upper)
-{
-        struct backref_node *node;
-        int ret = 0;
-        WARN_ON(!list_empty(&cache->pending[level]));
-        if (list_empty(&cache->pending[level + 1]))
-                return 1;
-        node = list_entry(cache->pending[level + 1].next,
-                          struct backref_node, lower);
-        if (node->eb)
-                ret = add_child_blocks(trans, rc, node, blocks);
-        *upper = node;
-        return ret;
-}
 static int get_tree_block_key(struct reloc_control *rc,
                              struct tree_block *block)
 {
@@ -2371,40 +2696,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                                struct btrfs_path *path)
 {
        struct btrfs_root *root;
-        int ret;
+        int release = 0;
+        int ret = 0;
+        if (!node)
+                return 0;
+        BUG_ON(node->processed);
        root = select_one_root(trans, node);
-        if (unlikely(!root)) {
+        if (root == ERR_PTR(-ENOENT)) {
-                rc->found_old_snapshot = 1;
                update_processed_blocks(rc, node);
-                return 0;
+                goto out;
        }
-        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
+        if (!root || root->ref_cows) {
-                ret = do_relocation(trans, node, key, path, 1);
+                ret = reserve_metadata_space(trans, rc, node);
-                if (ret < 0)
+                if (ret)
-                        goto out;
-                if (node->level == 0 && rc->stage == UPDATE_DATA_PTRS) {
-                        ret = replace_file_extents(trans, rc, root,
-                                                   node->eb, NULL);
-                        if (ret < 0)
-                                goto out;
-                }
-                drop_node_buffer(node);
-        } else if (!root->ref_cows) {
-                path->lowest_level = node->level;
-                ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-                btrfs_release_path(root, path);
-                if (ret < 0)
                        goto out;
-        } else if (root != node->root) {
+                release = 1;
-                WARN_ON(node->level > 0 || rc->stage != UPDATE_DATA_PTRS);
        }
-        update_processed_blocks(rc, node);
+        if (root) {
-        ret = 0;
+                if (root->ref_cows) {
+                        BUG_ON(node->new_bytenr);
+                        BUG_ON(!list_empty(&node->list));
+                        btrfs_record_root_in_trans(trans, root);
+                        root = root->reloc_root;
+                        node->new_bytenr = root->node->start;
+                        node->root = root;
+                        list_add_tail(&node->list, &rc->backref_cache.changed);
+                } else {
+                        path->lowest_level = node->level;
+                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
+                        btrfs_release_path(root, path);
+                        if (ret > 0)
+                                ret = 0;
+                }
+                if (!ret)
+                        update_processed_blocks(rc, node);
+        } else {
+                ret = do_relocation(trans, rc, node, key, path, 1);
+        }
 out:
-        drop_node_buffer(node);
+        if (ret || node->level == 0 || node->cowonly) {
+                if (release)
+                        release_metadata_space(rc, node);
+                remove_backref_node(&rc->backref_cache, node);
+        }
        return ret;
 }
@@ -2415,12 +2753,10 @@ static noinline_for_stack
 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
                         struct reloc_control *rc, struct rb_root *blocks)
 {
-        struct backref_cache *cache;
        struct backref_node *node;
        struct btrfs_path *path;
        struct tree_block *block;
        struct rb_node *rb_node;
-        int level = -1;
        int ret;
        int err = 0;
@@ -2428,21 +2764,9 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
-        cache = kmalloc(sizeof(*cache), GFP_NOFS);
-        if (!cache) {
-                btrfs_free_path(path);
-                return -ENOMEM;
-        }
-        backref_cache_init(cache);
        rb_node = rb_first(blocks);
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
-                if (level == -1)
-                        level = block->level;
-                else
-                        BUG_ON(level != block->level);
                if (!block->key_ready)
                        reada_tree_block(rc, block);
                rb_node = rb_next(rb_node);
@@ -2460,7 +2784,7 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
        while (rb_node) {
                block = rb_entry(rb_node, struct tree_block, rb_node);
-                node = build_backref_tree(rc, cache, &block->key,
+                node = build_backref_tree(rc, &block->key,
                                          block->level, block->bytenr);
                if (IS_ERR(node)) {
                        err = PTR_ERR(node);
@@ -2470,79 +2794,62 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans,
                ret = relocate_tree_block(trans, rc, node, &block->key,
                                          path);
                if (ret < 0) {
-                        err = ret;
+                        if (ret != -EAGAIN || rb_node == rb_first(blocks))
+                                err = ret;
                        goto out;
                }
-                remove_backref_node(cache, node);
                rb_node = rb_next(rb_node);
        }
+out:
-        if (level > 0)
-                goto out;
        free_block_list(blocks);
+        err = finish_pending_nodes(trans, rc, path, err);
-        /*
+        btrfs_free_path(path);
-         * now backrefs of some upper level tree blocks have been cached,
+        return err;
-         * try relocating blocks referenced by these upper level blocks.
+}
-         */
-        while (1) {
-                struct backref_node *upper = NULL;
-                if (trans->transaction->in_commit ||
-                    trans->transaction->delayed_refs.flushing)
-                        break;
-                ret = add_adjacent_blocks(trans, rc, cache, blocks, level,
+static noinline_for_stack
-                                          &upper);
+int prealloc_file_extent_cluster(struct inode *inode,
-                if (ret < 0)
+                                 struct file_extent_cluster *cluster)
-                        err = ret;
+{
-                if (ret != 0)
+        u64 alloc_hint = 0;
-                        break;
+        u64 start;
+        u64 end;
+        u64 offset = BTRFS_I(inode)->index_cnt;
+        u64 num_bytes;
+        int nr = 0;
+        int ret = 0;
-                rb_node = rb_first(blocks);
+        BUG_ON(cluster->start != cluster->boundary[0]);
-                while (rb_node) {
+        mutex_lock(&inode->i_mutex);
-                        block = rb_entry(rb_node, struct tree_block, rb_node);
-                        if (trans->transaction->in_commit ||
-                            trans->transaction->delayed_refs.flushing)
-                                goto out;
-                        BUG_ON(!block->key_ready);
-                        node = build_backref_tree(rc, cache, &block->key,
-                                                  level, block->bytenr);
-                        if (IS_ERR(node)) {
-                                err = PTR_ERR(node);
-                                goto out;
-                        }
-                        ret = relocate_tree_block(trans, rc, node,
+        ret = btrfs_check_data_free_space(inode, cluster->end +
-                                                  &block->key, path);
+                                          1 - cluster->start);
-                        if (ret < 0) {
+        if (ret)
-                                err = ret;
+                goto out;
-                                goto out;
-                        }
-                        remove_backref_node(cache, node);
-                        rb_node = rb_next(rb_node);
-                }
-                free_block_list(blocks);
-                if (upper) {
+        while (nr < cluster->nr) {
-                        ret = link_to_upper(trans, upper, path);
+                start = cluster->boundary[nr] - offset;
-                        if (ret < 0) {
+                if (nr + 1 < cluster->nr)
-                                err = ret;
+                        end = cluster->boundary[nr + 1] - 1 - offset;
-                                break;
+                else
-                        }
+                        end = cluster->end - offset;
-                        remove_backref_node(cache, upper);
-                }
+                lock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                num_bytes = end + 1 - start;
+                ret = btrfs_prealloc_file_range(inode, 0, start,
+                                                num_bytes, num_bytes,
+                                                end + 1, &alloc_hint);
+                unlock_extent(&BTRFS_I(inode)->io_tree, start, end, GFP_NOFS);
+                if (ret)
+                        break;
+                nr++;
        }
+        btrfs_free_reserved_data_space(inode, cluster->end +
+                                       1 - cluster->start);
 out:
-        free_block_list(blocks);
+        mutex_unlock(&inode->i_mutex);
+        return ret;
-        ret = finish_pending_nodes(trans, cache, path);
-        if (ret < 0)
-                err = ret;
-        kfree(cache);
-        btrfs_free_path(path);
-        return err;
 }
 static noinline_for_stack
@@ -2588,7 +2895,6 @@ static int relocate_file_extent_cluster(struct inode *inode,
        u64 offset = BTRFS_I(inode)->index_cnt;
        unsigned long index;
        unsigned long last_index;
-        unsigned int dirty_page = 0;
        struct page *page;
        struct file_ra_state *ra;
        int nr = 0;
@@ -2601,21 +2907,24 @@ static int relocate_file_extent_cluster(struct inode *inode,
        if (!ra)
                return -ENOMEM;
-        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+        ret = prealloc_file_extent_cluster(inode, cluster);
-        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
+        if (ret)
+                goto out;
-        mutex_lock(&inode->i_mutex);
+        file_ra_state_init(ra, inode->i_mapping);
-        i_size_write(inode, cluster->end + 1 - offset);
        ret = setup_extent_mapping(inode, cluster->start - offset,
                                   cluster->end - offset, cluster->start);
        if (ret)
-                goto out_unlock;
+                goto out;
-        file_ra_state_init(ra, inode->i_mapping);
-        WARN_ON(cluster->start != cluster->boundary[0]);
+        index = (cluster->start - offset) >> PAGE_CACHE_SHIFT;
+        last_index = (cluster->end - offset) >> PAGE_CACHE_SHIFT;
        while (index <= last_index) {
+                ret = btrfs_delalloc_reserve_metadata(inode, PAGE_CACHE_SIZE);
+                if (ret)
+                        goto out;
                page = find_lock_page(inode->i_mapping, index);
                if (!page) {
                        page_cache_sync_readahead(inode->i_mapping,
@@ -2623,8 +2932,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                                  last_index + 1 - index);
                        page = grab_cache_page(inode->i_mapping, index);
                        if (!page) {
+                                btrfs_delalloc_release_metadata(inode,
+                                                        PAGE_CACHE_SIZE);
                                ret = -ENOMEM;
-                                goto out_unlock;
+                                goto out;
                        }
                }
@@ -2640,8 +2951,10 @@ static int relocate_file_extent_cluster(struct inode *inode,
                        if (!PageUptodate(page)) {
                                unlock_page(page);
                                page_cache_release(page);
+                                btrfs_delalloc_release_metadata(inode,
+                                                        PAGE_CACHE_SIZE);
                                ret = -EIO;
-                                goto out_unlock;
+                                goto out;
                        }
                }
@@ -2660,10 +2973,9 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                        EXTENT_BOUNDARY, GFP_NOFS);
                        nr++;
                }
-                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
+                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
                set_page_dirty(page);
-                dirty_page++;
                unlock_extent(&BTRFS_I(inode)->io_tree,
                              page_start, page_end, GFP_NOFS);
@@ -2671,20 +2983,11 @@ static int relocate_file_extent_cluster(struct inode *inode,
                page_cache_release(page);
                index++;
-                if (nr < cluster->nr &&
+                balance_dirty_pages_ratelimited(inode->i_mapping);
-                    page_end + 1 + offset == cluster->boundary[nr]) {
+                btrfs_throttle(BTRFS_I(inode)->root);
-                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                                           dirty_page);
-                        dirty_page = 0;
-                }
-        }
-        if (dirty_page) {
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                                                   dirty_page);
        }
        WARN_ON(nr != cluster->nr);
-out_unlock:
+out:
-        mutex_unlock(&inode->i_mutex);
        kfree(ra);
        return ret;
 }
@@ -2870,9 +3173,6 @@ out:
 static int block_use_full_backref(struct reloc_control *rc,
                                  struct extent_buffer *eb)
 {
-        struct btrfs_path *path;
-        struct btrfs_extent_item *ei;
-        struct btrfs_key key;
        u64 flags;
        int ret;
@@ -2880,28 +3180,14 @@ static int block_use_full_backref(struct reloc_control *rc,
            btrfs_header_backref_rev(eb) < BTRFS_MIXED_BACKREF_REV)
                return 1;
-        path = btrfs_alloc_path();
+        ret = btrfs_lookup_extent_info(NULL, rc->extent_root,
-        BUG_ON(!path);
+                                       eb->start, eb->len, NULL, &flags);
-        key.objectid = eb->start;
-        key.type = BTRFS_EXTENT_ITEM_KEY;
-        key.offset = eb->len;
-        path->search_commit_root = 1;
-        path->skip_locking = 1;
-        ret = btrfs_search_slot(NULL, rc->extent_root,
-                                &key, path, 0, 0);
        BUG_ON(ret);
-        ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
-                            struct btrfs_extent_item);
-        flags = btrfs_extent_flags(path->nodes[0], ei);
-        BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
        if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)
                ret = 1;
        else
                ret = 0;
-        btrfs_free_path(path);
        return ret;
 }
@@ -3074,22 +3360,10 @@ int add_data_references(struct reloc_control *rc,
        struct btrfs_extent_inline_ref *iref;
        unsigned long ptr;
        unsigned long end;
-        u32 blocksize;
+        u32 blocksize = btrfs_level_size(rc->extent_root, 0);
        int ret;
        int err = 0;
-        ret = get_new_location(rc->data_inode, NULL, extent_key->objectid,
-                               extent_key->offset);
-        BUG_ON(ret < 0);
-        if (ret > 0) {
-                /* the relocated data is fragmented */
-                rc->extents_skipped++;
-                btrfs_release_path(rc->extent_root, path);
-                return 0;
-        }
-        blocksize = btrfs_level_size(rc->extent_root, 0);
        eb = path->nodes[0];
        ptr = btrfs_item_ptr_offset(eb, path->slots[0]);
        end = ptr + btrfs_item_size_nr(eb, path->slots[0]);
@@ -3170,7 +3444,8 @@ int add_data_references(struct reloc_control *rc,
 */
 static noinline_for_stack
 int find_next_extent(struct btrfs_trans_handle *trans,
-                     struct reloc_control *rc, struct btrfs_path *path)
+                     struct reloc_control *rc, struct btrfs_path *path,
+                     struct btrfs_key *extent_key)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
@@ -3225,6 +3500,7 @@ next:
                        rc->search_start = end + 1;
                } else {
                        rc->search_start = key.objectid + key.offset;
+                        memcpy(extent_key, &key, sizeof(key));
                        return 0;
                }
        }
@@ -3262,12 +3538,49 @@ static int check_extent_flags(u64 flags)
        return 0;
 }
+static noinline_for_stack
+int prepare_to_relocate(struct reloc_control *rc)
+{
+        struct btrfs_trans_handle *trans;
+        int ret;
+        rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root);
+        if (!rc->block_rsv)
+                return -ENOMEM;
+        /*
+         * reserve some space for creating reloc trees.
+         * btrfs_init_reloc_root will use them when there
+         * is no reservation in transaction handle.
+         */
+        ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+                                  rc->extent_root->nodesize * 256,
+                                  &rc->block_rsv_retries);
+        if (ret)
+                return ret;
+        rc->block_rsv->refill_used = 1;
+        btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
+        memset(&rc->cluster, 0, sizeof(rc->cluster));
+        rc->search_start = rc->block_group->key.objectid;
+        rc->extents_found = 0;
+        rc->nodes_relocated = 0;
+        rc->merging_rsv_size = 0;
+        rc->block_rsv_retries = 0;
+        rc->create_reloc_tree = 1;
+        set_reloc_control(rc);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        btrfs_commit_transaction(trans, rc->extent_root);
+        return 0;
+}
 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 {
        struct rb_root blocks = RB_ROOT;
        struct btrfs_key key;
-        struct file_extent_cluster *cluster;
        struct btrfs_trans_handle *trans = NULL;
        struct btrfs_path *path;
        struct btrfs_extent_item *ei;
@@ -3277,33 +3590,25 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        int ret;
        int err = 0;
-        cluster = kzalloc(sizeof(*cluster), GFP_NOFS);
-        if (!cluster)
-                return -ENOMEM;
        path = btrfs_alloc_path();
-        if (!path) {
+        if (!path)
-                kfree(cluster);
                return -ENOMEM;
-        }
-        rc->extents_found = 0;
-        rc->extents_skipped = 0;
-        rc->search_start = rc->block_group->key.objectid;
-        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
-                          GFP_NOFS);
-        rc->create_reloc_root = 1;
-        set_reloc_control(rc);
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        ret = prepare_to_relocate(rc);
-        btrfs_commit_transaction(trans, rc->extent_root);
+        if (ret) {
+                err = ret;
+                goto out_free;
+        }
        while (1) {
-                trans = btrfs_start_transaction(rc->extent_root, 1);
+                trans = btrfs_start_transaction(rc->extent_root, 0);
+                if (update_backref_cache(trans, &rc->backref_cache)) {
+                        btrfs_end_transaction(trans, rc->extent_root);
+                        continue;
+                }
-                ret = find_next_extent(trans, rc, path);
+                ret = find_next_extent(trans, rc, path, &key);
                if (ret < 0)
                        err = ret;
                if (ret != 0)
@@ -3313,9 +3618,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                    struct btrfs_extent_item);
-                btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+                item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
-                item_size = btrfs_item_size_nr(path->nodes[0],
-                                               path->slots[0]);
                if (item_size >= sizeof(*ei)) {
                        flags = btrfs_extent_flags(path->nodes[0], ei);
                        ret = check_extent_flags(flags);
@@ -3356,73 +3659,100 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
                        ret = add_tree_block(rc, &key, path, &blocks);
                } else if (rc->stage == UPDATE_DATA_PTRS &&
-                         (flags & BTRFS_EXTENT_FLAG_DATA)) {
+                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        ret = add_data_references(rc, &key, path, &blocks);
                } else {
                        btrfs_release_path(rc->extent_root, path);
                        ret = 0;
                }
                if (ret < 0) {
-                        err = 0;
+                        err = ret;
                        break;
                }
                if (!RB_EMPTY_ROOT(&blocks)) {
                        ret = relocate_tree_blocks(trans, rc, &blocks);
                        if (ret < 0) {
+                                if (ret != -EAGAIN) {
+                                        err = ret;
+                                        break;
+                                }
+                                rc->extents_found--;
+                                rc->search_start = key.objectid;
+                        }
+                }
+                ret = btrfs_block_rsv_check(trans, rc->extent_root,
+                                            rc->block_rsv, 0, 5);
+                if (ret < 0) {
+                        if (ret != -EAGAIN) {
                                err = ret;
+                                WARN_ON(1);
                                break;
                        }
+                        rc->commit_transaction = 1;
                }
-                nr = trans->blocks_used;
+                if (rc->commit_transaction) {
-                btrfs_end_transaction(trans, rc->extent_root);
+                        rc->commit_transaction = 0;
+                        ret = btrfs_commit_transaction(trans, rc->extent_root);
+                        BUG_ON(ret);
+                } else {
+                        nr = trans->blocks_used;
+                        btrfs_end_transaction_throttle(trans, rc->extent_root);
+                        btrfs_btree_balance_dirty(rc->extent_root, nr);
+                }
                trans = NULL;
-                btrfs_btree_balance_dirty(rc->extent_root, nr);
                if (rc->stage == MOVE_DATA_EXTENTS &&
                    (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        rc->found_file_extent = 1;
                        ret = relocate_data_extent(rc->data_inode,
-                                                   &key, cluster);
+                                                   &key, &rc->cluster);
                        if (ret < 0) {
                                err = ret;
                                break;
                        }
                }
        }
-        btrfs_free_path(path);
+        btrfs_release_path(rc->extent_root, path);
+        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
+                          GFP_NOFS);
        if (trans) {
                nr = trans->blocks_used;
-                btrfs_end_transaction(trans, rc->extent_root);
+                btrfs_end_transaction_throttle(trans, rc->extent_root);
                btrfs_btree_balance_dirty(rc->extent_root, nr);
        }
        if (!err) {
-                ret = relocate_file_extent_cluster(rc->data_inode, cluster);
+                ret = relocate_file_extent_cluster(rc->data_inode,
+                                                   &rc->cluster);
                if (ret < 0)
                        err = ret;
        }
-        kfree(cluster);
+        rc->create_reloc_tree = 0;
+        set_reloc_control(rc);
-        rc->create_reloc_root = 0;
+        backref_cache_cleanup(&rc->backref_cache);
-        smp_mb();
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
-        if (rc->extents_found > 0) {
+        err = prepare_to_merge(rc, err);
-                trans = btrfs_start_transaction(rc->extent_root, 1);
-                btrfs_commit_transaction(trans, rc->extent_root);
-        }
        merge_reloc_roots(rc);
+        rc->merge_reloc_tree = 0;
        unset_reloc_control(rc);
+        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
        /* get rid of pinned extents */
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
+        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
+        btrfs_free_path(path);
        return err;
 }
@@ -3448,7 +3778,8 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        btrfs_set_inode_generation(leaf, item, 1);
        btrfs_set_inode_size(leaf, item, 0);
        btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
-        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS);
+        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
+                                          BTRFS_INODE_PREALLOC);
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(root, path);
 out:
@@ -3460,8 +3791,9 @@ out:
 * helper to create inode for data relocation.
 * the inode is in data relocation tree and its link count is 0
 */
-static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+static noinline_for_stack
-                                        struct btrfs_block_group_cache *group)
+struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
+                                 struct btrfs_block_group_cache *group)
 {
        struct inode *inode = NULL;
        struct btrfs_trans_handle *trans;
@@ -3475,8 +3807,9 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        if (IS_ERR(root))
                return ERR_CAST(root);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 6);
-        BUG_ON(!trans);
+        if (IS_ERR(trans))
+                return ERR_CAST(trans);
        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
        if (err)
@@ -3496,7 +3829,6 @@ static struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
 out:
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
        if (err) {
                if (inode)
@@ -3506,6 +3838,21 @@ out:
        return inode;
 }
+static struct reloc_control *alloc_reloc_control(void)
+{
+        struct reloc_control *rc;
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc)
+                return NULL;
+        INIT_LIST_HEAD(&rc->reloc_roots);
+        backref_cache_init(&rc->backref_cache);
+        mapping_tree_init(&rc->reloc_root_tree);
+        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+        return rc;
+}
 /*
 * function to relocate all extents in a block group.
 */
@@ -3514,24 +3861,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        struct reloc_control *rc;
        int ret;
+        int rw = 0;
        int err = 0;
-        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        rc = alloc_reloc_control();
        if (!rc)
                return -ENOMEM;
-        mapping_tree_init(&rc->reloc_root_tree);
+        rc->extent_root = extent_root;
-        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
-        INIT_LIST_HEAD(&rc->reloc_roots);
        rc->block_group = btrfs_lookup_block_group(fs_info, group_start);
        BUG_ON(!rc->block_group);
-        btrfs_init_workers(&rc->workers, "relocate",
+        if (!rc->block_group->ro) {
-                           fs_info->thread_pool_size, NULL);
+                ret = btrfs_set_block_group_ro(extent_root, rc->block_group);
+                if (ret) {
-        rc->extent_root = extent_root;
+                        err = ret;
-        btrfs_prepare_block_group_relocation(extent_root, rc->block_group);
+                        goto out;
+                }
+                rw = 1;
+        }
        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
        if (IS_ERR(rc->data_inode)) {
@@ -3548,9 +3897,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0);
        while (1) {
-                rc->extents_found = 0;
-                rc->extents_skipped = 0;
                mutex_lock(&fs_info->cleaner_mutex);
                btrfs_clean_old_snapshots(fs_info->tree_root);
@@ -3559,7 +3905,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                mutex_unlock(&fs_info->cleaner_mutex);
                if (ret < 0) {
                        err = ret;
-                        break;
+                        goto out;
                }
                if (rc->extents_found == 0)
@@ -3573,18 +3919,6 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                        invalidate_mapping_pages(rc->data_inode->i_mapping,
                                                 0, -1);
                        rc->stage = UPDATE_DATA_PTRS;
-                } else if (rc->stage == UPDATE_DATA_PTRS &&
-                           rc->extents_skipped >= rc->extents_found) {
-                        iput(rc->data_inode);
-                        rc->data_inode = create_reloc_inode(fs_info,
-                                                            rc->block_group);
-                        if (IS_ERR(rc->data_inode)) {
-                                err = PTR_ERR(rc->data_inode);
-                                rc->data_inode = NULL;
-                                break;
-                        }
-                        rc->stage = MOVE_DATA_EXTENTS;
-                        rc->found_file_extent = 0;
                }
        }
@@ -3597,8 +3931,9 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
        WARN_ON(rc->block_group->reserved > 0);
        WARN_ON(btrfs_block_group_used(&rc->block_group->item) > 0);
 out:
+        if (err && rw)
+                btrfs_set_block_group_rw(extent_root, rc->block_group);
        iput(rc->data_inode);
-        btrfs_stop_workers(&rc->workers);
        btrfs_put_block_group(rc->block_group);
        kfree(rc);
        return err;
@@ -3609,7 +3944,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
        struct btrfs_trans_handle *trans;
        int ret;
-        trans = btrfs_start_transaction(root->fs_info->tree_root, 1);
+        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
        memset(&root->root_item.drop_progress, 0,
                sizeof(root->root_item.drop_progress));
@@ -3702,20 +4037,20 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        if (list_empty(&reloc_roots))
                goto out;
-        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        rc = alloc_reloc_control();
        if (!rc) {
                err = -ENOMEM;
                goto out;
        }
-        mapping_tree_init(&rc->reloc_root_tree);
-        INIT_LIST_HEAD(&rc->reloc_roots);
-        btrfs_init_workers(&rc->workers, "relocate",
-                           root->fs_info->thread_pool_size, NULL);
        rc->extent_root = root->fs_info->extent_root;
        set_reloc_control(rc);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
+        rc->merge_reloc_tree = 1;
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
@@ -3735,20 +4070,16 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                fs_root->reloc_root = reloc_root;
        }
-        trans = btrfs_start_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
        merge_reloc_roots(rc);
        unset_reloc_control(rc);
-        trans = btrfs_start_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root, 1);
        btrfs_commit_transaction(trans, rc->extent_root);
 out:
-        if (rc) {
+        kfree(rc);
-                btrfs_stop_workers(&rc->workers);
-                kfree(rc);
-        }
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
@@ -3814,3 +4145,130 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
        btrfs_put_ordered_extent(ordered);
        return 0;
 }
+void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct extent_buffer *buf,
+                           struct extent_buffer *cow)
+{
+        struct reloc_control *rc;
+        struct backref_node *node;
+        int first_cow = 0;
+        int level;
+        int ret;
+        rc = root->fs_info->reloc_ctl;
+        if (!rc)
+                return;
+        BUG_ON(rc->stage == UPDATE_DATA_PTRS &&
+               root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID);
+        level = btrfs_header_level(buf);
+        if (btrfs_header_generation(buf) <=
+            btrfs_root_last_snapshot(&root->root_item))
+                first_cow = 1;
+        if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
+            rc->create_reloc_tree) {
+                WARN_ON(!first_cow && level == 0);
+                node = rc->backref_cache.path[level];
+                BUG_ON(node->bytenr != buf->start &&
+                       node->new_bytenr != buf->start);
+                drop_node_buffer(node);
+                extent_buffer_get(cow);
+                node->eb = cow;
+                node->new_bytenr = cow->start;
+                if (!node->pending) {
+                        list_move_tail(&node->list,
+                                       &rc->backref_cache.pending[level]);
+                        node->pending = 1;
+                }
+                if (first_cow)
+                        __mark_block_processed(rc, node);
+                if (first_cow && level > 0)
+                        rc->nodes_relocated += buf->len;
+        }
+        if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS) {
+                ret = replace_file_extents(trans, rc, root, cow);
+                BUG_ON(ret);
+        }
+}
+/*
+ * called before creating snapshot. it calculates metadata reservation
+ * requried for relocating tree blocks in the snapshot
+ */
+void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
+                              struct btrfs_pending_snapshot *pending,
+                              u64 *bytes_to_reserve)
+{
+        struct btrfs_root *root;
+        struct reloc_control *rc;
+        root = pending->root;
+        if (!root->reloc_root)
+                return;
+        rc = root->fs_info->reloc_ctl;
+        if (!rc->merge_reloc_tree)
+                return;
+        root = root->reloc_root;
+        BUG_ON(btrfs_root_refs(&root->root_item) == 0);
+        /*
+         * relocation is in the stage of merging trees. the space
+         * used by merging a reloc tree is twice the size of
+         * relocated tree nodes in the worst case. half for cowing
+         * the reloc tree, half for cowing the fs tree. the space
+         * used by cowing the reloc tree will be freed after the
+         * tree is dropped. if we create snapshot, cowing the fs
+         * tree may use more space than it frees. so we need
+         * reserve extra space.
+         */
+        *bytes_to_reserve += rc->nodes_relocated;
+}
+/*
+ * called after snapshot is created. migrate block reservation
+ * and create reloc root for the newly created snapshot
+ */
+void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
+                               struct btrfs_pending_snapshot *pending)
+{
+        struct btrfs_root *root = pending->root;
+        struct btrfs_root *reloc_root;
+        struct btrfs_root *new_root;
+        struct reloc_control *rc;
+        int ret;
+        if (!root->reloc_root)
+                return;
+        rc = root->fs_info->reloc_ctl;
+        rc->merging_rsv_size += rc->nodes_relocated;
+        if (rc->merge_reloc_tree) {
+                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
+                                              rc->block_rsv,
+                                              rc->nodes_relocated);
+                BUG_ON(ret);
+        }
+        new_root = pending->snap;
+        reloc_root = create_reloc_root(trans, root->reloc_root,
+                                       new_root->root_key.objectid);
+        __add_reloc_root(reloc_root);
+        new_root->reloc_root = reloc_root;
+        if (rc->create_reloc_tree) {
+                ret = clone_backref_node(trans, rc, root, reloc_root);
+                BUG_ON(ret);
+        }
+}
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 67fa2d29d663..2d958be761c8 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -259,6 +259,8 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
        struct extent_buffer *leaf;
        struct btrfs_path *path;
        struct btrfs_key key;
+        struct btrfs_key root_key;
+        struct btrfs_root *root;
        int err = 0;
        int ret;
@@ -270,6 +272,9 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
        key.type = BTRFS_ORPHAN_ITEM_KEY;
        key.offset = 0;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = (u64)-1;
        while (1) {
                ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0);
                if (ret < 0) {
@@ -294,13 +299,25 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                    key.type != BTRFS_ORPHAN_ITEM_KEY)
                        break;
-                ret = btrfs_find_dead_roots(tree_root, key.offset);
+                root_key.objectid = key.offset;
-                if (ret) {
+                key.offset++;
+                root = btrfs_read_fs_root_no_name(tree_root->fs_info,
+                                                  &root_key);
+                if (!IS_ERR(root))
+                        continue;
+                ret = PTR_ERR(root);
+                if (ret != -ENOENT) {
                        err = ret;
                        break;
                }
-                key.offset++;
+                ret = btrfs_find_dead_roots(tree_root, root_key.objectid);
+                if (ret) {
+                        err = ret;
+                        break;
+                }
        }
        btrfs_free_path(path);
@@ -313,7 +330,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 {
        struct btrfs_path *path;
        int ret;
-        u32 refs;
        struct btrfs_root_item *ri;
        struct extent_buffer *leaf;
@@ -327,8 +343,6 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        leaf = path->nodes[0];
        ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item);
-        refs = btrfs_disk_root_refs(leaf, ri);
-        BUG_ON(refs != 0);
        ret = btrfs_del_item(trans, root, path);
 out:
        btrfs_free_path(path);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2909a03e5230..f2393b390318 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -360,6 +360,8 @@ static struct dentry *get_default_root(struct super_block *sb,
         */
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
+        if (IS_ERR(di))
+                return ERR_CAST(di);
        if (!di) {
                /*
                 * Ok the default dir item isn't there.  This is weird since
@@ -390,8 +392,8 @@ setup_root:
        location.offset = 0;
        inode = btrfs_iget(sb, &location, new_root, &new);
-        if (!inode)
+        if (IS_ERR(inode))
-                return ERR_PTR(-ENOMEM);
+                return ERR_CAST(inode);
        /*
         * If we're just mounting the root most subvol put the inode and return
@@ -498,7 +500,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_start_delalloc_inodes(root, 0);
        btrfs_wait_ordered_extents(root, 0, 0);
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        ret = btrfs_commit_transaction(trans, root);
        return ret;
 }
@@ -694,11 +696,11 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
                        return -EINVAL;
-                /* recover relocation */
+                ret = btrfs_cleanup_fs_roots(root->fs_info);
-                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
-                ret = btrfs_cleanup_fs_roots(root->fs_info);
+                /* recover relocation */
+                ret = btrfs_recover_relocation(root);
                WARN_ON(ret);
                sb->s_flags &= ~MS_RDONLY;
@@ -714,34 +716,18 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
-        u64 data_used = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
        rcu_read_lock();
-        list_for_each_entry_rcu(found, head, list) {
+        list_for_each_entry_rcu(found, head, list)
-                if (found->flags & (BTRFS_BLOCK_GROUP_DUP|
+                total_used += found->disk_used;
-                                    BTRFS_BLOCK_GROUP_RAID10|
-                                    BTRFS_BLOCK_GROUP_RAID1)) {
-                        total_used += found->bytes_used;
-                        if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                                data_used += found->bytes_used;
-                        else
-                                data_used += found->total_bytes;
-                }
-                total_used += found->bytes_used;
-                if (found->flags & BTRFS_BLOCK_GROUP_DATA)
-                        data_used += found->bytes_used;
-                else
-                        data_used += found->total_bytes;
-        }
        rcu_read_unlock();
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-        buf->f_bavail = buf->f_blocks - (data_used >> bits);
+        buf->f_bavail = buf->f_bfree;
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 2cb116099b90..66e4c66cc63b 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -165,54 +165,89 @@ enum btrfs_trans_type {
        TRANS_USERSPACE,
 };
+static int may_wait_transaction(struct btrfs_root *root, int type)
+{
+        if (!root->fs_info->log_root_recovering &&
+            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+             type == TRANS_USERSPACE))
+                return 1;
+        return 0;
+}
 static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
-                                             int num_blocks, int type)
+                                                    u64 num_items, int type)
 {
-        struct btrfs_trans_handle *h =
+        struct btrfs_trans_handle *h;
-                kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+        struct btrfs_transaction *cur_trans;
+        int retries = 0;
        int ret;
+again:
+        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
+        if (!h)
+                return ERR_PTR(-ENOMEM);
        mutex_lock(&root->fs_info->trans_mutex);
-        if (!root->fs_info->log_root_recovering &&
+        if (may_wait_transaction(root, type))
-            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
-             type == TRANS_USERSPACE))
                wait_current_trans(root);
        ret = join_transaction(root);
        BUG_ON(ret);
-        h->transid = root->fs_info->running_transaction->transid;
+        cur_trans = root->fs_info->running_transaction;
-        h->transaction = root->fs_info->running_transaction;
+        cur_trans->use_count++;
-        h->blocks_reserved = num_blocks;
+        mutex_unlock(&root->fs_info->trans_mutex);
+        h->transid = cur_trans->transid;
+        h->transaction = cur_trans;
        h->blocks_used = 0;
        h->block_group = 0;
-        h->alloc_exclude_nr = 0;
+        h->bytes_reserved = 0;
-        h->alloc_exclude_start = 0;
        h->delayed_ref_updates = 0;
+        h->block_rsv = NULL;
-        if (!current->journal_info && type != TRANS_USERSPACE)
+        smp_mb();
-                current->journal_info = h;
+        if (cur_trans->blocked && may_wait_transaction(root, type)) {
+                btrfs_commit_transaction(h, root);
+                goto again;
+        }
+        if (num_items > 0) {
+                ret = btrfs_trans_reserve_metadata(h, root, num_items,
+                                                   &retries);
+                if (ret == -EAGAIN) {
+                        btrfs_commit_transaction(h, root);
+                        goto again;
+                }
+                if (ret < 0) {
+                        btrfs_end_transaction(h, root);
+                        return ERR_PTR(ret);
+                }
+        }
-        root->fs_info->running_transaction->use_count++;
+        mutex_lock(&root->fs_info->trans_mutex);
        record_root_in_trans(h, root);
        mutex_unlock(&root->fs_info->trans_mutex);
+        if (!current->journal_info && type != TRANS_USERSPACE)
+                current->journal_info = h;
        return h;
 }
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                   int num_blocks)
+                                                   int num_items)
 {
-        return start_transaction(root, num_blocks, TRANS_START);
+        return start_transaction(root, num_items, TRANS_START);
 }
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
                                                   int num_blocks)
 {
-        return start_transaction(root, num_blocks, TRANS_JOIN);
+        return start_transaction(root, 0, TRANS_JOIN);
 }
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
                                                         int num_blocks)
 {
-        return start_transaction(r, num_blocks, TRANS_USERSPACE);
+        return start_transaction(r, 0, TRANS_USERSPACE);
 }
 /* wait for a transaction commit to be fully complete */
@@ -286,10 +321,36 @@ void btrfs_throttle(struct btrfs_root *root)
        mutex_unlock(&root->fs_info->trans_mutex);
 }
+static int should_end_transaction(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root)
+{
+        int ret;
+        ret = btrfs_block_rsv_check(trans, root,
+                                    &root->fs_info->global_block_rsv, 0, 5);
+        return ret ? 1 : 0;
+}
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root)
+{
+        struct btrfs_transaction *cur_trans = trans->transaction;
+        int updates;
+        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
+                return 1;
+        updates = trans->delayed_ref_updates;
+        trans->delayed_ref_updates = 0;
+        if (updates)
+                btrfs_run_delayed_refs(trans, root, updates);
+        return should_end_transaction(trans, root);
+}
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, int throttle)
 {
-        struct btrfs_transaction *cur_trans;
+        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
        int count = 0;
@@ -313,9 +374,21 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                count++;
        }
+        btrfs_trans_release_metadata(trans, root);
+        if (!root->fs_info->open_ioctl_trans &&
+            should_end_transaction(trans, root))
+                trans->transaction->blocked = 1;
+        if (cur_trans->blocked && !cur_trans->in_commit) {
+                if (throttle)
+                        return btrfs_commit_transaction(trans, root);
+                else
+                        wake_up_process(info->transaction_kthread);
+        }
        mutex_lock(&info->trans_mutex);
-        cur_trans = info->running_transaction;
+        WARN_ON(cur_trans != info->running_transaction);
-        WARN_ON(cur_trans != trans->transaction);
        WARN_ON(cur_trans->num_writers < 1);
        cur_trans->num_writers--;
@@ -603,6 +676,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        btrfs_free_log(trans, root);
                        btrfs_update_reloc_root(trans, root);
+                        btrfs_orphan_commit_root(trans, root);
                        if (root->commit_root != root->node) {
                                switch_commit_root(root);
@@ -627,30 +701,30 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 {
        struct btrfs_fs_info *info = root->fs_info;
-        int ret;
        struct btrfs_trans_handle *trans;
+        int ret;
        unsigned long nr;
-        smp_mb();
+        if (xchg(&root->defrag_running, 1))
-        if (root->defrag_running)
                return 0;
-        trans = btrfs_start_transaction(root, 1);
        while (1) {
-                root->defrag_running = 1;
+                trans = btrfs_start_transaction(root, 0);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
                ret = btrfs_defrag_leaves(trans, root, cacheonly);
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(info->tree_root, nr);
                cond_resched();
-                trans = btrfs_start_transaction(root, 1);
                if (root->fs_info->closing || ret != -EAGAIN)
                        break;
        }
        root->defrag_running = 0;
-        smp_mb();
+        return ret;
-        btrfs_end_transaction(trans, root);
-        return 0;
 }
 #if 0
@@ -758,47 +832,63 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = pending->root;
        struct btrfs_root *parent_root;
        struct inode *parent_inode;
+        struct dentry *dentry;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
        int ret;
-        u64 objectid;
+        int retries = 0;
-        int namelen;
+        u64 to_reserve = 0;
        u64 index = 0;
+        u64 objectid;
-        parent_inode = pending->dentry->d_parent->d_inode;
-        parent_root = BTRFS_I(parent_inode)->root;
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
-                ret = -ENOMEM;
+                pending->error = -ENOMEM;
                goto fail;
        }
        ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
-        if (ret)
+        if (ret) {
+                pending->error = ret;
                goto fail;
+        }
+        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
+        btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
+        if (to_reserve > 0) {
+                ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+                                          to_reserve, &retries);
+                if (ret) {
+                        pending->error = ret;
+                        goto fail;
+                }
+        }
        key.objectid = objectid;
-        /* record when the snapshot was created in key.offset */
+        key.offset = (u64)-1;
-        key.offset = trans->transid;
+        key.type = BTRFS_ROOT_ITEM_KEY;
-        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
-        memcpy(&pending->root_key, &key, sizeof(key));
+        trans->block_rsv = &pending->block_rsv;
-        pending->root_key.offset = (u64)-1;
+        dentry = pending->dentry;
+        parent_inode = dentry->d_parent->d_inode;
+        parent_root = BTRFS_I(parent_inode)->root;
        record_root_in_trans(trans, parent_root);
        /*
         * insert the directory item
         */
-        namelen = strlen(pending->name);
        ret = btrfs_set_inode_index(parent_inode, &index);
        BUG_ON(ret);
        ret = btrfs_insert_dir_item(trans, parent_root,
-                            pending->name, namelen,
+                                dentry->d_name.name, dentry->d_name.len,
-                            parent_inode->i_ino,
+                                parent_inode->i_ino, &key,
-                            &pending->root_key, BTRFS_FT_DIR, index);
+                                BTRFS_FT_DIR, index);
        BUG_ON(ret);
-        btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2);
+        btrfs_i_size_write(parent_inode, parent_inode->i_size +
+                                         dentry->d_name.len * 2);
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
@@ -815,22 +905,32 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        free_extent_buffer(old);
        btrfs_set_root_node(new_root_item, tmp);
-        ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
+        /* record when the snapshot was created in key.offset */
-                                new_root_item);
+        key.offset = trans->transid;
-        BUG_ON(ret);
+        ret = btrfs_insert_root(trans, tree_root, &key, new_root_item);
        btrfs_tree_unlock(tmp);
        free_extent_buffer(tmp);
+        BUG_ON(ret);
-        ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root,
+        /*
-                                 pending->root_key.objectid,
+         * insert root back/forward references
+         */
+        ret = btrfs_add_root_ref(trans, tree_root, objectid,
                                 parent_root->root_key.objectid,
-                                 parent_inode->i_ino, index, pending->name,
+                                 parent_inode->i_ino, index,
-                                 namelen);
+                                 dentry->d_name.name, dentry->d_name.len);
        BUG_ON(ret);
+        key.offset = (u64)-1;
+        pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
+        BUG_ON(IS_ERR(pending->snap));
+        btrfs_reloc_post_snapshot(trans, pending);
+        btrfs_orphan_post_snapshot(trans, pending);
 fail:
        kfree(new_root_item);
-        return ret;
+        btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
+        return 0;
 }
 /*
@@ -878,6 +978,16 @@ int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
        return ret;
 }
+int btrfs_transaction_blocked(struct btrfs_fs_info *info)
+{
+        int ret = 0;
+        spin_lock(&info->new_trans_lock);
+        if (info->running_transaction)
+                ret = info->running_transaction->blocked;
+        spin_unlock(&info->new_trans_lock);
+        return ret;
+}
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
@@ -899,6 +1009,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
+        btrfs_trans_release_metadata(trans, root);
        cur_trans = trans->transaction;
        /*
         * set the flushing flag so procs in this transaction have to
@@ -951,9 +1063,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        snap_pending = 1;
                WARN_ON(cur_trans != trans->transaction);
-                prepare_to_wait(&cur_trans->writer_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
                if (cur_trans->num_writers > 1)
                        timeout = MAX_SCHEDULE_TIMEOUT;
                else if (should_grow)
@@ -976,6 +1085,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                 */
                btrfs_run_ordered_operations(root, 1);
+                prepare_to_wait(&cur_trans->writer_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
                smp_mb();
                if (cur_trans->num_writers > 1 || should_grow)
                        schedule_timeout(timeout);
@@ -1103,9 +1215,9 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
                if (btrfs_header_backref_rev(root->node) <
                    BTRFS_MIXED_BACKREF_REV)
-                        btrfs_drop_snapshot(root, 0);
+                        btrfs_drop_snapshot(root, NULL, 0);
                else
-                        btrfs_drop_snapshot(root, 1);
+                        btrfs_drop_snapshot(root, NULL, 1);
        }
        return 0;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 93c7ccb33118..e104986d0bfd 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -45,20 +45,23 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
        u64 transid;
+        u64 block_group;
+        u64 bytes_reserved;
        unsigned long blocks_reserved;
        unsigned long blocks_used;
-        struct btrfs_transaction *transaction;
-        u64 block_group;
-        u64 alloc_exclude_start;
-        u64 alloc_exclude_nr;
        unsigned long delayed_ref_updates;
+        struct btrfs_transaction *transaction;
+        struct btrfs_block_rsv *block_rsv;
 };
 struct btrfs_pending_snapshot {
        struct dentry *dentry;
        struct btrfs_root *root;
-        char *name;
+        struct btrfs_root *snap;
-        struct btrfs_key root_key;
+        /* block reservation for the operation */
+        struct btrfs_block_rsv block_rsv;
+        /* extra metadata reseration for relocation */
+        int error;
        struct list_head list;
 };
@@ -85,11 +88,11 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
-                                                   int num_blocks);
+                                                   int num_items);
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
-                                                   int num_blocks);
+                                                  int num_blocks);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
-                                                   int num_blocks);
+                                                         int num_blocks);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
@@ -103,6 +106,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
+int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
@@ -112,5 +117,6 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
                                struct extent_io_tree *dirty_pages, int mark);
 int btrfs_wait_marked_extents(struct btrfs_root *root,
                                struct extent_io_tree *dirty_pages, int mark);
+int btrfs_transaction_blocked(struct btrfs_fs_info *info);
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
 #endif
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index b10eacdb1620..f7ac8e013ed7 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -117,13 +117,14 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                                 path->nodes[1], 0,
                                 cache_only, &last_ret,
                                 &root->defrag_progress);
-        WARN_ON(ret && ret != -EAGAIN);
+        if (ret) {
+                WARN_ON(ret == -EAGAIN);
+                goto out;
+        }
        if (next_key_ret == 0) {
                memcpy(&root->defrag_progress, &key, sizeof(key));
                ret = -EAGAIN;
        }
-        btrfs_release_path(root, path);
 out:
        if (path)
                btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index af57dd2b43d4..fb102a9aee9c 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -135,6 +135,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root)
 {
        int ret;
+        int err = 0;
        mutex_lock(&root->log_mutex);
        if (root->log_root) {
@@ -155,17 +156,19 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
        mutex_lock(&root->fs_info->tree_log_mutex);
        if (!root->fs_info->log_root_tree) {
                ret = btrfs_init_log_root_tree(trans, root->fs_info);
-                BUG_ON(ret);
+                if (ret)
+                        err = ret;
        }
-        if (!root->log_root) {
+        if (err == 0 && !root->log_root) {
                ret = btrfs_add_log_tree(trans, root);
-                BUG_ON(ret);
+                if (ret)
+                        err = ret;
        }
        mutex_unlock(&root->fs_info->tree_log_mutex);
        root->log_batch++;
        atomic_inc(&root->log_writers);
        mutex_unlock(&root->log_mutex);
-        return 0;
+        return err;
 }
 /*
@@ -376,7 +379,7 @@ insert:
                        BUG_ON(ret);
                }
        } else if (ret) {
-                BUG();
+                return ret;
        }
        dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
                                        path->slots[0]);
@@ -1699,9 +1702,9 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
-                wc->process_func(root, next, wc, ptr_gen);
                if (*level == 1) {
+                        wc->process_func(root, next, wc, ptr_gen);
                        path->slots[*level]++;
                        if (wc->free) {
                                btrfs_read_buffer(next, ptr_gen);
@@ -1734,35 +1737,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
        WARN_ON(*level < 0);
        WARN_ON(*level >= BTRFS_MAX_LEVEL);
-        if (path->nodes[*level] == root->node)
+        path->slots[*level] = btrfs_header_nritems(path->nodes[*level]);
-                parent = path->nodes[*level];
-        else
-                parent = path->nodes[*level + 1];
-        bytenr = path->nodes[*level]->start;
-        blocksize = btrfs_level_size(root, *level);
-        root_owner = btrfs_header_owner(parent);
-        root_gen = btrfs_header_generation(parent);
-        wc->process_func(root, path->nodes[*level], wc,
-                         btrfs_header_generation(path->nodes[*level]));
-        if (wc->free) {
-                next = path->nodes[*level];
-                btrfs_tree_lock(next);
-                clean_tree_block(trans, root, next);
-                btrfs_set_lock_blocking(next);
-                btrfs_wait_tree_block_writeback(next);
-                btrfs_tree_unlock(next);
-                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-                ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
-                BUG_ON(ret);
-        }
-        free_extent_buffer(path->nodes[*level]);
-        path->nodes[*level] = NULL;
-        *level += 1;
        cond_resched();
        return 0;
@@ -1781,7 +1756,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
                slot = path->slots[i];
-                if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
+                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
                        struct extent_buffer *node;
                        node = path->nodes[i];
                        path->slots[i]++;
@@ -2047,7 +2022,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        mutex_unlock(&log_root_tree->log_mutex);
        ret = update_log_root(trans, log);
-        BUG_ON(ret);
        mutex_lock(&log_root_tree->log_mutex);
        if (atomic_dec_and_test(&log_root_tree->log_writers)) {
@@ -2056,6 +2030,15 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                        wake_up(&log_root_tree->log_writer_wait);
        }
+        if (ret) {
+                BUG_ON(ret != -ENOSPC);
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+                mutex_unlock(&log_root_tree->log_mutex);
+                ret = -EAGAIN;
+                goto out;
+        }
        index2 = log_root_tree->log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
@@ -2129,15 +2112,10 @@ out:
        return 0;
 }
-/*
+static void free_log_tree(struct btrfs_trans_handle *trans,
- * free all the extents used by the tree log.  This should be called
+                          struct btrfs_root *log)
- * at commit time of the full transaction
- */
-int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
 {
        int ret;
-        struct btrfs_root *log;
-        struct key;
        u64 start;
        u64 end;
        struct walk_control wc = {
@@ -2145,10 +2123,6 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
                .process_func = process_one_buffer
        };
-        if (!root->log_root || root->fs_info->log_root_recovering)
-                return 0;
-        log = root->log_root;
        ret = walk_log_tree(trans, log, &wc);
        BUG_ON(ret);
@@ -2162,14 +2136,30 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
        }
-        if (log->log_transid > 0) {
-                ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
-                                     &log->root_key);
-                BUG_ON(ret);
-        }
-        root->log_root = NULL;
        free_extent_buffer(log->node);
        kfree(log);
+}
+/*
+ * free all the extents used by the tree log.  This should be called
+ * at commit time of the full transaction
+ */
+int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
+{
+        if (root->log_root) {
+                free_log_tree(trans, root->log_root);
+                root->log_root = NULL;
+        }
+        return 0;
+}
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info)
+{
+        if (fs_info->log_root_tree) {
+                free_log_tree(trans, fs_info->log_root_tree);
+                fs_info->log_root_tree = NULL;
+        }
        return 0;
 }
@@ -2203,6 +2193,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        struct btrfs_dir_item *di;
        struct btrfs_path *path;
        int ret;
+        int err = 0;
        int bytes_del = 0;
        if (BTRFS_I(dir)->logged_trans < trans->transid)
@@ -2218,7 +2209,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
                                   name, name_len, -1);
-        if (di && !IS_ERR(di)) {
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto fail;
+        }
+        if (di) {
                ret = btrfs_delete_one_dir_name(trans, log, path, di);
                bytes_del += name_len;
                BUG_ON(ret);
@@ -2226,7 +2221,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        btrfs_release_path(log, path);
        di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
                                         index, name, name_len, -1);
-        if (di && !IS_ERR(di)) {
+        if (IS_ERR(di)) {
+                err = PTR_ERR(di);
+                goto fail;
+        }
+        if (di) {
                ret = btrfs_delete_one_dir_name(trans, log, path, di);
                bytes_del += name_len;
                BUG_ON(ret);
@@ -2244,6 +2243,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                btrfs_release_path(log, path);
                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
+                if (ret < 0) {
+                        err = ret;
+                        goto fail;
+                }
                if (ret == 0) {
                        struct btrfs_inode_item *item;
                        u64 i_size;
@@ -2261,9 +2264,13 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                        ret = 0;
                btrfs_release_path(log, path);
        }
+fail:
        btrfs_free_path(path);
        mutex_unlock(&BTRFS_I(dir)->log_mutex);
+        if (ret == -ENOSPC) {
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 0;
+        }
        btrfs_end_log_trans(root);
        return 0;
@@ -2291,6 +2298,10 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
                                  dirid, &index);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
+        if (ret == -ENOSPC) {
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 0;
+        }
        btrfs_end_log_trans(root);
        return ret;
@@ -2318,7 +2329,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
        else
                key.type = BTRFS_DIR_LOG_INDEX_KEY;
        ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
-        BUG_ON(ret);
+        if (ret)
+                return ret;
        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                              struct btrfs_dir_log_item);
@@ -2343,6 +2355,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        struct btrfs_key max_key;
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src;
+        int err = 0;
        int ret;
        int i;
        int nritems;
@@ -2405,6 +2418,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        ret = overwrite_item(trans, log, dst_path,
                                             path->nodes[0], path->slots[0],
                                             &tmp);
+                        if (ret) {
+                                err = ret;
+                                goto done;
+                        }
                }
        }
        btrfs_release_path(root, path);
@@ -2432,7 +2449,10 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                                goto done;
                        ret = overwrite_item(trans, log, dst_path, src, i,
                                             &min_key);
-                        BUG_ON(ret);
+                        if (ret) {
+                                err = ret;
+                                goto done;
+                        }
                }
                path->slots[0] = nritems;
@@ -2454,22 +2474,30 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        ret = overwrite_item(trans, log, dst_path,
                                             path->nodes[0], path->slots[0],
                                             &tmp);
+                        if (ret)
-                        BUG_ON(ret);
+                                err = ret;
-                        last_offset = tmp.offset;
+                        else
+                                last_offset = tmp.offset;
                        goto done;
                }
        }
 done:
-        *last_offset_ret = last_offset;
        btrfs_release_path(root, path);
        btrfs_release_path(log, dst_path);
-        /* insert the log range keys to indicate where the log is valid */
+        if (err == 0) {
-        ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
+                *last_offset_ret = last_offset;
-                                 first_offset, last_offset);
+                /*
-        BUG_ON(ret);
+                 * insert the log range keys to indicate where the log
-        return 0;
+                 * is valid
+                 */
+                ret = insert_dir_log_key(trans, log, path, key_type,
+                                         inode->i_ino, first_offset,
+                                         last_offset);
+                if (ret)
+                        err = ret;
+        }
+        return err;
 }
 /*
@@ -2501,7 +2529,8 @@ again:
                ret = log_dir_items(trans, root, inode, path,
                                    dst_path, key_type, min_key,
                                    &max_key);
-                BUG_ON(ret);
+                if (ret)
+                        return ret;
                if (max_key == (u64)-1)
                        break;
                min_key = max_key + 1;
@@ -2535,8 +2564,8 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
        while (1) {
                ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
+                BUG_ON(ret == 0);
-                if (ret != 1)
+                if (ret < 0)
                        break;
                if (path->slots[0] == 0)
@@ -2554,7 +2583,7 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                btrfs_release_path(log, path);
        }
        btrfs_release_path(log, path);
-        return 0;
+        return ret;
 }
 static noinline int copy_items(struct btrfs_trans_handle *trans,
@@ -2587,7 +2616,10 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        }
        ret = btrfs_insert_empty_items(trans, log, dst_path,
                                       ins_keys, ins_sizes, nr);
-        BUG_ON(ret);
+        if (ret) {
+                kfree(ins_data);
+                return ret;
+        }
        for (i = 0; i < nr; i++, dst_path->slots[0]++) {
                dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
@@ -2660,16 +2692,17 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
         * we have to do this after the loop above to avoid changing the
         * log tree while trying to change the log tree.
         */
+        ret = 0;
        while (!list_empty(&ordered_sums)) {
                struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
                                                   struct btrfs_ordered_sum,
                                                   list);
-                ret = btrfs_csum_file_blocks(trans, log, sums);
+                if (!ret)
-                BUG_ON(ret);
+                        ret = btrfs_csum_file_blocks(trans, log, sums);
                list_del(&sums->list);
                kfree(sums);
        }
-        return 0;
+        return ret;
 }
 /* log a single inode in the tree log.
@@ -2697,6 +2730,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src = NULL;
        u32 size;
+        int err = 0;
        int ret;
        int nritems;
        int ins_start_slot = 0;
@@ -2739,7 +2773,10 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        } else {
                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
        }
-        BUG_ON(ret);
+        if (ret) {
+                err = ret;
+                goto out_unlock;
+        }
        path->keep_locks = 1;
        while (1) {
@@ -2768,7 +2805,10 @@ again:
                ret = copy_items(trans, log, dst_path, src, ins_start_slot,
                                 ins_nr, inode_only);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
                ins_nr = 1;
                ins_start_slot = path->slots[0];
 next_slot:
@@ -2784,7 +2824,10 @@ next_slot:
                        ret = copy_items(trans, log, dst_path, src,
                                         ins_start_slot,
                                         ins_nr, inode_only);
-                        BUG_ON(ret);
+                        if (ret) {
+                                err = ret;
+                                goto out_unlock;
+                        }
                        ins_nr = 0;
                }
                btrfs_release_path(root, path);
@@ -2802,7 +2845,10 @@ next_slot:
                ret = copy_items(trans, log, dst_path, src,
                                 ins_start_slot,
                                 ins_nr, inode_only);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
                ins_nr = 0;
        }
        WARN_ON(ins_nr);
@@ -2810,14 +2856,18 @@ next_slot:
                btrfs_release_path(root, path);
                btrfs_release_path(log, dst_path);
                ret = log_directory_changes(trans, root, inode, path, dst_path);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out_unlock;
+                }
        }
        BTRFS_I(inode)->logged_trans = trans->transid;
+out_unlock:
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
        btrfs_free_path(path);
        btrfs_free_path(dst_path);
-        return 0;
+        return err;
 }
 /*
@@ -2942,10 +2992,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                goto end_no_trans;
        }
-        start_log_trans(trans, root);
+        ret = start_log_trans(trans, root);
+        if (ret)
+                goto end_trans;
        ret = btrfs_log_inode(trans, root, inode, inode_only);
-        BUG_ON(ret);
+        if (ret)
+                goto end_trans;
        /*
         * for regular files, if its inode is already on disk, we don't
@@ -2955,8 +3008,10 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
         */
        if (S_ISREG(inode->i_mode) &&
            BTRFS_I(inode)->generation <= last_committed &&
-            BTRFS_I(inode)->last_unlink_trans <= last_committed)
+            BTRFS_I(inode)->last_unlink_trans <= last_committed) {
-                        goto no_parent;
+                ret = 0;
+                goto end_trans;
+        }
        inode_only = LOG_INODE_EXISTS;
        while (1) {
@@ -2970,15 +3025,21 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (BTRFS_I(inode)->generation >
                    root->fs_info->last_trans_committed) {
                        ret = btrfs_log_inode(trans, root, inode, inode_only);
-                        BUG_ON(ret);
+                        if (ret)
+                                goto end_trans;
                }
                if (IS_ROOT(parent))
                        break;
                parent = parent->d_parent;
        }
-no_parent:
        ret = 0;
+end_trans:
+        if (ret < 0) {
+                BUG_ON(ret != -ENOSPC);
+                root->fs_info->last_trans_log_full_commit = trans->transid;
+                ret = 1;
+        }
        btrfs_end_log_trans(root);
 end_no_trans:
        return ret;
@@ -3020,7 +3081,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        trans = btrfs_start_transaction(fs_info->tree_root, 1);
+        trans = btrfs_start_transaction(fs_info->tree_root, 0);
        wc.trans = trans;
        wc.pin = 1;
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 0776eacb5083..3dfae84c8cc8 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -25,6 +25,8 @@
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
+int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
+                             struct btrfs_fs_info *fs_info);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 8db7b14bbae8..d6e3af8be95b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1097,7 +1097,7 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
        if (!path)
                return -ENOMEM;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
@@ -1486,7 +1486,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                goto error;
        }
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        lock_chunks(root);
        device->barriers = 1;
@@ -1751,9 +1751,10 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        /* step one, relocate all the extents inside this chunk */
        ret = btrfs_relocate_block_group(extent_root, chunk_offset);
-        BUG_ON(ret);
+        if (ret)
+                return ret;
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
        BUG_ON(!trans);
        lock_chunks(root);
@@ -1925,7 +1926,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
                        break;
                BUG_ON(ret);
-                trans = btrfs_start_transaction(dev_root, 1);
+                trans = btrfs_start_transaction(dev_root, 0);
                BUG_ON(!trans);
                ret = btrfs_grow_device(trans, device, old_size);
@@ -2094,11 +2095,7 @@ again:
        }
        /* Shrinking succeeded, else we would be at "done". */
-        trans = btrfs_start_transaction(root, 1);
+        trans = btrfs_start_transaction(root, 0);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto done;
-        }
        lock_chunks(root);
        device->disk_total_bytes = new_size;
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 59acd3eb288a..88ecbb215878 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -154,15 +154,10 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        if (trans)
                return do_setxattr(trans, inode, name, value, size, flags);
-        ret = btrfs_reserve_metadata_space(root, 2);
+        trans = btrfs_start_transaction(root, 2);
-        if (ret)
+        if (IS_ERR(trans))
-                return ret;
+                return PTR_ERR(trans);
-        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
-                ret = -ENOMEM;
-                goto out;
-        }
        btrfs_set_trans_block_group(trans, inode);
        ret = do_setxattr(trans, inode, name, value, size, flags);
@@ -174,7 +169,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
 out:
        btrfs_end_transaction_throttle(trans, root);
-        btrfs_unreserve_metadata_space(root, 2);
        return ret;
 }
diff --git a/fs/buffer.c b/fs/buffer.c
index e8aa7081d25c..d54812b198e9 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1949,14 +1949,11 @@ static int __block_commit_write(struct inode *inode, struct page *page,
 }
 /*
- * block_write_begin takes care of the basic task of block allocation and
+ * Filesystems implementing the new truncate sequence should use the
- * bringing partial write blocks uptodate first.
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
- *
+ * The filesystem needs to handle block truncation upon failure.
- * If *pagep is not NULL, then block_write_begin uses the locked page
- * at *pagep rather than allocating its own. In this case, the page will
- * not be unlocked or deallocated on failure.
 */
-int block_write_begin(struct file *file, struct address_space *mapping,
+int block_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -1992,20 +1989,50 @@ int block_write_begin(struct file *file, struct address_space *mapping,
                        unlock_page(page);
                        page_cache_release(page);
                        *pagep = NULL;
-                        /*
-                         * prepare_write() may have instantiated a few blocks
-                         * outside i_size.  Trim these off again. Don't need
-                         * i_size_read because we hold i_mutex.
-                         */
-                        if (pos + len > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
                }
        }
 out:
        return status;
 }
+EXPORT_SYMBOL(block_write_begin_newtrunc);
+/*
+ * block_write_begin takes care of the basic task of block allocation and
+ * bringing partial write blocks uptodate first.
+ *
+ * If *pagep is not NULL, then block_write_begin uses the locked page
+ * at *pagep rather than allocating its own. In this case, the page will
+ * not be unlocked or deallocated on failure.
+ */
+int block_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        int ret;
+        ret = block_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block);
+        /*
+         * prepare_write() may have instantiated a few blocks
+         * outside i_size.  Trim these off again. Don't need
+         * i_size_read because we hold i_mutex.
+         *
+         * Filesystems which pass down their own page also cannot
+         * call into vmtruncate here because it would lead to lock
+         * inversion problems (*pagep is locked). This is a further
+         * example of where the old truncate sequence is inadequate.
+         */
+        if (unlikely(ret) && *pagep == NULL) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
+}
 EXPORT_SYMBOL(block_write_begin);
 int block_write_end(struct file *file, struct address_space *mapping,
@@ -2324,7 +2351,7 @@ out:
 * For moronic filesystems that do not allow holes in file.
 * We may have to extend the file.
 */
-int cont_write_begin(struct file *file, struct address_space *mapping,
+int cont_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block, loff_t *bytes)
@@ -2345,11 +2372,30 @@ int cont_write_begin(struct file *file, struct address_space *mapping,
        }
        *pagep = NULL;
-        err = block_write_begin(file, mapping, pos, len,
+        err = block_write_begin_newtrunc(file, mapping, pos, len,
                                flags, pagep, fsdata, get_block);
 out:
        return err;
 }
+EXPORT_SYMBOL(cont_write_begin_newtrunc);
+int cont_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block, loff_t *bytes)
+{
+        int ret;
+        ret = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block, bytes);
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
+        return ret;
+}
 EXPORT_SYMBOL(cont_write_begin);
 int block_prepare_write(struct page *page, unsigned from, unsigned to,
@@ -2381,7 +2427,7 @@ EXPORT_SYMBOL(block_commit_write);
 *
 * We are not allowed to take the i_mutex here so we have to play games to
 * protect against truncate races as the page could now be beyond EOF.  Because
- * vmtruncate() writes the inode size before removing pages, once we have the
+ * truncate writes the inode size before removing pages, once we have the
 * page lock we can determine safely if the page is beyond EOF. If it is not
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
@@ -2464,10 +2510,11 @@ static void attach_nobh_buffers(struct page *page, struct buffer_head *head)
 }
 /*
- * On entry, the page is fully not uptodate.
+ * Filesystems implementing the new truncate sequence should use the
- * On exit the page is fully uptodate in the areas outside (from,to)
+ * _newtrunc postfix variant which won't incorrectly call vmtruncate.
+ * The filesystem needs to handle block truncation upon failure.
 */
-int nobh_write_begin(struct file *file, struct address_space *mapping,
+int nobh_write_begin_newtrunc(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata,
                        get_block_t *get_block)
@@ -2500,8 +2547,8 @@ int nobh_write_begin(struct file *file, struct address_space *mapping,
                unlock_page(page);
                page_cache_release(page);
                *pagep = NULL;
-                return block_write_begin(file, mapping, pos, len, flags, pagep,
+                return block_write_begin_newtrunc(file, mapping, pos, len,
-                                        fsdata, get_block);
+                                        flags, pagep, fsdata, get_block);
        }
        if (PageMappedToDisk(page))
@@ -2605,8 +2652,34 @@ out_release:
        page_cache_release(page);
        *pagep = NULL;
-        if (pos + len > inode->i_size)
+        return ret;
-                vmtruncate(inode, inode->i_size);
+}
+EXPORT_SYMBOL(nobh_write_begin_newtrunc);
+/*
+ * On entry, the page is fully not uptodate.
+ * On exit the page is fully uptodate in the areas outside (from,to)
+ */
+int nobh_write_begin(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned flags,
+                        struct page **pagep, void **fsdata,
+                        get_block_t *get_block)
+{
+        int ret;
+        ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags,
+                                        pagep, fsdata, get_block);
+        /*
+         * prepare_write() may have instantiated a few blocks
+         * outside i_size.  Trim these off again. Don't need
+         * i_size_read because we hold i_mutex.
+         */
+        if (unlikely(ret)) {
+                loff_t isize = mapping->host->i_size;
+                if (pos + len > isize)
+                        vmtruncate(mapping->host, isize);
+        }
        return ret;
 }
diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c
index 9f46de2ba7a7..89490beaf537 100644
--- a/fs/ceph/auth.c
+++ b/fs/ceph/auth.c
@@ -1,7 +1,6 @@
 #include "ceph_debug.h"
 #include <linux/module.h>
-#include <linux/slab.h>
 #include <linux/err.h>
 #include <linux/slab.h>
@@ -217,8 +216,8 @@ int ceph_handle_auth_reply(struct ceph_auth_client *ac,
                if (ac->protocol != protocol) {
                        ret = ceph_auth_init_protocol(ac, protocol);
                        if (ret) {
-                                pr_err("error %d on auth method %s init\n",
+                                pr_err("error %d on auth protocol %d init\n",
-                                       ret, ac->ops->name);
+                                       ret, protocol);
                                goto out;
                        }
                }
@@ -247,7 +246,7 @@ int ceph_build_auth(struct ceph_auth_client *ac,
        if (!ac->protocol)
                return ceph_auth_build_hello(ac, msg_buf, msg_len);
        BUG_ON(!ac->ops);
-        if (!ac->ops->is_authenticated(ac))
+        if (ac->ops->should_authenticate(ac))
                return ceph_build_auth_request(ac, msg_buf, msg_len);
        return 0;
 }
diff --git a/fs/ceph/auth.h b/fs/ceph/auth.h
index 4429a707c021..d38a2fb4a137 100644
--- a/fs/ceph/auth.h
+++ b/fs/ceph/auth.h
@@ -24,6 +24,12 @@ struct ceph_auth_client_ops {
        int (*is_authenticated)(struct ceph_auth_client *ac);
        /*
+         * true if we should (re)authenticate, e.g., when our tickets
+         * are getting old and crusty.
+         */
+        int (*should_authenticate)(struct ceph_auth_client *ac);
+        /*
         * build requests and process replies during monitor
         * handshake.  if handle_reply returns -EAGAIN, we build
         * another request.
diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
index 24407c119291..ad1dc21286c7 100644
--- a/fs/ceph/auth_none.c
+++ b/fs/ceph/auth_none.c
@@ -31,6 +31,13 @@ static int is_authenticated(struct ceph_auth_client *ac)
        return !xi->starting;
 }
+static int should_authenticate(struct ceph_auth_client *ac)
+{
+        struct ceph_auth_none_info *xi = ac->private;
+        return xi->starting;
+}
 /*
 * the generic auth code decode the global_id, and we carry no actual
 * authenticate state, so nothing happens here.
@@ -98,6 +105,7 @@ static const struct ceph_auth_client_ops ceph_auth_none_ops = {
        .reset = reset,
        .destroy = destroy,
        .is_authenticated = is_authenticated,
+        .should_authenticate = should_authenticate,
        .handle_reply = handle_reply,
        .create_authorizer = ceph_auth_none_create_authorizer,
        .destroy_authorizer = ceph_auth_none_destroy_authorizer,
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
index 7b206231566d..83d4d2785ffe 100644
--- a/fs/ceph/auth_x.c
+++ b/fs/ceph/auth_x.c
@@ -27,6 +27,17 @@ static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
        return (ac->want_keys & xi->have_keys) == ac->want_keys;
 }
+static int ceph_x_should_authenticate(struct ceph_auth_client *ac)
+{
+        struct ceph_x_info *xi = ac->private;
+        int need;
+        ceph_x_validate_tickets(ac, &need);
+        dout("ceph_x_should_authenticate want=%d need=%d have=%d\n",
+             ac->want_keys, need, xi->have_keys);
+        return need != 0;
+}
 static int ceph_x_encrypt_buflen(int ilen)
 {
        return sizeof(struct ceph_x_encrypt_header) + ilen + 16 +
@@ -620,6 +631,7 @@ static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
 static const struct ceph_auth_client_ops ceph_x_ops = {
        .name = "x",
        .is_authenticated = ceph_x_is_authenticated,
+        .should_authenticate = ceph_x_should_authenticate,
        .build_request = ceph_x_build_request,
        .handle_reply = ceph_x_handle_reply,
        .create_authorizer = ceph_x_create_authorizer,
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 0dd0b81e64f7..619b61655ee5 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -981,6 +981,46 @@ static int send_cap_msg(struct ceph_mds_session *session,
        return 0;
 }
+static void __queue_cap_release(struct ceph_mds_session *session,
+                                u64 ino, u64 cap_id, u32 migrate_seq,
+                                u32 issue_seq)
+{
+        struct ceph_msg *msg;
+        struct ceph_mds_cap_release *head;
+        struct ceph_mds_cap_item *item;
+        spin_lock(&session->s_cap_lock);
+        BUG_ON(!session->s_num_cap_releases);
+        msg = list_first_entry(&session->s_cap_releases,
+                               struct ceph_msg, list_head);
+        dout(" adding %llx release to mds%d msg %p (%d left)\n",
+             ino, session->s_mds, msg, session->s_num_cap_releases);
+        BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
+        head = msg->front.iov_base;
+        head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+        item = msg->front.iov_base + msg->front.iov_len;
+        item->ino = cpu_to_le64(ino);
+        item->cap_id = cpu_to_le64(cap_id);
+        item->migrate_seq = cpu_to_le32(migrate_seq);
+        item->seq = cpu_to_le32(issue_seq);
+        session->s_num_cap_releases--;
+        msg->front.iov_len += sizeof(*item);
+        if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+                dout(" release msg %p full\n", msg);
+                list_move_tail(&msg->list_head, &session->s_cap_releases_done);
+        } else {
+                dout(" release msg %p at %d/%d (%d)\n", msg,
+                     (int)le32_to_cpu(head->num),
+                     (int)CEPH_CAPS_PER_RELEASE,
+                     (int)msg->front.iov_len);
+        }
+        spin_unlock(&session->s_cap_lock);
+}
 /*
 * Queue cap releases when an inode is dropped from our cache.  Since
 * inode is about to be destroyed, there is no need for i_lock.
@@ -994,41 +1034,9 @@ void ceph_queue_caps_release(struct inode *inode)
        while (p) {
                struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
                struct ceph_mds_session *session = cap->session;
-                struct ceph_msg *msg;
-                struct ceph_mds_cap_release *head;
-                struct ceph_mds_cap_item *item;
-                spin_lock(&session->s_cap_lock);
+                __queue_cap_release(session, ceph_ino(inode), cap->cap_id,
-                BUG_ON(!session->s_num_cap_releases);
+                                    cap->mseq, cap->issue_seq);
-                msg = list_first_entry(&session->s_cap_releases,
-                                       struct ceph_msg, list_head);
-                dout(" adding %p release to mds%d msg %p (%d left)\n",
-                     inode, session->s_mds, msg, session->s_num_cap_releases);
-                BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);
-                head = msg->front.iov_base;
-                head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
-                item = msg->front.iov_base + msg->front.iov_len;
-                item->ino = cpu_to_le64(ceph_ino(inode));
-                item->cap_id = cpu_to_le64(cap->cap_id);
-                item->migrate_seq = cpu_to_le32(cap->mseq);
-                item->seq = cpu_to_le32(cap->issue_seq);
-                session->s_num_cap_releases--;
-                msg->front.iov_len += sizeof(*item);
-                if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
-                        dout(" release msg %p full\n", msg);
-                        list_move_tail(&msg->list_head,
-                                       &session->s_cap_releases_done);
-                } else {
-                        dout(" release msg %p at %d/%d (%d)\n", msg,
-                             (int)le32_to_cpu(head->num),
-                             (int)CEPH_CAPS_PER_RELEASE,
-                             (int)msg->front.iov_len);
-                }
-                spin_unlock(&session->s_cap_lock);
                p = rb_next(p);
                __ceph_remove_cap(cap);
        }
@@ -1776,9 +1784,9 @@ out:
        spin_unlock(&ci->i_unsafe_lock);
 }
-int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ceph_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ceph_inode_info *ci = ceph_inode(inode);
        unsigned flush_tid;
        int ret;
@@ -2655,7 +2663,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        struct ceph_mds_caps *h;
        int mds = session->s_mds;
        int op;
-        u32 seq;
+        u32 seq, mseq;
        struct ceph_vino vino;
        u64 cap_id;
        u64 size, max_size;
@@ -2675,6 +2683,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        vino.snap = CEPH_NOSNAP;
        cap_id = le64_to_cpu(h->cap_id);
        seq = le32_to_cpu(h->seq);
+        mseq = le32_to_cpu(h->migrate_seq);
        size = le64_to_cpu(h->size);
        max_size = le64_to_cpu(h->max_size);
@@ -2689,6 +2698,18 @@ void ceph_handle_caps(struct ceph_mds_session *session,
             vino.snap, inode);
        if (!inode) {
                dout(" i don't have ino %llx\n", vino.ino);
+                if (op == CEPH_CAP_OP_IMPORT)
+                        __queue_cap_release(session, vino.ino, cap_id,
+                                            mseq, seq);
+                /*
+                 * send any full release message to try to move things
+                 * along for the mds (who clearly thinks we still have this
+                 * cap).
+                 */
+                ceph_add_cap_releases(mdsc, session, -1);
+                ceph_send_cap_releases(mdsc, session);
                goto done;
        }
@@ -2714,7 +2735,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
        spin_lock(&inode->i_lock);
        cap = __get_cap_for_mds(ceph_inode(inode), mds);
        if (!cap) {
-                dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+                dout(" no cap on %p ino %llx.%llx from mds%d\n",
                     inode, ceph_ino(inode), ceph_snap(inode), mds);
                spin_unlock(&inode->i_lock);
                goto done;
diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
index 3b9eeed097b3..2fa992eaf7da 100644
--- a/fs/ceph/ceph_fs.h
+++ b/fs/ceph/ceph_fs.h
@@ -265,16 +265,17 @@ extern const char *ceph_mds_state_name(int s);
 *  - they also define the lock ordering by the MDS
 *  - a few of these are internal to the mds
 */
-#define CEPH_LOCK_DN          1
+#define CEPH_LOCK_DVERSION    1
-#define CEPH_LOCK_ISNAP       2
+#define CEPH_LOCK_DN          2
-#define CEPH_LOCK_IVERSION    4     /* mds internal */
+#define CEPH_LOCK_ISNAP       16
-#define CEPH_LOCK_IFILE       8     /* mds internal */
+#define CEPH_LOCK_IVERSION    32    /* mds internal */
-#define CEPH_LOCK_IAUTH       32
+#define CEPH_LOCK_IFILE       64
-#define CEPH_LOCK_ILINK       64
+#define CEPH_LOCK_IAUTH       128
-#define CEPH_LOCK_IDFT        128   /* dir frag tree */
+#define CEPH_LOCK_ILINK       256
-#define CEPH_LOCK_INEST       256   /* mds internal */
+#define CEPH_LOCK_IDFT        512   /* dir frag tree */
-#define CEPH_LOCK_IXATTR      512
+#define CEPH_LOCK_INEST       1024  /* mds internal */
-#define CEPH_LOCK_INO         2048  /* immutable inode bits; not a lock */
+#define CEPH_LOCK_IXATTR      2048
+#define CEPH_LOCK_INO         8192  /* immutable inode bits; not a lock */
 /* client_session ops */
 enum {
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 4fd30900eff7..f85719310db2 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -587,7 +587,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
                CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
        req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        /* we only need inode linkage */
@@ -1107,10 +1107,9 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
 * an fsync() on a dir will wait for any uncommitted directory
 * operations to commit.
 */
-static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
+static int ceph_dir_fsync(struct file *file, int datasync)
-                          int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_path.dentry->d_inode;
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct list_head *head = &ci->i_unsafe_dirops;
        struct ceph_mds_request *req;
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 17447644d675..4480cb1c63e7 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -133,7 +133,7 @@ static struct dentry *__cfh_to_dentry(struct super_block *sb,
                req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPHASH,
                                               USE_ANY_MDS);
                if (IS_ERR(req))
-                        return ERR_PTR(PTR_ERR(req));
+                        return ERR_CAST(req);
                req->r_ino1 = vino;
                req->r_ino2.ino = cfh->parent_ino;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 6512b6701b9e..6251a1574b94 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -230,7 +230,7 @@ struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
        /* do the open */
        req = prepare_open_request(dir->i_sb, flags, mode);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_dentry = dget(dentry);
        req->r_num_caps = 2;
        if (flags & O_CREAT) {
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index a81b8b662c7b..ab47f46ca282 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -69,7 +69,7 @@ struct inode *ceph_get_snapdir(struct inode *parent)
        BUG_ON(!S_ISDIR(parent->i_mode));
        if (IS_ERR(inode))
-                return ERR_PTR(PTR_ERR(inode));
+                return inode;
        inode->i_mode = parent->i_mode;
        inode->i_uid = parent->i_uid;
        inode->i_gid = parent->i_gid;
@@ -827,7 +827,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
        spin_lock(&dcache_lock);
        spin_lock(&dn->d_lock);
-        list_move_tail(&dir->d_subdirs, &dn->d_u.d_child);
+        list_move(&dn->d_u.d_child, &dir->d_subdirs);
        dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
             dn->d_u.d_child.prev, dn->d_u.d_child.next);
        spin_unlock(&dn->d_lock);
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 885aa5710cfd..1766947fc07a 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -1066,9 +1066,9 @@ static int trim_caps(struct ceph_mds_client *mdsc,
 *
 * Called under s_mutex.
 */
-static int add_cap_releases(struct ceph_mds_client *mdsc,
+int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
-                            struct ceph_mds_session *session,
+                          struct ceph_mds_session *session,
-                            int extra)
+                          int extra)
 {
        struct ceph_msg *msg;
        struct ceph_mds_cap_release *head;
@@ -1176,8 +1176,8 @@ static int check_cap_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq)
 /*
 * called under s_mutex
 */
-static void send_cap_releases(struct ceph_mds_client *mdsc,
+void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
-                       struct ceph_mds_session *session)
+                            struct ceph_mds_session *session)
 {
        struct ceph_msg *msg;
@@ -1768,12 +1768,12 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
        mutex_unlock(&mdsc->mutex);
        dout("do_request waiting\n");
        if (req->r_timeout) {
-                err = (long)wait_for_completion_interruptible_timeout(
+                err = (long)wait_for_completion_killable_timeout(
                        &req->r_completion, req->r_timeout);
                if (err == 0)
                        err = -EIO;
        } else {
-                err = wait_for_completion_interruptible(&req->r_completion);
+                err = wait_for_completion_killable(&req->r_completion);
        }
        dout("do_request waited, got %d\n", err);
        mutex_lock(&mdsc->mutex);
@@ -1980,7 +1980,7 @@ out_err:
        }
        mutex_unlock(&mdsc->mutex);
-        add_cap_releases(mdsc, req->r_session, -1);
+        ceph_add_cap_releases(mdsc, req->r_session, -1);
        mutex_unlock(&session->s_mutex);
        /* kick calling process */
@@ -2014,16 +2014,21 @@ static void handle_forward(struct ceph_mds_client *mdsc,
        mutex_lock(&mdsc->mutex);
        req = __lookup_request(mdsc, tid);
        if (!req) {
-                dout("forward %llu to mds%d - req dne\n", tid, next_mds);
+                dout("forward tid %llu to mds%d - req dne\n", tid, next_mds);
                goto out;  /* dup reply? */
        }
-        if (fwd_seq <= req->r_num_fwd) {
+        if (req->r_aborted) {
-                dout("forward %llu to mds%d - old seq %d <= %d\n",
+                dout("forward tid %llu aborted, unregistering\n", tid);
+                __unregister_request(mdsc, req);
+        } else if (fwd_seq <= req->r_num_fwd) {
+                dout("forward tid %llu to mds%d - old seq %d <= %d\n",
                     tid, next_mds, req->r_num_fwd, fwd_seq);
        } else {
                /* resend. forward race not possible; mds would drop */
-                dout("forward %llu to mds%d (we resend)\n", tid, next_mds);
+                dout("forward tid %llu to mds%d (we resend)\n", tid, next_mds);
+                BUG_ON(req->r_err);
+                BUG_ON(req->r_got_result);
                req->r_num_fwd = fwd_seq;
                req->r_resend_mds = next_mds;
                put_request_session(req);
@@ -2428,6 +2433,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        struct ceph_dentry_info *di;
        int mds = session->s_mds;
        struct ceph_mds_lease *h = msg->front.iov_base;
+        u32 seq;
        struct ceph_vino vino;
        int mask;
        struct qstr dname;
@@ -2441,6 +2447,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        vino.ino = le64_to_cpu(h->ino);
        vino.snap = CEPH_NOSNAP;
        mask = le16_to_cpu(h->mask);
+        seq = le32_to_cpu(h->seq);
        dname.name = (void *)h + sizeof(*h) + sizeof(u32);
        dname.len = msg->front.iov_len - sizeof(*h) - sizeof(u32);
        if (dname.len != get_unaligned_le32(h+1))
@@ -2451,8 +2458,9 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        /* lookup inode */
        inode = ceph_find_inode(sb, vino);
-        dout("handle_lease '%s', mask %d, ino %llx %p\n",
+        dout("handle_lease %s, mask %d, ino %llx %p %.*s\n",
-             ceph_lease_op_name(h->action), mask, vino.ino, inode);
+             ceph_lease_op_name(h->action), mask, vino.ino, inode,
+             dname.len, dname.name);
        if (inode == NULL) {
                dout("handle_lease no inode %llx\n", vino.ino);
                goto release;
@@ -2477,7 +2485,8 @@ static void handle_lease(struct ceph_mds_client *mdsc,
        switch (h->action) {
        case CEPH_MDS_LEASE_REVOKE:
                if (di && di->lease_session == session) {
-                        h->seq = cpu_to_le32(di->lease_seq);
+                        if (ceph_seq_cmp(di->lease_seq, seq) > 0)
+                                h->seq = cpu_to_le32(di->lease_seq);
                        __ceph_mdsc_drop_dentry_lease(dentry);
                }
                release = 1;
@@ -2491,7 +2500,7 @@ static void handle_lease(struct ceph_mds_client *mdsc,
                        unsigned long duration =
                                le32_to_cpu(h->duration_ms) * HZ / 1000;
-                        di->lease_seq = le32_to_cpu(h->seq);
+                        di->lease_seq = seq;
                        dentry->d_time = di->lease_renew_from + duration;
                        di->lease_renew_after = di->lease_renew_from +
                                (duration >> 1);
@@ -2541,7 +2550,7 @@ void ceph_mdsc_lease_send_msg(struct ceph_mds_session *session,
                return;
        lease = msg->front.iov_base;
        lease->action = action;
-        lease->mask = cpu_to_le16(CEPH_LOCK_DN);
+        lease->mask = cpu_to_le16(1);
        lease->ino = cpu_to_le64(ceph_vino(inode).ino);
        lease->first = lease->last = cpu_to_le64(ceph_vino(inode).snap);
        lease->seq = cpu_to_le32(seq);
@@ -2571,7 +2580,7 @@ void ceph_mdsc_lease_release(struct ceph_mds_client *mdsc, struct inode *inode,
        BUG_ON(inode == NULL);
        BUG_ON(dentry == NULL);
-        BUG_ON(mask != CEPH_LOCK_DN);
+        BUG_ON(mask == 0);
        /* is dentry lease valid? */
        spin_lock(&dentry->d_lock);
@@ -2681,10 +2690,10 @@ static void delayed_work(struct work_struct *work)
                        send_renew_caps(mdsc, s);
                else
                        ceph_con_keepalive(&s->s_con);
-                add_cap_releases(mdsc, s, -1);
+                ceph_add_cap_releases(mdsc, s, -1);
                if (s->s_state == CEPH_MDS_SESSION_OPEN ||
                    s->s_state == CEPH_MDS_SESSION_HUNG)
-                        send_cap_releases(mdsc, s);
+                        ceph_send_cap_releases(mdsc, s);
                mutex_unlock(&s->s_mutex);
                ceph_put_mds_session(s);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index d9936c4f1212..b292fa42a66d 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -322,6 +322,12 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req)
        kref_put(&req->r_kref, ceph_mdsc_release_request);
 }
+extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc,
+                                 struct ceph_mds_session *session,
+                                 int extra);
+extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc,
+                                   struct ceph_mds_session *session);
 extern void ceph_mdsc_pre_umount(struct ceph_mds_client *mdsc);
 extern char *ceph_mdsc_build_path(struct dentry *dentry, int *plen, u64 *base,
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
index 60b74839ebec..64b8b1f7863d 100644
--- a/fs/ceph/messenger.c
+++ b/fs/ceph/messenger.c
@@ -120,6 +120,12 @@ void ceph_msgr_exit(void)
        destroy_workqueue(ceph_msgr_wq);
 }
+void ceph_msgr_flush()
+{
+        flush_workqueue(ceph_msgr_wq);
+}
 /*
 * socket callback functions
 */
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
index 00a9430b1ffc..76fbc957bc13 100644
--- a/fs/ceph/messenger.h
+++ b/fs/ceph/messenger.h
@@ -213,6 +213,7 @@ extern int ceph_parse_ips(const char *c, const char *end,
 extern int ceph_msgr_init(void);
 extern void ceph_msgr_exit(void);
+extern void ceph_msgr_flush(void);
 extern struct ceph_messenger *ceph_messenger_create(
        struct ceph_entity_addr *myaddr);
diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
index f6510a476e7e..07a539906e67 100644
--- a/fs/ceph/mon_client.c
+++ b/fs/ceph/mon_client.c
@@ -400,6 +400,8 @@ static void release_generic_request(struct kref *kref)
                ceph_msg_put(req->reply);
        if (req->request)
                ceph_msg_put(req->request);
+        kfree(req);
 }
 static void put_generic_request(struct ceph_mon_generic_request *req)
@@ -704,8 +706,11 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                              struct ceph_msg *msg)
 {
        int ret;
+        int was_auth = 0;
        mutex_lock(&monc->mutex);
+        if (monc->auth->ops)
+                was_auth = monc->auth->ops->is_authenticated(monc->auth);
        monc->pending_auth = 0;
        ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
                                     msg->front.iov_len,
@@ -716,7 +721,7 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
                wake_up(&monc->client->auth_wq);
        } else if (ret > 0) {
                __send_prepared_auth_request(monc, ret);
-        } else if (monc->auth->ops->is_authenticated(monc->auth)) {
+        } else if (!was_auth && monc->auth->ops->is_authenticated(monc->auth)) {
                dout("authenticated, starting session\n");
                monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
index afa7bb3895c4..d25b4add85b4 100644
--- a/fs/ceph/osd_client.c
+++ b/fs/ceph/osd_client.c
@@ -361,8 +361,13 @@ static void put_osd(struct ceph_osd *osd)
 {
        dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
             atomic_read(&osd->o_ref) - 1);
-        if (atomic_dec_and_test(&osd->o_ref))
+        if (atomic_dec_and_test(&osd->o_ref)) {
+                struct ceph_auth_client *ac = osd->o_osdc->client->monc.auth;
+                if (osd->o_authorizer)
+                        ac->ops->destroy_authorizer(ac, osd->o_authorizer);
                kfree(osd);
+        }
 }
 /*
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
index cfdd8f4388b7..ddc656fb5c05 100644
--- a/fs/ceph/osdmap.c
+++ b/fs/ceph/osdmap.c
@@ -706,7 +706,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
                     len, *p, end);
                newcrush = crush_decode(*p, min(*p+len, end));
                if (IS_ERR(newcrush))
-                        return ERR_PTR(PTR_ERR(newcrush));
+                        return ERR_CAST(newcrush);
        }
        /* new flags? */
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index 7c663d9b9f81..fa87f51e38e1 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -89,7 +89,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
        buf->f_files = le64_to_cpu(st.num_objects);
        buf->f_ffree = -1;
-        buf->f_namelen = PATH_MAX;
+        buf->f_namelen = NAME_MAX;
        buf->f_frsize = PAGE_CACHE_SIZE;
        /* leave fsid little-endian, regardless of host endianness */
@@ -669,9 +669,17 @@ static void ceph_destroy_client(struct ceph_client *client)
        /* unmount */
        ceph_mdsc_stop(&client->mdsc);
-        ceph_monc_stop(&client->monc);
        ceph_osdc_stop(&client->osdc);
+        /*
+         * make sure mds and osd connections close out before destroying
+         * the auth module, which is needed to free those connections'
+         * ceph_authorizers.
+         */
+        ceph_msgr_flush();
+        ceph_monc_stop(&client->monc);
        ceph_adjust_min_caps(-client->min_caps);
        ceph_debugfs_client_cleanup(client);
@@ -738,7 +746,7 @@ static struct dentry *open_root_dentry(struct ceph_client *client,
        dout("open_root_inode opening '%s'\n", path);
        req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
        if (IS_ERR(req))
-                return ERR_PTR(PTR_ERR(req));
+                return ERR_CAST(req);
        req->r_path1 = kstrdup(path, GFP_NOFS);
        req->r_ino1.ino = CEPH_INO_ROOT;
        req->r_ino1.snap = CEPH_NOSNAP;
@@ -918,7 +926,7 @@ static int ceph_compare_super(struct super_block *sb, void *data)
 /*
 * construct our own bdi so we can control readahead, etc.
 */
-static atomic_long_t bdi_seq = ATOMIC_INIT(0);
+static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
 static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)
 {
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3725c9ee9d08..10a4a406e887 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -10,7 +10,6 @@
 #include <linux/fs.h>
 #include <linux/mempool.h>
 #include <linux/pagemap.h>
-#include <linux/slab.h>
 #include <linux/wait.h>
 #include <linux/writeback.h>
 #include <linux/slab.h>
@@ -811,7 +810,7 @@ extern void ceph_put_cap(struct ceph_cap *cap);
 extern void ceph_queue_caps_release(struct inode *inode);
 extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc);
-extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ceph_fsync(struct file *file, int datasync);
 extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
                                    struct ceph_mds_session *session);
 extern int ceph_get_cap_mds(struct inode *inode);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 0242ff9cbf41..a7eb65c84b1c 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -84,7 +84,7 @@ extern ssize_t cifs_user_read(struct file *file, char __user *read_data,
 extern ssize_t cifs_user_write(struct file *file, const char __user *write_data,
                         size_t write_size, loff_t *poffset);
 extern int cifs_lock(struct file *, int, struct file_lock *);
-extern int cifs_fsync(struct file *, struct dentry *, int);
+extern int cifs_fsync(struct file *, int);
 extern int cifs_flush(struct file *, fl_owner_t id);
 extern int cifs_file_mmap(struct file * , struct vm_area_struct *);
 extern const struct file_operations cifs_dir_ops;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index a83541ec9713..75541af4b3db 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1676,7 +1676,7 @@ static int cifs_write_end(struct file *file, struct address_space *mapping,
        return rc;
 }
-int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int cifs_fsync(struct file *file, int datasync)
 {
        int xid;
        int rc = 0;
@@ -1688,7 +1688,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync)
        xid = GetXid();
        cFYI(1, "Sync file - name: %s datasync: 0x%x",
-                dentry->d_name.name, datasync);
+                file->f_path.dentry->d_name.name, datasync);
        rc = filemap_write_and_wait(inode->i_mapping);
        if (rc == 0) {
@@ -1952,6 +1952,7 @@ static void cifs_copy_cache_pages(struct address_space *mapping,
                        bytes_read -= PAGE_CACHE_SIZE;
                        continue;
                }
+                page_cache_release(page);
                target = kmap_atomic(page, KM_USER0);
diff --git a/fs/coda/coda_int.h b/fs/coda/coda_int.h
index d99860a33890..6b443ff43a19 100644
--- a/fs/coda/coda_int.h
+++ b/fs/coda/coda_int.h
@@ -11,8 +11,7 @@ extern int coda_fake_statfs;
 void coda_destroy_inodecache(void);
 int coda_init_inodecache(void);
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry,
+int coda_fsync(struct file *coda_file, int datasync);
-               int datasync);
 void coda_sysctl_init(void);
 void coda_sysctl_clean(void);
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 7196077b1688..ad3cd2abeeb4 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -202,10 +202,10 @@ int coda_release(struct inode *coda_inode, struct file *coda_file)
        return 0;
 }
-int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync)
+int coda_fsync(struct file *coda_file, int datasync)
 {
        struct file *host_file;
-        struct inode *coda_inode = coda_dentry->d_inode;
+        struct inode *coda_inode = coda_file->f_path.dentry->d_inode;
        struct coda_file_info *cfi;
        int err = 0;
diff --git a/fs/compat.c b/fs/compat.c
index 05448730f840..6490d2134ff3 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -568,6 +568,79 @@ out:
        return ret;
 }
+/* A write operation does a read from user space and vice versa */
+#define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
+ssize_t compat_rw_copy_check_uvector(int type,
+                const struct compat_iovec __user *uvector, unsigned long nr_segs,
+                unsigned long fast_segs, struct iovec *fast_pointer,
+                struct iovec **ret_pointer)
+{
+        compat_ssize_t tot_len;
+        struct iovec *iov = *ret_pointer = fast_pointer;
+        ssize_t ret = 0;
+        int seg;
+        /*
+         * SuS says "The readv() function *may* fail if the iovcnt argument
+         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
+         * traditionally returned zero for zero segments, so...
+         */
+        if (nr_segs == 0)
+                goto out;
+        ret = -EINVAL;
+        if (nr_segs > UIO_MAXIOV || nr_segs < 0)
+                goto out;
+        if (nr_segs > fast_segs) {
+                ret = -ENOMEM;
+                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
+                if (iov == NULL) {
+                        *ret_pointer = fast_pointer;
+                        goto out;
+                }
+        }
+        *ret_pointer = iov;
+        /*
+         * Single unix specification:
+         * We should -EINVAL if an element length is not >= 0 and fitting an
+         * ssize_t.  The total length is fitting an ssize_t
+         *
+         * Be careful here because iov_len is a size_t not an ssize_t
+         */
+        tot_len = 0;
+        ret = -EINVAL;
+        for (seg = 0; seg < nr_segs; seg++) {
+                compat_ssize_t tmp = tot_len;
+                compat_uptr_t buf;
+                compat_ssize_t len;
+                if (__get_user(len, &uvector->iov_len) ||
+                   __get_user(buf, &uvector->iov_base)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                if (len < 0)    /* size_t not fitting in compat_ssize_t .. */
+                        goto out;
+                tot_len += len;
+                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
+                        goto out;
+                if (!access_ok(vrfy_dir(type), compat_ptr(buf), len)) {
+                        ret = -EFAULT;
+                        goto out;
+                }
+                iov->iov_base = compat_ptr(buf);
+                iov->iov_len = (compat_size_t) len;
+                uvector++;
+                iov++;
+        }
+        ret = tot_len;
+out:
+        return ret;
+}
 static inline long
 copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
 {
@@ -600,7 +673,7 @@ compat_sys_io_submit(aio_context_t ctx_id, int nr, u32 __user *iocb)
        iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
        ret = copy_iocb(nr, iocb, iocb64);
        if (!ret)
-                ret = sys_io_submit(ctx_id, nr, iocb64);
+                ret = do_io_submit(ctx_id, nr, iocb64, 1);
        return ret;
 }
@@ -1077,70 +1150,21 @@ static ssize_t compat_do_readv_writev(int type, struct file *file,
 {
        compat_ssize_t tot_len;
        struct iovec iovstack[UIO_FASTIOV];
-        struct iovec *iov=iovstack, *vector;
+        struct iovec *iov;
        ssize_t ret;
-        int seg;
        io_fn_t fn;
        iov_fn_t fnv;
-        /*
-         * SuS says "The readv() function *may* fail if the iovcnt argument
-         * was less than or equal to 0, or greater than {IOV_MAX}.  Linux has
-         * traditionally returned zero for zero segments, so...
-         */
-        ret = 0;
-        if (nr_segs == 0)
-                goto out;
-        /*
-         * First get the "struct iovec" from user memory and
-         * verify all the pointers
-         */
        ret = -EINVAL;
-        if ((nr_segs > UIO_MAXIOV) || (nr_segs <= 0))
-                goto out;
        if (!file->f_op)
                goto out;
-        if (nr_segs > UIO_FASTIOV) {
-                ret = -ENOMEM;
-                iov = kmalloc(nr_segs*sizeof(struct iovec), GFP_KERNEL);
-                if (!iov)
-                        goto out;
-        }
        ret = -EFAULT;
        if (!access_ok(VERIFY_READ, uvector, nr_segs*sizeof(*uvector)))
                goto out;
-        /*
+        tot_len = compat_rw_copy_check_uvector(type, uvector, nr_segs,
-         * Single unix specification:
+                                               UIO_FASTIOV, iovstack, &iov);
-         * We should -EINVAL if an element length is not >= 0 and fitting an
-         * ssize_t.  The total length is fitting an ssize_t
-         *
-         * Be careful here because iov_len is a size_t not an ssize_t
-         */
-        tot_len = 0;
-        vector = iov;
-        ret = -EINVAL;
-        for (seg = 0 ; seg < nr_segs; seg++) {
-                compat_ssize_t tmp = tot_len;
-                compat_ssize_t len;
-                compat_uptr_t buf;
-                if (__get_user(len, &uvector->iov_len) ||
-                    __get_user(buf, &uvector->iov_base)) {
-                        ret = -EFAULT;
-                        goto out;
-                }
-                if (len < 0)    /* size_t not fitting an compat_ssize_t .. */
-                        goto out;
-                tot_len += len;
-                if (tot_len < tmp) /* maths overflow on the compat_ssize_t */
-                        goto out;
-                vector->iov_base = compat_ptr(buf);
-                vector->iov_len = (compat_size_t) len;
-                uvector++;
-                vector++;
-        }
        if (tot_len == 0) {
                ret = 0;
                goto out;
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index c8af2d91174b..cf78d44a8d6a 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -73,15 +73,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
                return -EINVAL;
        sd_iattr = sd->s_iattr;
-        error = inode_change_ok(inode, iattr);
-        if (error)
-                return error;
-        error = inode_setattr(inode, iattr);
-        if (error)
-                return error;
        if (!sd_iattr) {
                /* setting attributes for the first time, allocate now */
                sd_iattr = kzalloc(sizeof(struct iattr), GFP_KERNEL);
@@ -94,9 +85,12 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr)
                sd_iattr->ia_atime = sd_iattr->ia_mtime = sd_iattr->ia_ctime = CURRENT_TIME;
                sd->s_iattr = sd_iattr;
        }
        /* attributes were changed atleast once in past */
+        error = simple_setattr(dentry, iattr);
+        if (error)
+                return error;
        if (ia_valid & ATTR_UID)
                sd_iattr->ia_uid = iattr->ia_uid;
        if (ia_valid & ATTR_GID)
diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c
index 4d74fc72c195..0210898458b2 100644
--- a/fs/debugfs/file.c
+++ b/fs/debugfs/file.c
@@ -277,8 +277,10 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n");
 DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n");
+DEFINE_SIMPLE_ATTRIBUTE(fops_x64, debugfs_u64_get, debugfs_u64_set, "0x%016llx\n");
 /*
- * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value
+ * debugfs_create_x{8,16,32,64} - create a debugfs file that is used to read and write an unsigned {8,16,32,64}-bit value
 *
 * These functions are exactly the same as the above functions (but use a hex
 * output for the decimal challenged). For details look at the above unsigned
@@ -357,6 +359,23 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode,
 }
 EXPORT_SYMBOL_GPL(debugfs_create_x32);
+/**
+ * debugfs_create_x64 - create a debugfs file that is used to read and write an unsigned 64-bit value
+ * @name: a pointer to a string containing the name of the file to create.
+ * @mode: the permission that the file should have
+ * @parent: a pointer to the parent dentry for this file.  This should be a
+ *          directory dentry if set.  If this parameter is %NULL, then the
+ *          file will be created in the root of the debugfs filesystem.
+ * @value: a pointer to the variable that the file should read to and write
+ *         from.
+ */
+struct dentry *debugfs_create_x64(const char *name, mode_t mode,
+                                 struct dentry *parent, u64 *value)
+{
+        return debugfs_create_file(name, mode, parent, value, &fops_x64);
+}
+EXPORT_SYMBOL_GPL(debugfs_create_x64);
 static int debugfs_size_t_set(void *data, u64 val)
 {
diff --git a/fs/direct-io.c b/fs/direct-io.c
index e82adc2debb7..7600aacf531d 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -82,6 +82,8 @@ struct dio {
        int reap_counter;               /* rate limit reaping */
        get_block_t *get_block;         /* block mapping function */
        dio_iodone_t *end_io;           /* IO completion function */
+        dio_submit_t *submit_io;        /* IO submition function */
+        loff_t logical_offset_in_bio;   /* current first logical block in bio */
        sector_t final_block_in_bio;    /* current final block in bio + 1 */
        sector_t next_block_for_io;     /* next block to be put under IO,
                                           in dio_blocks units */
@@ -96,6 +98,7 @@ struct dio {
        unsigned cur_page_offset;       /* Offset into it, in bytes */
        unsigned cur_page_len;          /* Nr of bytes at cur_page_offset */
        sector_t cur_page_block;        /* Where it starts */
+        loff_t cur_page_fs_offset;      /* Offset in file */
        /* BIO completion state */
        spinlock_t bio_lock;            /* protects BIO fields below */
@@ -300,6 +303,26 @@ static void dio_bio_end_io(struct bio *bio, int error)
        spin_unlock_irqrestore(&dio->bio_lock, flags);
 }
+/**
+ * dio_end_io - handle the end io action for the given bio
+ * @bio: The direct io bio thats being completed
+ * @error: Error if there was one
+ *
+ * This is meant to be called by any filesystem that uses their own dio_submit_t
+ * so that the DIO specific endio actions are dealt with after the filesystem
+ * has done it's completion work.
+ */
+void dio_end_io(struct bio *bio, int error)
+{
+        struct dio *dio = bio->bi_private;
+        if (dio->is_async)
+                dio_bio_end_aio(bio, error);
+        else
+                dio_bio_end_io(bio, error);
+}
+EXPORT_SYMBOL_GPL(dio_end_io);
 static int
 dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                sector_t first_sector, int nr_vecs)
@@ -316,6 +339,7 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
                bio->bi_end_io = dio_bio_end_io;
        dio->bio = bio;
+        dio->logical_offset_in_bio = dio->cur_page_fs_offset;
        return 0;
 }
@@ -340,10 +364,15 @@ static void dio_bio_submit(struct dio *dio)
        if (dio->is_async && dio->rw == READ)
                bio_set_pages_dirty(bio);
-        submit_bio(dio->rw, bio);
+        if (dio->submit_io)
+                dio->submit_io(dio->rw, bio, dio->inode,
+                               dio->logical_offset_in_bio);
+        else
+                submit_bio(dio->rw, bio);
        dio->bio = NULL;
        dio->boundary = 0;
+        dio->logical_offset_in_bio = 0;
 }
 /*
@@ -603,10 +632,26 @@ static int dio_send_cur_page(struct dio *dio)
        int ret = 0;
        if (dio->bio) {
+                loff_t cur_offset = dio->block_in_file << dio->blkbits;
+                loff_t bio_next_offset = dio->logical_offset_in_bio +
+                        dio->bio->bi_size;
                /*
-                 * See whether this new request is contiguous with the old
+                 * See whether this new request is contiguous with the old.
+                 *
+                 * Btrfs cannot handl having logically non-contiguous requests
+                 * submitted.  For exmple if you have
+                 *
+                 * Logical:  [0-4095][HOLE][8192-12287]
+                 * Phyiscal: [0-4095]      [4096-8181]
+                 *
+                 * We cannot submit those pages together as one BIO.  So if our
+                 * current logical offset in the file does not equal what would
+                 * be the next logical offset in the bio, submit the bio we
+                 * have.
                 */
-                if (dio->final_block_in_bio != dio->cur_page_block)
+                if (dio->final_block_in_bio != dio->cur_page_block ||
+                    cur_offset != bio_next_offset)
                        dio_bio_submit(dio);
                /*
                 * Submit now if the underlying fs is about to perform a
@@ -701,6 +746,7 @@ submit_page_section(struct dio *dio, struct page *page,
        dio->cur_page_offset = offset;
        dio->cur_page_len = len;
        dio->cur_page_block = blocknr;
+        dio->cur_page_fs_offset = dio->block_in_file << dio->blkbits;
 out:
        return ret;
 }
@@ -935,7 +981,7 @@ static ssize_t
 direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, 
        const struct iovec *iov, loff_t offset, unsigned long nr_segs, 
        unsigned blkbits, get_block_t get_block, dio_iodone_t end_io,
-        struct dio *dio)
+        dio_submit_t submit_io, struct dio *dio)
 {
        unsigned long user_addr; 
        unsigned long flags;
@@ -952,6 +998,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        dio->get_block = get_block;
        dio->end_io = end_io;
+        dio->submit_io = submit_io;
        dio->final_block_in_bio = -1;
        dio->next_block_for_io = -1;
@@ -1008,7 +1055,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
                }
        } /* end iovec loop */
-        if (ret == -ENOTBLK && (rw & WRITE)) {
+        if (ret == -ENOTBLK) {
                /*
                 * The remaining part of the request will be
                 * be handled by buffered I/O when we return
@@ -1087,30 +1134,11 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        return ret;
 }
-/*
- * This is a library function for use by filesystem drivers.
- *
- * The locking rules are governed by the flags parameter:
- *  - if the flags value contains DIO_LOCKING we use a fancy locking
- *    scheme for dumb filesystems.
- *    For writes this function is called under i_mutex and returns with
- *    i_mutex held, for reads, i_mutex is not held on entry, but it is
- *    taken and dropped again before returning.
- *    For reads and writes i_alloc_sem is taken in shared mode and released
- *    on I/O completion (which may happen asynchronously after returning to
- *    the caller).
- *
- *  - if the flags value does NOT contain DIO_LOCKING we don't use any
- *    internal locking but rather rely on the filesystem to synchronize
- *    direct I/O reads/writes versus each other and truncate.
- *    For reads and writes both i_mutex and i_alloc_sem are not held on
- *    entry and are never taken.
- */
 ssize_t
-__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+__blockdev_direct_IO_newtrunc(int rw, struct kiocb *iocb, struct inode *inode,
        struct block_device *bdev, const struct iovec *iov, loff_t offset, 
        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
-        int flags)
+        dio_submit_t submit_io, int flags)
 {
        int seg;
        size_t size;
@@ -1197,11 +1225,49 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
                (end > i_size_read(inode)));
        retval = direct_io_worker(rw, iocb, inode, iov, offset,
-                                nr_segs, blkbits, get_block, end_io, dio);
+                                nr_segs, blkbits, get_block, end_io,
+                                submit_io, dio);
+out:
+        return retval;
+}
+EXPORT_SYMBOL(__blockdev_direct_IO_newtrunc);
+/*
+ * This is a library function for use by filesystem drivers.
+ *
+ * The locking rules are governed by the flags parameter:
+ *  - if the flags value contains DIO_LOCKING we use a fancy locking
+ *    scheme for dumb filesystems.
+ *    For writes this function is called under i_mutex and returns with
+ *    i_mutex held, for reads, i_mutex is not held on entry, but it is
+ *    taken and dropped again before returning.
+ *    For reads and writes i_alloc_sem is taken in shared mode and released
+ *    on I/O completion (which may happen asynchronously after returning to
+ *    the caller).
+ *
+ *  - if the flags value does NOT contain DIO_LOCKING we don't use any
+ *    internal locking but rather rely on the filesystem to synchronize
+ *    direct I/O reads/writes versus each other and truncate.
+ *    For reads and writes both i_mutex and i_alloc_sem are not held on
+ *    entry and are never taken.
+ */
+ssize_t
+__blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
+        struct block_device *bdev, const struct iovec *iov, loff_t offset,
+        unsigned long nr_segs, get_block_t get_block, dio_iodone_t end_io,
+        dio_submit_t submit_io, int flags)
+{
+        ssize_t retval;
+        retval = __blockdev_direct_IO_newtrunc(rw, iocb, inode, bdev, iov,
+                        offset, nr_segs, get_block, end_io, submit_io, flags);
        /*
         * In case of error extending write may have instantiated a few
         * blocks outside i_size. Trim these off again for DIO_LOCKING.
+         * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this in
+         * their own manner. This is a further example of where the old
+         * truncate sequence is inadequate.
         *
         * NOTE: filesystems with their own locking have to handle this
         * on their own.
@@ -1209,12 +1275,13 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (flags & DIO_LOCKING) {
                if (unlikely((rw & WRITE) && retval < 0)) {
                        loff_t isize = i_size_read(inode);
+                        loff_t end = offset + iov_length(iov, nr_segs);
                        if (end > isize)
                                vmtruncate(inode, isize);
                }
        }
-out:
        return retval;
 }
 EXPORT_SYMBOL(__blockdev_direct_IO);
diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c
index 3bdddbcc785f..e8fcf4e2ed7d 100644
--- a/fs/ecryptfs/file.c
+++ b/fs/ecryptfs/file.c
@@ -274,7 +274,7 @@ static int ecryptfs_release(struct inode *inode, struct file *file)
 }
 static int
-ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+ecryptfs_fsync(struct file *file, int datasync)
 {
        return vfs_fsync(ecryptfs_file_to_lower(file), datasync);
 }
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index 65dee2f336ae..31ef5252f0fe 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -805,7 +805,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                                    - (ia->ia_size & ~PAGE_CACHE_MASK));
                if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) {
-                        rc = vmtruncate(inode, ia->ia_size);
+                        rc = simple_setsize(inode, ia->ia_size);
                        if (rc)
                                goto out;
                        lower_ia->ia_size = ia->ia_size;
@@ -830,7 +830,7 @@ static int truncate_upper(struct dentry *dentry, struct iattr *ia,
                                goto out;
                        }
                }
-                vmtruncate(inode, ia->ia_size);
+                simple_setsize(inode, ia->ia_size);
                rc = ecryptfs_write_inode_size_to_metadata(inode);
                if (rc) {
                        printk(KERN_ERR "Problem with "
diff --git a/fs/exec.c b/fs/exec.c
index 9badbc0bfb1d..e19de6a80339 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -768,7 +768,6 @@ static int de_thread(struct task_struct *tsk)
        struct signal_struct *sig = tsk->signal;
        struct sighand_struct *oldsighand = tsk->sighand;
        spinlock_t *lock = &oldsighand->siglock;
-        int count;
        if (thread_group_empty(tsk))
                goto no_thread_group;
@@ -785,13 +784,13 @@ static int de_thread(struct task_struct *tsk)
                spin_unlock_irq(lock);
                return -EAGAIN;
        }
        sig->group_exit_task = tsk;
-        zap_other_threads(tsk);
+        sig->notify_count = zap_other_threads(tsk);
+        if (!thread_group_leader(tsk))
+                sig->notify_count--;
-        /* Account for the thread group leader hanging around: */
+        while (sig->notify_count) {
-        count = thread_group_leader(tsk) ? 1 : 2;
-        sig->notify_count = count;
-        while (atomic_read(&sig->count) > count) {
                __set_current_state(TASK_UNINTERRUPTIBLE);
                spin_unlock_irq(lock);
                schedule();
@@ -1662,12 +1661,15 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
        struct task_struct *tsk = current;
        struct mm_struct *mm = tsk->mm;
        struct completion *vfork_done;
-        int core_waiters;
+        int core_waiters = -EBUSY;
        init_completion(&core_state->startup);
        core_state->dumper.task = tsk;
        core_state->dumper.next = NULL;
-        core_waiters = zap_threads(tsk, mm, core_state, exit_code);
+        down_write(&mm->mmap_sem);
+        if (!mm->core_state)
+                core_waiters = zap_threads(tsk, mm, core_state, exit_code);
        up_write(&mm->mmap_sem);
        if (unlikely(core_waiters < 0))
@@ -1787,21 +1789,61 @@ static void wait_for_dump_helpers(struct file *file)
 }
+/*
+ * uhm_pipe_setup
+ * helper function to customize the process used
+ * to collect the core in userspace.  Specifically
+ * it sets up a pipe and installs it as fd 0 (stdin)
+ * for the process.  Returns 0 on success, or
+ * PTR_ERR on failure.
+ * Note that it also sets the core limit to 1.  This
+ * is a special value that we use to trap recursive
+ * core dumps
+ */
+static int umh_pipe_setup(struct subprocess_info *info)
+{
+        struct file *rp, *wp;
+        struct fdtable *fdt;
+        struct coredump_params *cp = (struct coredump_params *)info->data;
+        struct files_struct *cf = current->files;
+        wp = create_write_pipe(0);
+        if (IS_ERR(wp))
+                return PTR_ERR(wp);
+        rp = create_read_pipe(wp, 0);
+        if (IS_ERR(rp)) {
+                free_write_pipe(wp);
+                return PTR_ERR(rp);
+        }
+        cp->file = wp;
+        sys_close(0);
+        fd_install(0, rp);
+        spin_lock(&cf->file_lock);
+        fdt = files_fdtable(cf);
+        FD_SET(0, fdt->open_fds);
+        FD_CLR(0, fdt->close_on_exec);
+        spin_unlock(&cf->file_lock);
+        /* and disallow core files too */
+        current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
+        return 0;
+}
 void do_coredump(long signr, int exit_code, struct pt_regs *regs)
 {
        struct core_state core_state;
        char corename[CORENAME_MAX_SIZE + 1];
        struct mm_struct *mm = current->mm;
        struct linux_binfmt * binfmt;
-        struct inode * inode;
        const struct cred *old_cred;
        struct cred *cred;
        int retval = 0;
        int flag = 0;
-        int ispipe = 0;
+        int ispipe;
-        char **helper_argv = NULL;
-        int helper_argc = 0;
-        int dump_count = 0;
        static atomic_t core_dump_count = ATOMIC_INIT(0);
        struct coredump_params cprm = {
                .signr = signr,
@@ -1820,23 +1862,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        binfmt = mm->binfmt;
        if (!binfmt || !binfmt->core_dump)
                goto fail;
+        if (!__get_dumpable(cprm.mm_flags))
-        cred = prepare_creds();
-        if (!cred) {
-                retval = -ENOMEM;
                goto fail;
-        }
-        down_write(&mm->mmap_sem);
+        cred = prepare_creds();
-        /*
+        if (!cred)
-         * If another thread got here first, or we are not dumpable, bail out.
-         */
-        if (mm->core_state || !__get_dumpable(cprm.mm_flags)) {
-                up_write(&mm->mmap_sem);
-                put_cred(cred);
                goto fail;
-        }
        /*
         *      We cannot trust fsuid as being the "true" uid of the
         *      process nor do we know its entire history. We only know it
@@ -1849,10 +1880,8 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        }
        retval = coredump_wait(exit_code, &core_state);
-        if (retval < 0) {
+        if (retval < 0)
-                put_cred(cred);
+                goto fail_creds;
-                goto fail;
-        }
        old_cred = override_creds(cred);
@@ -1870,19 +1899,19 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
        ispipe = format_corename(corename, signr);
        unlock_kernel();
-        if ((!ispipe) && (cprm.limit < binfmt->min_coredump))
-                goto fail_unlock;
        if (ispipe) {
-                if (cprm.limit == 0) {
+                int dump_count;
+                char **helper_argv;
+                if (cprm.limit == 1) {
                        /*
                         * Normally core limits are irrelevant to pipes, since
                         * we're not writing to the file system, but we use
-                         * cprm.limit of 0 here as a speacial value. Any
+                         * cprm.limit of 1 here as a speacial value. Any
-                         * non-zero limit gets set to RLIM_INFINITY below, but
+                         * non-1 limit gets set to RLIM_INFINITY below, but
                         * a limit of 0 skips the dump.  This is a consistent
                         * way to catch recursive crashes.  We can still crash
-                         * if the core_pattern binary sets RLIM_CORE =  !0
+                         * if the core_pattern binary sets RLIM_CORE =  !1
                         * but it runs as root, and can do lots of stupid things
                         * Note that we use task_tgid_vnr here to grab the pid
                         * of the process group leader.  That way we get the
@@ -1890,11 +1919,12 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                         * core_pattern process dies.
                         */
                        printk(KERN_WARNING
-                                "Process %d(%s) has RLIMIT_CORE set to 0\n",
+                                "Process %d(%s) has RLIMIT_CORE set to 1\n",
                                task_tgid_vnr(current), current->comm);
                        printk(KERN_WARNING "Aborting core\n");
                        goto fail_unlock;
                }
+                cprm.limit = RLIM_INFINITY;
                dump_count = atomic_inc_return(&core_dump_count);
                if (core_pipe_limit && (core_pipe_limit < dump_count)) {
@@ -1904,71 +1934,74 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs)
                        goto fail_dropcount;
                }
-                helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc);
+                helper_argv = argv_split(GFP_KERNEL, corename+1, NULL);
                if (!helper_argv) {
                        printk(KERN_WARNING "%s failed to allocate memory\n",
                               __func__);
                        goto fail_dropcount;
                }
-                cprm.limit = RLIM_INFINITY;
+                retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
+                                        NULL, UMH_WAIT_EXEC, umh_pipe_setup,
-                /* SIGPIPE can happen, but it's just never processed */
+                                        NULL, &cprm);
-                if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL,
+                argv_free(helper_argv);
-                                &cprm.file)) {
+                if (retval) {
                        printk(KERN_INFO "Core dump to %s pipe failed\n",
                               corename);
-                        goto fail_dropcount;
+                        goto close_fail;
                }
-        } else
+        } else {
+                struct inode *inode;
+                if (cprm.limit < binfmt->min_coredump)
+                        goto fail_unlock;
                cprm.file = filp_open(corename,
                                 O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
                                 0600);
-        if (IS_ERR(cprm.file))
+                if (IS_ERR(cprm.file))
-                goto fail_dropcount;
+                        goto fail_unlock;
-        inode = cprm.file->f_path.dentry->d_inode;
-        if (inode->i_nlink > 1)
-                goto close_fail;        /* multiple links - don't dump */
-        if (!ispipe && d_unhashed(cprm.file->f_path.dentry))
-                goto close_fail;
-        /* AK: actually i see no reason to not allow this for named pipes etc.,
-           but keep the previous behaviour for now. */
-        if (!ispipe && !S_ISREG(inode->i_mode))
-                goto close_fail;
-        /*
-         * Dont allow local users get cute and trick others to coredump
-         * into their pre-created files:
-         * Note, this is not relevant for pipes
-         */
-        if (!ispipe && (inode->i_uid != current_fsuid()))
-                goto close_fail;
-        if (!cprm.file->f_op)
-                goto close_fail;
-        if (!cprm.file->f_op->write)
-                goto close_fail;
-        if (!ispipe &&
-            do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0)
-                goto close_fail;
-        retval = binfmt->core_dump(&cprm);
+                inode = cprm.file->f_path.dentry->d_inode;
+                if (inode->i_nlink > 1)
+                        goto close_fail;
+                if (d_unhashed(cprm.file->f_path.dentry))
+                        goto close_fail;
+                /*
+                 * AK: actually i see no reason to not allow this for named
+                 * pipes etc, but keep the previous behaviour for now.
+                 */
+                if (!S_ISREG(inode->i_mode))
+                        goto close_fail;
+                /*
+                 * Dont allow local users get cute and trick others to coredump
+                 * into their pre-created files.
+                 */
+                if (inode->i_uid != current_fsuid())
+                        goto close_fail;
+                if (!cprm.file->f_op || !cprm.file->f_op->write)
+                        goto close_fail;
+                if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+                        goto close_fail;
+        }
+        retval = binfmt->core_dump(&cprm);
        if (retval)
                current->signal->group_exit_code |= 0x80;
-close_fail:
        if (ispipe && core_pipe_limit)
                wait_for_dump_helpers(cprm.file);
-        filp_close(cprm.file, NULL);
+close_fail:
+        if (cprm.file)
+                filp_close(cprm.file, NULL);
 fail_dropcount:
-        if (dump_count)
+        if (ispipe)
                atomic_dec(&core_dump_count);
 fail_unlock:
-        if (helper_argv)
+        coredump_finish(mm);
-                argv_free(helper_argv);
        revert_creds(old_cred);
+fail_creds:
        put_cred(cred);
-        coredump_finish(mm);
 fail:
        return;
 }
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 839b9dc1e70f..fef6899be397 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -40,12 +40,11 @@ static int exofs_release_file(struct inode *inode, struct file *filp)
        return 0;
 }
-static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+static int exofs_file_fsync(struct file *filp, int datasync)
-                            int datasync)
 {
        int ret;
        struct address_space *mapping = filp->f_mapping;
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = mapping->host;
        struct super_block *sb;
        ret = filemap_write_and_wait(mapping);
@@ -66,7 +65,7 @@ static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
 static int exofs_flush(struct file *file, fl_owner_t id)
 {
-        exofs_file_fsync(file, file->f_path.dentry, 1);
+        exofs_file_fsync(file, 1);
        /* TODO: Flush the OSD target */
        return 0;
 }
diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h
index 0b038e47ad2f..52b34f1d2738 100644
--- a/fs/ext2/ext2.h
+++ b/fs/ext2/ext2.h
@@ -122,7 +122,6 @@ extern int ext2_write_inode (struct inode *, struct writeback_control *);
 extern void ext2_delete_inode (struct inode *);
 extern int ext2_sync_inode (struct inode *);
 extern int ext2_get_block(struct inode *, sector_t, struct buffer_head *, int);
-extern void ext2_truncate (struct inode *);
 extern int ext2_setattr (struct dentry *, struct iattr *);
 extern void ext2_set_inode_flags(struct inode *inode);
 extern void ext2_get_inode_flags(struct ext2_inode_info *);
@@ -155,7 +154,7 @@ extern void ext2_write_super (struct super_block *);
 extern const struct file_operations ext2_dir_operations;
 /* file.c */
-extern int ext2_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern int ext2_fsync(struct file *file, int datasync);
 extern const struct inode_operations ext2_file_inode_operations;
 extern const struct file_operations ext2_file_operations;
 extern const struct file_operations ext2_xip_file_operations;
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 5d198d0697fb..49eec9456c5b 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -40,13 +40,13 @@ static int ext2_release_file (struct inode * inode, struct file * filp)
        return 0;
 }
-int ext2_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ext2_fsync(struct file *file, int datasync)
 {
        int ret;
-        struct super_block *sb = dentry->d_inode->i_sb;
+        struct super_block *sb = file->f_mapping->host->i_sb;
        struct address_space *mapping = sb->s_bdev->bd_inode->i_mapping;
-        ret = simple_fsync(file, dentry, datasync);
+        ret = generic_file_fsync(file, datasync);
        if (ret == -EIO || test_and_clear_bit(AS_EIO, &mapping->flags)) {
                /* We don't really know where the IO error happened... */
                ext2_error(sb, __func__,
@@ -95,7 +95,6 @@ const struct file_operations ext2_xip_file_operations = {
 #endif
 const struct inode_operations ext2_file_inode_operations = {
-        .truncate       = ext2_truncate,
 #ifdef CONFIG_EXT2_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 527c46d9bc1f..3675088cb88c 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -54,6 +54,18 @@ static inline int ext2_inode_is_fast_symlink(struct inode *inode)
                inode->i_blocks - ea_blocks == 0);
 }
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset);
+static void ext2_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                ext2_truncate_blocks(inode, inode->i_size);
+        }
+}
 /*
 * Called at the last iput() if i_nlink is zero.
 */
@@ -71,7 +83,7 @@ void ext2_delete_inode (struct inode * inode)
        inode->i_size = 0;
        if (inode->i_blocks)
-                ext2_truncate (inode);
+                ext2_truncate_blocks(inode, 0);
        ext2_free_inode (inode);
        return;
@@ -757,8 +769,8 @@ int __ext2_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
-        return block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        return block_write_begin_newtrunc(file, mapping, pos, len, flags,
-                                                        ext2_get_block);
+                                        pagep, fsdata, ext2_get_block);
 }
 static int
@@ -766,8 +778,25 @@ ext2_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
+        int ret;
        *pagep = NULL;
-        return __ext2_write_begin(file, mapping, pos, len, flags, pagep,fsdata);
+        ret = __ext2_write_begin(file, mapping, pos, len, flags, pagep, fsdata);
+        if (ret < 0)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
+}
+static int ext2_write_end(struct file *file, struct address_space *mapping,
+                        loff_t pos, unsigned len, unsigned copied,
+                        struct page *page, void *fsdata)
+{
+        int ret;
+        ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+        if (ret < len)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
 }
 static int
@@ -775,13 +804,18 @@ ext2_nobh_write_begin(struct file *file, struct address_space *mapping,
                loff_t pos, unsigned len, unsigned flags,
                struct page **pagep, void **fsdata)
 {
+        int ret;
        /*
         * Dir-in-pagecache still uses ext2_write_begin. Would have to rework
         * directory handling code to pass around offsets rather than struct
         * pages in order to make this work easily.
         */
-        return nobh_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        ret = nobh_write_begin_newtrunc(file, mapping, pos, len, flags, pagep,
-                                                        ext2_get_block);
+                                                fsdata, ext2_get_block);
+        if (ret < 0)
+                ext2_write_failed(mapping, pos + len);
+        return ret;
 }
 static int ext2_nobh_writepage(struct page *page,
@@ -800,10 +834,15 @@ ext2_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
                        loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ssize_t ret;
-                                offset, nr_segs, ext2_get_block, NULL);
+        ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
+                                iov, offset, nr_segs, ext2_get_block, NULL);
+        if (ret < 0 && (rw & WRITE))
+                ext2_write_failed(mapping, offset + iov_length(iov, nr_segs));
+        return ret;
 }
 static int
@@ -818,7 +857,7 @@ const struct address_space_operations ext2_aops = {
        .writepage              = ext2_writepage,
        .sync_page              = block_sync_page,
        .write_begin            = ext2_write_begin,
-        .write_end              = generic_write_end,
+        .write_end              = ext2_write_end,
        .bmap                   = ext2_bmap,
        .direct_IO              = ext2_direct_IO,
        .writepages             = ext2_writepages,
@@ -1027,7 +1066,7 @@ static void ext2_free_branches(struct inode *inode, __le32 *p, __le32 *q, int de
                ext2_free_data(inode, p, q);
 }
-void ext2_truncate(struct inode *inode)
+static void __ext2_truncate_blocks(struct inode *inode, loff_t offset)
 {
        __le32 *i_data = EXT2_I(inode)->i_data;
        struct ext2_inode_info *ei = EXT2_I(inode);
@@ -1039,27 +1078,8 @@ void ext2_truncate(struct inode *inode)
        int n;
        long iblock;
        unsigned blocksize;
-        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-            S_ISLNK(inode->i_mode)))
-                return;
-        if (ext2_inode_is_fast_symlink(inode))
-                return;
-        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
-                return;
        blocksize = inode->i_sb->s_blocksize;
-        iblock = (inode->i_size + blocksize-1)
+        iblock = (offset + blocksize-1) >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
-                                        >> EXT2_BLOCK_SIZE_BITS(inode->i_sb);
-        if (mapping_is_xip(inode->i_mapping))
-                xip_truncate_page(inode->i_mapping, inode->i_size);
-        else if (test_opt(inode->i_sb, NOBH))
-                nobh_truncate_page(inode->i_mapping,
-                                inode->i_size, ext2_get_block);
-        else
-                block_truncate_page(inode->i_mapping,
-                                inode->i_size, ext2_get_block);
        n = ext2_block_to_path(inode, iblock, offsets, NULL);
        if (n == 0)
@@ -1127,6 +1147,62 @@ do_indirects:
        ext2_discard_reservation(inode);
        mutex_unlock(&ei->truncate_mutex);
+}
+static void ext2_truncate_blocks(struct inode *inode, loff_t offset)
+{
+        /*
+         * XXX: it seems like a bug here that we don't allow
+         * IS_APPEND inode to have blocks-past-i_size trimmed off.
+         * review and fix this.
+         *
+         * Also would be nice to be able to handle IO errors and such,
+         * but that's probably too much to ask.
+         */
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)))
+                return;
+        if (ext2_inode_is_fast_symlink(inode))
+                return;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return;
+        __ext2_truncate_blocks(inode, offset);
+}
+int ext2_setsize(struct inode *inode, loff_t newsize)
+{
+        loff_t oldsize;
+        int error;
+        error = inode_newsize_ok(inode, newsize);
+        if (error)
+                return error;
+        if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+            S_ISLNK(inode->i_mode)))
+                return -EINVAL;
+        if (ext2_inode_is_fast_symlink(inode))
+                return -EINVAL;
+        if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+                return -EPERM;
+        if (mapping_is_xip(inode->i_mapping))
+                error = xip_truncate_page(inode->i_mapping, newsize);
+        else if (test_opt(inode->i_sb, NOBH))
+                error = nobh_truncate_page(inode->i_mapping,
+                                newsize, ext2_get_block);
+        else
+                error = block_truncate_page(inode->i_mapping,
+                                newsize, ext2_get_block);
+        if (error)
+                return error;
+        oldsize = inode->i_size;
+        i_size_write(inode, newsize);
+        truncate_pagecache(inode, oldsize, newsize);
+        __ext2_truncate_blocks(inode, newsize);
        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
        if (inode_needs_sync(inode)) {
                sync_mapping_buffers(inode->i_mapping);
@@ -1134,6 +1210,8 @@ do_indirects:
        } else {
                mark_inode_dirty(inode);
        }
+        return 0;
 }
 static struct ext2_inode *ext2_get_inode(struct super_block *sb, ino_t ino,
@@ -1474,8 +1552,15 @@ int ext2_setattr(struct dentry *dentry, struct iattr *iattr)
                if (error)
                        return error;
        }
-        error = inode_setattr(inode, iattr);
+        if (iattr->ia_valid & ATTR_SIZE && iattr->ia_size != inode->i_size) {
-        if (!error && (iattr->ia_valid & ATTR_MODE))
+                error = ext2_setsize(inode, iattr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, iattr);
+        if (iattr->ia_valid & ATTR_MODE)
                error = ext2_acl_chmod(inode);
+        mark_inode_dirty(inode);
        return error;
 }
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 71e9eb1fa696..7ff43f4a59cd 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -119,6 +119,8 @@ static void ext2_put_super (struct super_block * sb)
        int i;
        struct ext2_sb_info *sbi = EXT2_SB(sb);
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        if (sb->s_dirt)
                ext2_write_super(sb);
@@ -1063,6 +1065,12 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_op = &ext2_sops;
        sb->s_export_op = &ext2_export_ops;
        sb->s_xattr = ext2_xattr_handlers;
+#ifdef CONFIG_QUOTA
+        sb->dq_op = &dquot_operations;
+        sb->s_qcop = &dquot_quotactl_ops;
+#endif
        root = ext2_iget(sb, EXT2_ROOT_INO);
        if (IS_ERR(root)) {
                ret = PTR_ERR(root);
@@ -1241,6 +1249,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                        spin_unlock(&sbi->s_lock);
                        return 0;
                }
                /*
                 * OK, we are remounting a valid rw partition rdonly, so set
                 * the rdonly flag and then mark the partition as valid again.
@@ -1248,6 +1257,13 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                es->s_state = cpu_to_le16(sbi->s_mount_state);
                es->s_mtime = cpu_to_le32(get_seconds());
                spin_unlock(&sbi->s_lock);
+                err = dquot_suspend(sb, -1);
+                if (err < 0) {
+                        spin_lock(&sbi->s_lock);
+                        goto restore_opts;
+                }
                ext2_sync_super(sb, es, 1);
        } else {
                __le32 ret = EXT2_HAS_RO_COMPAT_FEATURE(sb,
@@ -1269,8 +1285,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
                if (!ext2_setup_super (sb, es, 0))
                        sb->s_flags &= ~MS_RDONLY;
                spin_unlock(&sbi->s_lock);
                ext2_write_super(sb);
+                dquot_resume(sb, -1);
        }
        return 0;
 restore_opts:
        sbi->s_mount_opt = old_opts.s_mount_opt;
diff --git a/fs/ext3/dir.c b/fs/ext3/dir.c
index 373fa90c796a..e2e72c367cf6 100644
--- a/fs/ext3/dir.c
+++ b/fs/ext3/dir.c
@@ -297,7 +297,7 @@ static void free_rb_tree_fname(struct rb_root *root)
                        kfree (old);
                }
                if (!parent)
-                        root->rb_node = NULL;
+                        *root = RB_ROOT;
                else if (parent->rb_left == n)
                        parent->rb_left = NULL;
                else if (parent->rb_right == n)
diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c
index fcf7487734b6..d7e9f74dc3a6 100644
--- a/fs/ext3/fsync.c
+++ b/fs/ext3/fsync.c
@@ -43,9 +43,9 @@
 * inode to disk.
 */
-int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync)
+int ext3_sync_file(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ext3_inode_info *ei = EXT3_I(inode);
        journal_t *journal = EXT3_SB(inode->i_sb)->s_journal;
        int ret, needs_barrier = 0;
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 0fc1293d0e96..6c953bb255e7 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -410,6 +410,8 @@ static void ext3_put_super (struct super_block * sb)
        struct ext3_super_block *es = sbi->s_es;
        int i, err;
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        lock_kernel();
        ext3_xattr_put_super(sb);
@@ -748,7 +750,7 @@ static int ext3_release_dquot(struct dquot *dquot);
 static int ext3_mark_dquot_dirty(struct dquot *dquot);
 static int ext3_write_info(struct super_block *sb, int type);
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path, int remount);
+                                char *path);
 static int ext3_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -767,12 +769,12 @@ static const struct dquot_operations ext3_quota_operations = {
 static const struct quotactl_ops ext3_qctl_operations = {
        .quota_on       = ext3_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
 #endif
@@ -1527,7 +1529,7 @@ static void ext3_orphan_cleanup (struct super_block * sb,
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
                if (sb_dqopt(sb)->files[i])
-                        vfs_quota_off(sb, i, 0);
+                        dquot_quota_off(sb, i);
        }
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2551,6 +2553,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
        ext3_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext3_mount_options old_opts;
+        int enable_quota = 0;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
@@ -2597,6 +2600,10 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                }
                if (*flags & MS_RDONLY) {
+                        err = dquot_suspend(sb, -1);
+                        if (err < 0)
+                                goto restore_opts;
                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
@@ -2651,6 +2658,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
                                goto restore_opts;
                        if (!ext3_setup_super (sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        enable_quota = 1;
                }
        }
 #ifdef CONFIG_QUOTA
@@ -2662,6 +2670,9 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data)
 #endif
        unlock_super(sb);
        unlock_kernel();
+        if (enable_quota)
+                dquot_resume(sb, -1);
        return 0;
 restore_opts:
        sb->s_flags = old_sb_flags;
@@ -2851,24 +2862,21 @@ static int ext3_write_info(struct super_block *sb, int type)
 */
 static int ext3_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, EXT3_SB(sb)->s_qf_names[type],
-                        EXT3_SB(sb)->s_jquota_fmt, type);
+                                        EXT3_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int ext3_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name, int remount)
+                         char *name)
 {
        int err;
        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        /* When remounting, no checks are needed and in fact, name is NULL */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, remount);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
@@ -2906,7 +2914,7 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id,
                }
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
        path_put(&path);
        return err;
 }
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index d2f37a5516c7..95b7594c76f9 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -591,14 +591,15 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode,
        ret = ext4_mb_new_blocks(handle, &ar, errp);
        if (count)
                *count = ar.len;
        /*
-         * Account for the allocated meta blocks
+         * Account for the allocated meta blocks.  We will never
+         * fail EDQUOT for metdata, but we do account for it.
         */
        if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) {
                spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
                EXT4_I(inode)->i_allocated_meta_blocks += ar.len;
                spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
+                dquot_alloc_block_nofail(inode, ar.len);
        }
        return ret;
 }
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
index 538c48655084..5b6973fbf1bd 100644
--- a/fs/ext4/block_validity.c
+++ b/fs/ext4/block_validity.c
@@ -72,9 +72,9 @@ static int add_system_zone(struct ext4_sb_info *sbi,
                else if (start_blk >= (entry->start_blk + entry->count))
                        n = &(*n)->rb_right;
                else {
-                        if (start_blk + count > (entry->start_blk + 
+                        if (start_blk + count > (entry->start_blk +
                                                 entry->count))
-                                entry->count = (start_blk + count - 
+                                entry->count = (start_blk + count -
                                                entry->start_blk);
                        new_node = *n;
                        new_entry = rb_entry(new_node, struct ext4_system_zone,
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 86cb6d86a048..ea5e6cb7e2a5 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -83,11 +83,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                error_msg = "inode out of bounds";
        if (error_msg != NULL)
-                __ext4_error(dir->i_sb, function,
+                ext4_error_inode(function, dir,
-                        "bad entry in directory #%lu: %s - block=%llu"
+                        "bad entry in directory: %s - block=%llu"
                        "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d",
-                        dir->i_ino, error_msg, 
+                        error_msg, (unsigned long long) bh->b_blocknr,
-                        (unsigned long long) bh->b_blocknr,     
                        (unsigned) (offset%bh->b_size), offset,
                        le32_to_cpu(de->inode),
                        rlen, de->name_len);
@@ -111,7 +110,7 @@ static int ext4_readdir(struct file *filp,
        if (EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
                                    EXT4_FEATURE_COMPAT_DIR_INDEX) &&
-            ((EXT4_I(inode)->i_flags & EXT4_INDEX_FL) ||
+            ((ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) ||
             ((inode->i_size >> sb->s_blocksize_bits) == 1))) {
                err = ext4_dx_readdir(filp, dirent, filldir);
                if (err != ERR_BAD_DX_DIR) {
@@ -122,20 +121,20 @@ static int ext4_readdir(struct file *filp,
                 * We don't set the inode dirty flag since it's not
                 * critical that it get flushed back to the disk.
                 */
-                EXT4_I(filp->f_path.dentry->d_inode)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX);
        }
        stored = 0;
        offset = filp->f_pos & (sb->s_blocksize - 1);
        while (!error && !stored && filp->f_pos < inode->i_size) {
-                ext4_lblk_t blk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
+                struct ext4_map_blocks map;
-                struct buffer_head map_bh;
                struct buffer_head *bh = NULL;
-                map_bh.b_state = 0;
+                map.m_lblk = filp->f_pos >> EXT4_BLOCK_SIZE_BITS(sb);
-                err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
+                map.m_len = 1;
+                err = ext4_map_blocks(NULL, inode, &map, 0);
                if (err > 0) {
-                        pgoff_t index = map_bh.b_blocknr >>
+                        pgoff_t index = map.m_pblk >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
                        if (!ra_has_index(&filp->f_ra, index))
                                page_cache_sync_readahead(
@@ -143,7 +142,7 @@ static int ext4_readdir(struct file *filp,
                                        &filp->f_ra, filp,
                                        index, 1);
                        filp->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
-                        bh = ext4_bread(NULL, inode, blk, 0, &err);
+                        bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err);
                }
                /*
@@ -152,9 +151,8 @@ static int ext4_readdir(struct file *filp,
                 */
                if (!bh) {
                        if (!dir_has_error) {
-                                ext4_error(sb, "directory #%lu "
+                                EXT4_ERROR_INODE(inode, "directory "
                                           "contains a hole at offset %Lu",
-                                           inode->i_ino,
                                           (unsigned long long) filp->f_pos);
                                dir_has_error = 1;
                        }
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf7c5f0..19a4de57128a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -29,6 +29,9 @@
 #include <linux/wait.h>
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
+#ifdef __KERNEL__
+#include <linux/compat.h>
+#endif
 /*
 * The fourth extended filesystem constants/structures
@@ -54,10 +57,10 @@
 #endif
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
-        ext4_error_inode(__func__, (inode), (fmt), ## a);
+        ext4_error_inode(__func__, (inode), (fmt), ## a)
 #define EXT4_ERROR_FILE(file, fmt, a...)        \
-        ext4_error_file(__func__, (file), (fmt), ## a);
+        ext4_error_file(__func__, (file), (fmt), ## a)
 /* data type for block offset of block group */
 typedef int ext4_grpblk_t;
@@ -72,7 +75,7 @@ typedef __u32 ext4_lblk_t;
 typedef unsigned int ext4_group_t;
 /*
- * Flags used in mballoc's allocation_context flags field.  
+ * Flags used in mballoc's allocation_context flags field.
 *
 * Also used to show what's going on for debugging purposes when the
 * flag field is exported via the traceport interface
@@ -126,6 +129,29 @@ struct ext4_allocation_request {
 };
 /*
+ * Logical to physical block mapping, used by ext4_map_blocks()
+ *
+ * This structure is used to pass requests into ext4_map_blocks() as
+ * well as to store the information returned by ext4_map_blocks().  It
+ * takes less room on the stack than a struct buffer_head.
+ */
+#define EXT4_MAP_NEW            (1 << BH_New)
+#define EXT4_MAP_MAPPED         (1 << BH_Mapped)
+#define EXT4_MAP_UNWRITTEN      (1 << BH_Unwritten)
+#define EXT4_MAP_BOUNDARY       (1 << BH_Boundary)
+#define EXT4_MAP_UNINIT         (1 << BH_Uninit)
+#define EXT4_MAP_FLAGS          (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
+                                 EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
+                                 EXT4_MAP_UNINIT)
+struct ext4_map_blocks {
+        ext4_fsblk_t m_pblk;
+        ext4_lblk_t m_lblk;
+        unsigned int m_len;
+        unsigned int m_flags;
+};
+/*
 * For delayed allocation tracking
 */
 struct mpage_da_data {
@@ -321,6 +347,83 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
                return flags & EXT4_OTHER_FLMASK;
 }
+/*
+ * Inode flags used for atomic set/get
+ */
+enum {
+        EXT4_INODE_SECRM        = 0,    /* Secure deletion */
+        EXT4_INODE_UNRM         = 1,    /* Undelete */
+        EXT4_INODE_COMPR        = 2,    /* Compress file */
+        EXT4_INODE_SYNC         = 3,    /* Synchronous updates */
+        EXT4_INODE_IMMUTABLE    = 4,    /* Immutable file */
+        EXT4_INODE_APPEND       = 5,    /* writes to file may only append */
+        EXT4_INODE_NODUMP       = 6,    /* do not dump file */
+        EXT4_INODE_NOATIME      = 7,    /* do not update atime */
+/* Reserved for compression usage... */
+        EXT4_INODE_DIRTY        = 8,
+        EXT4_INODE_COMPRBLK     = 9,    /* One or more compressed clusters */
+        EXT4_INODE_NOCOMPR      = 10,   /* Don't compress */
+        EXT4_INODE_ECOMPR       = 11,   /* Compression error */
+/* End compression flags --- maybe not all used */
+        EXT4_INODE_INDEX        = 12,   /* hash-indexed directory */
+        EXT4_INODE_IMAGIC       = 13,   /* AFS directory */
+        EXT4_INODE_JOURNAL_DATA = 14,   /* file data should be journaled */
+        EXT4_INODE_NOTAIL       = 15,   /* file tail should not be merged */
+        EXT4_INODE_DIRSYNC      = 16,   /* dirsync behaviour (directories only) */
+        EXT4_INODE_TOPDIR       = 17,   /* Top of directory hierarchies*/
+        EXT4_INODE_HUGE_FILE    = 18,   /* Set to each huge file */
+        EXT4_INODE_EXTENTS      = 19,   /* Inode uses extents */
+        EXT4_INODE_EA_INODE     = 21,   /* Inode used for large EA */
+        EXT4_INODE_EOFBLOCKS    = 22,   /* Blocks allocated beyond EOF */
+        EXT4_INODE_RESERVED     = 31,   /* reserved for ext4 lib */
+};
+#define TEST_FLAG_VALUE(FLAG) (EXT4_##FLAG##_FL == (1 << EXT4_INODE_##FLAG))
+#define CHECK_FLAG_VALUE(FLAG) if (!TEST_FLAG_VALUE(FLAG)) { \
+        printk(KERN_EMERG "EXT4 flag fail: " #FLAG ": %d %d\n", \
+                EXT4_##FLAG##_FL, EXT4_INODE_##FLAG); BUG_ON(1); }
+/*
+ * Since it's pretty easy to mix up bit numbers and hex values, and we
+ * can't do a compile-time test for ENUM values, we use a run-time
+ * test to make sure that EXT4_XXX_FL is consistent with respect to
+ * EXT4_INODE_XXX.  If all is well the printk and BUG_ON will all drop
+ * out so it won't cost any extra space in the compiled kernel image.
+ * But it's important that these values are the same, since we are
+ * using EXT4_INODE_XXX to test for the flag values, but EXT4_XX_FL
+ * must be consistent with the values of FS_XXX_FL defined in
+ * include/linux/fs.h and the on-disk values found in ext2, ext3, and
+ * ext4 filesystems, and of course the values defined in e2fsprogs.
+ *
+ * It's not paranoia if the Murphy's Law really *is* out to get you.  :-)
+ */
+static inline void ext4_check_flag_values(void)
+{
+        CHECK_FLAG_VALUE(SECRM);
+        CHECK_FLAG_VALUE(UNRM);
+        CHECK_FLAG_VALUE(COMPR);
+        CHECK_FLAG_VALUE(SYNC);
+        CHECK_FLAG_VALUE(IMMUTABLE);
+        CHECK_FLAG_VALUE(APPEND);
+        CHECK_FLAG_VALUE(NODUMP);
+        CHECK_FLAG_VALUE(NOATIME);
+        CHECK_FLAG_VALUE(DIRTY);
+        CHECK_FLAG_VALUE(COMPRBLK);
+        CHECK_FLAG_VALUE(NOCOMPR);
+        CHECK_FLAG_VALUE(ECOMPR);
+        CHECK_FLAG_VALUE(INDEX);
+        CHECK_FLAG_VALUE(IMAGIC);
+        CHECK_FLAG_VALUE(JOURNAL_DATA);
+        CHECK_FLAG_VALUE(NOTAIL);
+        CHECK_FLAG_VALUE(DIRSYNC);
+        CHECK_FLAG_VALUE(TOPDIR);
+        CHECK_FLAG_VALUE(HUGE_FILE);
+        CHECK_FLAG_VALUE(EXTENTS);
+        CHECK_FLAG_VALUE(EA_INODE);
+        CHECK_FLAG_VALUE(EOFBLOCKS);
+        CHECK_FLAG_VALUE(RESERVED);
+}
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
        __u32 group;            /* Group number for this data */
@@ -332,6 +435,18 @@ struct ext4_new_group_input {
        __u16 unused;
 };
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
+struct compat_ext4_new_group_input {
+        u32 group;
+        compat_u64 block_bitmap;
+        compat_u64 inode_bitmap;
+        compat_u64 inode_table;
+        u32 blocks_count;
+        u16 reserved_blocks;
+        u16 unused;
+};
+#endif
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
        __u32 group;
@@ -355,7 +470,7 @@ struct ext4_new_group_data {
 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT       (EXT4_GET_BLOCKS_UNINIT_EXT|\
                                                 EXT4_GET_BLOCKS_CREATE)
        /* Caller is from the delayed allocation writeout path,
-           so set the magic i_delalloc_reserve_flag after taking the 
+           so set the magic i_delalloc_reserve_flag after taking the
           inode allocation semaphore for */
 #define EXT4_GET_BLOCKS_DELALLOC_RESERVE        0x0004
        /* caller is from the direct IO path, request to creation of an
@@ -398,6 +513,7 @@ struct ext4_new_group_data {
 #define EXT4_IOC_ALLOC_DA_BLKS          _IO('f', 12)
 #define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
+#if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
 * ioctl commands in 32 bit emulation
 */
@@ -408,11 +524,13 @@ struct ext4_new_group_data {
 #define EXT4_IOC32_GETRSVSZ             _IOR('f', 5, int)
 #define EXT4_IOC32_SETRSVSZ             _IOW('f', 6, int)
 #define EXT4_IOC32_GROUP_EXTEND         _IOW('f', 7, unsigned int)
+#define EXT4_IOC32_GROUP_ADD            _IOW('f', 8, struct compat_ext4_new_group_input)
 #ifdef CONFIG_JBD2_DEBUG
 #define EXT4_IOC32_WAIT_FOR_READONLY    _IOR('f', 99, int)
 #endif
 #define EXT4_IOC32_GETVERSION_OLD       FS_IOC32_GETVERSION
 #define EXT4_IOC32_SETVERSION_OLD       FS_IOC32_SETVERSION
+#endif
 /*
@@ -616,9 +734,8 @@ struct ext4_ext_cache {
 */
 struct ext4_inode_info {
        __le32  i_data[15];     /* unconverted */
-        __u32   i_flags;
-        ext4_fsblk_t    i_file_acl;
        __u32   i_dtime;
+        ext4_fsblk_t    i_file_acl;
        /*
         * i_block_group is the number of the block group which contains
@@ -629,6 +746,7 @@ struct ext4_inode_info {
         */
        ext4_group_t    i_block_group;
        unsigned long   i_state_flags;          /* Dynamic state flags */
+        unsigned long   i_flags;
        ext4_lblk_t             i_dir_start_lookup;
 #ifdef CONFIG_EXT4_FS_XATTR
@@ -1062,22 +1180,25 @@ enum {
        EXT4_STATE_DA_ALLOC_CLOSE,      /* Alloc DA blks on close */
        EXT4_STATE_EXT_MIGRATE,         /* Inode is migrating */
        EXT4_STATE_DIO_UNWRITTEN,       /* need convert on dio done*/
+        EXT4_STATE_NEWENTRY,            /* File just added to dir */
 };
-static inline int ext4_test_inode_state(struct inode *inode, int bit)
+#define EXT4_INODE_BIT_FNS(name, field)                                 \
-{
+static inline int ext4_test_inode_##name(struct inode *inode, int bit)  \
-        return test_bit(bit, &EXT4_I(inode)->i_state_flags);
+{                                                                       \
-}
+        return test_bit(bit, &EXT4_I(inode)->i_##field);                \
+}                                                                       \
-static inline void ext4_set_inode_state(struct inode *inode, int bit)
+static inline void ext4_set_inode_##name(struct inode *inode, int bit)  \
-{
+{                                                                       \
-        set_bit(bit, &EXT4_I(inode)->i_state_flags);
+        set_bit(bit, &EXT4_I(inode)->i_##field);                        \
+}                                                                       \
+static inline void ext4_clear_inode_##name(struct inode *inode, int bit) \
+{                                                                       \
+        clear_bit(bit, &EXT4_I(inode)->i_##field);                      \
 }
-static inline void ext4_clear_inode_state(struct inode *inode, int bit)
+EXT4_INODE_BIT_FNS(flag, flags)
-{
+EXT4_INODE_BIT_FNS(state, state_flags)
-        clear_bit(bit, &EXT4_I(inode)->i_state_flags);
-}
 #else
 /* Assume that user mode programs are passing in an ext4fs superblock, not
 * a kernel struct super_block.  This will allow us to call the feature-test
@@ -1264,7 +1385,7 @@ struct ext4_dir_entry_2 {
 #define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \
                                      EXT4_FEATURE_COMPAT_DIR_INDEX) && \
-                      (EXT4_I(dir)->i_flags & EXT4_INDEX_FL))
+                    ext4_test_inode_flag((dir), EXT4_INODE_INDEX))
 #define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX)
 #define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1)
@@ -1398,7 +1519,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash,
 extern void ext4_htree_free_dir_info(struct dir_private_info *p);
 /* fsync.c */
-extern int ext4_sync_file(struct file *, struct dentry *, int);
+extern int ext4_sync_file(struct file *, int);
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -1678,6 +1799,7 @@ struct ext4_group_info {
        ext4_grpblk_t   bb_first_free;  /* first free block */
        ext4_grpblk_t   bb_free;        /* total free blocks */
        ext4_grpblk_t   bb_fragments;   /* nr of freespace fragments */
+        ext4_grpblk_t   bb_largest_free_order;/* order of largest frag in BG */
        struct          list_head bb_prealloc_list;
 #ifdef DOUBLE_CHECK
        void            *bb_bitmap;
@@ -1772,9 +1894,8 @@ extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
-extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, unsigned int max_blocks,
+                               struct ext4_map_blocks *map, int flags);
-                               struct buffer_head *bh_result, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
@@ -1782,6 +1903,8 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
 extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                          ssize_t len);
+extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
+                           struct ext4_map_blocks *map, int flags);
 extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
                           sector_t block, unsigned int max_blocks,
                           struct buffer_head *bh, int flags);
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b79ad5126468..dade0c024797 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -273,7 +273,7 @@ static inline int ext4_should_journal_data(struct inode *inode)
                return 1;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
                return 1;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 1;
        return 0;
 }
@@ -284,7 +284,7 @@ static inline int ext4_should_order_data(struct inode *inode)
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
                return 1;
@@ -297,7 +297,7 @@ static inline int ext4_should_writeback_data(struct inode *inode)
                return 0;
        if (EXT4_JOURNAL(inode) == NULL)
                return 1;
-        if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
                return 0;
        if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
                return 1;
@@ -321,7 +321,7 @@ static inline int ext4_should_dioread_nolock(struct inode *inode)
                return 0;
        if (!S_ISREG(inode->i_mode))
                return 0;
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return 0;
        if (ext4_should_journal_data(inode))
                return 0;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 236b834b4ca8..377309c1af65 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -107,11 +107,8 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle,
        if (err <= 0)
                return err;
        err = ext4_truncate_restart_trans(handle, inode, needed);
-        /*
+        if (err == 0)
-         * We have dropped i_data_sem so someone might have cached again
+                err = -EAGAIN;
-         * an extent we are going to truncate.
-         */
-        ext4_ext_invalidate_cache(inode);
        return err;
 }
@@ -185,10 +182,10 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
                /*
                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
-                 * block groups per flexgroup, reserve the first block 
+                 * block groups per flexgroup, reserve the first block
-                 * group for directories and special files.  Regular 
+                 * group for directories and special files.  Regular
                 * files will start at the second block group.  This
-                 * tends to speed up directory access and improves 
+                 * tends to speed up directory access and improves
                 * fsck times.
                 */
                block_group &= ~(flex_size-1);
@@ -439,10 +436,10 @@ static int __ext4_ext_check(const char *function, struct inode *inode,
        return 0;
 corrupted:
-        __ext4_error(inode->i_sb, function,
+        ext4_error_inode(function, inode,
-                        "bad header/extent in inode #%lu: %s - magic %x, "
+                        "bad header/extent: %s - magic %x, "
                        "entries %u, max %u(%u), depth %u(%u)",
-                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
+                        error_msg, le16_to_cpu(eh->eh_magic),
                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
                        max, le16_to_cpu(eh->eh_depth), depth);
@@ -1622,9 +1619,7 @@ int ext4_ext_try_to_merge(struct inode *inode,
                merge_done = 1;
                WARN_ON(eh->eh_entries == 0);
                if (!eh->eh_entries)
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode, "eh->eh_entries = 0!");
-                                   "inode#%lu, eh->eh_entries = 0!",
-                                   inode->i_ino);
        }
        return merge_done;
@@ -2039,7 +2034,7 @@ ext4_ext_in_cache(struct inode *inode, ext4_lblk_t block,
        struct ext4_ext_cache *cex;
        int ret = EXT4_EXT_CACHE_NO;
-        /* 
+        /*
         * We borrow i_block_reservation_lock to protect i_cached_extent
         */
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2361,7 +2356,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        int depth = ext_depth(inode);
        struct ext4_ext_path *path;
        handle_t *handle;
-        int i = 0, err = 0;
+        int i, err;
        ext_debug("truncate since %u\n", start);
@@ -2370,23 +2365,26 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
        if (IS_ERR(handle))
                return PTR_ERR(handle);
+again:
        ext4_ext_invalidate_cache(inode);
        /*
         * We start scanning from right side, freeing all the blocks
         * after i_size and walking into the tree depth-wise.
         */
+        depth = ext_depth(inode);
        path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS);
        if (path == NULL) {
                ext4_journal_stop(handle);
                return -ENOMEM;
        }
+        path[0].p_depth = depth;
        path[0].p_hdr = ext_inode_hdr(inode);
        if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                err = -EIO;
                goto out;
        }
-        path[0].p_depth = depth;
+        i = err = 0;
        while (i >= 0 && err == 0) {
                if (i == depth) {
@@ -2480,6 +2478,8 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
 out:
        ext4_ext_drop_refs(path);
        kfree(path);
+        if (err == -EAGAIN)
+                goto again;
        ext4_journal_stop(handle);
        return err;
@@ -2544,7 +2544,7 @@ static void bi_complete(struct bio *bio, int error)
 /* FIXME!! we need to try to merge to left or right after zero-out  */
 static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
-        int ret = -EIO;
+        int ret;
        struct bio *bio;
        int blkbits, blocksize;
        sector_t ee_pblock;
@@ -2568,6 +2568,9 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
                        len = ee_len;
                bio = bio_alloc(GFP_NOIO, len);
+                if (!bio)
+                        return -ENOMEM;
                bio->bi_sector = ee_pblock;
                bio->bi_bdev   = inode->i_sb->s_bdev;
@@ -2595,22 +2598,20 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
                submit_bio(WRITE, bio);
                wait_for_completion(&event);
-                if (test_bit(BIO_UPTODATE, &bio->bi_flags))
+                if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
-                        ret = 0;
+                        bio_put(bio);
-                else {
+                        return -EIO;
-                        ret = -EIO;
-                        break;
                }
                bio_put(bio);
                ee_len    -= done;
                ee_pblock += done  << (blkbits - 9);
        }
-        return ret;
+        return 0;
 }
 #define EXT4_EXT_ZERO_LEN 7
 /*
- * This function is called by ext4_ext_get_blocks() if someone tries to write
+ * This function is called by ext4_ext_map_blocks() if someone tries to write
 * to an uninitialized extent. It may result in splitting the uninitialized
 * extent into multiple extents (upto three - one initialized and two
 * uninitialized).
@@ -2620,39 +2621,55 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 *   c> Splits in three extents: Somone is writing in middle of the extent
 */
 static int ext4_ext_convert_to_initialized(handle_t *handle,
-                                                struct inode *inode,
+                                           struct inode *inode,
-                                                struct ext4_ext_path *path,
+                                           struct ext4_map_blocks *map,
-                                                ext4_lblk_t iblock,
+                                           struct ext4_ext_path *path)
-                                                unsigned int max_blocks)
 {
        struct ext4_extent *ex, newex, orig_ex;
        struct ext4_extent *ex1 = NULL;
        struct ext4_extent *ex2 = NULL;
        struct ext4_extent *ex3 = NULL;
        struct ext4_extent_header *eh;
-        ext4_lblk_t ee_block;
+        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
        ext4_fsblk_t newblock;
        int err = 0;
        int ret = 0;
+        int may_zeroout;
+        ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)map->m_lblk, map->m_len);
+        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+                inode->i_sb->s_blocksize_bits;
+        if (eof_block < map->m_lblk + map->m_len)
+                eof_block = map->m_lblk + map->m_len;
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (iblock - ee_block);
+        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = iblock - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
+        /*
+         * It is safe to convert extent to initialized via explicit
+         * zeroout only if extent is fully insde i_size or new_size.
+         */
+        may_zeroout = ee_block + ee_len <= eof_block;
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
        /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
-        if (ee_len <= 2*EXT4_EXT_ZERO_LEN) {
+        if (ee_len <= 2*EXT4_EXT_ZERO_LEN && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -2665,10 +2682,10 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                return allocated;
        }
-        /* ex1: ee_block to iblock - 1 : uninitialized */
+        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (iblock > ee_block) {
+        if (map->m_lblk > ee_block) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
@@ -2677,15 +2694,15 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         * we insert ex3, if ex1 is NULL. This is to avoid temporary
         * overlap of blocks.
         */
-        if (!ex1 && allocated > max_blocks)
+        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(max_blocks);
+                ex2->ee_len = cpu_to_le16(map->m_len);
        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unsigned int newdepth;
                /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */
-                if (allocated <= EXT4_EXT_ZERO_LEN) {
+                if (allocated <= EXT4_EXT_ZERO_LEN && may_zeroout) {
                        /*
-                         * iblock == ee_block is handled by the zerouout
+                         * map->m_lblk == ee_block is handled by the zerouout
                         * at the beginning.
                         * Mark first half uninitialized.
                         * Mark second half initialized and zero out the
@@ -2698,7 +2715,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_dirty(handle, inode, path + depth);
                        ex3 = &newex;
-                        ex3->ee_block = cpu_to_le32(iblock);
+                        ex3->ee_block = cpu_to_le32(map->m_lblk);
                        ext4_ext_store_pblock(ex3, newblock);
                        ex3->ee_len = cpu_to_le16(allocated);
                        err = ext4_ext_insert_extent(handle, inode, path,
@@ -2711,7 +2728,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                ex->ee_len   = orig_ex.ee_len;
                                ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                                ext4_ext_dirty(handle, inode, path + depth);
-                                /* blocks available from iblock */
+                                /* blocks available from map->m_lblk */
                                return allocated;
                        } else if (err)
@@ -2733,8 +2750,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                                 */
                                depth = ext_depth(inode);
                                ext4_ext_drop_refs(path);
-                                path = ext4_ext_find_extent(inode,
+                                path = ext4_ext_find_extent(inode, map->m_lblk,
-                                                                iblock, path);
+                                                            path);
                                if (IS_ERR(path)) {
                                        err = PTR_ERR(path);
                                        return err;
@@ -2754,12 +2771,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        return allocated;
                }
                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + max_blocks);
+                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
                ext4_ext_mark_uninitialized(ex3);
                err = ext4_ext_insert_extent(handle, inode, path, ex3, 0);
-                if (err == -ENOSPC) {
+                if (err == -ENOSPC && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -2769,7 +2786,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                } else if (err)
@@ -2783,11 +2800,13 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                 * update the extent length after successful insert of the
                 * split extent
                 */
-                orig_ex.ee_len = cpu_to_le16(ee_len -
+                ee_len -= ext4_ext_get_actual_len(ex3);
-                                                ext4_ext_get_actual_len(ex3));
+                orig_ex.ee_len = cpu_to_le16(ee_len);
+                may_zeroout = ee_block + ee_len <= eof_block;
                depth = newdepth;
                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, iblock, path);
+                path = ext4_ext_find_extent(inode, map->m_lblk, path);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
@@ -2801,14 +2820,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                if (err)
                        goto out;
-                allocated = max_blocks;
+                allocated = map->m_len;
                /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying
                 * to insert a extent in the middle zerout directly
                 * otherwise give the extent a chance to merge to left
                 */
                if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN &&
-                                                        iblock != ee_block) {
+                        map->m_lblk != ee_block && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -2818,7 +2837,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zero out the first half */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                }
        }
@@ -2829,12 +2848,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
         */
        if (ex1 && ex1 != ex) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
-        /* ex2: iblock to iblock + maxblocks-1 : initialised */
+        /* ex2: map->m_lblk to map->m_lblk + maxblocks-1 : initialised */
-        ex2->ee_block = cpu_to_le32(iblock);
+        ex2->ee_block = cpu_to_le32(map->m_lblk);
        ext4_ext_store_pblock(ex2, newblock);
        ex2->ee_len = cpu_to_le16(allocated);
        if (ex2 != ex)
@@ -2877,7 +2896,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
        goto out;
 insert:
        err = ext4_ext_insert_extent(handle, inode, path, &newex, 0);
-        if (err == -ENOSPC) {
+        if (err == -ENOSPC && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -2904,7 +2923,7 @@ fix_extent_len:
 }
 /*
- * This function is called by ext4_ext_get_blocks() from
+ * This function is called by ext4_ext_map_blocks() from
 * ext4_get_blocks_dio_write() when DIO to write
 * to an uninitialized extent.
 *
@@ -2927,9 +2946,8 @@ fix_extent_len:
 */
 static int ext4_split_unwritten_extents(handle_t *handle,
                                        struct inode *inode,
+                                        struct ext4_map_blocks *map,
                                        struct ext4_ext_path *path,
-                                        ext4_lblk_t iblock,
-                                        unsigned int max_blocks,
                                        int flags)
 {
        struct ext4_extent *ex, newex, orig_ex;
@@ -2937,41 +2955,55 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        struct ext4_extent *ex2 = NULL;
        struct ext4_extent *ex3 = NULL;
        struct ext4_extent_header *eh;
-        ext4_lblk_t ee_block;
+        ext4_lblk_t ee_block, eof_block;
        unsigned int allocated, ee_len, depth;
        ext4_fsblk_t newblock;
        int err = 0;
+        int may_zeroout;
+        ext_debug("ext4_split_unwritten_extents: inode %lu, logical"
+                "block %llu, max_blocks %u\n", inode->i_ino,
+                (unsigned long long)map->m_lblk, map->m_len);
+        eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
+                inode->i_sb->s_blocksize_bits;
+        if (eof_block < map->m_lblk + map->m_len)
+                eof_block = map->m_lblk + map->m_len;
-        ext_debug("ext4_split_unwritten_extents: inode %lu,"
-                  "iblock %llu, max_blocks %u\n", inode->i_ino,
-                  (unsigned long long)iblock, max_blocks);
        depth = ext_depth(inode);
        eh = path[depth].p_hdr;
        ex = path[depth].p_ext;
        ee_block = le32_to_cpu(ex->ee_block);
        ee_len = ext4_ext_get_actual_len(ex);
-        allocated = ee_len - (iblock - ee_block);
+        allocated = ee_len - (map->m_lblk - ee_block);
-        newblock = iblock - ee_block + ext_pblock(ex);
+        newblock = map->m_lblk - ee_block + ext_pblock(ex);
        ex2 = ex;
        orig_ex.ee_block = ex->ee_block;
        orig_ex.ee_len   = cpu_to_le16(ee_len);
        ext4_ext_store_pblock(&orig_ex, ext_pblock(ex));
        /*
+         * It is safe to convert extent to initialized via explicit
+         * zeroout only if extent is fully insde i_size or new_size.
+         */
+        may_zeroout = ee_block + ee_len <= eof_block;
+        /*
         * If the uninitialized extent begins at the same logical
         * block where the write begins, and the write completely
         * covers the extent, then we don't need to split it.
         */
-        if ((iblock == ee_block) && (allocated <= max_blocks))
+        if ((map->m_lblk == ee_block) && (allocated <= map->m_len))
                return allocated;
        err = ext4_ext_get_access(handle, inode, path + depth);
        if (err)
                goto out;
-        /* ex1: ee_block to iblock - 1 : uninitialized */
+        /* ex1: ee_block to map->m_lblk - 1 : uninitialized */
-        if (iblock > ee_block) {
+        if (map->m_lblk > ee_block) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
@@ -2980,18 +3012,18 @@ static int ext4_split_unwritten_extents(handle_t *handle,
         * we insert ex3, if ex1 is NULL. This is to avoid temporary
         * overlap of blocks.
         */
-        if (!ex1 && allocated > max_blocks)
+        if (!ex1 && allocated > map->m_len)
-                ex2->ee_len = cpu_to_le16(max_blocks);
+                ex2->ee_len = cpu_to_le16(map->m_len);
        /* ex3: to ee_block + ee_len : uninitialised */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unsigned int newdepth;
                ex3 = &newex;
-                ex3->ee_block = cpu_to_le32(iblock + max_blocks);
+                ex3->ee_block = cpu_to_le32(map->m_lblk + map->m_len);
-                ext4_ext_store_pblock(ex3, newblock + max_blocks);
+                ext4_ext_store_pblock(ex3, newblock + map->m_len);
-                ex3->ee_len = cpu_to_le16(allocated - max_blocks);
+                ex3->ee_len = cpu_to_le16(allocated - map->m_len);
                ext4_ext_mark_uninitialized(ex3);
                err = ext4_ext_insert_extent(handle, inode, path, ex3, flags);
-                if (err == -ENOSPC) {
+                if (err == -ENOSPC && may_zeroout) {
                        err =  ext4_ext_zeroout(inode, &orig_ex);
                        if (err)
                                goto fix_extent_len;
@@ -3001,7 +3033,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                        ext4_ext_store_pblock(ex, ext_pblock(&orig_ex));
                        ext4_ext_dirty(handle, inode, path + depth);
                        /* zeroed the full extent */
-                        /* blocks available from iblock */
+                        /* blocks available from map->m_lblk */
                        return allocated;
                } else if (err)
@@ -3015,11 +3047,13 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                 * update the extent length after successful insert of the
                 * split extent
                 */
-                orig_ex.ee_len = cpu_to_le16(ee_len -
+                ee_len -= ext4_ext_get_actual_len(ex3);
-                                                ext4_ext_get_actual_len(ex3));
+                orig_ex.ee_len = cpu_to_le16(ee_len);
+                may_zeroout = ee_block + ee_len <= eof_block;
                depth = newdepth;
                ext4_ext_drop_refs(path);
-                path = ext4_ext_find_extent(inode, iblock, path);
+                path = ext4_ext_find_extent(inode, map->m_lblk, path);
                if (IS_ERR(path)) {
                        err = PTR_ERR(path);
                        goto out;
@@ -3033,7 +3067,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
                if (err)
                        goto out;
-                allocated = max_blocks;
+                allocated = map->m_len;
        }
        /*
         * If there was a change of depth as part of the
@@ -3042,15 +3076,15 @@ static int ext4_split_unwritten_extents(handle_t *handle,
         */
        if (ex1 && ex1 != ex) {
                ex1 = ex;
-                ex1->ee_len = cpu_to_le16(iblock - ee_block);
+                ex1->ee_len = cpu_to_le16(map->m_lblk - ee_block);
                ext4_ext_mark_uninitialized(ex1);
                ex2 = &newex;
        }
        /*
-         * ex2: iblock to iblock + maxblocks-1 : to be direct IO written,
+         * ex2: map->m_lblk to map->m_lblk + map->m_len-1 : to be written
-         * uninitialised still.
+         * using direct I/O, uninitialised still.
         */
-        ex2->ee_block = cpu_to_le32(iblock);
+        ex2->ee_block = cpu_to_le32(map->m_lblk);
        ext4_ext_store_pblock(ex2, newblock);
        ex2->ee_len = cpu_to_le16(allocated);
        ext4_ext_mark_uninitialized(ex2);
@@ -3062,7 +3096,7 @@ static int ext4_split_unwritten_extents(handle_t *handle,
        goto out;
 insert:
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
-        if (err == -ENOSPC) {
+        if (err == -ENOSPC && may_zeroout) {
                err =  ext4_ext_zeroout(inode, &orig_ex);
                if (err)
                        goto fix_extent_len;
@@ -3152,10 +3186,9 @@ static void unmap_underlying_metadata_blocks(struct block_device *bdev,
 static int
 ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t iblock, unsigned int max_blocks,
+                        struct ext4_map_blocks *map,
                        struct ext4_ext_path *path, int flags,
-                        unsigned int allocated, struct buffer_head *bh_result,
+                        unsigned int allocated, ext4_fsblk_t newblock)
-                        ext4_fsblk_t newblock)
 {
        int ret = 0;
        int err = 0;
@@ -3163,15 +3196,14 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
        ext_debug("ext4_ext_handle_uninitialized_extents: inode %lu, logical"
                  "block %llu, max_blocks %u, flags %d, allocated %u",
-                  inode->i_ino, (unsigned long long)iblock, max_blocks,
+                  inode->i_ino, (unsigned long long)map->m_lblk, map->m_len,
                  flags, allocated);
        ext4_ext_show_leaf(inode, path);
        /* get_block() before submit the IO, split the extent */
        if ((flags & EXT4_GET_BLOCKS_PRE_IO)) {
-                ret = ext4_split_unwritten_extents(handle,
+                ret = ext4_split_unwritten_extents(handle, inode, map,
-                                                inode, path, iblock,
+                                                   path, flags);
-                                                max_blocks, flags);
                /*
                 * Flag the inode(non aio case) or end_io struct (aio case)
                 * that this IO needs to convertion to written when IO is
@@ -3182,7 +3214,7 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                else
                        ext4_set_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN);
                if (ext4_should_dioread_nolock(inode))
-                        set_buffer_uninit(bh_result);
+                        map->m_flags |= EXT4_MAP_UNINIT;
                goto out;
        }
        /* IO end_io complete, convert the filled extent to written */
@@ -3210,14 +3242,12 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode,
                 * the buffer head will be unmapped so that
                 * a read from the block returns 0s.
                 */
-                set_buffer_unwritten(bh_result);
+                map->m_flags |= EXT4_MAP_UNWRITTEN;
                goto out1;
        }
        /* buffered write, writepage time, convert*/
-        ret = ext4_ext_convert_to_initialized(handle, inode,
+        ret = ext4_ext_convert_to_initialized(handle, inode, map, path);
-                                                path, iblock,
-                                                max_blocks);
        if (ret >= 0)
                ext4_update_inode_fsync_trans(handle, inode, 1);
 out:
@@ -3226,7 +3256,7 @@ out:
                goto out2;
        } else
                allocated = ret;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        /*
         * if we allocated more blocks than requested
         * we need to make sure we unmap the extra block
@@ -3234,11 +3264,11 @@ out:
         * unmapped later when we find the buffer_head marked
         * new.
         */
-        if (allocated > max_blocks) {
+        if (allocated > map->m_len) {
                unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
-                                        newblock + max_blocks,
+                                        newblock + map->m_len,
-                                        allocated - max_blocks);
+                                        allocated - map->m_len);
-                allocated = max_blocks;
+                allocated = map->m_len;
        }
        /*
@@ -3252,13 +3282,13 @@ out:
                ext4_da_update_reserve_space(inode, allocated, 0);
 map_out:
-        set_buffer_mapped(bh_result);
+        map->m_flags |= EXT4_MAP_MAPPED;
 out1:
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
-        bh_result->b_bdev = inode->i_sb->s_bdev;
+        map->m_pblk = newblock;
-        bh_result->b_blocknr = newblock;
+        map->m_len = allocated;
 out2:
        if (path) {
                ext4_ext_drop_refs(path);
@@ -3284,26 +3314,23 @@ out2:
 *
 * return < 0, error case.
 */
-int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
-                        ext4_lblk_t iblock,
+                        struct ext4_map_blocks *map, int flags)
-                        unsigned int max_blocks, struct buffer_head *bh_result,
-                        int flags)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
        struct ext4_extent newex, *ex, *last_ex;
        ext4_fsblk_t newblock;
-        int err = 0, depth, ret, cache_type;
+        int i, err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
        ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
-        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%u requested for inode %lu\n",
-                        iblock, max_blocks, inode->i_ino);
+                  map->m_lblk, map->m_len, inode->i_ino);
        /* check in cache */
-        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
+        cache_type = ext4_ext_in_cache(inode, map->m_lblk, &newex);
        if (cache_type) {
                if (cache_type == EXT4_EXT_CACHE_GAP) {
                        if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
@@ -3316,12 +3343,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        /* we should allocate requested block */
                } else if (cache_type == EXT4_EXT_CACHE_EXTENT) {
                        /* block is already allocated */
-                        newblock = iblock
+                        newblock = map->m_lblk
                                   - le32_to_cpu(newex.ee_block)
                                   + ext_pblock(&newex);
                        /* number of remaining blocks in the extent */
                        allocated = ext4_ext_get_actual_len(&newex) -
-                                        (iblock - le32_to_cpu(newex.ee_block));
+                                (map->m_lblk - le32_to_cpu(newex.ee_block));
                        goto out;
                } else {
                        BUG();
@@ -3329,7 +3356,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        }
        /* find extent for this block */
-        path = ext4_ext_find_extent(inode, iblock, NULL);
+        path = ext4_ext_find_extent(inode, map->m_lblk, NULL);
        if (IS_ERR(path)) {
                err = PTR_ERR(path);
                path = NULL;
@@ -3345,8 +3372,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         */
        if (unlikely(path[depth].p_ext == NULL && depth != 0)) {
                EXT4_ERROR_INODE(inode, "bad extent address "
-                                 "iblock: %d, depth: %d pblock %lld",
+                                 "lblock: %lu, depth: %d pblock %lld",
-                                 iblock, depth, path[depth].p_block);
+                                 (unsigned long) map->m_lblk, depth,
+                                 path[depth].p_block);
                err = -EIO;
                goto out2;
        }
@@ -3364,12 +3392,12 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                 */
                ee_len = ext4_ext_get_actual_len(ex);
                /* if found extent covers block, simply return it */
-                if (in_range(iblock, ee_block, ee_len)) {
+                if (in_range(map->m_lblk, ee_block, ee_len)) {
-                        newblock = iblock - ee_block + ee_start;
+                        newblock = map->m_lblk - ee_block + ee_start;
                        /* number of remaining blocks in the extent */
-                        allocated = ee_len - (iblock - ee_block);
+                        allocated = ee_len - (map->m_lblk - ee_block);
-                        ext_debug("%u fit into %u:%d -> %llu\n", iblock,
+                        ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
-                                        ee_block, ee_len, newblock);
+                                  ee_block, ee_len, newblock);
                        /* Do not put uninitialized extent in the cache */
                        if (!ext4_ext_is_uninitialized(ex)) {
@@ -3379,8 +3407,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                goto out;
                        }
                        ret = ext4_ext_handle_uninitialized_extents(handle,
-                                        inode, iblock, max_blocks, path,
+                                        inode, map, path, flags, allocated,
-                                        flags, allocated, bh_result, newblock);
+                                        newblock);
                        return ret;
                }
        }
@@ -3394,7 +3422,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                 * put just found gap into cache to speed up
                 * subsequent requests
                 */
-                ext4_ext_put_gap_in_cache(inode, path, iblock);
+                ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
                goto out2;
        }
        /*
@@ -3402,11 +3430,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         */
        /* find neighbour allocated blocks */
-        ar.lleft = iblock;
+        ar.lleft = map->m_lblk;
        err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft);
        if (err)
                goto out2;
-        ar.lright = iblock;
+        ar.lright = map->m_lblk;
        err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright);
        if (err)
                goto out2;
@@ -3417,26 +3445,26 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * EXT_INIT_MAX_LEN and for an uninitialized extent this limit is
         * EXT_UNINIT_MAX_LEN.
         */
-        if (max_blocks > EXT_INIT_MAX_LEN &&
+        if (map->m_len > EXT_INIT_MAX_LEN &&
            !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-                max_blocks = EXT_INIT_MAX_LEN;
+                map->m_len = EXT_INIT_MAX_LEN;
-        else if (max_blocks > EXT_UNINIT_MAX_LEN &&
+        else if (map->m_len > EXT_UNINIT_MAX_LEN &&
                 (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
-                max_blocks = EXT_UNINIT_MAX_LEN;
+                map->m_len = EXT_UNINIT_MAX_LEN;
-        /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
+        /* Check if we can really insert (m_lblk)::(m_lblk + m_len) extent */
-        newex.ee_block = cpu_to_le32(iblock);
+        newex.ee_block = cpu_to_le32(map->m_lblk);
-        newex.ee_len = cpu_to_le16(max_blocks);
+        newex.ee_len = cpu_to_le16(map->m_len);
        err = ext4_ext_check_overlap(inode, &newex, path);
        if (err)
                allocated = ext4_ext_get_actual_len(&newex);
        else
-                allocated = max_blocks;
+                allocated = map->m_len;
        /* allocate new block */
        ar.inode = inode;
-        ar.goal = ext4_ext_find_goal(inode, path, iblock);
+        ar.goal = ext4_ext_find_goal(inode, path, map->m_lblk);
-        ar.logical = iblock;
+        ar.logical = map->m_lblk;
        ar.len = allocated;
        if (S_ISREG(inode->i_mode))
                ar.flags = EXT4_MB_HINT_DATA;
@@ -3470,21 +3498,33 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                                     EXT4_STATE_DIO_UNWRITTEN);
                }
                if (ext4_should_dioread_nolock(inode))
-                        set_buffer_uninit(bh_result);
+                        map->m_flags |= EXT4_MAP_UNINIT;
        }
-        if (unlikely(EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL)) {
+        if (unlikely(ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))) {
                if (unlikely(!eh->eh_entries)) {
                        EXT4_ERROR_INODE(inode,
-                                         "eh->eh_entries == 0 ee_block %d",
+                                         "eh->eh_entries == 0 and "
-                                         ex->ee_block);
+                                         "EOFBLOCKS_FL set");
                        err = -EIO;
                        goto out2;
                }
                last_ex = EXT_LAST_EXTENT(eh);
-                if (iblock + ar.len > le32_to_cpu(last_ex->ee_block)
+                /*
-                    + ext4_ext_get_actual_len(last_ex))
+                 * If the current leaf block was reached by looking at
-                        EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+                 * the last index block all the way down the tree, and
+                 * we are extending the inode beyond the last extent
+                 * in the current leaf block, then clear the
+                 * EOFBLOCKS_FL flag.
+                 */
+                for (i = depth-1; i >= 0; i--) {
+                        if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
+                                break;
+                }
+                if ((i < 0) &&
+                    (map->m_lblk + ar.len > le32_to_cpu(last_ex->ee_block) +
+                     ext4_ext_get_actual_len(last_ex)))
+                        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
        err = ext4_ext_insert_extent(handle, inode, path, &newex, flags);
        if (err) {
@@ -3500,9 +3540,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* previous routine could use block we allocated */
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        /*
         * Update reserved blocks/metadata blocks after successful
@@ -3516,18 +3556,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * when it is _not_ an uninitialized extent.
         */
        if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) {
-                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
+                ext4_ext_put_in_cache(inode, map->m_lblk, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
                ext4_update_inode_fsync_trans(handle, inode, 1);
        } else
                ext4_update_inode_fsync_trans(handle, inode, 0);
 out:
-        if (allocated > max_blocks)
+        if (allocated > map->m_len)
-                allocated = max_blocks;
+                allocated = map->m_len;
        ext4_ext_show_leaf(inode, path);
-        set_buffer_mapped(bh_result);
+        map->m_flags |= EXT4_MAP_MAPPED;
-        bh_result->b_bdev = inode->i_sb->s_bdev;
+        map->m_pblk = newblock;
-        bh_result->b_blocknr = newblock;
+        map->m_len = allocated;
 out2:
        if (path) {
                ext4_ext_drop_refs(path);
@@ -3625,7 +3665,7 @@ static void ext4_falloc_update_inode(struct inode *inode,
                 * can proceed even if the new size is the same as i_size.
                 */
                if (new_size > i_size_read(inode))
-                        EXT4_I(inode)->i_flags |= EXT4_EOFBLOCKS_FL;
+                        ext4_set_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        }
 }
@@ -3640,55 +3680,57 @@ static void ext4_falloc_update_inode(struct inode *inode,
 long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len)
 {
        handle_t *handle;
-        ext4_lblk_t block;
        loff_t new_size;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
        int retries = 0;
-        struct buffer_head map_bh;
+        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
        /*
         * currently supporting (pre)allocate mode for extent-based
         * files _only_
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EOPNOTSUPP;
        /* preallocation to directories is currently not supported */
        if (S_ISDIR(inode->i_mode))
                return -ENODEV;
-        block = offset >> blkbits;
+        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
-                                                        - block;
+                - map.m_lblk;
        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, (len + offset));
+        if (ret) {
+                mutex_unlock(&inode->i_mutex);
+                return ret;
+        }
 retry:
        while (ret >= 0 && ret < max_blocks) {
-                block = block + ret;
+                map.m_lblk = map.m_lblk + ret;
-                max_blocks = max_blocks - ret;
+                map.m_len = max_blocks = max_blocks - ret;
                handle = ext4_journal_start(inode, credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
-                map_bh.b_state = 0;
+                ret = ext4_map_blocks(handle, inode, &map,
-                ret = ext4_get_blocks(handle, inode, block,
-                                      max_blocks, &map_bh,
                                      EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
-                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
+                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
                                    inode->i_ino, block, max_blocks);
@@ -3697,14 +3739,14 @@ retry:
                        ret2 = ext4_journal_stop(handle);
                        break;
                }
-                if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
+                if ((map.m_lblk + ret) >= (EXT4_BLOCK_ALIGN(offset + len,
                                                blkbits) >> blkbits))
                        new_size = offset + len;
                else
-                        new_size = (block + ret) << blkbits;
+                        new_size = (map.m_lblk + ret) << blkbits;
                ext4_falloc_update_inode(inode, mode, new_size,
-                                                buffer_new(&map_bh));
+                                         (map.m_flags & EXT4_MAP_NEW));
                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
                if (ret2)
@@ -3733,42 +3775,39 @@ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset,
                                    ssize_t len)
 {
        handle_t *handle;
-        ext4_lblk_t block;
        unsigned int max_blocks;
        int ret = 0;
        int ret2 = 0;
-        struct buffer_head map_bh;
+        struct ext4_map_blocks map;
        unsigned int credits, blkbits = inode->i_blkbits;
-        block = offset >> blkbits;
+        map.m_lblk = offset >> blkbits;
        /*
         * We can't just convert len to max_blocks because
         * If blocksize = 4096 offset = 3072 and len = 2048
         */
-        max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits)
+        max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
-                                                        - block;
+                      map.m_lblk);
        /*
         * credits to insert 1 extent into extent tree
         */
        credits = ext4_chunk_trans_blocks(inode, max_blocks);
        while (ret >= 0 && ret < max_blocks) {
-                block = block + ret;
+                map.m_lblk += ret;
-                max_blocks = max_blocks - ret;
+                map.m_len = (max_blocks -= ret);
                handle = ext4_journal_start(inode, credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        break;
                }
-                map_bh.b_state = 0;
+                ret = ext4_map_blocks(handle, inode, &map,
-                ret = ext4_get_blocks(handle, inode, block,
-                                      max_blocks, &map_bh,
                                      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
                if (ret <= 0) {
                        WARN_ON(ret <= 0);
-                        printk(KERN_ERR "%s: ext4_ext_get_blocks "
+                        printk(KERN_ERR "%s: ext4_ext_map_blocks "
                                    "returned error inode#%lu, block=%u, "
                                    "max_blocks=%u", __func__,
-                                    inode->i_ino, block, max_blocks);
+                                    inode->i_ino, map.m_lblk, map.m_len);
                }
                ext4_mark_inode_dirty(handle, inode);
                ret2 = ext4_journal_stop(handle);
@@ -3898,7 +3937,7 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
        int error = 0;
        /* fallback to generic here if not in extents fmt */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return generic_block_fiemap(inode, fieinfo, start, len,
                        ext4_get_block);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index d0776e410f34..5313ae4cda2d 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -66,7 +66,7 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov,
         * is smaller than s_maxbytes, which is for extent-mapped files.
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                size_t length = iov_length(iov, nr_segs);
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index ef3d980e67cb..592adf2e546e 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -35,6 +35,29 @@
 #include <trace/events/ext4.h>
 /*
+ * If we're not journaling and this is a just-created file, we have to
+ * sync our parent directory (if it was freshly created) since
+ * otherwise it will only be written by writeback, leaving a huge
+ * window during which a crash may lose the file.  This may apply for
+ * the parent directory's parent as well, and so on recursively, if
+ * they are also freshly created.
+ */
+static void ext4_sync_parent(struct inode *inode)
+{
+        struct dentry *dentry = NULL;
+        while (inode && ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) {
+                ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY);
+                dentry = list_entry(inode->i_dentry.next,
+                                    struct dentry, d_alias);
+                if (!dentry || !dentry->d_parent || !dentry->d_parent->d_inode)
+                        break;
+                inode = dentry->d_parent->d_inode;
+                sync_mapping_buffers(inode->i_mapping);
+        }
+}
+/*
 * akpm: A new design for ext4_sync_file().
 *
 * This is only called from sys_fsync(), sys_fdatasync() and sys_msync().
@@ -48,9 +71,9 @@
 * i_mutex lock is held when entering and exiting this function
 */
-int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int ext4_sync_file(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ext4_inode_info *ei = EXT4_I(inode);
        journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
        int ret;
@@ -58,7 +81,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        J_ASSERT(ext4_journal_current_handle() == NULL);
-        trace_ext4_sync_file(file, dentry, datasync);
+        trace_ext4_sync_file(file, datasync);
        if (inode->i_sb->s_flags & MS_RDONLY)
                return 0;
@@ -66,9 +89,13 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
        ret = flush_completed_IO(inode);
        if (ret < 0)
                return ret;
-        
-        if (!journal)
+        if (!journal) {
-                return simple_fsync(file, dentry, datasync);
+                ret = generic_file_fsync(file, datasync);
+                if (!ret && !list_empty(&inode->i_dentry))
+                        ext4_sync_parent(inode);
+                return ret;
+        }
        /*
         * data=writeback,ordered:
@@ -102,7 +129,7 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync)
                    (journal->j_flags & JBD2_BARRIER))
                        blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL,
                                        NULL, BLKDEV_IFL_WAIT);
-                jbd2_log_wait_commit(journal, commit_tid);
+                ret = jbd2_log_wait_commit(journal, commit_tid);
        } else if (journal->j_flags & JBD2_BARRIER)
                blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL,
                        BLKDEV_IFL_WAIT);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 1a0e183a2f04..25c4b3173fd9 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -240,56 +240,49 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        if (fatal)
                goto error_return;
-        /* Ok, now we can actually update the inode bitmaps.. */
+        fatal = -ESRCH;
-        cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+        gdp = ext4_get_group_desc(sb, block_group, &bh2);
-                                        bit, bitmap_bh->b_data);
+        if (gdp) {
-        if (!cleared)
-                ext4_error(sb, "bit already cleared for inode %lu", ino);
-        else {
-                gdp = ext4_get_group_desc(sb, block_group, &bh2);
                BUFFER_TRACE(bh2, "get_write_access");
                fatal = ext4_journal_get_write_access(handle, bh2);
-                if (fatal) goto error_return;
+        }
+        ext4_lock_group(sb, block_group);
-                if (gdp) {
+        cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
-                        ext4_lock_group(sb, block_group);
+        if (fatal || !cleared) {
-                        count = ext4_free_inodes_count(sb, gdp) + 1;
+                ext4_unlock_group(sb, block_group);
-                        ext4_free_inodes_set(sb, gdp, count);
+                goto out;
-                        if (is_directory) {
+        }
-                                count = ext4_used_dirs_count(sb, gdp) - 1;
-                                ext4_used_dirs_set(sb, gdp, count);
-                                if (sbi->s_log_groups_per_flex) {
-                                        ext4_group_t f;
-                                        f = ext4_flex_group(sbi, block_group);
-                                        atomic_dec(&sbi->s_flex_groups[f].used_dirs);
-                                }
-                        }
+        count = ext4_free_inodes_count(sb, gdp) + 1;
-                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
+        ext4_free_inodes_set(sb, gdp, count);
-                                                        block_group, gdp);
+        if (is_directory) {
-                        ext4_unlock_group(sb, block_group);
+                count = ext4_used_dirs_count(sb, gdp) - 1;
-                        percpu_counter_inc(&sbi->s_freeinodes_counter);
+                ext4_used_dirs_set(sb, gdp, count);
-                        if (is_directory)
+                percpu_counter_dec(&sbi->s_dirs_counter);
-                                percpu_counter_dec(&sbi->s_dirs_counter);
-                        if (sbi->s_log_groups_per_flex) {
-                                ext4_group_t f;
-                                f = ext4_flex_group(sbi, block_group);
-                                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
-                        }
-                }
-                BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
-                err = ext4_handle_dirty_metadata(handle, NULL, bh2);
-                if (!fatal) fatal = err;
        }
-        BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+        ext4_unlock_group(sb, block_group);
-        if (!fatal)
-                fatal = err;
+        percpu_counter_inc(&sbi->s_freeinodes_counter);
-        sb->s_dirt = 1;
+        if (sbi->s_log_groups_per_flex) {
+                ext4_group_t f = ext4_flex_group(sbi, block_group);
+                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+                if (is_directory)
+                        atomic_dec(&sbi->s_flex_groups[f].used_dirs);
+        }
+        BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
+        fatal = ext4_handle_dirty_metadata(handle, NULL, bh2);
+out:
+        if (cleared) {
+                BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
+                if (!fatal)
+                        fatal = err;
+                sb->s_dirt = 1;
+        } else
+                ext4_error(sb, "bit already cleared for inode %lu", ino);
 error_return:
        brelse(bitmap_bh);
        ext4_std_error(sb, fatal);
@@ -499,7 +492,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        if (S_ISDIR(mode) &&
            ((parent == sb->s_root->d_inode) ||
-             (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
+             (ext4_test_inode_flag(parent, EXT4_INODE_TOPDIR)))) {
                int best_ndir = inodes_per_group;
                int ret = -1;
@@ -1041,7 +1034,7 @@ got:
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
                /* set extent flag only for directory, file and normal symlink*/
                if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
-                        EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL;
+                        ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
                        ext4_ext_tree_init(handle, inode);
                }
        }
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 3e0f6af9d08d..42272d67955a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -149,7 +149,7 @@ int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode,
        int ret;
        /*
-         * Drop i_data_sem to avoid deadlock with ext4_get_blocks At this
+         * Drop i_data_sem to avoid deadlock with ext4_map_blocks.  At this
         * moment, get_block can be called only for blocks inside i_size since
         * page cache has been already dropped and writes are blocked by
         * i_mutex. So we can safely drop the i_data_sem here.
@@ -348,9 +348,8 @@ static int __ext4_check_blockref(const char *function, struct inode *inode,
                if (blk &&
                    unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                    blk, 1))) {
-                        __ext4_error(inode->i_sb, function,
+                        ext4_error_inode(function, inode,
-                                   "invalid block reference %u "
+                                         "invalid block reference %u", blk);
-                                   "in inode #%lu", blk, inode->i_ino);
                        return -EIO;
                }
        }
@@ -785,7 +784,7 @@ failed:
        /* Allocation failed, free what we already allocated */
        ext4_free_blocks(handle, inode, 0, new_blocks[0], 1, 0);
        for (i = 1; i <= n ; i++) {
-                /* 
+                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -875,7 +874,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode,
 err_out:
        for (i = 1; i <= num; i++) {
-                /* 
+                /*
                 * branch[i].bh is newly allocated, so there is no
                 * need to revoke the block, which is why we don't
                 * need to set EXT4_FREE_BLOCKS_METADATA.
@@ -890,9 +889,9 @@ err_out:
 }
 /*
- * The ext4_ind_get_blocks() function handles non-extents inodes
+ * The ext4_ind_map_blocks() function handles non-extents inodes
 * (i.e., using the traditional indirect/double-indirect i_blocks
- * scheme) for ext4_get_blocks().
+ * scheme) for ext4_map_blocks().
 *
 * Allocation strategy is simple: if we have to allocate something, we will
 * have to go the whole way to leaf. So let's do it before attaching anything
@@ -917,9 +916,8 @@ err_out:
 * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
 * blocks.
 */
-static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
+static int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
-                               ext4_lblk_t iblock, unsigned int maxblocks,
+                               struct ext4_map_blocks *map,
-                               struct buffer_head *bh_result,
                               int flags)
 {
        int err = -EIO;
@@ -933,9 +931,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        int count = 0;
        ext4_fsblk_t first_block = 0;
-        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
+        J_ASSERT(!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)));
        J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
-        depth = ext4_block_to_path(inode, iblock, offsets,
+        depth = ext4_block_to_path(inode, map->m_lblk, offsets,
                                   &blocks_to_boundary);
        if (depth == 0)
@@ -946,10 +944,9 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        /* Simplest case - block found, no allocation needed */
        if (!partial) {
                first_block = le32_to_cpu(chain[depth - 1].key);
-                clear_buffer_new(bh_result);
                count++;
                /*map more blocks*/
-                while (count < maxblocks && count <= blocks_to_boundary) {
+                while (count < map->m_len && count <= blocks_to_boundary) {
                        ext4_fsblk_t blk;
                        blk = le32_to_cpu(*(chain[depth-1].p + count));
@@ -969,7 +966,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
        /*
         * Okay, we need to do block allocation.
        */
-        goal = ext4_find_goal(inode, iblock, partial);
+        goal = ext4_find_goal(inode, map->m_lblk, partial);
        /* the number of blocks need to allocate for [d,t]indirect blocks */
        indirect_blks = (chain + depth) - partial - 1;
@@ -979,11 +976,11 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * direct blocks to allocate for this branch.
         */
        count = ext4_blks_to_allocate(partial, indirect_blks,
-                                        maxblocks, blocks_to_boundary);
+                                      map->m_len, blocks_to_boundary);
        /*
         * Block out ext4_truncate while we alter the tree
         */
-        err = ext4_alloc_branch(handle, inode, iblock, indirect_blks,
+        err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks,
                                &count, goal,
                                offsets + (partial - chain), partial);
@@ -995,18 +992,20 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
         * may need to return -EAGAIN upwards in the worst case.  --sct
         */
        if (!err)
-                err = ext4_splice_branch(handle, inode, iblock,
+                err = ext4_splice_branch(handle, inode, map->m_lblk,
                                         partial, indirect_blks, count);
        if (err)
                goto cleanup;
-        set_buffer_new(bh_result);
+        map->m_flags |= EXT4_MAP_NEW;
        ext4_update_inode_fsync_trans(handle, inode, 1);
 got_it:
-        map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
+        map->m_flags |= EXT4_MAP_MAPPED;
+        map->m_pblk = le32_to_cpu(chain[depth-1].key);
+        map->m_len = count;
        if (count > blocks_to_boundary)
-                set_buffer_boundary(bh_result);
+                map->m_flags |= EXT4_MAP_BOUNDARY;
        err = count;
        /* Clean up and exit */
        partial = chain + depth - 1;    /* the whole chain */
@@ -1016,7 +1015,6 @@ cleanup:
                brelse(partial->bh);
                partial--;
        }
-        BUFFER_TRACE(bh_result, "returned");
 out:
        return err;
 }
@@ -1061,7 +1059,7 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode,
 */
 static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock)
 {
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_calc_metadata_amount(inode, lblock);
        return ext4_indirect_calc_metadata_amount(inode, lblock);
@@ -1076,7 +1074,6 @@ void ext4_da_update_reserve_space(struct inode *inode,
 {
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        int mdb_free = 0, allocated_meta_blocks = 0;
        spin_lock(&ei->i_block_reservation_lock);
        trace_ext4_da_update_reserve_space(inode, used);
@@ -1091,11 +1088,10 @@ void ext4_da_update_reserve_space(struct inode *inode,
        /* Update per-inode reservations */
        ei->i_reserved_data_blocks -= used;
-        used += ei->i_allocated_meta_blocks;
        ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks;
-        allocated_meta_blocks = ei->i_allocated_meta_blocks;
+        percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                           used + ei->i_allocated_meta_blocks);
        ei->i_allocated_meta_blocks = 0;
-        percpu_counter_sub(&sbi->s_dirtyblocks_counter, used);
        if (ei->i_reserved_data_blocks == 0) {
                /*
@@ -1103,30 +1099,23 @@ void ext4_da_update_reserve_space(struct inode *inode,
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-                mdb_free = ei->i_reserved_meta_blocks;
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
-                percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free);
        }
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
-        /* Update quota subsystem */
+        /* Update quota subsystem for data blocks */
-        if (quota_claim) {
+        if (quota_claim)
                dquot_claim_block(inode, used);
-                if (mdb_free)
+        else {
-                        dquot_release_reservation_block(inode, mdb_free);
-        } else {
                /*
                 * We did fallocate with an offset that is already delayed
                 * allocated. So on delayed allocated writeback we should
-                 * not update the quota for allocated blocks. But then
+                 * not re-claim the quota for fallocated blocks.
-                 * converting an fallocate region to initialized region would
-                 * have caused a metadata allocation. So claim quota for
-                 * that
                 */
-                if (allocated_meta_blocks)
+                dquot_release_reservation_block(inode, used);
-                        dquot_claim_block(inode, allocated_meta_blocks);
-                dquot_release_reservation_block(inode, mdb_free + used);
        }
        /*
@@ -1139,15 +1128,15 @@ void ext4_da_update_reserve_space(struct inode *inode,
                ext4_discard_preallocations(inode);
 }
-static int check_block_validity(struct inode *inode, const char *msg,
+static int check_block_validity(struct inode *inode, const char *func,
-                                sector_t logical, sector_t phys, int len)
+                                struct ext4_map_blocks *map)
 {
-        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk,
-                __ext4_error(inode->i_sb, msg,
+                                   map->m_len)) {
-                           "inode #%lu logical block %llu mapped to %llu "
+                ext4_error_inode(func, inode,
-                           "(size %d)", inode->i_ino,
+                           "lblock %lu mapped to illegal pblock %llu "
-                           (unsigned long long) logical,
+                           "(length %d)", (unsigned long) map->m_lblk,
-                           (unsigned long long) phys, len);
+                                 map->m_pblk, map->m_len);
                return -EIO;
        }
        return 0;
@@ -1212,15 +1201,15 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 }
 /*
- * The ext4_get_blocks() function tries to look up the requested blocks,
+ * The ext4_map_blocks() function tries to look up the requested blocks,
 * and returns if the blocks are already mapped.
 *
 * Otherwise it takes the write lock of the i_data_sem and allocate blocks
 * and store the allocated blocks in the result buffer head and mark it
 * mapped.
 *
- * If file type is extents based, it will call ext4_ext_get_blocks(),
+ * If file type is extents based, it will call ext4_ext_map_blocks(),
- * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
+ * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
 * based files
 *
 * On success, it returns the number of blocks being mapped or allocate.
@@ -1233,35 +1222,29 @@ static pgoff_t ext4_num_dirty_pages(struct inode *inode, pgoff_t idx,
 *
 * It returns the error in case of allocation failure.
 */
-int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+int ext4_map_blocks(handle_t *handle, struct inode *inode,
-                    unsigned int max_blocks, struct buffer_head *bh,
+                    struct ext4_map_blocks *map, int flags)
-                    int flags)
 {
        int retval;
-        clear_buffer_mapped(bh);
+        map->m_flags = 0;
-        clear_buffer_unwritten(bh);
+        ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u,"
+                  "logical block %lu\n", inode->i_ino, flags, map->m_len,
-        ext_debug("ext4_get_blocks(): inode %lu, flag %d, max_blocks %u,"
+                  (unsigned long) map->m_lblk);
-                  "logical block %lu\n", inode->i_ino, flags, max_blocks,
-                  (unsigned long)block);
        /*
         * Try to see if we can get the block without requesting a new
         * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ext_map_blocks(handle, inode, map, 0);
-                                bh, 0);
        } else {
-                retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ind_map_blocks(handle, inode, map, 0);
-                                             bh, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
-        if (retval > 0 && buffer_mapped(bh)) {
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, "file system corruption",
+                int ret = check_block_validity(inode, __func__, map);
-                                               block, bh->b_blocknr, retval);
                if (ret != 0)
                        return ret;
        }
@@ -1277,7 +1260,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * ext4_ext_get_block() returns th create = 0
         * with buffer head unmapped.
         */
-        if (retval > 0 && buffer_mapped(bh))
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED)
                return retval;
        /*
@@ -1290,7 +1273,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * of BH_Unwritten and BH_Mapped flags being simultaneously
         * set on the buffer_head.
         */
-        clear_buffer_unwritten(bh);
+        map->m_flags &= ~EXT4_MAP_UNWRITTEN;
        /*
         * New blocks allocate and/or writing to uninitialized extent
@@ -1312,14 +1295,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
         * We need to check for EXT4 here because migrate
         * could have changed the inode type in between
         */
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
-                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
+                retval = ext4_ext_map_blocks(handle, inode, map, flags);
-                                              bh, flags);
        } else {
-                retval = ext4_ind_get_blocks(handle, inode, block,
+                retval = ext4_ind_map_blocks(handle, inode, map, flags);
-                                             max_blocks, bh, flags);
-                if (retval > 0 && buffer_new(bh)) {
+                if (retval > 0 && map->m_flags & EXT4_MAP_NEW) {
                        /*
                         * We allocated new blocks which will result in
                         * i_data's format changing.  Force the migrate
@@ -1342,10 +1323,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
        up_write((&EXT4_I(inode)->i_data_sem));
-        if (retval > 0 && buffer_mapped(bh)) {
+        if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
-                int ret = check_block_validity(inode, "file system "
+                int ret = check_block_validity(inode,
-                                               "corruption after allocation",
+                                               "ext4_map_blocks_after_alloc",
-                                               block, bh->b_blocknr, retval);
+                                               map);
                if (ret != 0)
                        return ret;
        }
@@ -1355,109 +1336,109 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
 /* Maximum number of blocks we map for direct IO at once. */
 #define DIO_MAX_BLOCKS 4096
-int ext4_get_block(struct inode *inode, sector_t iblock,
+static int _ext4_get_block(struct inode *inode, sector_t iblock,
-                   struct buffer_head *bh_result, int create)
+                           struct buffer_head *bh, int flags)
 {
        handle_t *handle = ext4_journal_current_handle();
+        struct ext4_map_blocks map;
        int ret = 0, started = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        int dio_credits;
-        if (create && !handle) {
+        map.m_lblk = iblock;
+        map.m_len = bh->b_size >> inode->i_blkbits;
+        if (flags && !handle) {
                /* Direct IO write... */
-                if (max_blocks > DIO_MAX_BLOCKS)
+                if (map.m_len > DIO_MAX_BLOCKS)
-                        max_blocks = DIO_MAX_BLOCKS;
+                        map.m_len = DIO_MAX_BLOCKS;
-                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
+                dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
                handle = ext4_journal_start(inode, dio_credits);
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
-                        goto out;
+                        return ret;
                }
                started = 1;
        }
-        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+        ret = ext4_map_blocks(handle, inode, &map, flags);
-                              create ? EXT4_GET_BLOCKS_CREATE : 0);
        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
+                map_bh(bh, inode->i_sb, map.m_pblk);
+                bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+                bh->b_size = inode->i_sb->s_blocksize * map.m_len;
                ret = 0;
        }
        if (started)
                ext4_journal_stop(handle);
-out:
        return ret;
 }
+int ext4_get_block(struct inode *inode, sector_t iblock,
+                   struct buffer_head *bh, int create)
+{
+        return _ext4_get_block(inode, iblock, bh,
+                               create ? EXT4_GET_BLOCKS_CREATE : 0);
+}
 /*
 * `handle' can be NULL if create is zero
 */
 struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
                                ext4_lblk_t block, int create, int *errp)
 {
-        struct buffer_head dummy;
+        struct ext4_map_blocks map;
+        struct buffer_head *bh;
        int fatal = 0, err;
-        int flags = 0;
        J_ASSERT(handle != NULL || create == 0);
-        dummy.b_state = 0;
+        map.m_lblk = block;
-        dummy.b_blocknr = -1000;
+        map.m_len = 1;
-        buffer_trace_init(&dummy.b_history);
+        err = ext4_map_blocks(handle, inode, &map,
-        if (create)
+                              create ? EXT4_GET_BLOCKS_CREATE : 0);
-                flags |= EXT4_GET_BLOCKS_CREATE;
-        err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
+        if (err < 0)
-        /*
+                *errp = err;
-         * ext4_get_blocks() returns number of blocks mapped. 0 in
+        if (err <= 0)
-         * case of a HOLE.
+                return NULL;
-         */
+        *errp = 0;
-        if (err > 0) {
-                if (err > 1)
+        bh = sb_getblk(inode->i_sb, map.m_pblk);
-                        WARN_ON(1);
+        if (!bh) {
-                err = 0;
+                *errp = -EIO;
+                return NULL;
        }
-        *errp = err;
+        if (map.m_flags & EXT4_MAP_NEW) {
-        if (!err && buffer_mapped(&dummy)) {
+                J_ASSERT(create != 0);
-                struct buffer_head *bh;
+                J_ASSERT(handle != NULL);
-                bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
-                if (!bh) {
-                        *errp = -EIO;
-                        goto err;
-                }
-                if (buffer_new(&dummy)) {
-                        J_ASSERT(create != 0);
-                        J_ASSERT(handle != NULL);
-                        /*
+                /*
-                         * Now that we do not always journal data, we should
+                 * Now that we do not always journal data, we should
-                         * keep in mind whether this should always journal the
+                 * keep in mind whether this should always journal the
-                         * new buffer as metadata.  For now, regular file
+                 * new buffer as metadata.  For now, regular file
-                         * writes use ext4_get_block instead, so it's not a
+                 * writes use ext4_get_block instead, so it's not a
-                         * problem.
+                 * problem.
-                         */
+                 */
-                        lock_buffer(bh);
+                lock_buffer(bh);
-                        BUFFER_TRACE(bh, "call get_create_access");
+                BUFFER_TRACE(bh, "call get_create_access");
-                        fatal = ext4_journal_get_create_access(handle, bh);
+                fatal = ext4_journal_get_create_access(handle, bh);
-                        if (!fatal && !buffer_uptodate(bh)) {
+                if (!fatal && !buffer_uptodate(bh)) {
-                                memset(bh->b_data, 0, inode->i_sb->s_blocksize);
+                        memset(bh->b_data, 0, inode->i_sb->s_blocksize);
-                                set_buffer_uptodate(bh);
+                        set_buffer_uptodate(bh);
-                        }
-                        unlock_buffer(bh);
-                        BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
-                        err = ext4_handle_dirty_metadata(handle, inode, bh);
-                        if (!fatal)
-                                fatal = err;
-                } else {
-                        BUFFER_TRACE(bh, "not a new buffer");
-                }
-                if (fatal) {
-                        *errp = fatal;
-                        brelse(bh);
-                        bh = NULL;
                }
-                return bh;
+                unlock_buffer(bh);
+                BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
+                err = ext4_handle_dirty_metadata(handle, inode, bh);
+                if (!fatal)
+                        fatal = err;
+        } else {
+                BUFFER_TRACE(bh, "not a new buffer");
        }
-err:
+        if (fatal) {
-        return NULL;
+                *errp = fatal;
+                brelse(bh);
+                bh = NULL;
+        }
+        return bh;
 }
 struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
@@ -1860,7 +1841,7 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
        int retries = 0;
        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
        struct ext4_inode_info *ei = EXT4_I(inode);
-        unsigned long md_needed, md_reserved;
+        unsigned long md_needed;
        int ret;
        /*
@@ -1870,22 +1851,24 @@ static int ext4_da_reserve_space(struct inode *inode, sector_t lblock)
         */
 repeat:
        spin_lock(&ei->i_block_reservation_lock);
-        md_reserved = ei->i_reserved_meta_blocks;
        md_needed = ext4_calc_metadata_amount(inode, lblock);
        trace_ext4_da_reserve_space(inode, md_needed);
        spin_unlock(&ei->i_block_reservation_lock);
        /*
-         * Make quota reservation here to prevent quota overflow
+         * We will charge metadata quota at writeout time; this saves
-         * later. Real quota accounting is done at pages writeout
+         * us from metadata over-estimation, though we may go over by
-         * time.
+         * a small amount in the end.  Here we just reserve for data.
         */
-        ret = dquot_reserve_block(inode, md_needed + 1);
+        ret = dquot_reserve_block(inode, 1);
        if (ret)
                return ret;
+        /*
+         * We do still charge estimated metadata to the sb though;
+         * we cannot afford to run out of free blocks.
+         */
        if (ext4_claim_free_blocks(sbi, md_needed + 1)) {
-                dquot_release_reservation_block(inode, md_needed + 1);
+                dquot_release_reservation_block(inode, 1);
                if (ext4_should_retry_alloc(inode->i_sb, &retries)) {
                        yield();
                        goto repeat;
@@ -1910,6 +1893,7 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
        spin_lock(&EXT4_I(inode)->i_block_reservation_lock);
+        trace_ext4_da_release_space(inode, to_free);
        if (unlikely(to_free > ei->i_reserved_data_blocks)) {
                /*
                 * if there aren't enough reserved blocks, then the
@@ -1932,12 +1916,13 @@ static void ext4_da_release_space(struct inode *inode, int to_free)
                 * only when we have written all of the delayed
                 * allocation blocks.
                 */
-                to_free += ei->i_reserved_meta_blocks;
+                percpu_counter_sub(&sbi->s_dirtyblocks_counter,
+                                   ei->i_reserved_meta_blocks);
                ei->i_reserved_meta_blocks = 0;
                ei->i_da_metadata_calc_len = 0;
        }
-        /* update fs dirty blocks counter */
+        /* update fs dirty data blocks counter */
        percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free);
        spin_unlock(&EXT4_I(inode)->i_block_reservation_lock);
@@ -2042,28 +2027,23 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
 /*
 * mpage_put_bnr_to_bhs - walk blocks and assign them actual numbers
 *
- * @mpd->inode - inode to walk through
- * @exbh->b_blocknr - first block on a disk
- * @exbh->b_size - amount of space in bytes
- * @logical - first logical block to start assignment with
- *
 * the function goes through all passed space and put actual disk
 * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
 */
-static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
+static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd,
-                                 struct buffer_head *exbh)
+                                 struct ext4_map_blocks *map)
 {
        struct inode *inode = mpd->inode;
        struct address_space *mapping = inode->i_mapping;
-        int blocks = exbh->b_size >> inode->i_blkbits;
+        int blocks = map->m_len;
-        sector_t pblock = exbh->b_blocknr, cur_logical;
+        sector_t pblock = map->m_pblk, cur_logical;
        struct buffer_head *head, *bh;
        pgoff_t index, end;
        struct pagevec pvec;
        int nr_pages, i;
-        index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        index = map->m_lblk >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
-        end = (logical + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
+        end = (map->m_lblk + blocks - 1) >> (PAGE_CACHE_SHIFT - inode->i_blkbits);
        cur_logical = index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
        pagevec_init(&pvec, 0);
@@ -2090,17 +2070,16 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                        /* skip blocks out of the range */
                        do {
-                                if (cur_logical >= logical)
+                                if (cur_logical >= map->m_lblk)
                                        break;
                                cur_logical++;
                        } while ((bh = bh->b_this_page) != head);
                        do {
-                                if (cur_logical >= logical + blocks)
+                                if (cur_logical >= map->m_lblk + blocks)
                                        break;
-                                if (buffer_delay(bh) ||
+                                if (buffer_delay(bh) || buffer_unwritten(bh)) {
-                                                buffer_unwritten(bh)) {
                                        BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
@@ -2119,7 +2098,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
-                                if (buffer_uninit(exbh))
+                                if (map->m_flags & EXT4_MAP_UNINIT)
                                        set_buffer_uninit(bh);
                                cur_logical++;
                                pblock++;
@@ -2130,21 +2109,6 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
 }
-/*
- * __unmap_underlying_blocks - just a helper function to unmap
- * set of blocks described by @bh
- */
-static inline void __unmap_underlying_blocks(struct inode *inode,
-                                             struct buffer_head *bh)
-{
-        struct block_device *bdev = inode->i_sb->s_bdev;
-        int blocks, i;
-        blocks = bh->b_size >> inode->i_blkbits;
-        for (i = 0; i < blocks; i++)
-                unmap_underlying_metadata(bdev, bh->b_blocknr + i);
-}
 static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd,
                                        sector_t logical, long blk_cnt)
 {
@@ -2206,7 +2170,7 @@ static void ext4_print_free_blocks(struct inode *inode)
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
        int err, blks, get_blocks_flags;
-        struct buffer_head new;
+        struct ext4_map_blocks map;
        sector_t next = mpd->b_blocknr;
        unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
        loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
@@ -2247,15 +2211,15 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
         * variables are updated after the blocks have been allocated.
         */
-        new.b_state = 0;
+        map.m_lblk = next;
+        map.m_len = max_blocks;
        get_blocks_flags = EXT4_GET_BLOCKS_CREATE;
        if (ext4_should_dioread_nolock(mpd->inode))
                get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
        if (mpd->b_state & (1 << BH_Delay))
                get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE;
-        blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+        blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags);
-                               &new, get_blocks_flags);
        if (blks < 0) {
                err = blks;
                /*
@@ -2282,7 +2246,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                ext4_msg(mpd->inode->i_sb, KERN_CRIT,
                         "delayed block allocation failed for inode %lu at "
                         "logical offset %llu with max blocks %zd with "
-                         "error %d\n", mpd->inode->i_ino,
+                         "error %d", mpd->inode->i_ino,
                         (unsigned long long) next,
                         mpd->b_size >> mpd->inode->i_blkbits, err);
                printk(KERN_CRIT "This should not happen!!  "
@@ -2297,10 +2261,13 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
        }
        BUG_ON(blks == 0);
-        new.b_size = (blks << mpd->inode->i_blkbits);
+        if (map.m_flags & EXT4_MAP_NEW) {
+                struct block_device *bdev = mpd->inode->i_sb->s_bdev;
+                int i;
-        if (buffer_new(&new))
+                for (i = 0; i < map.m_len; i++)
-                __unmap_underlying_blocks(mpd->inode, &new);
+                        unmap_underlying_metadata(bdev, map.m_pblk + i);
+        }
        /*
         * If blocks are delayed marked, we need to
@@ -2308,7 +2275,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
         */
        if ((mpd->b_state & (1 << BH_Delay)) ||
            (mpd->b_state & (1 << BH_Unwritten)))
-                mpage_put_bnr_to_bhs(mpd, next, &new);
+                mpage_put_bnr_to_bhs(mpd, &map);
        if (ext4_should_order_data(mpd->inode)) {
                err = ext4_jbd2_file_inode(handle, mpd->inode);
@@ -2349,8 +2316,17 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
        sector_t next;
        int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
+        /*
+         * XXX Don't go larger than mballoc is willing to allocate
+         * This is a stopgap solution.  We eventually need to fold
+         * mpage_da_submit_io() into this function and then call
+         * ext4_get_blocks() multiple times in a loop
+         */
+        if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize)
+                goto flush_it;
        /* check if thereserved journal credits might overflow */
-        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(mpd->inode, EXT4_INODE_EXTENTS))) {
                if (nrblocks >= EXT4_MAX_TRANS_DATA) {
                        /*
                         * With non-extent format we are limited by the journal
@@ -2423,17 +2399,6 @@ static int __mpage_da_writepage(struct page *page,
        struct buffer_head *bh, *head;
        sector_t logical;
-        if (mpd->io_done) {
-                /*
-                 * Rest of the page in the page_vec
-                 * redirty then and skip then. We will
-                 * try to write them again after
-                 * starting a new transaction
-                 */
-                redirty_page_for_writepage(wbc, page);
-                unlock_page(page);
-                return MPAGE_DA_EXTENT_TAIL;
-        }
        /*
         * Can we merge this page to current extent?
         */
@@ -2528,8 +2493,9 @@ static int __mpage_da_writepage(struct page *page,
 * initialized properly.
 */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
-                                  struct buffer_head *bh_result, int create)
+                                  struct buffer_head *bh, int create)
 {
+        struct ext4_map_blocks map;
        int ret = 0;
        sector_t invalid_block = ~((sector_t) 0xffff);
@@ -2537,16 +2503,22 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                invalid_block = ~0;
        BUG_ON(create == 0);
-        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        BUG_ON(bh->b_size != inode->i_sb->s_blocksize);
+        map.m_lblk = iblock;
+        map.m_len = 1;
        /*
         * first, we need to know whether the block is allocated already
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-        ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
+        ret = ext4_map_blocks(NULL, inode, &map, 0);
-        if ((ret == 0) && !buffer_delay(bh_result)) {
+        if (ret < 0)
-                /* the block isn't (pre)allocated yet, let's reserve space */
+                return ret;
+        if (ret == 0) {
+                if (buffer_delay(bh))
+                        return 0; /* Not sure this could or should happen */
                /*
                 * XXX: __block_prepare_write() unmaps passed block,
                 * is it OK?
@@ -2556,26 +2528,26 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                        /* not enough space to reserve */
                        return ret;
-                map_bh(bh_result, inode->i_sb, invalid_block);
+                map_bh(bh, inode->i_sb, invalid_block);
-                set_buffer_new(bh_result);
+                set_buffer_new(bh);
-                set_buffer_delay(bh_result);
+                set_buffer_delay(bh);
-        } else if (ret > 0) {
+                return 0;
-                bh_result->b_size = (ret << inode->i_blkbits);
-                if (buffer_unwritten(bh_result)) {
-                        /* A delayed write to unwritten bh should
-                         * be marked new and mapped.  Mapped ensures
-                         * that we don't do get_block multiple times
-                         * when we write to the same offset and new
-                         * ensures that we do proper zero out for
-                         * partial write.
-                         */
-                        set_buffer_new(bh_result);
-                        set_buffer_mapped(bh_result);
-                }
-                ret = 0;
        }
-        return ret;
+        map_bh(bh, inode->i_sb, map.m_pblk);
+        bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags;
+        if (buffer_unwritten(bh)) {
+                /* A delayed write to unwritten bh should be marked
+                 * new and mapped.  Mapped ensures that we don't do
+                 * get_block multiple times when we write to the same
+                 * offset and new ensures that we do proper zero out
+                 * for partial write.
+                 */
+                set_buffer_new(bh);
+                set_buffer_mapped(bh);
+        }
+        return 0;
 }
 /*
@@ -2597,21 +2569,8 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
 static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
-        int ret = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
        BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+        return _ext4_get_block(inode, iblock, bh_result, 0);
-        /*
-         * we don't want to do block allocation in writepage
-         * so call get_block_wrap with create = 0
-         */
-        ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
-        }
-        return ret;
 }
 static int bget_one(handle_t *handle, struct buffer_head *bh)
@@ -2821,13 +2780,131 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
         * number of contiguous block. So we will limit
         * number of contiguous block to a sane value
         */
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) &&
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) &&
            (max_blocks > EXT4_MAX_TRANS_DATA))
                max_blocks = EXT4_MAX_TRANS_DATA;
        return ext4_chunk_trans_blocks(inode, max_blocks);
 }
+/*
+ * write_cache_pages_da - walk the list of dirty pages of the given
+ * address space and call the callback function (which usually writes
+ * the pages).
+ *
+ * This is a forked version of write_cache_pages().  Differences:
+ *      Range cyclic is ignored.
+ *      no_nrwrite_index_update is always presumed true
+ */
+static int write_cache_pages_da(struct address_space *mapping,
+                                struct writeback_control *wbc,
+                                struct mpage_da_data *mpd)
+{
+        int ret = 0;
+        int done = 0;
+        struct pagevec pvec;
+        int nr_pages;
+        pgoff_t index;
+        pgoff_t end;            /* Inclusive */
+        long nr_to_write = wbc->nr_to_write;
+        pagevec_init(&pvec, 0);
+        index = wbc->range_start >> PAGE_CACHE_SHIFT;
+        end = wbc->range_end >> PAGE_CACHE_SHIFT;
+        while (!done && (index <= end)) {
+                int i;
+                nr_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+                              PAGECACHE_TAG_DIRTY,
+                              min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1);
+                if (nr_pages == 0)
+                        break;
+                for (i = 0; i < nr_pages; i++) {
+                        struct page *page = pvec.pages[i];
+                        /*
+                         * At this point, the page may be truncated or
+                         * invalidated (changing page->mapping to NULL), or
+                         * even swizzled back from swapper_space to tmpfs file
+                         * mapping. However, page->index will not change
+                         * because we have a reference on the page.
+                         */
+                        if (page->index > end) {
+                                done = 1;
+                                break;
+                        }
+                        lock_page(page);
+                        /*
+                         * Page truncated or invalidated. We can freely skip it
+                         * then, even for data integrity operations: the page
+                         * has disappeared concurrently, so there could be no
+                         * real expectation of this data interity operation
+                         * even if there is now a new, dirty page at the same
+                         * pagecache address.
+                         */
+                        if (unlikely(page->mapping != mapping)) {
+continue_unlock:
+                                unlock_page(page);
+                                continue;
+                        }
+                        if (!PageDirty(page)) {
+                                /* someone wrote it for us */
+                                goto continue_unlock;
+                        }
+                        if (PageWriteback(page)) {
+                                if (wbc->sync_mode != WB_SYNC_NONE)
+                                        wait_on_page_writeback(page);
+                                else
+                                        goto continue_unlock;
+                        }
+                        BUG_ON(PageWriteback(page));
+                        if (!clear_page_dirty_for_io(page))
+                                goto continue_unlock;
+                        ret = __mpage_da_writepage(page, wbc, mpd);
+                        if (unlikely(ret)) {
+                                if (ret == AOP_WRITEPAGE_ACTIVATE) {
+                                        unlock_page(page);
+                                        ret = 0;
+                                } else {
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                        if (nr_to_write > 0) {
+                                nr_to_write--;
+                                if (nr_to_write == 0 &&
+                                    wbc->sync_mode == WB_SYNC_NONE) {
+                                        /*
+                                         * We stop writing back only if we are
+                                         * not doing integrity sync. In case of
+                                         * integrity sync we have to keep going
+                                         * because someone may be concurrently
+                                         * dirtying pages, and we might have
+                                         * synced a lot of newly appeared dirty
+                                         * pages, but have not synced all of the
+                                         * old dirty pages.
+                                         */
+                                        done = 1;
+                                        break;
+                                }
+                        }
+                }
+                pagevec_release(&pvec);
+                cond_resched();
+        }
+        return ret;
+}
 static int ext4_da_writepages(struct address_space *mapping,
                              struct writeback_control *wbc)
 {
@@ -2836,7 +2913,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        handle_t *handle = NULL;
        struct mpage_da_data mpd;
        struct inode *inode = mapping->host;
-        int no_nrwrite_index_update;
        int pages_written = 0;
        long pages_skipped;
        unsigned int max_pages;
@@ -2916,12 +2992,6 @@ static int ext4_da_writepages(struct address_space *mapping,
        mpd.wbc = wbc;
        mpd.inode = mapping->host;
-        /*
-         * we don't want write_cache_pages to update
-         * nr_to_write and writeback_index
-         */
-        no_nrwrite_index_update = wbc->no_nrwrite_index_update;
-        wbc->no_nrwrite_index_update = 1;
        pages_skipped = wbc->pages_skipped;
 retry:
@@ -2941,7 +3011,7 @@ retry:
                if (IS_ERR(handle)) {
                        ret = PTR_ERR(handle);
                        ext4_msg(inode->i_sb, KERN_CRIT, "%s: jbd2_start: "
-                               "%ld pages, ino %lu; err %d\n", __func__,
+                               "%ld pages, ino %lu; err %d", __func__,
                                wbc->nr_to_write, inode->i_ino, ret);
                        goto out_writepages;
                }
@@ -2963,8 +3033,7 @@ retry:
                mpd.io_done = 0;
                mpd.pages_written = 0;
                mpd.retval = 0;
-                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+                ret = write_cache_pages_da(mapping, wbc, &mpd);
-                                        &mpd);
                /*
                 * If we have a contiguous extent of pages and we
                 * haven't done the I/O yet, map the blocks and submit
@@ -3016,7 +3085,7 @@ retry:
        if (pages_skipped != wbc->pages_skipped)
                ext4_msg(inode->i_sb, KERN_CRIT,
                         "This should not happen leaving %s "
-                         "with nr_to_write = %ld ret = %d\n",
+                         "with nr_to_write = %ld ret = %d",
                         __func__, wbc->nr_to_write, ret);
        /* Update index */
@@ -3030,8 +3099,6 @@ retry:
                mapping->writeback_index = index;
 out_writepages:
-        if (!no_nrwrite_index_update)
-                wbc->no_nrwrite_index_update = 0;
        wbc->nr_to_write -= nr_to_writebump;
        wbc->range_start = range_start;
        trace_ext4_da_writepages_result(inode, wbc, ret, pages_written);
@@ -3076,7 +3143,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                               loff_t pos, unsigned len, unsigned flags,
                               struct page **pagep, void **fsdata)
 {
-        int ret, retries = 0, quota_retries = 0;
+        int ret, retries = 0;
        struct page *page;
        pgoff_t index;
        unsigned from, to;
@@ -3135,22 +3202,6 @@ retry:
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
                goto retry;
-        if ((ret == -EDQUOT) &&
-            EXT4_I(inode)->i_reserved_meta_blocks &&
-            (quota_retries++ < 3)) {
-                /*
-                 * Since we often over-estimate the number of meta
-                 * data blocks required, we may sometimes get a
-                 * spurios out of quota error even though there would
-                 * be enough space once we write the data blocks and
-                 * find out how many meta data blocks were _really_
-                 * required.  So try forcing the inode write to see if
-                 * that helps.
-                 */
-                write_inode_now(inode, (quota_retries == 3));
-                goto retry;
-        }
 out:
        return ret;
 }
@@ -3546,46 +3597,18 @@ out:
        return ret;
 }
+/*
+ * ext4_get_block used when preparing for a DIO write or buffer write.
+ * We allocate an uinitialized extent if blocks haven't been allocated.
+ * The extent will be converted to initialized after the IO is complete.
+ */
 static int ext4_get_block_write(struct inode *inode, sector_t iblock,
                   struct buffer_head *bh_result, int create)
 {
-        handle_t *handle = ext4_journal_current_handle();
-        int ret = 0;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        int dio_credits;
-        int started = 0;
        ext4_debug("ext4_get_block_write: inode %lu, create flag %d\n",
                   inode->i_ino, create);
-        /*
+        return _ext4_get_block(inode, iblock, bh_result,
-         * ext4_get_block in prepare for a DIO write or buffer write.
+                               EXT4_GET_BLOCKS_IO_CREATE_EXT);
-         * We allocate an uinitialized extent if blocks haven't been allocated.
-         * The extent will be converted to initialized after IO complete.
-         */
-        create = EXT4_GET_BLOCKS_IO_CREATE_EXT;
-        if (!handle) {
-                if (max_blocks > DIO_MAX_BLOCKS)
-                        max_blocks = DIO_MAX_BLOCKS;
-                dio_credits = ext4_chunk_trans_blocks(inode, max_blocks);
-                handle = ext4_journal_start(inode, dio_credits);
-                if (IS_ERR(handle)) {
-                        ret = PTR_ERR(handle);
-                        goto out;
-                }
-                started = 1;
-        }
-        ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
-                              create);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                ret = 0;
-        }
-        if (started)
-                ext4_journal_stop(handle);
-out:
-        return ret;
 }
 static void dump_completed_IO(struct inode * inode)
@@ -3973,7 +3996,7 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
                return ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
        return ext4_ind_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -4302,10 +4325,9 @@ static int ext4_clear_blocks(handle_t *handle, struct inode *inode,
        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), block_to_free,
                                   count)) {
-                ext4_error(inode->i_sb, "inode #%lu: "
+                EXT4_ERROR_INODE(inode, "attempt to clear invalid "
-                           "attempt to clear blocks %llu len %lu, invalid",
+                                 "blocks %llu len %lu",
-                           inode->i_ino, (unsigned long long) block_to_free,
+                                 (unsigned long long) block_to_free, count);
-                           count);
                return 1;
        }
@@ -4410,11 +4432,10 @@ static void ext4_free_data(handle_t *handle, struct inode *inode,
                if ((EXT4_JOURNAL(inode) == NULL) || bh2jh(this_bh))
                        ext4_handle_dirty_metadata(handle, inode, this_bh);
                else
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode,
-                                   "circular indirect block detected, "
+                                         "circular indirect block detected at "
-                                   "inode=%lu, block=%llu",
+                                         "block %llu",
-                                   inode->i_ino,
+                                (unsigned long long) this_bh->b_blocknr);
-                                   (unsigned long long) this_bh->b_blocknr);
        }
 }
@@ -4452,11 +4473,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                        if (!ext4_data_block_valid(EXT4_SB(inode->i_sb),
                                                   nr, 1)) {
-                                ext4_error(inode->i_sb,
+                                EXT4_ERROR_INODE(inode,
-                                           "indirect mapped block in inode "
+                                                 "invalid indirect mapped "
-                                           "#%lu invalid (level %d, blk #%lu)",
+                                                 "block %lu (level %d)",
-                                           inode->i_ino, depth,
+                                                 (unsigned long) nr, depth);
-                                           (unsigned long) nr);
                                break;
                        }
@@ -4468,9 +4488,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode,
                         * (should be rare).
                         */
                        if (!bh) {
-                                ext4_error(inode->i_sb,
+                                EXT4_ERROR_INODE(inode,
-                                           "Read failure, inode=%lu, block=%llu",
+                                                 "Read failure block=%llu",
-                                           inode->i_ino, nr);
+                                                 (unsigned long long) nr);
                                continue;
                        }
@@ -4612,12 +4632,12 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
-        EXT4_I(inode)->i_flags &= ~EXT4_EOFBLOCKS_FL;
+        ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
-        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
+        if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
                ext4_ext_truncate(inode);
                return;
        }
@@ -4785,8 +4805,8 @@ static int __ext4_get_inode_loc(struct inode *inode,
        bh = sb_getblk(sb, block);
        if (!bh) {
-                ext4_error(sb, "unable to read inode block - "
+                EXT4_ERROR_INODE(inode, "unable to read inode block - "
-                           "inode=%lu, block=%llu", inode->i_ino, block);
+                                 "block %llu", block);
                return -EIO;
        }
        if (!buffer_uptodate(bh)) {
@@ -4884,8 +4904,8 @@ make_io:
                submit_bh(READ_META, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
-                        ext4_error(sb, "unable to read inode block - inode=%lu,"
+                        EXT4_ERROR_INODE(inode, "unable to read inode "
-                                   " block=%llu", inode->i_ino, block);
+                                         "block %llu", block);
                        brelse(bh);
                        return -EIO;
                }
@@ -4922,20 +4942,26 @@ void ext4_set_inode_flags(struct inode *inode)
 /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */
 void ext4_get_inode_flags(struct ext4_inode_info *ei)
 {
-        unsigned int flags = ei->vfs_inode.i_flags;
+        unsigned int vfs_fl;
+        unsigned long old_fl, new_fl;
-        ei->i_flags &= ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-                        EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|EXT4_DIRSYNC_FL);
+        do {
-        if (flags & S_SYNC)
+                vfs_fl = ei->vfs_inode.i_flags;
-                ei->i_flags |= EXT4_SYNC_FL;
+                old_fl = ei->i_flags;
-        if (flags & S_APPEND)
+                new_fl = old_fl & ~(EXT4_SYNC_FL|EXT4_APPEND_FL|
-                ei->i_flags |= EXT4_APPEND_FL;
+                                EXT4_IMMUTABLE_FL|EXT4_NOATIME_FL|
-        if (flags & S_IMMUTABLE)
+                                EXT4_DIRSYNC_FL);
-                ei->i_flags |= EXT4_IMMUTABLE_FL;
+                if (vfs_fl & S_SYNC)
-        if (flags & S_NOATIME)
+                        new_fl |= EXT4_SYNC_FL;
-                ei->i_flags |= EXT4_NOATIME_FL;
+                if (vfs_fl & S_APPEND)
-        if (flags & S_DIRSYNC)
+                        new_fl |= EXT4_APPEND_FL;
-                ei->i_flags |= EXT4_DIRSYNC_FL;
+                if (vfs_fl & S_IMMUTABLE)
+                        new_fl |= EXT4_IMMUTABLE_FL;
+                if (vfs_fl & S_NOATIME)
+                        new_fl |= EXT4_NOATIME_FL;
+                if (vfs_fl & S_DIRSYNC)
+                        new_fl |= EXT4_DIRSYNC_FL;
+        } while (cmpxchg(&ei->i_flags, old_fl, new_fl) != old_fl);
 }
 static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
@@ -5096,8 +5122,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ret = 0;
        if (ei->i_file_acl &&
            !ext4_data_block_valid(EXT4_SB(sb), ei->i_file_acl, 1)) {
-                ext4_error(sb, "bad extended attribute block %llu inode #%lu",
+                EXT4_ERROR_INODE(inode, "bad extended attribute block %llu",
-                           ei->i_file_acl, inode->i_ino);
+                                 ei->i_file_acl);
                ret = -EIO;
                goto bad_inode;
        } else if (ei->i_flags & EXT4_EXTENTS_FL) {
@@ -5142,8 +5168,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
        } else {
                ret = -EIO;
-                ext4_error(inode->i_sb, "bogus i_mode (%o) for inode=%lu",
+                EXT4_ERROR_INODE(inode, "bogus i_mode (%o)", inode->i_mode);
-                           inode->i_mode, inode->i_ino);
                goto bad_inode;
        }
        brelse(iloc.bh);
@@ -5172,7 +5197,7 @@ static int ext4_inode_blocks_set(handle_t *handle,
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = 0;
-                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                return 0;
        }
        if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE))
@@ -5185,9 +5210,9 @@ static int ext4_inode_blocks_set(handle_t *handle,
                 */
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
                raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32);
-                ei->i_flags &= ~EXT4_HUGE_FILE_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_HUGE_FILE);
        } else {
-                ei->i_flags |= EXT4_HUGE_FILE_FL;
+                ext4_set_inode_flag(inode, EXT4_INODE_HUGE_FILE);
                /* i_block is stored in file system block size */
                i_blocks = i_blocks >> (inode->i_blkbits - 9);
                raw_inode->i_blocks_lo   = cpu_to_le32(i_blocks);
@@ -5381,9 +5406,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc)
                if (wbc->sync_mode == WB_SYNC_ALL)
                        sync_dirty_buffer(iloc.bh);
                if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) {
-                        ext4_error(inode->i_sb, "IO error syncing inode, "
+                        EXT4_ERROR_INODE(inode,
-                                   "inode=%lu, block=%llu", inode->i_ino,
+                                "IO error syncing inode (block=%llu)",
-                                   (unsigned long long)iloc.bh->b_blocknr);
+                                (unsigned long long) iloc.bh->b_blocknr);
                        err = -EIO;
                }
                brelse(iloc.bh);
@@ -5455,7 +5480,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        }
        if (attr->ia_valid & ATTR_SIZE) {
-                if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+                if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
                        struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
                        if (attr->ia_size > sbi->s_bitmap_maxbytes) {
@@ -5468,7 +5493,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
        if (S_ISREG(inode->i_mode) &&
            attr->ia_valid & ATTR_SIZE &&
            (attr->ia_size < inode->i_size ||
-             (EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))) {
+             (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))) {
                handle_t *handle;
                handle = ext4_journal_start(inode, 3);
@@ -5500,7 +5525,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
                        }
                }
                /* ext4_truncate will clear the flag */
-                if ((EXT4_I(inode)->i_flags & EXT4_EOFBLOCKS_FL))
+                if ((ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS)))
                        ext4_truncate(inode);
        }
@@ -5576,7 +5601,7 @@ static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks,
 static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return ext4_indirect_trans_blocks(inode, nrblocks, chunk);
        return ext4_ext_index_trans_blocks(inode, nrblocks, chunk);
 }
@@ -5911,9 +5936,9 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
         */
        if (val)
-                EXT4_I(inode)->i_flags |= EXT4_JOURNAL_DATA_FL;
+                ext4_set_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        else
-                EXT4_I(inode)->i_flags &= ~EXT4_JOURNAL_DATA_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
        ext4_set_aops(inode);
        jbd2_journal_unlock_updates(journal);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 016d0249294f..bf5ae883b1bd 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -258,7 +258,7 @@ setversion_out:
                if (me.moved_len > 0)
                        file_remove_suid(donor_filp);
-                if (copy_to_user((struct move_extent __user *)arg, 
+                if (copy_to_user((struct move_extent __user *)arg,
                                 &me, sizeof(me)))
                        err = -EFAULT;
 mext_out:
@@ -373,7 +373,30 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case EXT4_IOC32_SETRSVSZ:
                cmd = EXT4_IOC_SETRSVSZ;
                break;
-        case EXT4_IOC_GROUP_ADD:
+        case EXT4_IOC32_GROUP_ADD: {
+                struct compat_ext4_new_group_input __user *uinput;
+                struct ext4_new_group_input input;
+                mm_segment_t old_fs;
+                int err;
+                uinput = compat_ptr(arg);
+                err = get_user(input.group, &uinput->group);
+                err |= get_user(input.block_bitmap, &uinput->block_bitmap);
+                err |= get_user(input.inode_bitmap, &uinput->inode_bitmap);
+                err |= get_user(input.inode_table, &uinput->inode_table);
+                err |= get_user(input.blocks_count, &uinput->blocks_count);
+                err |= get_user(input.reserved_blocks,
+                                &uinput->reserved_blocks);
+                if (err)
+                        return -EFAULT;
+                old_fs = get_fs();
+                set_fs(KERNEL_DS);
+                err = ext4_ioctl(file, EXT4_IOC_GROUP_ADD,
+                                 (unsigned long) &input);
+                set_fs(old_fs);
+                return err;
+        }
+        case EXT4_IOC_MOVE_EXT:
                break;
        default:
                return -ENOIOCTLCMD;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a364dca3..12b3bc026a68 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -658,6 +658,27 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
        }
 }
+/*
+ * Cache the order of the largest free extent we have available in this block
+ * group.
+ */
+static void
+mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp)
+{
+        int i;
+        int bits;
+        grp->bb_largest_free_order = -1; /* uninit */
+        bits = sb->s_blocksize_bits + 1;
+        for (i = bits; i >= 0; i--) {
+                if (grp->bb_counters[i] > 0) {
+                        grp->bb_largest_free_order = i;
+                        break;
+                }
+        }
+}
 static noinline_for_stack
 void ext4_mb_generate_buddy(struct super_block *sb,
                                void *buddy, void *bitmap, ext4_group_t group)
@@ -700,6 +721,7 @@ void ext4_mb_generate_buddy(struct super_block *sb,
                 */
                grp->bb_free = free;
        }
+        mb_set_largest_free_order(sb, grp);
        clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state));
@@ -725,6 +747,9 @@ void ext4_mb_generate_buddy(struct super_block *sb,
 * contain blocks_per_page (PAGE_CACHE_SIZE / blocksize)  blocks.
 * So it can have information regarding groups_per_page which
 * is blocks_per_page/2
+ *
+ * Locking note:  This routine takes the block group lock of all groups
+ * for this page; do not hold this lock when calling this routine!
 */
 static int ext4_mb_init_cache(struct page *page, char *incore)
@@ -865,6 +890,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore == NULL);
                        mb_debug(1, "put buddy for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
+                        trace_ext4_mb_buddy_bitmap_load(sb, group);
                        grinfo = ext4_get_group_info(sb, group);
                        grinfo->bb_fragments = 0;
                        memset(grinfo->bb_counters, 0,
@@ -882,6 +908,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        BUG_ON(incore != NULL);
                        mb_debug(1, "put bitmap for group %u in page %lu/%x\n",
                                group, page->index, i * blocksize);
+                        trace_ext4_mb_bitmap_load(sb, group);
                        /* see comments in ext4_mb_put_pa() */
                        ext4_lock_group(sb, group);
@@ -910,6 +937,11 @@ out:
        return err;
 }
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack
 int ext4_mb_init_group(struct super_block *sb, ext4_group_t group)
 {
@@ -1004,6 +1036,11 @@ err:
        return ret;
 }
+/*
+ * Locking note:  This routine calls ext4_mb_init_cache(), which takes the
+ * block group lock of all groups for this page; do not hold the BG lock when
+ * calling this routine!
+ */
 static noinline_for_stack int
 ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group,
                                        struct ext4_buddy *e4b)
@@ -1150,7 +1187,7 @@ err:
        return ret;
 }
-static void ext4_mb_release_desc(struct ext4_buddy *e4b)
+static void ext4_mb_unload_buddy(struct ext4_buddy *e4b)
 {
        if (e4b->bd_bitmap_page)
                page_cache_release(e4b->bd_bitmap_page);
@@ -1299,6 +1336,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
                        buddy = buddy2;
                } while (1);
        }
+        mb_set_largest_free_order(sb, e4b->bd_info);
        mb_check_buddy(e4b);
 }
@@ -1427,6 +1465,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
                e4b->bd_info->bb_counters[ord]++;
                e4b->bd_info->bb_counters[ord]++;
        }
+        mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
        mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
        mb_check_buddy(e4b);
@@ -1617,7 +1656,7 @@ int ext4_mb_try_best_found(struct ext4_allocation_context *ac,
        }
        ext4_unlock_group(ac->ac_sb, group);
-        ext4_mb_release_desc(e4b);
+        ext4_mb_unload_buddy(e4b);
        return 0;
 }
@@ -1672,7 +1711,7 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
                ext4_mb_use_best_found(ac, e4b);
        }
        ext4_unlock_group(ac->ac_sb, group);
-        ext4_mb_release_desc(e4b);
+        ext4_mb_unload_buddy(e4b);
        return 0;
 }
@@ -1821,16 +1860,22 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
        }
 }
+/* This is now called BEFORE we load the buddy bitmap. */
 static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                                ext4_group_t group, int cr)
 {
        unsigned free, fragments;
-        unsigned i, bits;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
        BUG_ON(cr < 0 || cr >= 4);
-        BUG_ON(EXT4_MB_GRP_NEED_INIT(grp));
+        /* We only do this if the grp has never been initialized */
+        if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) {
+                int ret = ext4_mb_init_group(ac->ac_sb, group);
+                if (ret)
+                        return 0;
+        }
        free = grp->bb_free;
        fragments = grp->bb_fragments;
@@ -1843,17 +1888,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        case 0:
                BUG_ON(ac->ac_2order == 0);
+                if (grp->bb_largest_free_order < ac->ac_2order)
+                        return 0;
                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
                    ((group % flex_size) == 0))
                        return 0;
-                bits = ac->ac_sb->s_blocksize_bits + 1;
+                return 1;
-                for (i = ac->ac_2order; i <= bits; i++)
-                        if (grp->bb_counters[i] > 0)
-                                return 1;
-                break;
        case 1:
                if ((free / fragments) >= ac->ac_g_ex.fe_len)
                        return 1;
@@ -1964,7 +2008,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        sbi = EXT4_SB(sb);
        ngroups = ext4_get_groups_count(sb);
        /* non-extent files are limited to low blocks/groups */
-        if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL))
+        if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)))
                ngroups = sbi->s_blockfile_groups;
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
@@ -2024,15 +2068,11 @@ repeat:
                group = ac->ac_g_ex.fe_group;
                for (i = 0; i < ngroups; group++, i++) {
-                        struct ext4_group_info *grp;
-                        struct ext4_group_desc *desc;
                        if (group == ngroups)
                                group = 0;
-                        /* quick check to skip empty groups */
+                        /* This now checks without needing the buddy page */
-                        grp = ext4_get_group_info(sb, group);
+                        if (!ext4_mb_good_group(ac, group, cr))
-                        if (grp->bb_free == 0)
                                continue;
                        err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -2040,15 +2080,18 @@ repeat:
                                goto out;
                        ext4_lock_group(sb, group);
+                        /*
+                         * We need to check again after locking the
+                         * block group
+                         */
                        if (!ext4_mb_good_group(ac, group, cr)) {
-                                /* someone did allocation from this group */
                                ext4_unlock_group(sb, group);
-                                ext4_mb_release_desc(&e4b);
+                                ext4_mb_unload_buddy(&e4b);
                                continue;
                        }
                        ac->ac_groups_scanned++;
-                        desc = ext4_get_group_desc(sb, group, NULL);
                        if (cr == 0)
                                ext4_mb_simple_scan_group(ac, &e4b);
                        else if (cr == 1 &&
@@ -2058,7 +2101,7 @@ repeat:
                                ext4_mb_complex_scan_group(ac, &e4b);
                        ext4_unlock_group(sb, group);
-                        ext4_mb_release_desc(&e4b);
+                        ext4_mb_unload_buddy(&e4b);
                        if (ac->ac_status != AC_STATUS_CONTINUE)
                                break;
@@ -2148,7 +2191,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
        ext4_lock_group(sb, group);
        memcpy(&sg, ext4_get_group_info(sb, group), i);
        ext4_unlock_group(sb, group);
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free,
                        sg.info.bb_fragments, sg.info.bb_first_free);
@@ -2255,6 +2298,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group,
        INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list);
        init_rwsem(&meta_group_info[i]->alloc_sem);
        meta_group_info[i]->bb_free_root = RB_ROOT;
+        meta_group_info[i]->bb_largest_free_order = -1;  /* uninit */
 #ifdef DOUBLE_CHECK
        {
@@ -2536,6 +2580,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                         entry->count, entry->group, entry);
                if (test_opt(sb, DISCARD)) {
+                        int ret;
                        ext4_fsblk_t discard_block;
                        discard_block = entry->start_blk +
@@ -2543,7 +2588,12 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                        trace_ext4_discard_blocks(sb,
                                        (unsigned long long)discard_block,
                                        entry->count);
-                        sb_issue_discard(sb, discard_block, entry->count);
+                        ret = sb_issue_discard(sb, discard_block, entry->count);
+                        if (ret == EOPNOTSUPP) {
+                                ext4_warning(sb,
+                                        "discard not supported, disabling");
+                                clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD);
+                        }
                }
                err = ext4_mb_load_buddy(sb, entry->group, &e4b);
@@ -2568,7 +2618,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
                }
                ext4_unlock_group(sb, entry->group);
                kmem_cache_free(ext4_free_ext_cachep, entry);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
        }
        mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
@@ -2641,7 +2691,7 @@ int __init init_ext4_mballoc(void)
 void exit_ext4_mballoc(void)
 {
-        /* 
+        /*
         * Wait for completion of call_rcu()'s on ext4_pspace_cachep
         * before destroying the slab cache.
         */
@@ -2981,7 +3031,7 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
        if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) {
                atomic_inc(&sbi->s_bal_reqs);
                atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
-                if (ac->ac_o_ex.fe_len >= ac->ac_g_ex.fe_len)
+                if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
                        atomic_inc(&sbi->s_bal_success);
                atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
                if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
@@ -3123,7 +3173,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
                        continue;
                /* non-extent files can't have physical blocks past 2^32 */
-                if (!(EXT4_I(ac->ac_inode)->i_flags & EXT4_EXTENTS_FL) &&
+                if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
                        pa->pa_pstart + pa->pa_len > EXT4_MAX_BLOCK_FILE_PHYS)
                        continue;
@@ -3280,7 +3330,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        spin_unlock(&pa->pa_lock);
        grp_blk = pa->pa_pstart;
-        /* 
+        /*
         * If doing group-based preallocation, pa_pstart may be in the
         * next group when pa is used up
         */
@@ -3697,7 +3747,7 @@ out:
        ext4_unlock_group(sb, group);
        if (ac)
                kmem_cache_free(ext4_ac_cachep, ac);
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        put_bh(bitmap_bh);
        return free;
 }
@@ -3801,7 +3851,7 @@ repeat:
                if (bitmap_bh == NULL) {
                        ext4_error(sb, "Error reading block bitmap for %u",
                                        group);
-                        ext4_mb_release_desc(&e4b);
+                        ext4_mb_unload_buddy(&e4b);
                        continue;
                }
@@ -3810,7 +3860,7 @@ repeat:
                ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
                ext4_unlock_group(sb, group);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
                put_bh(bitmap_bh);
                list_del(&pa->u.pa_tmp_list);
@@ -4074,7 +4124,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                ext4_mb_release_group_pa(&e4b, pa, ac);
                ext4_unlock_group(sb, group);
-                ext4_mb_release_desc(&e4b);
+                ext4_mb_unload_buddy(&e4b);
                list_del(&pa->u.pa_tmp_list);
                call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback);
        }
@@ -4484,12 +4534,12 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
                        if (!bh)
                                tbh = sb_find_get_block(inode->i_sb,
                                                        block + i);
-                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA, 
+                        ext4_forget(handle, flags & EXT4_FREE_BLOCKS_METADATA,
                                    inode, tbh, block + i);
                }
        }
-        /* 
+        /*
         * We need to make sure we don't reuse the freed block until
         * after the transaction is committed, which we can do by
         * treating the block as metadata, below.  We make an
@@ -4610,7 +4660,7 @@ do_more:
                atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
        }
-        ext4_mb_release_desc(&e4b);
+        ext4_mb_unload_buddy(&e4b);
        freed += count;
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 34dcfc52ef44..6f3a27ec30bf 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -475,7 +475,7 @@ int ext4_ext_migrate(struct inode *inode)
         */
        if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb,
                                       EXT4_FEATURE_INCOMPAT_EXTENTS) ||
-            (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+            (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                return -EINVAL;
        if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index d1fc662cc311..52abfa12762a 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -482,6 +482,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
        int depth = ext_depth(orig_inode);
        int ret;
+        start_ext.ee_block = end_ext.ee_block = 0;
        o_start = o_end = oext = orig_path[depth].p_ext;
        oext_alen = ext4_ext_get_actual_len(oext);
        start_ext.ee_len = end_ext.ee_len = 0;
@@ -529,7 +530,7 @@ mext_leaf_block(handle_t *handle, struct inode *orig_inode,
         * new_ext       |-------|
         */
        if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-                ext4_error(orig_inode->i_sb,
+                EXT4_ERROR_INODE(orig_inode,
                        "new_ext_end(%u) should be less than or equal to "
                        "oext->ee_block(%u) + oext_alen(%d) - 1",
                        new_ext_end, le32_to_cpu(oext->ee_block),
@@ -692,12 +693,12 @@ mext_replace_branches(handle_t *handle, struct inode *orig_inode,
        while (1) {
                /* The extent for donor must be found. */
                if (!dext) {
-                        ext4_error(donor_inode->i_sb,
+                        EXT4_ERROR_INODE(donor_inode,
                                   "The extent for donor must be found");
                        *err = -EIO;
                        goto out;
                } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-                        ext4_error(donor_inode->i_sb,
+                        EXT4_ERROR_INODE(donor_inode,
                                "Donor offset(%u) and the first block of donor "
                                "extent(%u) should be equal",
                                donor_off,
@@ -959,6 +960,9 @@ mext_check_arguments(struct inode *orig_inode,
                return -EINVAL;
        }
+        if (IS_IMMUTABLE(donor_inode) || IS_APPEND(donor_inode))
+                return -EPERM;
        /* Ext4 move extent does not support swapfile */
        if (IS_SWAPFILE(orig_inode) || IS_SWAPFILE(donor_inode)) {
                ext4_debug("ext4 move extent: The argument files should "
@@ -976,11 +980,11 @@ mext_check_arguments(struct inode *orig_inode,
        }
        /* Ext4 move extent supports only extent based file */
-        if (!(EXT4_I(orig_inode)->i_flags & EXT4_EXTENTS_FL)) {
+        if (!(ext4_test_inode_flag(orig_inode, EXT4_INODE_EXTENTS))) {
                ext4_debug("ext4 move extent: orig file is not extents "
                        "based file [ino:orig %lu]\n", orig_inode->i_ino);
                return -EOPNOTSUPP;
-        } else if (!(EXT4_I(donor_inode)->i_flags & EXT4_EXTENTS_FL)) {
+        } else if (!(ext4_test_inode_flag(donor_inode, EXT4_INODE_EXTENTS))) {
                ext4_debug("ext4 move extent: donor file is not extents "
                        "based file [ino:donor %lu]\n", donor_inode->i_ino);
                return -EOPNOTSUPP;
@@ -1354,7 +1358,7 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
                        if (ret1 < 0)
                                break;
                        if (*moved_len > len) {
-                                ext4_error(orig_inode->i_sb,
+                                EXT4_ERROR_INODE(orig_inode,
                                        "We replaced blocks too much! "
                                        "sum of replaced: %llu requested: %llu",
                                        *moved_len, len);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 0c070fabd108..a43e6617b351 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -187,7 +187,7 @@ unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
                return blocksize;
        return (len & 65532) | ((len & 3) << 16);
 }
-  
 __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
 {
        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
@@ -197,7 +197,7 @@ __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
        if (len == blocksize) {
                if (blocksize == 65536)
                        return cpu_to_le16(EXT4_MAX_REC_LEN);
-                else 
+                else
                        return cpu_to_le16(0);
        }
        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
@@ -349,7 +349,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir,
                brelse(bh);
        }
        if (bcount)
-                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", 
+                printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n",
                       levels ? "" : "   ", names, space/bcount,
                       (space/bcount)*100/blocksize);
        return (struct stats) { names, space, bcount};
@@ -653,10 +653,10 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        int ret, err;
        __u32 hashval;
-        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", 
+        dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n",
                       start_hash, start_minor_hash));
        dir = dir_file->f_path.dentry->d_inode;
-        if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) {
+        if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) {
                hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
                if (hinfo.hash_version <= DX_HASH_TEA)
                        hinfo.hash_version +=
@@ -801,7 +801,7 @@ static void ext4_update_dx_flag(struct inode *inode)
 {
        if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb,
                                     EXT4_FEATURE_COMPAT_DIR_INDEX))
-                EXT4_I(inode)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_INDEX);
 }
 /*
@@ -943,8 +943,8 @@ restart:
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        /* read error, skip block & hope for the best */
-                        ext4_error(sb, "reading directory #%lu offset %lu",
+                        EXT4_ERROR_INODE(dir, "reading directory lblock %lu",
-                                   dir->i_ino, (unsigned long)block);
+                                         (unsigned long) block);
                        brelse(bh);
                        goto next;
                }
@@ -1066,15 +1066,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                __u32 ino = le32_to_cpu(de->inode);
                brelse(bh);
                if (!ext4_valid_inum(dir->i_sb, ino)) {
-                        ext4_error(dir->i_sb, "bad inode number: %u", ino);
+                        EXT4_ERROR_INODE(dir, "bad inode number: %u", ino);
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
                if (unlikely(IS_ERR(inode))) {
                        if (PTR_ERR(inode) == -ESTALE) {
-                                ext4_error(dir->i_sb,
+                                EXT4_ERROR_INODE(dir,
-                                                "deleted inode referenced: %u",
+                                                 "deleted inode referenced: %u",
-                                                ino);
+                                                 ino);
                                return ERR_PTR(-EIO);
                        } else {
                                return ERR_CAST(inode);
@@ -1104,8 +1104,8 @@ struct dentry *ext4_get_parent(struct dentry *child)
        brelse(bh);
        if (!ext4_valid_inum(child->d_inode->i_sb, ino)) {
-                ext4_error(child->d_inode->i_sb,
+                EXT4_ERROR_INODE(child->d_inode,
-                           "bad inode number: %u", ino);
+                                 "bad parent inode number: %u", ino);
                return ERR_PTR(-EIO);
        }
@@ -1141,7 +1141,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
        unsigned rec_len = 0;
        while (count--) {
-                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+                struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)
                                                (from + (map->offs<<2));
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
@@ -1404,9 +1404,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        de = (struct ext4_dir_entry_2 *)((char *)fde +
                ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
-                ext4_error(dir->i_sb,
+                EXT4_ERROR_INODE(dir, "invalid rec_len for '..'");
-                           "invalid rec_len for '..' in inode %lu",
-                           dir->i_ino);
                brelse(bh);
                return -EIO;
        }
@@ -1418,7 +1416,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
                brelse(bh);
                return retval;
        }
-        EXT4_I(dir)->i_flags |= EXT4_INDEX_FL;
+        ext4_set_inode_flag(dir, EXT4_INODE_INDEX);
        data1 = bh2->b_data;
        memcpy (data1, de, len);
@@ -1491,7 +1489,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                retval = ext4_dx_add_entry(handle, dentry, inode);
                if (!retval || (retval != ERR_BAD_DX_DIR))
                        return retval;
-                EXT4_I(dir)->i_flags &= ~EXT4_INDEX_FL;
+                ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
                dx_fallback++;
                ext4_mark_inode_dirty(handle, dir);
        }
@@ -1519,6 +1517,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
        de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
        retval = add_dirent_to_buf(handle, dentry, inode, de, bh);
        brelse(bh);
+        if (retval == 0)
+                ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY);
        return retval;
 }
@@ -1915,9 +1915,8 @@ static int empty_dir(struct inode *inode)
        if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) ||
            !(bh = ext4_bread(NULL, inode, 0, 0, &err))) {
                if (err)
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode,
-                                   "error %d reading directory #%lu offset 0",
+                                "error %d reading directory lblock 0", err);
-                                   err, inode->i_ino);
                else
                        ext4_warning(inode->i_sb,
                                     "bad directory (dir #%lu) - no data block",
@@ -1941,17 +1940,17 @@ static int empty_dir(struct inode *inode)
        de = ext4_next_entry(de1, sb->s_blocksize);
        while (offset < inode->i_size) {
                if (!bh ||
-                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+                    (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
+                        unsigned int lblock;
                        err = 0;
                        brelse(bh);
-                        bh = ext4_bread(NULL, inode,
+                        lblock = offset >> EXT4_BLOCK_SIZE_BITS(sb);
-                                offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err);
+                        bh = ext4_bread(NULL, inode, lblock, 0, &err);
                        if (!bh) {
                                if (err)
-                                        ext4_error(sb,
+                                        EXT4_ERROR_INODE(inode,
-                                                   "error %d reading directory"
+                                                "error %d reading directory "
-                                                   " #%lu offset %u",
+                                                "lblock %u", err, lblock);
-                                                   err, inode->i_ino, offset);
                                offset += sb->s_blocksize;
                                continue;
                        }
@@ -2297,7 +2296,7 @@ retry:
                }
        } else {
                /* clear the extent format for fast symlink */
-                EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL;
+                ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
                inode->i_op = &ext4_fast_symlink_inode_operations;
                memcpy((char *)&EXT4_I(inode)->i_data, symname, l);
                inode->i_size = l-1;
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 5692c48754a0..6df797eb9aeb 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -911,7 +911,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        percpu_counter_add(&sbi->s_freeinodes_counter,
                           EXT4_INODES_PER_GROUP(sb));
-        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
+        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG) &&
+            sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group;
                flex_group = ext4_flex_group(sbi, input->group);
                atomic_add(input->free_blocks_count,
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c170d5..4e8983a9811b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -241,6 +241,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        if (sb->s_flags & MS_RDONLY)
                return ERR_PTR(-EROFS);
+        vfs_check_frozen(sb, SB_FREEZE_WRITE);
        /* Special case here: if the journal has aborted behind our
         * backs (eg. EIO in the commit thread), then we still need to
         * take the FS itself readonly cleanly. */
@@ -645,6 +646,8 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        flush_workqueue(sbi->dio_unwritten_wq);
        destroy_workqueue(sbi->dio_unwritten_wq);
@@ -941,6 +944,8 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
        if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
                seq_puts(seq, ",journal_async_commit");
+        else if (test_opt(sb, JOURNAL_CHECKSUM))
+                seq_puts(seq, ",journal_checksum");
        if (test_opt(sb, NOBH))
                seq_puts(seq, ",nobh");
        if (test_opt(sb, I_VERSION))
@@ -1059,7 +1064,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                                char *path, int remount);
+                                char *path);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
                               size_t len, loff_t off);
@@ -1081,12 +1086,12 @@ static const struct dquot_operations ext4_quota_operations = {
 static const struct quotactl_ops ext4_qctl_operations = {
        .quota_on       = ext4_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
 #endif
@@ -2051,7 +2056,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
                if (sb_dqopt(sb)->files[i])
-                        vfs_quota_off(sb, i, 0);
+                        dquot_quota_off(sb, i);
        }
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
@@ -2213,7 +2218,7 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
 struct ext4_attr {
        struct attribute attr;
        ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
-        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
                         const char *, size_t);
        int offset;
 };
@@ -2430,6 +2435,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
 {
+        char *orig_data = kstrdup(data, GFP_KERNEL);
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
        struct ext4_sb_info *sbi;
@@ -2793,24 +2799,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        get_random_bytes(&sbi->s_next_generation, sizeof(u32));
        spin_lock_init(&sbi->s_next_gen_lock);
-        err = percpu_counter_init(&sbi->s_freeblocks_counter,
-                        ext4_count_free_blocks(sb));
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_freeinodes_counter,
-                                ext4_count_free_inodes(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirs_counter,
-                                ext4_count_dirs(sb));
-        }
-        if (!err) {
-                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
-        }
-        if (err) {
-                ext4_msg(sb, KERN_ERR, "insufficient memory");
-                goto failed_mount3;
-        }
        sbi->s_stripe = ext4_get_stripe_size(sbi);
        sbi->s_max_writeback_mb_bump = 128;
@@ -2910,6 +2898,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 no_journal:
+        err = percpu_counter_init(&sbi->s_freeblocks_counter,
+                                  ext4_count_free_blocks(sb));
+        if (!err)
+                err = percpu_counter_init(&sbi->s_freeinodes_counter,
+                                          ext4_count_free_inodes(sb));
+        if (!err)
+                err = percpu_counter_init(&sbi->s_dirs_counter,
+                                          ext4_count_dirs(sb));
+        if (!err)
+                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
+        if (err) {
+                ext4_msg(sb, KERN_ERR, "insufficient memory");
+                goto failed_mount_wq;
+        }
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
                        ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
@@ -3001,7 +3003,7 @@ no_journal:
        err = ext4_setup_system_zone(sb);
        if (err) {
                ext4_msg(sb, KERN_ERR, "failed to initialize system "
-                         "zone (%d)\n", err);
+                         "zone (%d)", err);
                goto failed_mount4;
        }
@@ -3040,9 +3042,11 @@ no_journal:
        } else
                descr = "out journal";
-        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
+        ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
+                "Opts: %s", descr, orig_data);
        lock_kernel();
+        kfree(orig_data);
        return 0;
 cantfind_ext4:
@@ -3059,6 +3063,10 @@ failed_mount_wq:
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
+        percpu_counter_destroy(&sbi->s_freeblocks_counter);
+        percpu_counter_destroy(&sbi->s_freeinodes_counter);
+        percpu_counter_destroy(&sbi->s_dirs_counter);
+        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount3:
        if (sbi->s_flex_groups) {
                if (is_vmalloc_addr(sbi->s_flex_groups))
@@ -3066,10 +3074,6 @@ failed_mount3:
                else
                        kfree(sbi->s_flex_groups);
        }
-        percpu_counter_destroy(&sbi->s_freeblocks_counter);
-        percpu_counter_destroy(&sbi->s_freeinodes_counter);
-        percpu_counter_destroy(&sbi->s_dirs_counter);
-        percpu_counter_destroy(&sbi->s_dirtyblocks_counter);
 failed_mount2:
        for (i = 0; i < db_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -3089,6 +3093,7 @@ out_fail:
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
+        kfree(orig_data);
        return ret;
 }
@@ -3380,7 +3385,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
        if (!(sb->s_flags & MS_RDONLY))
                es->s_wtime = cpu_to_le32(get_seconds());
        es->s_kbytes_written =
-                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
                            ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
                              EXT4_SB(sb)->s_sectors_written_start) >> 1));
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
@@ -3485,8 +3490,10 @@ int ext4_force_commit(struct super_block *sb)
                return 0;
        journal = EXT4_SB(sb)->s_journal;
-        if (journal)
+        if (journal) {
+                vfs_check_frozen(sb, SB_FREEZE_WRITE);
                ret = ext4_journal_force_commit(journal);
+        }
        return ret;
 }
@@ -3535,18 +3542,16 @@ static int ext4_freeze(struct super_block *sb)
         * the journal.
         */
        error = jbd2_journal_flush(journal);
-        if (error < 0) {
+        if (error < 0)
-        out:
+                goto out;
-                jbd2_journal_unlock_updates(journal);
-                return error;
-        }
        /* Journal blocked and flushed, clear needs_recovery flag. */
        EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        error = ext4_commit_super(sb, 1);
-        if (error)
+out:
-                goto out;
+        /* we rely on s_frozen to stop further updates */
-        return 0;
+        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+        return error;
 }
 /*
@@ -3563,7 +3568,6 @@ static int ext4_unfreeze(struct super_block *sb)
        EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
        ext4_commit_super(sb, 1);
        unlock_super(sb);
-        jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        return 0;
 }
@@ -3574,12 +3578,14 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
        ext4_fsblk_t n_blocks_count = 0;
        unsigned long old_sb_flags;
        struct ext4_mount_options old_opts;
+        int enable_quota = 0;
        ext4_group_t g;
        unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
        int err;
 #ifdef CONFIG_QUOTA
        int i;
 #endif
+        char *orig_data = kstrdup(data, GFP_KERNEL);
        lock_kernel();
@@ -3630,6 +3636,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                }
                if (*flags & MS_RDONLY) {
+                        err = dquot_suspend(sb, -1);
+                        if (err < 0)
+                                goto restore_opts;
                        /*
                         * First of all, the unconditional stuff we have to do
                         * to disable replay of the journal when we next remount
@@ -3698,6 +3708,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                goto restore_opts;
                        if (!ext4_setup_super(sb, es, 0))
                                sb->s_flags &= ~MS_RDONLY;
+                        enable_quota = 1;
                }
        }
        ext4_setup_system_zone(sb);
@@ -3713,6 +3724,11 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 #endif
        unlock_super(sb);
        unlock_kernel();
+        if (enable_quota)
+                dquot_resume(sb, -1);
+        ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
+        kfree(orig_data);
        return 0;
 restore_opts:
@@ -3734,6 +3750,7 @@ restore_opts:
 #endif
        unlock_super(sb);
        unlock_kernel();
+        kfree(orig_data);
        return err;
 }
@@ -3906,24 +3923,21 @@ static int ext4_write_info(struct super_block *sb, int type)
 */
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-                                  EXT4_SB(sb)->s_jquota_fmt, type);
+                                        EXT4_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-                         char *name, int remount)
+                         char *name)
 {
        int err;
        struct path path;
        if (!test_opt(sb, QUOTA))
                return -EINVAL;
-        /* When remounting, no checks are needed and in fact, name is NULL */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, remount);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
@@ -3962,7 +3976,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
                }
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
        path_put(&path);
        return err;
 }
@@ -4141,6 +4155,7 @@ static int __init init_ext4_fs(void)
 {
        int err;
+        ext4_check_flag_values();
        err = init_ext4_system_zone();
        if (err)
                return err;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 00740cb32be3..ed9354aff279 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -34,6 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .setattr        = ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
@@ -45,6 +46,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 const struct inode_operations ext4_fast_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = ext4_follow_link,
+        .setattr        = ext4_setattr,
 #ifdef CONFIG_EXT4_FS_XATTR
        .setxattr       = generic_setxattr,
        .getxattr       = generic_getxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 2de0e9515089..04338009793a 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -228,9 +228,8 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name,
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
 bad_block:
-                ext4_error(inode->i_sb,
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           "inode %lu: bad block %llu", inode->i_ino,
+                                 EXT4_I(inode)->i_file_acl);
-                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
        }
@@ -372,9 +371,8 @@ ext4_xattr_block_list(struct dentry *dentry, char *buffer, size_t buffer_size)
        ea_bdebug(bh, "b_count=%d, refcount=%d",
                atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount));
        if (ext4_xattr_check_block(bh)) {
-                ext4_error(inode->i_sb,
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           "inode %lu: bad block %llu", inode->i_ino,
+                                 EXT4_I(inode)->i_file_acl);
-                           EXT4_I(inode)->i_file_acl);
                error = -EIO;
                goto cleanup;
        }
@@ -666,8 +664,8 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i,
                        atomic_read(&(bs->bh->b_count)),
                        le32_to_cpu(BHDR(bs->bh)->h_refcount));
                if (ext4_xattr_check_block(bs->bh)) {
-                        ext4_error(sb, "inode %lu: bad block %llu",
+                        EXT4_ERROR_INODE(inode, "bad block %llu",
-                                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                         EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -820,7 +818,7 @@ inserted:
                                                EXT4_I(inode)->i_block_group);
                        /* non-extent files can't have physical blocks past 2^32 */
-                        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                goal = goal & EXT4_MAX_BLOCK_FILE_PHYS;
                        block = ext4_new_meta_blocks(handle, inode,
@@ -828,7 +826,7 @@ inserted:
                        if (error)
                                goto cleanup;
-                        if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL))
+                        if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
                                BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
                        ea_idebug(inode, "creating block %d", block);
@@ -880,8 +878,8 @@ cleanup_dquot:
        goto cleanup;
 bad_block:
-        ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+        EXT4_ERROR_INODE(inode, "bad block %llu",
-                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                         EXT4_I(inode)->i_file_acl);
        goto cleanup;
 #undef header
@@ -1194,8 +1192,8 @@ retry:
                if (!bh)
                        goto cleanup;
                if (ext4_xattr_check_block(bh)) {
-                        ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                        EXT4_ERROR_INODE(inode, "bad block %llu",
-                                   inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                         EXT4_I(inode)->i_file_acl);
                        error = -EIO;
                        goto cleanup;
                }
@@ -1372,14 +1370,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode)
                goto cleanup;
        bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
        if (!bh) {
-                ext4_error(inode->i_sb, "inode %lu: block %llu read error",
+                EXT4_ERROR_INODE(inode, "block %llu read error",
-                           inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                 EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
            BHDR(bh)->h_blocks != cpu_to_le32(1)) {
-                ext4_error(inode->i_sb, "inode %lu: bad block %llu",
+                EXT4_ERROR_INODE(inode, "bad block %llu",
-                           inode->i_ino, EXT4_I(inode)->i_file_acl);
+                                 EXT4_I(inode)->i_file_acl);
                goto cleanup;
        }
        ext4_xattr_release_block(handle, inode, bh);
@@ -1504,9 +1502,8 @@ again:
                }
                bh = sb_bread(inode->i_sb, ce->e_block);
                if (!bh) {
-                        ext4_error(inode->i_sb,
+                        EXT4_ERROR_INODE(inode, "block %lu read error",
-                                "inode %lu: block %lu read error",
+                                         (unsigned long) ce->e_block);
-                                inode->i_ino, (unsigned long) ce->e_block);
                } else if (le32_to_cpu(BHDR(bh)->h_refcount) >=
                                EXT4_XATTR_REFCOUNT_MAX) {
                        ea_idebug(inode, "block %lu refcount %d>=%d",
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index 53dba57b49a1..27ac25725954 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -306,11 +306,11 @@ extern long fat_generic_ioctl(struct file *filp, unsigned int cmd,
 extern const struct file_operations fat_file_operations;
 extern const struct inode_operations fat_file_inode_operations;
 extern int fat_setattr(struct dentry * dentry, struct iattr * attr);
-extern void fat_truncate(struct inode *inode);
+extern int fat_setsize(struct inode *inode, loff_t offset);
+extern void fat_truncate_blocks(struct inode *inode, loff_t offset);
 extern int fat_getattr(struct vfsmount *mnt, struct dentry *dentry,
                       struct kstat *stat);
-extern int fat_file_fsync(struct file *file, struct dentry *dentry,
+extern int fat_file_fsync(struct file *file, int datasync);
-                          int datasync);
 /* fat/inode.c */
 extern void fat_attach(struct inode *inode, loff_t i_pos);
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a14c2f6a489e..990dfae022e5 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -149,12 +149,12 @@ static int fat_file_release(struct inode *inode, struct file *filp)
        return 0;
 }
-int fat_file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int fat_file_fsync(struct file *filp, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int res, err;
-        res = simple_fsync(filp, dentry, datasync);
+        res = generic_file_fsync(filp, datasync);
        err = sync_mapping_buffers(MSDOS_SB(inode->i_sb)->fat_inode->i_mapping);
        return res ? res : err;
@@ -283,7 +283,7 @@ static int fat_free(struct inode *inode, int skip)
        return fat_free_clusters(inode, free_start);
 }
-void fat_truncate(struct inode *inode)
+void fat_truncate_blocks(struct inode *inode, loff_t offset)
 {
        struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
        const unsigned int cluster_size = sbi->cluster_size;
@@ -293,10 +293,10 @@ void fat_truncate(struct inode *inode)
         * This protects against truncating a file bigger than it was then
         * trying to write into the hole.
         */
-        if (MSDOS_I(inode)->mmu_private > inode->i_size)
+        if (MSDOS_I(inode)->mmu_private > offset)
-                MSDOS_I(inode)->mmu_private = inode->i_size;
+                MSDOS_I(inode)->mmu_private = offset;
-        nr_clusters = (inode->i_size + (cluster_size - 1)) >> sbi->cluster_bits;
+        nr_clusters = (offset + (cluster_size - 1)) >> sbi->cluster_bits;
        fat_free(inode, nr_clusters);
        fat_flush_inodes(inode->i_sb, inode, NULL);
@@ -364,6 +364,18 @@ static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode)
        return 0;
 }
+int fat_setsize(struct inode *inode, loff_t offset)
+{
+        int error;
+        error = simple_setsize(inode, offset);
+        if (error)
+                return error;
+        fat_truncate_blocks(inode, offset);
+        return error;
+}
 #define TIMES_SET_FLAGS (ATTR_MTIME_SET | ATTR_ATIME_SET | ATTR_TIMES_SET)
 /* valid file mode bits */
 #define FAT_VALID_MODE  (S_IFREG | S_IFDIR | S_IRWXUGO)
@@ -378,7 +390,8 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
        /*
         * Expand the file. Since inode_setattr() updates ->i_size
         * before calling the ->truncate(), but FAT needs to fill the
-         * hole before it.
+         * hole before it. XXX: this is no longer true with new truncate
+         * sequence.
         */
        if (attr->ia_valid & ATTR_SIZE) {
                if (attr->ia_size > inode->i_size) {
@@ -427,15 +440,20 @@ int fat_setattr(struct dentry *dentry, struct iattr *attr)
                        attr->ia_valid &= ~ATTR_MODE;
        }
-        if (attr->ia_valid)
+        if (attr->ia_valid & ATTR_SIZE) {
-                error = inode_setattr(inode, attr);
+                error = fat_setsize(inode, attr->ia_size);
+                if (error)
+                        goto out;
+        }
+        generic_setattr(inode, attr);
+        mark_inode_dirty(inode);
 out:
        return error;
 }
 EXPORT_SYMBOL_GPL(fat_setattr);
 const struct inode_operations fat_file_inode_operations = {
-        .truncate       = fat_truncate,
        .setattr        = fat_setattr,
        .getattr        = fat_getattr,
 };
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index ed33904926ee..7bf45aee56d7 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -142,14 +142,29 @@ static int fat_readpages(struct file *file, struct address_space *mapping,
        return mpage_readpages(mapping, pages, nr_pages, fat_get_block);
 }
+static void fat_write_failed(struct address_space *mapping, loff_t to)
+{
+        struct inode *inode = mapping->host;
+        if (to > inode->i_size) {
+                truncate_pagecache(inode, to, inode->i_size);
+                fat_truncate_blocks(inode, inode->i_size);
+        }
+}
 static int fat_write_begin(struct file *file, struct address_space *mapping,
                        loff_t pos, unsigned len, unsigned flags,
                        struct page **pagep, void **fsdata)
 {
+        int err;
        *pagep = NULL;
-        return cont_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
+        err = cont_write_begin_newtrunc(file, mapping, pos, len, flags,
-                                fat_get_block,
+                                pagep, fsdata, fat_get_block,
                                &MSDOS_I(mapping->host)->mmu_private);
+        if (err < 0)
+                fat_write_failed(mapping, pos + len);
+        return err;
 }
 static int fat_write_end(struct file *file, struct address_space *mapping,
@@ -159,6 +174,8 @@ static int fat_write_end(struct file *file, struct address_space *mapping,
        struct inode *inode = mapping->host;
        int err;
        err = generic_write_end(file, mapping, pos, len, copied, pagep, fsdata);
+        if (err < len)
+                fat_write_failed(mapping, pos + len);
        if (!(err < 0) && !(MSDOS_I(inode)->i_attrs & ATTR_ARCH)) {
                inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
                MSDOS_I(inode)->i_attrs |= ATTR_ARCH;
@@ -172,7 +189,9 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
                             loff_t offset, unsigned long nr_segs)
 {
        struct file *file = iocb->ki_filp;
-        struct inode *inode = file->f_mapping->host;
+        struct address_space *mapping = file->f_mapping;
+        struct inode *inode = mapping->host;
+        ssize_t ret;
        if (rw == WRITE) {
                /*
@@ -193,8 +212,12 @@ static ssize_t fat_direct_IO(int rw, struct kiocb *iocb,
         * FAT need to use the DIO_LOCKING for avoiding the race
         * condition of fat_get_block() and ->truncate().
         */
-        return blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
+        ret = blockdev_direct_IO_newtrunc(rw, iocb, inode, inode->i_sb->s_bdev,
-                                  offset, nr_segs, fat_get_block, NULL);
+                                iov, offset, nr_segs, fat_get_block, NULL);
+        if (ret < 0 && (rw & WRITE))
+                fat_write_failed(mapping, offset + iov_length(iov, nr_segs));
+        return ret;
 }
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
@@ -429,7 +452,7 @@ static void fat_delete_inode(struct inode *inode)
 {
        truncate_inode_pages(&inode->i_data, 0);
        inode->i_size = 0;
-        fat_truncate(inode);
+        fat_truncate_blocks(inode, 0);
        clear_inode(inode);
 }
diff --git a/fs/fcntl.c b/fs/fcntl.c
index f74d270ba155..51e11bf5708f 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -274,7 +274,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg)
        ret = copy_from_user(&owner, owner_p, sizeof(owner));
        if (ret)
-                return ret;
+                return -EFAULT;
        switch (owner.type) {
        case F_OWNER_TID:
@@ -332,8 +332,11 @@ static int f_getown_ex(struct file *filp, unsigned long arg)
        }
        read_unlock(&filp->f_owner.lock);
-        if (!ret)
+        if (!ret) {
                ret = copy_to_user(owner_p, &owner, sizeof(owner));
+                if (ret)
+                        ret = -EFAULT;
+        }
        return ret;
 }
diff --git a/fs/file_table.c b/fs/file_table.c
index 32d12b78bac8..5c7d10ead4ad 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -194,14 +194,6 @@ struct file *alloc_file(struct path *path, fmode_t mode,
 }
 EXPORT_SYMBOL(alloc_file);
-void fput(struct file *file)
-{
-        if (atomic_long_dec_and_test(&file->f_count))
-                __fput(file);
-}
-EXPORT_SYMBOL(fput);
 /**
 * drop_file_write_access - give up ability to write to a file
 * @file: the file to which we will stop writing
@@ -227,10 +219,9 @@ void drop_file_write_access(struct file *file)
 }
 EXPORT_SYMBOL_GPL(drop_file_write_access);
-/* __fput is called from task context when aio completion releases the last
+/* the real guts of fput() - releasing the last reference to file
- * last use of a struct file *.  Do not use otherwise.
 */
-void __fput(struct file *file)
+static void __fput(struct file *file)
 {
        struct dentry *dentry = file->f_path.dentry;
        struct vfsmount *mnt = file->f_path.mnt;
@@ -268,6 +259,14 @@ void __fput(struct file *file)
        mntput(mnt);
 }
+void fput(struct file *file)
+{
+        if (atomic_long_dec_and_test(&file->f_count))
+                __fput(file);
+}
+EXPORT_SYMBOL(fput);
 struct file *fget(unsigned int fd)
 {
        struct file *file;
diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c
index aee049cb9f84..0ec7bb2c95c6 100644
--- a/fs/freevxfs/vxfs_lookup.c
+++ b/fs/freevxfs/vxfs_lookup.c
@@ -57,6 +57,8 @@ const struct inode_operations vxfs_dir_inode_ops = {
 };
 const struct file_operations vxfs_dir_operations = {
+        .llseek =               generic_file_llseek,
+        .read =                 generic_read_dir,
        .readdir =              vxfs_readdir,
 };
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index ea8592b90696..1d1088f48bc2 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -45,7 +45,6 @@ struct wb_writeback_args {
        unsigned int for_kupdate:1;
        unsigned int range_cyclic:1;
        unsigned int for_background:1;
-        unsigned int sb_pinned:1;
 };
 /*
@@ -193,8 +192,7 @@ static void bdi_wait_on_work_clear(struct bdi_work *work)
 }
 static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
-                                 struct wb_writeback_args *args,
+                                 struct wb_writeback_args *args)
-                                 int wait)
 {
        struct bdi_work *work;
@@ -206,8 +204,6 @@ static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
        if (work) {
                bdi_work_init(work, args);
                bdi_queue_work(bdi, work);
-                if (wait)
-                        bdi_wait_on_work_clear(work);
        } else {
                struct bdi_writeback *wb = &bdi->wb;
@@ -234,11 +230,6 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
                .sync_mode      = WB_SYNC_ALL,
                .nr_pages       = LONG_MAX,
                .range_cyclic   = 0,
-                /*
-                 * Setting sb_pinned is not necessary for WB_SYNC_ALL, but
-                 * lets make it explicitly clear.
-                 */
-                .sb_pinned      = 1,
        };
        struct bdi_work work;
@@ -254,23 +245,21 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi,
 * @bdi: the backing device to write from
 * @sb: write inodes from this super_block
 * @nr_pages: the number of pages to write
- * @sb_locked: caller already holds sb umount sem.
 *
 * Description:
 *   This does WB_SYNC_NONE opportunistic writeback. The IO is only
 *   started when this function returns, we make no guarentees on
- *   completion. Caller specifies whether sb umount sem is held already or not.
+ *   completion. Caller need not hold sb s_umount semaphore.
 *
 */
 void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
-                         long nr_pages, int sb_locked)
+                         long nr_pages)
 {
        struct wb_writeback_args args = {
                .sb             = sb,
                .sync_mode      = WB_SYNC_NONE,
                .nr_pages       = nr_pages,
                .range_cyclic   = 1,
-                .sb_pinned      = sb_locked,
        };
        /*
@@ -282,7 +271,7 @@ void bdi_start_writeback(struct backing_dev_info *bdi, struct super_block *sb,
                args.for_background = 1;
        }
-        bdi_alloc_queue_work(bdi, &args, sb_locked);
+        bdi_alloc_queue_work(bdi, &args);
 }
 /*
@@ -595,7 +584,7 @@ static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc,
        /*
         * Caller must already hold the ref for this
         */
-        if (wbc->sync_mode == WB_SYNC_ALL || wbc->sb_pinned) {
+        if (wbc->sync_mode == WB_SYNC_ALL) {
                WARN_ON(!rwsem_is_locked(&sb->s_umount));
                return SB_NOT_PINNED;
        }
@@ -769,7 +758,6 @@ static long wb_writeback(struct bdi_writeback *wb,
                .for_kupdate            = args->for_kupdate,
                .for_background         = args->for_background,
                .range_cyclic           = args->range_cyclic,
-                .sb_pinned              = args->sb_pinned,
        };
        unsigned long oldest_jif;
        long wrote = 0;
@@ -912,7 +900,6 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
        while ((work = get_next_work_item(bdi, wb)) != NULL) {
                struct wb_writeback_args args = work->args;
-                int post_clear;
                /*
                 * Override sync mode, in case we must wait for completion
@@ -920,13 +907,11 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                if (force_wait)
                        work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
-                post_clear = WB_SYNC_ALL || args.sb_pinned;
                /*
                 * If this isn't a data integrity operation, just notify
                 * that we have seen this work and we are now starting it.
                 */
-                if (!post_clear)
+                if (args.sync_mode == WB_SYNC_NONE)
                        wb_clear_pending(wb, work);
                wrote += wb_writeback(wb, &args);
@@ -935,7 +920,7 @@ long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
                 * This is a data integrity writeback, so only do the
                 * notification when we have completed the work.
                 */
-                if (post_clear)
+                if (args.sync_mode == WB_SYNC_ALL)
                        wb_clear_pending(wb, work);
        }
@@ -1011,7 +996,7 @@ static void bdi_writeback_all(struct super_block *sb, long nr_pages)
                if (!bdi_has_dirty_io(bdi))
                        continue;
-                bdi_alloc_queue_work(bdi, &args, 0);
+                bdi_alloc_queue_work(bdi, &args);
        }
        rcu_read_unlock();
@@ -1220,18 +1205,6 @@ static void wait_sb_inodes(struct super_block *sb)
        iput(old_inode);
 }
-static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
-{
-        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-        long nr_to_write;
-        nr_to_write = nr_dirty + nr_unstable +
-                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
-        bdi_start_writeback(sb->s_bdi, sb, nr_to_write, sb_locked);
-}
 /**
 * writeback_inodes_sb  -       writeback dirty inodes from given super_block
 * @sb: the superblock
@@ -1243,21 +1216,16 @@ static void __writeback_inodes_sb(struct super_block *sb, int sb_locked)
 */
 void writeback_inodes_sb(struct super_block *sb)
 {
-        __writeback_inodes_sb(sb, 0);
+        unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
-}
+        unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
-EXPORT_SYMBOL(writeback_inodes_sb);
+        long nr_to_write;
-/**
+        nr_to_write = nr_dirty + nr_unstable +
- * writeback_inodes_sb_locked   - writeback dirty inodes from given super_block
+                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
- * @sb: the superblock
- *
+        bdi_start_writeback(sb->s_bdi, sb, nr_to_write);
- * Like writeback_inodes_sb(), except the caller already holds the
- * sb umount sem.
- */
-void writeback_inodes_sb_locked(struct super_block *sb)
-{
-        __writeback_inodes_sb(sb, 1);
 }
+EXPORT_SYMBOL(writeback_inodes_sb);
 /**
 * writeback_inodes_sb_if_idle  -       start writeback if none underway
diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c
index 1e1f286dd70e..4a8eb31c5338 100644
--- a/fs/fscache/object-list.c
+++ b/fs/fscache/object-list.c
@@ -103,7 +103,7 @@ static struct fscache_object *fscache_objlist_lookup(loff_t *_pos)
        /* banners (can't represent line 0 by pos 0 as that would involve
         * returning a NULL pointer) */
        if (pos == 0)
-                return (struct fscache_object *) ++(*_pos);
+                return (struct fscache_object *)(long)++(*_pos);
        if (pos < 3)
                return (struct fscache_object *)pos;
diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 47aefd376e54..723b889fd219 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -710,30 +710,26 @@ static void fscache_write_op(struct fscache_operation *_op)
                goto superseded;
        }
-        if (page) {
+        radix_tree_tag_set(&cookie->stores, page->index,
-                radix_tree_tag_set(&cookie->stores, page->index,
+                           FSCACHE_COOKIE_STORING_TAG);
-                                   FSCACHE_COOKIE_STORING_TAG);
+        radix_tree_tag_clear(&cookie->stores, page->index,
-                radix_tree_tag_clear(&cookie->stores, page->index,
+                             FSCACHE_COOKIE_PENDING_TAG);
-                                     FSCACHE_COOKIE_PENDING_TAG);
-        }
        spin_unlock(&cookie->stores_lock);
        spin_unlock(&object->lock);
-        if (page) {
+        fscache_set_op_state(&op->op, "Store");
-                fscache_set_op_state(&op->op, "Store");
+        fscache_stat(&fscache_n_store_pages);
-                fscache_stat(&fscache_n_store_pages);
+        fscache_stat(&fscache_n_cop_write_page);
-                fscache_stat(&fscache_n_cop_write_page);
+        ret = object->cache->ops->write_page(op, page);
-                ret = object->cache->ops->write_page(op, page);
+        fscache_stat_d(&fscache_n_cop_write_page);
-                fscache_stat_d(&fscache_n_cop_write_page);
+        fscache_set_op_state(&op->op, "EndWrite");
-                fscache_set_op_state(&op->op, "EndWrite");
+        fscache_end_page_write(object, page);
-                fscache_end_page_write(object, page);
+        if (ret < 0) {
-                if (ret < 0) {
+                fscache_set_op_state(&op->op, "Abort");
-                        fscache_set_op_state(&op->op, "Abort");
+                fscache_abort_object(object);
-                        fscache_abort_object(object);
+        } else {
-                } else {
+                fscache_enqueue_operation(&op->op);
-                        fscache_enqueue_operation(&op->op);
-                }
        }
        _leave("");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index e53df5ebb2b8..9424796d6634 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -16,6 +16,9 @@
 #include <linux/pagemap.h>
 #include <linux/file.h>
 #include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
 MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 MODULE_ALIAS("devname:fuse");
@@ -499,6 +502,9 @@ struct fuse_copy_state {
        int write;
        struct fuse_req *req;
        const struct iovec *iov;
+        struct pipe_buffer *pipebufs;
+        struct pipe_buffer *currbuf;
+        struct pipe_inode_info *pipe;
        unsigned long nr_segs;
        unsigned long seglen;
        unsigned long addr;
@@ -506,16 +512,16 @@ struct fuse_copy_state {
        void *mapaddr;
        void *buf;
        unsigned len;
+        unsigned move_pages:1;
 };
 static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
-                           int write, struct fuse_req *req,
+                           int write,
                           const struct iovec *iov, unsigned long nr_segs)
 {
        memset(cs, 0, sizeof(*cs));
        cs->fc = fc;
        cs->write = write;
-        cs->req = req;
        cs->iov = iov;
        cs->nr_segs = nr_segs;
 }
@@ -523,7 +529,18 @@ static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
 /* Unmap and put previous page of userspace buffer */
 static void fuse_copy_finish(struct fuse_copy_state *cs)
 {
-        if (cs->mapaddr) {
+        if (cs->currbuf) {
+                struct pipe_buffer *buf = cs->currbuf;
+                if (!cs->write) {
+                        buf->ops->unmap(cs->pipe, buf, cs->mapaddr);
+                } else {
+                        kunmap_atomic(cs->mapaddr, KM_USER0);
+                        buf->len = PAGE_SIZE - cs->len;
+                }
+                cs->currbuf = NULL;
+                cs->mapaddr = NULL;
+        } else if (cs->mapaddr) {
                kunmap_atomic(cs->mapaddr, KM_USER0);
                if (cs->write) {
                        flush_dcache_page(cs->pg);
@@ -545,26 +562,61 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
        unlock_request(cs->fc, cs->req);
        fuse_copy_finish(cs);
-        if (!cs->seglen) {
+        if (cs->pipebufs) {
-                BUG_ON(!cs->nr_segs);
+                struct pipe_buffer *buf = cs->pipebufs;
-                cs->seglen = cs->iov[0].iov_len;
-                cs->addr = (unsigned long) cs->iov[0].iov_base;
+                if (!cs->write) {
-                cs->iov++;
+                        err = buf->ops->confirm(cs->pipe, buf);
-                cs->nr_segs--;
+                        if (err)
+                                return err;
+                        BUG_ON(!cs->nr_segs);
+                        cs->currbuf = buf;
+                        cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+                        cs->len = buf->len;
+                        cs->buf = cs->mapaddr + buf->offset;
+                        cs->pipebufs++;
+                        cs->nr_segs--;
+                } else {
+                        struct page *page;
+                        if (cs->nr_segs == cs->pipe->buffers)
+                                return -EIO;
+                        page = alloc_page(GFP_HIGHUSER);
+                        if (!page)
+                                return -ENOMEM;
+                        buf->page = page;
+                        buf->offset = 0;
+                        buf->len = 0;
+                        cs->currbuf = buf;
+                        cs->mapaddr = kmap_atomic(page, KM_USER0);
+                        cs->buf = cs->mapaddr;
+                        cs->len = PAGE_SIZE;
+                        cs->pipebufs++;
+                        cs->nr_segs++;
+                }
+        } else {
+                if (!cs->seglen) {
+                        BUG_ON(!cs->nr_segs);
+                        cs->seglen = cs->iov[0].iov_len;
+                        cs->addr = (unsigned long) cs->iov[0].iov_base;
+                        cs->iov++;
+                        cs->nr_segs--;
+                }
+                err = get_user_pages_fast(cs->addr, 1, cs->write, &cs->pg);
+                if (err < 0)
+                        return err;
+                BUG_ON(err != 1);
+                offset = cs->addr % PAGE_SIZE;
+                cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
+                cs->buf = cs->mapaddr + offset;
+                cs->len = min(PAGE_SIZE - offset, cs->seglen);
+                cs->seglen -= cs->len;
+                cs->addr += cs->len;
        }
-        down_read(&current->mm->mmap_sem);
-        err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0,
-                             &cs->pg, NULL);
-        up_read(&current->mm->mmap_sem);
-        if (err < 0)
-                return err;
-        BUG_ON(err != 1);
-        offset = cs->addr % PAGE_SIZE;
-        cs->mapaddr = kmap_atomic(cs->pg, KM_USER0);
-        cs->buf = cs->mapaddr + offset;
-        cs->len = min(PAGE_SIZE - offset, cs->seglen);
-        cs->seglen -= cs->len;
-        cs->addr += cs->len;
        return lock_request(cs->fc, cs->req);
 }
@@ -586,23 +638,178 @@ static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
        return ncpy;
 }
+static int fuse_check_page(struct page *page)
+{
+        if (page_mapcount(page) ||
+            page->mapping != NULL ||
+            page_count(page) != 1 ||
+            (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+             ~(1 << PG_locked |
+               1 << PG_referenced |
+               1 << PG_uptodate |
+               1 << PG_lru |
+               1 << PG_active |
+               1 << PG_reclaim))) {
+                printk(KERN_WARNING "fuse: trying to steal weird page\n");
+                printk(KERN_WARNING "  page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+                return 1;
+        }
+        return 0;
+}
+static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+{
+        int err;
+        struct page *oldpage = *pagep;
+        struct page *newpage;
+        struct pipe_buffer *buf = cs->pipebufs;
+        struct address_space *mapping;
+        pgoff_t index;
+        unlock_request(cs->fc, cs->req);
+        fuse_copy_finish(cs);
+        err = buf->ops->confirm(cs->pipe, buf);
+        if (err)
+                return err;
+        BUG_ON(!cs->nr_segs);
+        cs->currbuf = buf;
+        cs->len = buf->len;
+        cs->pipebufs++;
+        cs->nr_segs--;
+        if (cs->len != PAGE_SIZE)
+                goto out_fallback;
+        if (buf->ops->steal(cs->pipe, buf) != 0)
+                goto out_fallback;
+        newpage = buf->page;
+        if (WARN_ON(!PageUptodate(newpage)))
+                return -EIO;
+        ClearPageMappedToDisk(newpage);
+        if (fuse_check_page(newpage) != 0)
+                goto out_fallback_unlock;
+        mapping = oldpage->mapping;
+        index = oldpage->index;
+        /*
+         * This is a new and locked page, it shouldn't be mapped or
+         * have any special flags on it
+         */
+        if (WARN_ON(page_mapped(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(page_has_private(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+                goto out_fallback_unlock;
+        if (WARN_ON(PageMlocked(oldpage)))
+                goto out_fallback_unlock;
+        remove_from_page_cache(oldpage);
+        page_cache_release(oldpage);
+        err = add_to_page_cache_locked(newpage, mapping, index, GFP_KERNEL);
+        if (err) {
+                printk(KERN_WARNING "fuse_try_move_page: failed to add page");
+                goto out_fallback_unlock;
+        }
+        page_cache_get(newpage);
+        if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+                lru_cache_add_file(newpage);
+        err = 0;
+        spin_lock(&cs->fc->lock);
+        if (cs->req->aborted)
+                err = -ENOENT;
+        else
+                *pagep = newpage;
+        spin_unlock(&cs->fc->lock);
+        if (err) {
+                unlock_page(newpage);
+                page_cache_release(newpage);
+                return err;
+        }
+        unlock_page(oldpage);
+        page_cache_release(oldpage);
+        cs->len = 0;
+        return 0;
+out_fallback_unlock:
+        unlock_page(newpage);
+out_fallback:
+        cs->mapaddr = buf->ops->map(cs->pipe, buf, 1);
+        cs->buf = cs->mapaddr + buf->offset;
+        err = lock_request(cs->fc, cs->req);
+        if (err)
+                return err;
+        return 1;
+}
+static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
+                         unsigned offset, unsigned count)
+{
+        struct pipe_buffer *buf;
+        if (cs->nr_segs == cs->pipe->buffers)
+                return -EIO;
+        unlock_request(cs->fc, cs->req);
+        fuse_copy_finish(cs);
+        buf = cs->pipebufs;
+        page_cache_get(page);
+        buf->page = page;
+        buf->offset = offset;
+        buf->len = count;
+        cs->pipebufs++;
+        cs->nr_segs++;
+        cs->len = 0;
+        return 0;
+}
 /*
 * Copy a page in the request to/from the userspace buffer.  Must be
 * done atomically
 */
-static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
+static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
                          unsigned offset, unsigned count, int zeroing)
 {
+        int err;
+        struct page *page = *pagep;
        if (page && zeroing && count < PAGE_SIZE) {
                void *mapaddr = kmap_atomic(page, KM_USER1);
                memset(mapaddr, 0, PAGE_SIZE);
                kunmap_atomic(mapaddr, KM_USER1);
        }
        while (count) {
-                if (!cs->len) {
+                if (cs->write && cs->pipebufs && page) {
-                        int err = fuse_copy_fill(cs);
+                        return fuse_ref_page(cs, page, offset, count);
-                        if (err)
+                } else if (!cs->len) {
-                                return err;
+                        if (cs->move_pages && page &&
+                            offset == 0 && count == PAGE_SIZE) {
+                                err = fuse_try_move_page(cs, pagep);
+                                if (err <= 0)
+                                        return err;
+                        } else {
+                                err = fuse_copy_fill(cs);
+                                if (err)
+                                        return err;
+                        }
                }
                if (page) {
                        void *mapaddr = kmap_atomic(page, KM_USER1);
@@ -627,8 +834,10 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
        unsigned count = min(nbytes, (unsigned) PAGE_SIZE - offset);
        for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
-                struct page *page = req->pages[i];
+                int err;
-                int err = fuse_copy_page(cs, page, offset, count, zeroing);
+                err = fuse_copy_page(cs, &req->pages[i], offset, count,
+                                     zeroing);
                if (err)
                        return err;
@@ -705,11 +914,10 @@ __acquires(&fc->lock)
 *
 * Called with fc->lock held, releases it
 */
-static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req,
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
-                               const struct iovec *iov, unsigned long nr_segs)
+                               size_t nbytes, struct fuse_req *req)
 __releases(&fc->lock)
 {
-        struct fuse_copy_state cs;
        struct fuse_in_header ih;
        struct fuse_interrupt_in arg;
        unsigned reqsize = sizeof(ih) + sizeof(arg);
@@ -725,14 +933,13 @@ __releases(&fc->lock)
        arg.unique = req->in.h.unique;
        spin_unlock(&fc->lock);
-        if (iov_length(iov, nr_segs) < reqsize)
+        if (nbytes < reqsize)
                return -EINVAL;
-        fuse_copy_init(&cs, fc, 1, NULL, iov, nr_segs);
+        err = fuse_copy_one(cs, &ih, sizeof(ih));
-        err = fuse_copy_one(&cs, &ih, sizeof(ih));
        if (!err)
-                err = fuse_copy_one(&cs, &arg, sizeof(arg));
+                err = fuse_copy_one(cs, &arg, sizeof(arg));
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        return err ? err : reqsize;
 }
@@ -746,18 +953,13 @@ __releases(&fc->lock)
 * request_end().  Otherwise add it to the processing list, and set
 * the 'sent' flag.
 */
-static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
-                              unsigned long nr_segs, loff_t pos)
+                                struct fuse_copy_state *cs, size_t nbytes)
 {
        int err;
        struct fuse_req *req;
        struct fuse_in *in;
-        struct fuse_copy_state cs;
        unsigned reqsize;
-        struct file *file = iocb->ki_filp;
-        struct fuse_conn *fc = fuse_get_conn(file);
-        if (!fc)
-                return -EPERM;
 restart:
        spin_lock(&fc->lock);
@@ -777,7 +979,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        if (!list_empty(&fc->interrupts)) {
                req = list_entry(fc->interrupts.next, struct fuse_req,
                                 intr_entry);
-                return fuse_read_interrupt(fc, req, iov, nr_segs);
+                return fuse_read_interrupt(fc, cs, nbytes, req);
        }
        req = list_entry(fc->pending.next, struct fuse_req, list);
@@ -787,7 +989,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        in = &req->in;
        reqsize = in->h.len;
        /* If request is too large, reply with an error and restart the read */
-        if (iov_length(iov, nr_segs) < reqsize) {
+        if (nbytes < reqsize) {
                req->out.h.error = -EIO;
                /* SETXATTR is special, since it may contain too large data */
                if (in->h.opcode == FUSE_SETXATTR)
@@ -796,12 +998,12 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
                goto restart;
        }
        spin_unlock(&fc->lock);
-        fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);
+        cs->req = req;
-        err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
+        err = fuse_copy_one(cs, &in->h, sizeof(in->h));
        if (!err)
-                err = fuse_copy_args(&cs, in->numargs, in->argpages,
+                err = fuse_copy_args(cs, in->numargs, in->argpages,
                                     (struct fuse_arg *) in->args, 0);
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        spin_lock(&fc->lock);
        req->locked = 0;
        if (req->aborted) {
@@ -829,6 +1031,110 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
        return err;
 }
+static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t pos)
+{
+        struct fuse_copy_state cs;
+        struct file *file = iocb->ki_filp;
+        struct fuse_conn *fc = fuse_get_conn(file);
+        if (!fc)
+                return -EPERM;
+        fuse_copy_init(&cs, fc, 1, iov, nr_segs);
+        return fuse_dev_do_read(fc, file, &cs, iov_length(iov, nr_segs));
+}
+static int fuse_dev_pipe_buf_steal(struct pipe_inode_info *pipe,
+                                   struct pipe_buffer *buf)
+{
+        return 1;
+}
+static const struct pipe_buf_operations fuse_dev_pipe_buf_ops = {
+        .can_merge = 0,
+        .map = generic_pipe_buf_map,
+        .unmap = generic_pipe_buf_unmap,
+        .confirm = generic_pipe_buf_confirm,
+        .release = generic_pipe_buf_release,
+        .steal = fuse_dev_pipe_buf_steal,
+        .get = generic_pipe_buf_get,
+};
+static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+                                    struct pipe_inode_info *pipe,
+                                    size_t len, unsigned int flags)
+{
+        int ret;
+        int page_nr = 0;
+        int do_wakeup = 0;
+        struct pipe_buffer *bufs;
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc = fuse_get_conn(in);
+        if (!fc)
+                return -EPERM;
+        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        if (!bufs)
+                return -ENOMEM;
+        fuse_copy_init(&cs, fc, 1, NULL, 0);
+        cs.pipebufs = bufs;
+        cs.pipe = pipe;
+        ret = fuse_dev_do_read(fc, in, &cs, len);
+        if (ret < 0)
+                goto out;
+        ret = 0;
+        pipe_lock(pipe);
+        if (!pipe->readers) {
+                send_sig(SIGPIPE, current, 0);
+                if (!ret)
+                        ret = -EPIPE;
+                goto out_unlock;
+        }
+        if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+                ret = -EIO;
+                goto out_unlock;
+        }
+        while (page_nr < cs.nr_segs) {
+                int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+                struct pipe_buffer *buf = pipe->bufs + newbuf;
+                buf->page = bufs[page_nr].page;
+                buf->offset = bufs[page_nr].offset;
+                buf->len = bufs[page_nr].len;
+                buf->ops = &fuse_dev_pipe_buf_ops;
+                pipe->nrbufs++;
+                page_nr++;
+                ret += buf->len;
+                if (pipe->inode)
+                        do_wakeup = 1;
+        }
+out_unlock:
+        pipe_unlock(pipe);
+        if (do_wakeup) {
+                smp_mb();
+                if (waitqueue_active(&pipe->wait))
+                        wake_up_interruptible(&pipe->wait);
+                kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+        }
+out:
+        for (; page_nr < cs.nr_segs; page_nr++)
+                page_cache_release(bufs[page_nr].page);
+        kfree(bufs);
+        return ret;
+}
 static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
                            struct fuse_copy_state *cs)
 {
@@ -988,23 +1294,17 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
 * it from the list and copy the rest of the buffer to the request.
 * The request is finished by calling request_end()
 */
-static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
-                               unsigned long nr_segs, loff_t pos)
+                                 struct fuse_copy_state *cs, size_t nbytes)
 {
        int err;
-        size_t nbytes = iov_length(iov, nr_segs);
        struct fuse_req *req;
        struct fuse_out_header oh;
-        struct fuse_copy_state cs;
-        struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
-        if (!fc)
-                return -EPERM;
-        fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
        if (nbytes < sizeof(struct fuse_out_header))
                return -EINVAL;
-        err = fuse_copy_one(&cs, &oh, sizeof(oh));
+        err = fuse_copy_one(cs, &oh, sizeof(oh));
        if (err)
                goto err_finish;
@@ -1017,7 +1317,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
         * and error contains notification code.
         */
        if (!oh.unique) {
-                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs);
+                err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
                return err ? err : nbytes;
        }
@@ -1036,7 +1336,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        if (req->aborted) {
                spin_unlock(&fc->lock);
-                fuse_copy_finish(&cs);
+                fuse_copy_finish(cs);
                spin_lock(&fc->lock);
                request_end(fc, req);
                return -ENOENT;
@@ -1053,7 +1353,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
                        queue_interrupt(fc, req);
                spin_unlock(&fc->lock);
-                fuse_copy_finish(&cs);
+                fuse_copy_finish(cs);
                return nbytes;
        }
@@ -1061,11 +1361,13 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
        list_move(&req->list, &fc->io);
        req->out.h = oh;
        req->locked = 1;
-        cs.req = req;
+        cs->req = req;
+        if (!req->out.page_replace)
+                cs->move_pages = 0;
        spin_unlock(&fc->lock);
-        err = copy_out_args(&cs, &req->out, nbytes);
+        err = copy_out_args(cs, &req->out, nbytes);
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        spin_lock(&fc->lock);
        req->locked = 0;
@@ -1081,10 +1383,101 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
 err_unlock:
        spin_unlock(&fc->lock);
 err_finish:
-        fuse_copy_finish(&cs);
+        fuse_copy_finish(cs);
        return err;
 }
+static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov,
+                              unsigned long nr_segs, loff_t pos)
+{
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
+        if (!fc)
+                return -EPERM;
+        fuse_copy_init(&cs, fc, 0, iov, nr_segs);
+        return fuse_dev_do_write(fc, &cs, iov_length(iov, nr_segs));
+}
+static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
+                                     struct file *out, loff_t *ppos,
+                                     size_t len, unsigned int flags)
+{
+        unsigned nbuf;
+        unsigned idx;
+        struct pipe_buffer *bufs;
+        struct fuse_copy_state cs;
+        struct fuse_conn *fc;
+        size_t rem;
+        ssize_t ret;
+        fc = fuse_get_conn(out);
+        if (!fc)
+                return -EPERM;
+        bufs = kmalloc(pipe->buffers * sizeof (struct pipe_buffer), GFP_KERNEL);
+        if (!bufs)
+                return -ENOMEM;
+        pipe_lock(pipe);
+        nbuf = 0;
+        rem = 0;
+        for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
+                rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+        ret = -EINVAL;
+        if (rem < len) {
+                pipe_unlock(pipe);
+                goto out;
+        }
+        rem = len;
+        while (rem) {
+                struct pipe_buffer *ibuf;
+                struct pipe_buffer *obuf;
+                BUG_ON(nbuf >= pipe->buffers);
+                BUG_ON(!pipe->nrbufs);
+                ibuf = &pipe->bufs[pipe->curbuf];
+                obuf = &bufs[nbuf];
+                if (rem >= ibuf->len) {
+                        *obuf = *ibuf;
+                        ibuf->ops = NULL;
+                        pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+                        pipe->nrbufs--;
+                } else {
+                        ibuf->ops->get(pipe, ibuf);
+                        *obuf = *ibuf;
+                        obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+                        obuf->len = rem;
+                        ibuf->offset += obuf->len;
+                        ibuf->len -= obuf->len;
+                }
+                nbuf++;
+                rem -= obuf->len;
+        }
+        pipe_unlock(pipe);
+        fuse_copy_init(&cs, fc, 0, NULL, nbuf);
+        cs.pipebufs = bufs;
+        cs.pipe = pipe;
+        if (flags & SPLICE_F_MOVE)
+                cs.move_pages = 1;
+        ret = fuse_dev_do_write(fc, &cs, len);
+        for (idx = 0; idx < nbuf; idx++) {
+                struct pipe_buffer *buf = &bufs[idx];
+                buf->ops->release(pipe, buf);
+        }
+out:
+        kfree(bufs);
+        return ret;
+}
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
        unsigned mask = POLLOUT | POLLWRNORM;
@@ -1226,8 +1619,10 @@ const struct file_operations fuse_dev_operations = {
        .llseek         = no_llseek,
        .read           = do_sync_read,
        .aio_read       = fuse_dev_read,
+        .splice_read    = fuse_dev_splice_read,
        .write          = do_sync_write,
        .aio_write      = fuse_dev_write,
+        .splice_write   = fuse_dev_splice_write,
        .poll           = fuse_dev_poll,
        .release        = fuse_dev_release,
        .fasync         = fuse_dev_fasync,
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 4787ae6c5c1c..3cdc5f78a406 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1156,10 +1156,9 @@ static int fuse_dir_release(struct inode *inode, struct file *file)
        return 0;
 }
-static int fuse_dir_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_dir_fsync(struct file *file, int datasync)
 {
-        /* nfsd can call this with no file */
+        return fuse_fsync_common(file, datasync, 1);
-        return file ? fuse_fsync_common(file, de, datasync, 1) : 0;
 }
 static bool update_mtime(unsigned ivalid)
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index a9f5e137f1d3..ada0adeb3bb5 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -351,10 +351,9 @@ static void fuse_sync_writes(struct inode *inode)
        fuse_release_nowrite(inode);
 }
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+int fuse_fsync_common(struct file *file, int datasync, int isdir)
-                      int isdir)
 {
-        struct inode *inode = de->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct fuse_conn *fc = get_fuse_conn(inode);
        struct fuse_file *ff = file->private_data;
        struct fuse_req *req;
@@ -403,9 +402,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
        return err;
 }
-static int fuse_fsync(struct file *file, struct dentry *de, int datasync)
+static int fuse_fsync(struct file *file, int datasync)
 {
-        return fuse_fsync_common(file, de, datasync, 0);
+        return fuse_fsync_common(file, datasync, 0);
 }
 void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
@@ -517,17 +516,26 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
        int i;
        size_t count = req->misc.read.in.size;
        size_t num_read = req->out.args[0].size;
-        struct inode *inode = req->pages[0]->mapping->host;
+        struct address_space *mapping = NULL;
-        /*
+        for (i = 0; mapping == NULL && i < req->num_pages; i++)
-         * Short read means EOF.  If file size is larger, truncate it
+                mapping = req->pages[i]->mapping;
-         */
-        if (!req->out.h.error && num_read < count) {
-                loff_t pos = page_offset(req->pages[0]) + num_read;
-                fuse_read_update_size(inode, pos, req->misc.read.attr_ver);
-        }
-        fuse_invalidate_attr(inode); /* atime changed */
+        if (mapping) {
+                struct inode *inode = mapping->host;
+                /*
+                 * Short read means EOF. If file size is larger, truncate it
+                 */
+                if (!req->out.h.error && num_read < count) {
+                        loff_t pos;
+                        pos = page_offset(req->pages[0]) + num_read;
+                        fuse_read_update_size(inode, pos,
+                                              req->misc.read.attr_ver);
+                }
+                fuse_invalidate_attr(inode); /* atime changed */
+        }
        for (i = 0; i < req->num_pages; i++) {
                struct page *page = req->pages[i];
@@ -536,6 +544,7 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
                else
                        SetPageError(page);
                unlock_page(page);
+                page_cache_release(page);
        }
        if (req->ff)
                fuse_file_put(req->ff);
@@ -550,6 +559,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
        req->out.argpages = 1;
        req->out.page_zeroing = 1;
+        req->out.page_replace = 1;
        fuse_read_fill(req, file, pos, count, FUSE_READ);
        req->misc.read.attr_ver = fuse_get_attr_version(fc);
        if (fc->async_read) {
@@ -589,6 +599,7 @@ static int fuse_readpages_fill(void *_data, struct page *page)
                        return PTR_ERR(req);
                }
        }
+        page_cache_get(page);
        req->pages[req->num_pages] = page;
        req->num_pages++;
        return 0;
@@ -994,10 +1005,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf,
        nbytes = min_t(size_t, nbytes, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
        npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
        npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ);
-        down_read(&current->mm->mmap_sem);
+        npages = get_user_pages_fast(user_addr, npages, !write, req->pages);
-        npages = get_user_pages(current, current->mm, user_addr, npages, !write,
-                                0, req->pages, NULL);
-        up_read(&current->mm->mmap_sem);
        if (npages < 0)
                return npages;
@@ -1580,9 +1588,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
        while (iov_iter_count(&ii)) {
                struct page *page = pages[page_idx++];
                size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
-                void *kaddr, *map;
+                void *kaddr;
-                kaddr = map = kmap(page);
+                kaddr = kmap(page);
                while (todo) {
                        char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 01cc462ff45d..8f309f04064e 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -177,6 +177,9 @@ struct fuse_out {
        /** Zero partially or not copied pages */
        unsigned page_zeroing:1;
+        /** Pages may be replaced with new ones */
+        unsigned page_replace:1;
        /** Number or arguments */
        unsigned numargs;
@@ -568,8 +571,7 @@ void fuse_release_common(struct file *file, int opcode);
 /**
 * Send FSYNC or FSYNCDIR request
 */
-int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
+int fuse_fsync_common(struct file *file, int datasync, int isdir);
-                      int isdir);
 /**
 * Notify poll wakeup
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index a739a0a48067..9f8b52500d63 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -700,8 +700,14 @@ out:
                return 0;
        page_cache_release(page);
+        /*
+         * XXX(hch): the call below should probably be replaced with
+         * a call to the gfs2-specific truncate blocks helper to actually
+         * release disk blocks..
+         */
        if (pos + len > ip->i_inode.i_size)
-                vmtruncate(&ip->i_inode, ip->i_inode.i_size);
+                simple_setsize(&ip->i_inode, ip->i_inode.i_size);
 out_endtrans:
        gfs2_trans_end(sdp);
 out_trans_fail:
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index b20bfcc9fa2d..ed9a94f0ef15 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -554,9 +554,9 @@ static int gfs2_close(struct inode *inode, struct file *file)
 * Returns: errno
 */
-static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int gfs2_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int sync_state = inode->i_state & (I_DIRTY_SYNC|I_DIRTY_DATASYNC);
        int ret = 0;
diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c
index 4e64352d49de..98cdd05f3316 100644
--- a/fs/gfs2/ops_inode.c
+++ b/fs/gfs2/ops_inode.c
@@ -1071,6 +1071,9 @@ int gfs2_permission(struct inode *inode, int mask)
        return error;
 }
+/*
+ * XXX: should be changed to have proper ordering by opencoding simple_setsize
+ */
 static int setattr_size(struct inode *inode, struct iattr *attr)
 {
        struct gfs2_inode *ip = GFS2_I(inode);
@@ -1081,7 +1084,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr)
                error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks);
                if (error)
                        return error;
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
                gfs2_trans_end(sdp);
                if (error) 
                        return error;
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 3a029d8f4cf1..87ac1891a185 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -411,9 +411,9 @@ int hostfs_file_open(struct inode *ino, struct file *file)
        return 0;
 }
-int hostfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hostfs_fsync(struct file *file, int datasync)
 {
-        return fsync_file(HOSTFS_I(dentry->d_inode)->fd, datasync);
+        return fsync_file(HOSTFS_I(file->f_mapping->host)->fd, datasync);
 }
 static const struct file_operations hostfs_file_fops = {
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 3efabff00367..a9ae9bfa752f 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -19,9 +19,9 @@ static int hpfs_file_release(struct inode *inode, struct file *file)
        return 0;
 }
-int hpfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+int hpfs_file_fsync(struct file *file, int datasync)
 {
-        /*return file_fsync(file, dentry);*/
+        /*return file_fsync(file, datasync);*/
        return 0; /* Don't fsync :-) */
 }
diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h
index 97bf738cd5d6..75f9d4324851 100644
--- a/fs/hpfs/hpfs_fn.h
+++ b/fs/hpfs/hpfs_fn.h
@@ -268,7 +268,7 @@ void hpfs_set_ea(struct inode *, struct fnode *, const char *,
 /* file.c */
-int hpfs_file_fsync(struct file *, struct dentry *, int);
+int hpfs_file_fsync(struct file *, int);
 extern const struct file_operations hpfs_file_ops;
 extern const struct inode_operations hpfs_file_iops;
 extern const struct address_space_operations hpfs_aops;
diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c
index 2e4dfa8593da..826c3f9d29ac 100644
--- a/fs/hppfs/hppfs.c
+++ b/fs/hppfs/hppfs.c
@@ -587,7 +587,7 @@ static int hppfs_readdir(struct file *file, void *ent, filldir_t filldir)
        return err;
 }
-static int hppfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int hppfs_fsync(struct file *file, int datasync)
 {
        return 0;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index a0bbd3d1b41a..a4e9a7ec3691 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -688,7 +688,7 @@ static void init_once(void *foo)
 const struct file_operations hugetlbfs_file_operations = {
        .read                   = hugetlbfs_read,
        .mmap                   = hugetlbfs_file_mmap,
-        .fsync                  = simple_sync_file,
+        .fsync                  = noop_fsync,
        .get_unmapped_area      = hugetlb_get_unmapped_area,
 };
diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c
index b9ab69b3a482..e0aca9a0ac68 100644
--- a/fs/isofs/dir.c
+++ b/fs/isofs/dir.c
@@ -272,6 +272,7 @@ static int isofs_readdir(struct file *filp,
 const struct file_operations isofs_dir_operations =
 {
+        .llseek = generic_file_llseek,
        .read = generic_read_dir,
        .readdir = isofs_readdir,
 };
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index bfc70f57900f..e214d68620ac 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1311,7 +1311,6 @@ int jbd2_journal_stop(handle_t *handle)
        if (handle->h_sync)
                transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
-        spin_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
        transaction->t_outstanding_credits -= handle->h_buffer_credits;
        transaction->t_updates--;
@@ -1340,8 +1339,7 @@ int jbd2_journal_stop(handle_t *handle)
                jbd_debug(2, "transaction too old, requesting commit for "
                                        "handle %p\n", handle);
                /* This is non-blocking */
-                __jbd2_log_start_commit(journal, transaction->t_tid);
+                jbd2_log_start_commit(journal, transaction->t_tid);
-                spin_unlock(&journal->j_state_lock);
                /*
                 * Special case: JBD2_SYNC synchronous updates require us
@@ -1351,7 +1349,6 @@ int jbd2_journal_stop(handle_t *handle)
                        err = jbd2_log_wait_commit(journal, tid);
        } else {
                spin_unlock(&transaction->t_handle_lock);
-                spin_unlock(&journal->j_state_lock);
        }
        lock_map_release(&handle->h_lockdep_map);
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index a33aab6b5e68..54a92fd02bbd 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -234,8 +234,9 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
                        if (inode->i_mode != mode) {
                                struct iattr attr;
-                                attr.ia_valid = ATTR_MODE;
+                                attr.ia_valid = ATTR_MODE | ATTR_CTIME;
                                attr.ia_mode = mode;
+                                attr.ia_ctime = CURRENT_TIME_SEC;
                                rc = jffs2_do_setattr(inode, &attr);
                                if (rc < 0)
                                        return rc;
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index 7aa4417e085f..166062a68230 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -222,15 +222,18 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode,
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(ri->ctime));
        jffs2_free_raw_inode(ri);
-        d_instantiate(dentry, inode);
        D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n",
                  inode->i_ino, inode->i_mode, inode->i_nlink,
                  f->inocache->pino_nlink, inode->i_mapping->nrpages));
+        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
 fail:
        make_bad_inode(inode);
+        unlock_new_inode(inode);
        iput(inode);
        jffs2_free_raw_inode(ri);
        return ret;
@@ -360,8 +363,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                /* Eeek. Wave bye bye */
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fn);
-                return PTR_ERR(fn);
+                goto fail;
        }
        /* We use f->target field to store the target path. */
@@ -370,8 +373,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1);
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        memcpy(f->target, target, targetlen + 1);
@@ -386,30 +389,24 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        jffs2_complete_reservation(c);
        ret = jffs2_init_security(inode, dir_i);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_init_acl_post(inode);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-        if (ret) {
+        if (ret)
-                /* Eep. */
+                goto fail;
-                jffs2_clear_inode(inode);
-                return ret;
-        }
        rd = jffs2_alloc_raw_dirent();
        if (!rd) {
                /* Argh. Now we treat it like a normal delete */
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        dir_f = JFFS2_INODE_INFO(dir_i);
@@ -437,8 +434,8 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
                jffs2_complete_reservation(c);
                jffs2_free_raw_dirent(rd);
                mutex_unlock(&dir_f->sem);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fd);
-                return PTR_ERR(fd);
+                goto fail;
        }
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -453,7 +450,14 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char
        jffs2_complete_reservation(c);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
+ fail:
+        make_bad_inode(inode);
+        unlock_new_inode(inode);
+        iput(inode);
+        return ret;
 }
@@ -519,8 +523,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
                /* Eeek. Wave bye bye */
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fn);
-                return PTR_ERR(fn);
+                goto fail;
        }
        /* No data here. Only a metadata node, which will be
           obsoleted by the first data write
@@ -531,30 +535,24 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        jffs2_complete_reservation(c);
        ret = jffs2_init_security(inode, dir_i);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_init_acl_post(inode);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-        if (ret) {
+        if (ret)
-                /* Eep. */
+                goto fail;
-                jffs2_clear_inode(inode);
-                return ret;
-        }
        rd = jffs2_alloc_raw_dirent();
        if (!rd) {
                /* Argh. Now we treat it like a normal delete */
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        dir_f = JFFS2_INODE_INFO(dir_i);
@@ -582,8 +580,8 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
                jffs2_complete_reservation(c);
                jffs2_free_raw_dirent(rd);
                mutex_unlock(&dir_f->sem);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fd);
-                return PTR_ERR(fd);
+                goto fail;
        }
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -599,7 +597,14 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode)
        jffs2_complete_reservation(c);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
+ fail:
+        make_bad_inode(inode);
+        unlock_new_inode(inode);
+        iput(inode);
+        return ret;
 }
 static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry)
@@ -693,8 +698,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
                /* Eeek. Wave bye bye */
                mutex_unlock(&f->sem);
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fn);
-                return PTR_ERR(fn);
+                goto fail;
        }
        /* No data here. Only a metadata node, which will be
           obsoleted by the first data write
@@ -705,30 +710,24 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        jffs2_complete_reservation(c);
        ret = jffs2_init_security(inode, dir_i);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_init_acl_post(inode);
-        if (ret) {
+        if (ret)
-                jffs2_clear_inode(inode);
+                goto fail;
-                return ret;
-        }
        ret = jffs2_reserve_space(c, sizeof(*rd)+namelen, &alloclen,
                                  ALLOC_NORMAL, JFFS2_SUMMARY_DIRENT_SIZE(namelen));
-        if (ret) {
+        if (ret)
-                /* Eep. */
+                goto fail;
-                jffs2_clear_inode(inode);
-                return ret;
-        }
        rd = jffs2_alloc_raw_dirent();
        if (!rd) {
                /* Argh. Now we treat it like a normal delete */
                jffs2_complete_reservation(c);
-                jffs2_clear_inode(inode);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        dir_f = JFFS2_INODE_INFO(dir_i);
@@ -759,8 +758,8 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
                jffs2_complete_reservation(c);
                jffs2_free_raw_dirent(rd);
                mutex_unlock(&dir_f->sem);
-                jffs2_clear_inode(inode);
+                ret = PTR_ERR(fd);
-                return PTR_ERR(fd);
+                goto fail;
        }
        dir_i->i_mtime = dir_i->i_ctime = ITIME(je32_to_cpu(rd->mctime));
@@ -775,8 +774,14 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de
        jffs2_complete_reservation(c);
        d_instantiate(dentry, inode);
+        unlock_new_inode(inode);
        return 0;
+ fail:
+        make_bad_inode(inode);
+        unlock_new_inode(inode);
+        iput(inode);
+        return ret;
 }
 static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index e7291c161a19..813497024437 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -26,9 +26,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping,
                        struct page **pagep, void **fsdata);
 static int jffs2_readpage (struct file *filp, struct page *pg);
-int jffs2_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int jffs2_fsync(struct file *filp, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        struct jffs2_sb_info *c = JFFS2_SB_INFO(inode->i_sb);
        /* Trigger GC to flush any pending writes for this inode */
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 86e0821fc989..459d39d1ea0b 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -169,13 +169,13 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr)
        mutex_unlock(&f->sem);
        jffs2_complete_reservation(c);
-        /* We have to do the vmtruncate() without f->sem held, since
+        /* We have to do the simple_setsize() without f->sem held, since
           some pages may be locked and waiting for it in readpage().
           We are protected from a simultaneous write() extending i_size
           back past iattr->ia_size, because do_truncate() holds the
           generic inode semaphore. */
        if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) {
-                vmtruncate(inode, iattr->ia_size);      
+                simple_setsize(inode, iattr->ia_size);
                inode->i_blocks = (inode->i_size + 511) >> 9;
        }       
@@ -465,7 +465,12 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i
        inode->i_blocks = 0;
        inode->i_size = 0;
-        insert_inode_hash(inode);
+        if (insert_inode_locked(inode) < 0) {
+                make_bad_inode(inode);
+                unlock_new_inode(inode);
+                iput(inode);
+                return ERR_PTR(-EINVAL);
+        }
        return inode;
 }
diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h
index 035a767f958b..4791aacf3084 100644
--- a/fs/jffs2/os-linux.h
+++ b/fs/jffs2/os-linux.h
@@ -158,7 +158,7 @@ extern const struct inode_operations jffs2_dir_inode_operations;
 extern const struct file_operations jffs2_file_operations;
 extern const struct inode_operations jffs2_file_inode_operations;
 extern const struct address_space_operations jffs2_file_address_operations;
-int jffs2_fsync(struct file *, struct dentry *, int);
+int jffs2_fsync(struct file *, int);
 int jffs2_do_readpage_unlock (struct inode *inode, struct page *pg);
 /* ioctl.c */
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 85d9ec659225..127263cc8657 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -27,9 +27,9 @@
 #include "jfs_acl.h"
 #include "jfs_debug.h"
-int jfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int jfs_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int rc = 0;
        if (!(inode->i_state & I_DIRTY) ||
diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h
index 9e6bda30a6e8..11042b1f44b5 100644
--- a/fs/jfs/jfs_inode.h
+++ b/fs/jfs/jfs_inode.h
@@ -21,7 +21,7 @@
 struct fid;
 extern struct inode *ialloc(struct inode *, umode_t);
-extern int jfs_fsync(struct file *, struct dentry *, int);
+extern int jfs_fsync(struct file *, int);
 extern long jfs_ioctl(struct file *, unsigned int, unsigned long);
 extern long jfs_compat_ioctl(struct file *, unsigned int, unsigned long);
 extern struct inode *jfs_iget(struct super_block *, unsigned long);
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index b66832ac33ac..b38f96bef829 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -179,6 +179,8 @@ static void jfs_put_super(struct super_block *sb)
        jfs_info("In jfs_put_super");
+        dquot_disable(sb, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        lock_kernel();
        rc = jfs_umount(sb);
@@ -396,10 +398,20 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
                JFS_SBI(sb)->flag = flag;
                ret = jfs_mount_rw(sb, 1);
+                /* mark the fs r/w for quota activity */
+                sb->s_flags &= ~MS_RDONLY;
                unlock_kernel();
+                dquot_resume(sb, -1);
                return ret;
        }
        if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
+                rc = dquot_suspend(sb, -1);
+                if (rc < 0) {
+                        unlock_kernel();
+                        return rc;
+                }
                rc = jfs_umount_rw(sb);
                JFS_SBI(sb)->flag = flag;
                unlock_kernel();
@@ -469,6 +481,10 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
         */
        sb->s_op = &jfs_super_operations;
        sb->s_export_op = &jfs_export_operations;
+#ifdef CONFIG_QUOTA
+        sb->dq_op = &dquot_operations;
+        sb->s_qcop = &dquot_quotactl_ops;
+#endif
        /*
         * Initialize direct-mapping inode/address-space
diff --git a/fs/libfs.c b/fs/libfs.c
index 232bea425b09..dcaf972cbf1b 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -8,6 +8,7 @@
 #include <linux/slab.h>
 #include <linux/mount.h>
 #include <linux/vfs.h>
+#include <linux/quotaops.h>
 #include <linux/mutex.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
@@ -58,11 +59,6 @@ struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct na
        return NULL;
 }
-int simple_sync_file(struct file * file, struct dentry *dentry, int datasync)
-{
-        return 0;
-}
- 
 int dcache_dir_open(struct inode *inode, struct file *file)
 {
        static struct qstr cursor_name = {.len = 1, .name = "."};
@@ -190,7 +186,7 @@ const struct file_operations simple_dir_operations = {
        .llseek         = dcache_dir_lseek,
        .read           = generic_read_dir,
        .readdir        = dcache_readdir,
-        .fsync          = simple_sync_file,
+        .fsync          = noop_fsync,
 };
 const struct inode_operations simple_dir_inode_operations = {
@@ -330,6 +326,81 @@ int simple_rename(struct inode *old_dir, struct dentry *old_dentry,
        return 0;
 }
+/**
+ * simple_setsize - handle core mm and vfs requirements for file size change
+ * @inode: inode
+ * @newsize: new file size
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setsize must be called with inode_mutex held.
+ *
+ * simple_setsize will check that the requested new size is OK (see
+ * inode_newsize_ok), and then will perform the necessary i_size update
+ * and pagecache truncation (if necessary). It will be typically be called
+ * from the filesystem's setattr function when ATTR_SIZE is passed in.
+ *
+ * The inode itself must have correct permissions and attributes to allow
+ * i_size to be changed, this function then just checks that the new size
+ * requested is valid.
+ *
+ * In the case of simple in-memory filesystems with inodes stored solely
+ * in the inode cache, and file data in the pagecache, nothing more needs
+ * to be done to satisfy a truncate request. Filesystems with on-disk
+ * blocks for example will need to free them in the case of truncate, in
+ * that case it may be easier not to use simple_setsize (but each of its
+ * components will likely be required at some point to update pagecache
+ * and inode etc).
+ */
+int simple_setsize(struct inode *inode, loff_t newsize)
+{
+        loff_t oldsize;
+        int error;
+        error = inode_newsize_ok(inode, newsize);
+        if (error)
+                return error;
+        oldsize = inode->i_size;
+        i_size_write(inode, newsize);
+        truncate_pagecache(inode, oldsize, newsize);
+        return error;
+}
+EXPORT_SYMBOL(simple_setsize);
+/**
+ * simple_setattr - setattr for simple in-memory filesystem
+ * @dentry: dentry
+ * @iattr: iattr structure
+ *
+ * Returns 0 on success, -error on failure.
+ *
+ * simple_setattr implements setattr for an in-memory filesystem which
+ * does not store its own file data or metadata (eg. uses the page cache
+ * and inode cache as its data store).
+ */
+int simple_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+        struct inode *inode = dentry->d_inode;
+        int error;
+        error = inode_change_ok(inode, iattr);
+        if (error)
+                return error;
+        if (iattr->ia_valid & ATTR_SIZE) {
+                error = simple_setsize(inode, iattr->ia_size);
+                if (error)
+                        return error;
+        }
+        generic_setattr(inode, iattr);
+        return error;
+}
+EXPORT_SYMBOL(simple_setattr);
 int simple_readpage(struct file *file, struct page *page)
 {
        clear_highpage(page);
@@ -418,7 +489,8 @@ int simple_write_end(struct file *file, struct address_space *mapping,
 * unique inode values later for this filesystem, then you must take care
 * to pass it an appropriate max_reserved value to avoid collisions.
 */
-int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files)
+int simple_fill_super(struct super_block *s, unsigned long magic,
+                      struct tree_descr *files)
 {
        struct inode *inode;
        struct dentry *root;
@@ -851,13 +923,22 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid,
 }
 EXPORT_SYMBOL_GPL(generic_fh_to_parent);
-int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
+/**
+ * generic_file_fsync - generic fsync implementation for simple filesystems
+ * @file:       file to synchronize
+ * @datasync:   only synchronize essential metadata if true
+ *
+ * This is a generic implementation of the fsync method for simple
+ * filesystems which track all non-inode metadata in the buffers list
+ * hanging off the address_space structure.
+ */
+int generic_file_fsync(struct file *file, int datasync)
 {
        struct writeback_control wbc = {
                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = 0, /* metadata-only; caller takes care of data */
        };
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int err;
        int ret;
@@ -872,7 +953,15 @@ int simple_fsync(struct file *file, struct dentry *dentry, int datasync)
                ret = err;
        return ret;
 }
-EXPORT_SYMBOL(simple_fsync);
+EXPORT_SYMBOL(generic_file_fsync);
+/*
+ * No-op implementation of ->fsync for in-memory filesystems.
+ */
+int noop_fsync(struct file *file, int datasync)
+{
+        return 0;
+}
 EXPORT_SYMBOL(dcache_dir_close);
 EXPORT_SYMBOL(dcache_dir_lseek);
@@ -895,7 +984,7 @@ EXPORT_SYMBOL(simple_release_fs);
 EXPORT_SYMBOL(simple_rename);
 EXPORT_SYMBOL(simple_rmdir);
 EXPORT_SYMBOL(simple_statfs);
-EXPORT_SYMBOL(simple_sync_file);
+EXPORT_SYMBOL(noop_fsync);
 EXPORT_SYMBOL(simple_unlink);
 EXPORT_SYMBOL(simple_read_from_buffer);
 EXPORT_SYMBOL(simple_write_to_buffer);
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 0de524071870..abe1cafbd4c2 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -219,9 +219,9 @@ int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
        }
 }
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int logfs_fsync(struct file *file, int datasync)
 {
-        struct super_block *sb = dentry->d_inode->i_sb;
+        struct super_block *sb = file->f_mapping->host->i_sb;
        logfs_write_anchor(sb);
        return 0;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 1a9db84f8d8f..c838c4d72111 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -506,7 +506,7 @@ extern const struct address_space_operations logfs_reg_aops;
 int logfs_readpage(struct file *file, struct page *page);
 int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
                unsigned long arg);
-int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int logfs_fsync(struct file *file, int datasync);
 /* gc.c */
 u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
diff --git a/fs/minix/dir.c b/fs/minix/dir.c
index 6198731d7fcd..1dbf921ca44b 100644
--- a/fs/minix/dir.c
+++ b/fs/minix/dir.c
@@ -22,7 +22,7 @@ const struct file_operations minix_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = minix_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static inline void dir_put_page(struct page *page)
@@ -72,16 +72,9 @@ static struct page * dir_get_page(struct inode *dir, unsigned long n)
 {
        struct address_space *mapping = dir->i_mapping;
        struct page *page = read_mapping_page(mapping, n, NULL);
-        if (!IS_ERR(page)) {
+        if (!IS_ERR(page))
                kmap(page);
-                if (!PageUptodate(page))
-                        goto fail;
-        }
        return page;
-fail:
-        dir_put_page(page);
-        return ERR_PTR(-EIO);
 }
 static inline void *minix_next_entry(void *de, struct minix_sb_info *sbi)
diff --git a/fs/minix/file.c b/fs/minix/file.c
index 3eec3e607a87..d5320ff23faf 100644
--- a/fs/minix/file.c
+++ b/fs/minix/file.c
@@ -19,7 +19,7 @@ const struct file_operations minix_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index f23010969369..13487ad16894 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -20,6 +20,9 @@ static inline block_t *i_data(struct inode *inode)
        return (block_t *)minix_i(inode)->u.i2_data;
 }
+#define DIRCOUNT 7
+#define INDIRCOUNT(sb) (1 << ((sb)->s_blocksize_bits - 2))
 static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
 {
        int n = 0;
@@ -34,21 +37,21 @@ static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
                        printk("MINIX-fs: block_to_path: "
                               "block %ld too big on dev %s\n",
                                block, bdevname(sb->s_bdev, b));
-        } else if (block < 7) {
+        } else if (block < DIRCOUNT) {
                offsets[n++] = block;
-        } else if ((block -= 7) < 256) {
+        } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
-                offsets[n++] = 7;
+                offsets[n++] = DIRCOUNT;
                offsets[n++] = block;
-        } else if ((block -= 256) < 256*256) {
+        } else if ((block -= INDIRCOUNT(sb)) < INDIRCOUNT(sb) * INDIRCOUNT(sb)) {
-                offsets[n++] = 8;
+                offsets[n++] = DIRCOUNT + 1;
-                offsets[n++] = block>>8;
+                offsets[n++] = block / INDIRCOUNT(sb);
-                offsets[n++] = block & 255;
+                offsets[n++] = block % INDIRCOUNT(sb);
        } else {
-                block -= 256*256;
+                block -= INDIRCOUNT(sb) * INDIRCOUNT(sb);
-                offsets[n++] = 9;
+                offsets[n++] = DIRCOUNT + 2;
-                offsets[n++] = block>>16;
+                offsets[n++] = (block / INDIRCOUNT(sb)) / INDIRCOUNT(sb);
-                offsets[n++] = (block>>8) & 255;
+                offsets[n++] = (block / INDIRCOUNT(sb)) % INDIRCOUNT(sb);
-                offsets[n++] = block & 255;
+                offsets[n++] = block % INDIRCOUNT(sb);
        }
        return n;
 }
diff --git a/fs/namei.c b/fs/namei.c
index 48e1f60520ea..868d0cb9d473 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1621,6 +1621,7 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
        case LAST_DOTDOT:
                follow_dotdot(nd);
                dir = nd->path.dentry;
+        case LAST_DOT:
                if (nd->path.mnt->mnt_sb->s_type->fs_flags & FS_REVAL_DOT) {
                        if (!dir->d_op->d_revalidate(dir, nd)) {
                                error = -ESTALE;
@@ -1628,7 +1629,6 @@ static struct file *do_last(struct nameidata *nd, struct path *path,
                        }
                }
                /* fallthrough */
-        case LAST_DOT:
        case LAST_ROOT:
                if (open_flag & O_CREAT)
                        goto exit;
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 92dde6f8d893..9578cbe0cd58 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -49,6 +49,7 @@ extern int ncp_symlink(struct inode *, struct dentry *, const char *);
                      
 const struct file_operations ncp_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = ncp_readdir,
        .unlocked_ioctl = ncp_ioctl,
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index b93870892892..3639cc5cbdae 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -22,7 +22,7 @@
 #include <linux/ncp_fs.h>
 #include "ncplib_kernel.h"
-static int ncp_fsync(struct file *file, struct dentry *dentry, int datasync)
+static int ncp_fsync(struct file *file, int datasync)
 {
        return 0;
 }
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ee9a179ebdf3..782b431ef91c 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -53,7 +53,7 @@ static int nfs_link(struct dentry *, struct inode *, struct dentry *);
 static int nfs_mknod(struct inode *, struct dentry *, int, dev_t);
 static int nfs_rename(struct inode *, struct dentry *,
                      struct inode *, struct dentry *);
-static int nfs_fsync_dir(struct file *, struct dentry *, int);
+static int nfs_fsync_dir(struct file *, int);
 static loff_t nfs_llseek_dir(struct file *, loff_t, int);
 const struct file_operations nfs_dir_operations = {
@@ -641,8 +641,10 @@ out:
 * All directory operations under NFS are synchronous, so fsync()
 * is a dummy operation.
 */
-static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync)
+static int nfs_fsync_dir(struct file *filp, int datasync)
 {
+        struct dentry *dentry = filp->f_path.dentry;
        dfprintk(FILE, "NFS: fsync dir(%s/%s) datasync %d\n",
                        dentry->d_parent->d_name.name, dentry->d_name.name,
                        datasync);
@@ -1741,6 +1743,7 @@ remove_lru_entry:
                        clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
                        smp_mb__after_clear_bit();
                }
+                spin_unlock(&inode->i_lock);
        }
        spin_unlock(&nfs_access_lru_lock);
        nfs_access_free_list(&head);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cac96bcc91e4..36a5e74f51b4 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -53,7 +53,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe,
 static ssize_t nfs_file_write(struct kiocb *, const struct iovec *iov,
                                unsigned long nr_segs, loff_t pos);
 static int  nfs_file_flush(struct file *, fl_owner_t id);
-static int  nfs_file_fsync(struct file *, struct dentry *dentry, int datasync);
+static int  nfs_file_fsync(struct file *, int datasync);
 static int nfs_check_flags(int flags);
 static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl);
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl);
@@ -322,8 +322,9 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
 * whether any write errors occurred for this process.
 */
 static int
-nfs_file_fsync(struct file *file, struct dentry *dentry, int datasync)
+nfs_file_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct inode *inode = dentry->d_inode;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 3aea3ca98ab7..91679e2631ee 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1386,7 +1386,7 @@ static int nfs_commit_inode(struct inode *inode, int how)
        int res = 0;
        if (!nfs_commit_set_lock(NFS_I(inode), may_wait))
-                goto out;
+                goto out_mark_dirty;
        spin_lock(&inode->i_lock);
        res = nfs_scan_commit(inode, &head, 0, 0);
        spin_unlock(&inode->i_lock);
@@ -1398,9 +1398,18 @@ static int nfs_commit_inode(struct inode *inode, int how)
                        wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT,
                                        nfs_wait_bit_killable,
                                        TASK_KILLABLE);
+                else
+                        goto out_mark_dirty;
        } else
                nfs_commit_clear_lock(NFS_I(inode));
-out:
+        return res;
+        /* Note: If we exit without ensuring that the commit is complete,
+         * we must mark the inode as dirty. Otherwise, future calls to
+         * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
+         * that the data is on the disk.
+         */
+out_mark_dirty:
+        __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
        return res;
 }
@@ -1509,14 +1518,17 @@ int nfs_wb_page(struct inode *inode, struct page *page)
        };
        int ret;
-        while(PagePrivate(page)) {
+        for (;;) {
                wait_on_page_writeback(page);
                if (clear_page_dirty_for_io(page)) {
                        ret = nfs_writepage_locked(page, &wbc);
                        if (ret < 0)
                                goto out_error;
+                        continue;
                }
-                ret = sync_inode(inode, &wbc);
+                if (!PagePrivate(page))
+                        break;
+                ret = nfs_commit_inode(inode, FLUSH_SYNC);
                if (ret < 0)
                        goto out_error;
        }
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 12f7109720c2..4a2734758778 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4122,8 +4122,8 @@ nfs4_state_shutdown(void)
        nfs4_lock_state();
        nfs4_release_reclaim();
        __nfs4_state_shutdown();
-        nfsd4_destroy_callback_queue();
        nfs4_unlock_state();
+        nfsd4_destroy_callback_queue();
 }
 /*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ebbf3b6b2457..3c111120b619 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -443,8 +443,7 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
        if (size_change)
                put_write_access(inode);
        if (!err)
-                if (EX_ISSYNC(fhp->fh_export))
+                commit_metadata(fhp);
-                        write_inode_now(inode, 1);
 out:
        return err;
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
index af638d59e3bf..43c8c5b541fd 100644
--- a/fs/nilfs2/btree.h
+++ b/fs/nilfs2/btree.h
@@ -75,8 +75,6 @@ struct nilfs_btree_path {
 extern struct kmem_cache *nilfs_btree_path_cache;
-int nilfs_btree_path_cache_init(void);
-void nilfs_btree_path_cache_destroy(void);
 int nilfs_btree_init(struct nilfs_bmap *);
 int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
                                   const __u64 *, const __u64 *, int);
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 30292df443ce..c9a30d7ff6fc 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -27,7 +27,7 @@
 #include "nilfs.h"
 #include "segment.h"
-int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
+int nilfs_sync_file(struct file *file, int datasync)
 {
        /*
         * Called from fsync() system call
@@ -37,7 +37,7 @@ int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
         * This function should be implemented when the writeback function
         * will be implemented.
         */
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        int err;
        if (!nilfs_inode_dirty(inode))
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 8723e5bfd071..47d6d7928122 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -228,7 +228,7 @@ extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
                           struct page *, struct inode *);
 /* file.c */
-extern int nilfs_sync_file(struct file *, struct dentry *, int);
+extern int nilfs_sync_file(struct file *, int);
 /* ioctl.c */
 long nilfs_ioctl(struct file *, unsigned int, unsigned long);
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
index fdf1c3b6d673..85fbb66455e2 100644
--- a/fs/nilfs2/segbuf.h
+++ b/fs/nilfs2/segbuf.h
@@ -127,8 +127,6 @@ struct nilfs_segment_buffer {
 extern struct kmem_cache *nilfs_segbuf_cachep;
-int __init nilfs_init_segbuf_cache(void);
-void nilfs_destroy_segbuf_cache(void);
 struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
 void nilfs_segbuf_free(struct nilfs_segment_buffer *);
 void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
index dca142361ccf..01e20dbb217d 100644
--- a/fs/nilfs2/segment.h
+++ b/fs/nilfs2/segment.h
@@ -221,8 +221,6 @@ enum {
 extern struct kmem_cache *nilfs_transaction_cachep;
 /* segment.c */
-extern int nilfs_init_transaction_cache(void);
-extern void nilfs_destroy_transaction_cache(void);
 extern void nilfs_relax_pressure_in_lock(struct super_block *);
 extern int nilfs_construct_segment(struct super_block *);
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 03b34b738993..414ef68931cf 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1130,13 +1130,13 @@ static void nilfs_segbuf_init_once(void *obj)
 static void nilfs_destroy_cachep(void)
 {
-         if (nilfs_inode_cachep)
+        if (nilfs_inode_cachep)
                kmem_cache_destroy(nilfs_inode_cachep);
-         if (nilfs_transaction_cachep)
+        if (nilfs_transaction_cachep)
                kmem_cache_destroy(nilfs_transaction_cachep);
-         if (nilfs_segbuf_cachep)
+        if (nilfs_segbuf_cachep)
                kmem_cache_destroy(nilfs_segbuf_cachep);
-         if (nilfs_btree_path_cache)
+        if (nilfs_btree_path_cache)
                kmem_cache_destroy(nilfs_btree_path_cache);
 }
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index fe44d3feee4a..0f48e7c5d9e1 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1527,10 +1527,9 @@ static int ntfs_dir_open(struct inode *vi, struct file *filp)
 * this problem for now.  We do write the $BITMAP attribute if it is present
 * which is the important one for a directory so things are not too bad.
 */
-static int ntfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int ntfs_dir_fsync(struct file *filp, int datasync)
-                int datasync)
 {
-        struct inode *bmp_vi, *vi = dentry->d_inode;
+        struct inode *bmp_vi, *vi = filp->f_mapping->host;
        int err, ret;
        ntfs_attr na;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index a1924a0d2ab0..113ebd9f25a4 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -2133,7 +2133,6 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 /**
 * ntfs_file_fsync - sync a file to disk
 * @filp:       file to be synced
- * @dentry:     dentry describing the file to sync
 * @datasync:   if non-zero only flush user data and not metadata
 *
 * Data integrity sync of a file to disk.  Used for fsync, fdatasync, and msync
@@ -2149,19 +2148,15 @@ static ssize_t ntfs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 * Also, if @datasync is true, we do not wait on the inode to be written out
 * but we always wait on the page cache pages to be written out.
 *
- * Note: In the past @filp could be NULL so we ignore it as we don't need it
- * anyway.
- *
 * Locking: Caller must hold i_mutex on the inode.
 *
 * TODO: We should probably also write all attribute/index inodes associated
 * with this inode but since we have no simple way of getting to them we ignore
 * this problem for now.
 */
-static int ntfs_file_fsync(struct file *filp, struct dentry *dentry,
+static int ntfs_file_fsync(struct file *filp, int datasync)
-                int datasync)
 {
-        struct inode *vi = dentry->d_inode;
+        struct inode *vi = filp->f_mapping->host;
        int err, ret = 0;
        ntfs_debug("Entering for inode 0x%lx.", vi->i_ino);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 97e54b9e654b..6a13ea64c447 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -175,13 +175,12 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
        return 0;
 }
-static int ocfs2_sync_file(struct file *file,
+static int ocfs2_sync_file(struct file *file, int datasync)
-                           struct dentry *dentry,
-                           int datasync)
 {
        int err = 0;
        journal_t *journal;
-        struct inode *inode = dentry->d_inode;
+        struct dentry *dentry = file->f_path.dentry;
+        struct inode *inode = file->f_mapping->host;
        struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
        mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync,
@@ -1053,7 +1052,7 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
        }
        /*
-         * This will intentionally not wind up calling vmtruncate(),
+         * This will intentionally not wind up calling simple_setsize(),
         * since all the work for a size change has been done above.
         * Otherwise, we could get into problems with truncate as
         * ip_alloc_sem is used there to protect against i_size
@@ -2119,9 +2118,13 @@ relock:
                         * direct write may have instantiated a few
                         * blocks outside i_size. Trim these off again.
                         * Don't need i_size_read because we hold i_mutex.
+                         *
+                         * XXX(hch): this looks buggy because ocfs2 did not
+                         * actually implement ->truncate.  Take a look at
+                         * the new truncate sequence and update this accordingly
                         */
                        if (*ppos + count > inode->i_size)
-                                vmtruncate(inode, inode->i_size);
+                                simple_setsize(inode, inode->i_size);
                        ret = written;
                        goto out_dio;
                }
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2c26ce251cb3..0eaa929a4dbf 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -879,13 +879,15 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend)
                if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                        continue;
                if (unsuspend)
-                        status = vfs_quota_enable(
+                        status = dquot_resume(sb, type);
-                                        sb_dqopt(sb)->files[type],
+                else {
-                                        type, QFMT_OCFS2,
+                        struct ocfs2_mem_dqinfo *oinfo;
-                                        DQUOT_SUSPENDED);
-                else
+                        /* Cancel periodic syncing before suspending */
-                        status = vfs_quota_disable(sb, type,
+                        oinfo = sb_dqinfo(sb, type)->dqi_priv;
-                                                   DQUOT_SUSPENDED);
+                        cancel_delayed_work_sync(&oinfo->dqi_sync_work);
+                        status = dquot_suspend(sb, type);
+                }
                if (status < 0)
                        break;
        }
@@ -916,8 +918,8 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb)
                        status = -ENOENT;
                        goto out_quota_off;
                }
-                status = vfs_quota_enable(inode[type], type, QFMT_OCFS2,
+                status = dquot_enable(inode[type], type, QFMT_OCFS2,
-                                                DQUOT_USAGE_ENABLED);
+                                      DQUOT_USAGE_ENABLED);
                if (status < 0)
                        goto out_quota_off;
        }
@@ -952,8 +954,8 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
                /* Turn off quotas. This will remove all dquot structures from
                 * memory and so they will be automatically synced to global
                 * quota files */
-                vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED |
+                dquot_disable(sb, type, DQUOT_USAGE_ENABLED |
-                                            DQUOT_LIMITS_ENABLED);
+                                        DQUOT_LIMITS_ENABLED);
                if (!inode)
                        continue;
                iput(inode);
@@ -962,7 +964,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 /* Handle quota on quotactl */
 static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
-                          char *path, int remount)
+                          char *path)
 {
        unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA,
                                             OCFS2_FEATURE_RO_COMPAT_GRPQUOTA};
@@ -970,30 +972,24 @@ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id,
        if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type]))
                return -EINVAL;
-        if (remount)
+        return dquot_enable(sb_dqopt(sb)->files[type], type,
-                return 0;       /* Just ignore it has been handled in
+                            format_id, DQUOT_LIMITS_ENABLED);
-                                 * ocfs2_remount() */
-        return vfs_quota_enable(sb_dqopt(sb)->files[type], type,
-                                    format_id, DQUOT_LIMITS_ENABLED);
 }
 /* Handle quota off quotactl */
-static int ocfs2_quota_off(struct super_block *sb, int type, int remount)
+static int ocfs2_quota_off(struct super_block *sb, int type)
 {
-        if (remount)
+        return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED);
-                return 0;       /* Ignore now and handle later in
-                                 * ocfs2_remount() */
-        return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED);
 }
 static const struct quotactl_ops ocfs2_quotactl_ops = {
        .quota_on       = ocfs2_quota_on,
        .quota_off      = ocfs2_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk,
+        .set_dqblk      = dquot_set_dqblk,
 };
 static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 399487c09364..6e7a3291bbe8 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -329,7 +329,7 @@ const struct file_operations omfs_file_operations = {
        .aio_read = generic_file_aio_read,
        .aio_write = generic_file_aio_write,
        .mmap = generic_file_mmap,
-        .fsync = simple_fsync,
+        .fsync = generic_file_fsync,
        .splice_read = generic_file_splice_read,
 };
diff --git a/fs/pipe.c b/fs/pipe.c
index d79872eba09a..279eef96c51c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -26,9 +26,14 @@
 /*
 * The max size that a non-root user is allowed to grow the pipe. Can
- * be set by root in /proc/sys/fs/pipe-max-pages
+ * be set by root in /proc/sys/fs/pipe-max-size
 */
-unsigned int pipe_max_pages = PIPE_DEF_BUFFERS * 16;
+unsigned int pipe_max_size = 1048576;
+/*
+ * Minimum pipe size, as required by POSIX
+ */
+unsigned int pipe_min_size = PAGE_SIZE;
 /*
 * We use a start+len construction, which provides full use of the 
@@ -230,6 +235,7 @@ void *generic_pipe_buf_map(struct pipe_inode_info *pipe,
        return kmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_map);
 /**
 * generic_pipe_buf_unmap - unmap a previously mapped pipe buffer
@@ -249,6 +255,7 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *pipe,
        } else
                kunmap(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_unmap);
 /**
 * generic_pipe_buf_steal - attempt to take ownership of a &pipe_buffer
@@ -279,6 +286,7 @@ int generic_pipe_buf_steal(struct pipe_inode_info *pipe,
        return 1;
 }
+EXPORT_SYMBOL(generic_pipe_buf_steal);
 /**
 * generic_pipe_buf_get - get a reference to a &struct pipe_buffer
@@ -294,6 +302,7 @@ void generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf)
 {
        page_cache_get(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_get);
 /**
 * generic_pipe_buf_confirm - verify contents of the pipe buffer
@@ -309,6 +318,7 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info,
 {
        return 0;
 }
+EXPORT_SYMBOL(generic_pipe_buf_confirm);
 /**
 * generic_pipe_buf_release - put a reference to a &struct pipe_buffer
@@ -323,6 +333,7 @@ void generic_pipe_buf_release(struct pipe_inode_info *pipe,
 {
        page_cache_release(buf->page);
 }
+EXPORT_SYMBOL(generic_pipe_buf_release);
 static const struct pipe_buf_operations anon_pipe_buf_ops = {
        .can_merge = 1,
@@ -1112,26 +1123,20 @@ SYSCALL_DEFINE1(pipe, int __user *, fildes)
 * Allocate a new array of pipe buffers and copy the info over. Returns the
 * pipe size if successful, or return -ERROR on error.
 */
-static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
+static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 {
        struct pipe_buffer *bufs;
        /*
-         * Must be a power-of-2 currently
-         */
-        if (!is_power_of_2(arg))
-                return -EINVAL;
-        /*
         * We can shrink the pipe, if arg >= pipe->nrbufs. Since we don't
         * expect a lot of shrink+grow operations, just free and allocate
         * again like we would do for growing. If the pipe currently
         * contains more buffers than arg, then return busy.
         */
-        if (arg < pipe->nrbufs)
+        if (nr_pages < pipe->nrbufs)
                return -EBUSY;
-        bufs = kcalloc(arg, sizeof(struct pipe_buffer), GFP_KERNEL);
+        bufs = kcalloc(nr_pages, sizeof(struct pipe_buffer), GFP_KERNEL);
        if (unlikely(!bufs))
                return -ENOMEM;
@@ -1140,20 +1145,56 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg)
         * and adjust the indexes.
         */
        if (pipe->nrbufs) {
-                const unsigned int tail = pipe->nrbufs & (pipe->buffers - 1);
+                unsigned int tail;
-                const unsigned int head = pipe->nrbufs - tail;
+                unsigned int head;
+                tail = pipe->curbuf + pipe->nrbufs;
+                if (tail < pipe->buffers)
+                        tail = 0;
+                else
+                        tail &= (pipe->buffers - 1);
+                head = pipe->nrbufs - tail;
                if (head)
                        memcpy(bufs, pipe->bufs + pipe->curbuf, head * sizeof(struct pipe_buffer));
                if (tail)
-                        memcpy(bufs + head, pipe->bufs + pipe->curbuf, tail * sizeof(struct pipe_buffer));
+                        memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
        }
        pipe->curbuf = 0;
        kfree(pipe->bufs);
        pipe->bufs = bufs;
-        pipe->buffers = arg;
+        pipe->buffers = nr_pages;
-        return arg;
+        return nr_pages * PAGE_SIZE;
+}
+/*
+ * Currently we rely on the pipe array holding a power-of-2 number
+ * of pages.
+ */
+static inline unsigned int round_pipe_size(unsigned int size)
+{
+        unsigned long nr_pages;
+        nr_pages = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+        return roundup_pow_of_two(nr_pages) << PAGE_SHIFT;
+}
+/*
+ * This should work even if CONFIG_PROC_FS isn't set, as proc_dointvec_minmax
+ * will return an error.
+ */
+int pipe_proc_fn(struct ctl_table *table, int write, void __user *buf,
+                 size_t *lenp, loff_t *ppos)
+{
+        int ret;
+        ret = proc_dointvec_minmax(table, write, buf, lenp, ppos);
+        if (ret < 0 || !write)
+                return ret;
+        pipe_max_size = round_pipe_size(pipe_max_size);
+        return ret;
 }
 long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -1168,25 +1209,32 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
        mutex_lock(&pipe->inode->i_mutex);
        switch (cmd) {
-        case F_SETPIPE_SZ:
+        case F_SETPIPE_SZ: {
-                if (!capable(CAP_SYS_ADMIN) && arg > pipe_max_pages)
+                unsigned int size, nr_pages;
-                        return -EINVAL;
-                /*
+                size = round_pipe_size(arg);
-                 * The pipe needs to be at least 2 pages large to
+                nr_pages = size >> PAGE_SHIFT;
-                 * guarantee POSIX behaviour.
-                 */
+                ret = -EINVAL;
-                if (arg < 2)
+                if (!nr_pages)
-                        return -EINVAL;
+                        goto out;
-                ret = pipe_set_size(pipe, arg);
+                if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
+                        ret = -EPERM;
+                        goto out;
+                }
+                ret = pipe_set_size(pipe, nr_pages);
                break;
+                }
        case F_GETPIPE_SZ:
-                ret = pipe->buffers;
+                ret = pipe->buffers * PAGE_SIZE;
                break;
        default:
                ret = -EINVAL;
                break;
        }
+out:
        mutex_unlock(&pipe->inode->i_mutex);
        return ret;
 }
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 885ab5513ac5..9b58d38bc911 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -267,7 +267,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
                shpending = p->signal->shared_pending.signal;
                blocked = p->blocked;
                collect_sigign_sigcatch(p, &ignored, &caught);
-                num_threads = atomic_read(&p->signal->count);
+                num_threads = get_nr_threads(p);
                rcu_read_lock();  /* FIXME: is this correct? */
                qsize = atomic_read(&__task_cred(p)->user->sigpending);
                rcu_read_unlock();
@@ -410,7 +410,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
                        tty_nr = new_encode_dev(tty_devnum(sig->tty));
                }
-                num_threads = atomic_read(&sig->count);
+                num_threads = get_nr_threads(task);
                collect_sigign_sigcatch(task, &sigign, &sigcatch);
                cmin_flt = sig->cmin_flt;
diff --git a/fs/proc/base.c b/fs/proc/base.c
index c7f9f23449dc..acb7ef80ea4f 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,18 +166,6 @@ static int get_fs_path(struct task_struct *task, struct path *path, bool root)
        return result;
 }
-static int get_nr_threads(struct task_struct *tsk)
-{
-        unsigned long flags;
-        int count = 0;
-        if (lock_task_sighand(tsk, &flags)) {
-                count = atomic_read(&tsk->signal->count);
-                unlock_task_sighand(tsk, &flags);
-        }
-        return count;
-}
 static int proc_cwd_link(struct inode *inode, struct path *path)
 {
        struct task_struct *task = get_proc_task(inode);
@@ -2444,7 +2432,7 @@ static struct dentry *proc_base_instantiate(struct inode *dir,
        const struct pid_entry *p = ptr;
        struct inode *inode;
        struct proc_inode *ei;
-        struct dentry *error = ERR_PTR(-EINVAL);
+        struct dentry *error;
        /* Allocate the inode */
        error = ERR_PTR(-ENOMEM);
@@ -2794,7 +2782,7 @@ out:
 struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
 {
-        struct dentry *result = ERR_PTR(-ENOENT);
+        struct dentry *result;
        struct task_struct *task;
        unsigned tgid;
        struct pid_namespace *ns;
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 43c127490606..2791907744ed 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -343,21 +343,6 @@ static DEFINE_SPINLOCK(proc_inum_lock); /* protects the above */
 /*
 * Return an inode number between PROC_DYNAMIC_FIRST and
 * 0xffffffff, or zero on failure.
- *
- * Current inode allocations in the proc-fs (hex-numbers):
- *
- * 00000000             reserved
- * 00000001-00000fff    static entries  (goners)
- *      001             root-ino
- *
- * 00001000-00001fff    unused
- * 0001xxxx-7fffxxxx    pid-dir entries for pid 1-7fff
- * 80000000-efffffff    unused
- * f0000000-ffffffff    dynamic entries
- *
- * Goal:
- *      Once we split the thing into several virtual filesystems,
- *      we will get rid of magical ranges (and this comment, BTW).
 */
 static unsigned int get_inode_number(void)
 {
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index c837a77351be..6f37c391468d 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -588,7 +588,7 @@ static struct kcore_list kcore_text;
 */
 static void __init proc_kcore_text_init(void)
 {
-        kclist_add(&kcore_text, _stext, _end - _stext, KCORE_TEXT);
+        kclist_add(&kcore_text, _text, _end - _text, KCORE_TEXT);
 }
 #else
 static void __init proc_kcore_text_init(void)
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 757c069f2a65..4258384ed22d 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -110,7 +110,6 @@ void __init proc_root_init(void)
        if (err)
                return;
        proc_mnt = kern_mount_data(&proc_fs_type, &init_pid_ns);
-        err = PTR_ERR(proc_mnt);
        if (IS_ERR(proc_mnt)) {
                unregister_filesystem(&proc_fs_type);
                return;
diff --git a/fs/qnx4/dir.c b/fs/qnx4/dir.c
index 6f30c3d5bcbf..6e8fc62b40a8 100644
--- a/fs/qnx4/dir.c
+++ b/fs/qnx4/dir.c
@@ -77,9 +77,10 @@ out:
 const struct file_operations qnx4_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = qnx4_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 const struct inode_operations qnx4_dir_inode_operations =
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 655a4c52b8c3..12c233da1b6b 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -228,10 +228,6 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
-#ifdef CONFIG_SMP
-struct dqstats *dqstats_pcpu;
-EXPORT_SYMBOL(dqstats_pcpu);
-#endif
 static qsize_t inode_get_rsv_space(struct inode *inode);
 static void __dquot_initialize(struct inode *inode, int type);
@@ -584,7 +580,7 @@ out:
 }
 EXPORT_SYMBOL(dquot_scan_active);
-int vfs_quota_sync(struct super_block *sb, int type, int wait)
+int dquot_quota_sync(struct super_block *sb, int type, int wait)
 {
        struct list_head *dirty;
        struct dquot *dquot;
@@ -656,7 +652,7 @@ int vfs_quota_sync(struct super_block *sb, int type, int wait)
        return 0;
 }
-EXPORT_SYMBOL(vfs_quota_sync);
+EXPORT_SYMBOL(dquot_quota_sync);
 /* Free unused dquots from cache */
 static void prune_dqcache(int count)
@@ -676,27 +672,10 @@ static void prune_dqcache(int count)
        }
 }
-static int dqstats_read(unsigned int type)
-{
-        int count = 0;
-#ifdef CONFIG_SMP
-        int cpu;
-        for_each_possible_cpu(cpu)
-                count += per_cpu_ptr(dqstats_pcpu, cpu)->stat[type];
-        /* Statistics reading is racy, but absolute accuracy isn't required */
-        if (count < 0)
-                count = 0;
-#else
-        count = dqstats.stat[type];
-#endif
-        return count;
-}
 /*
 * This is called from kswapd when we think we need some
 * more memory
 */
 static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
 {
        if (nr) {
@@ -704,7 +683,9 @@ static int shrink_dqcache_memory(int nr, gfp_t gfp_mask)
                prune_dqcache(nr);
                spin_unlock(&dq_list_lock);
        }
-        return (dqstats_read(DQST_FREE_DQUOTS)/100) * sysctl_vfs_cache_pressure;
+        return ((unsigned)
+                percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
+                /100) * sysctl_vfs_cache_pressure;
 }
 static struct shrinker dqcache_shrinker = {
@@ -1514,11 +1495,13 @@ static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 /*
 * This operation can block, but only after everything is updated
 */
-int __dquot_alloc_space(struct inode *inode, qsize_t number,
+int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags)
-                int warn, int reserve)
 {
        int cnt, ret = 0;
        char warntype[MAXQUOTAS];
+        int warn = flags & DQUOT_SPACE_WARN;
+        int reserve = flags & DQUOT_SPACE_RESERVE;
+        int nofail = flags & DQUOT_SPACE_NOFAIL;
        /*
         * First test before acquiring mutex - solves deadlocks when we
@@ -1539,7 +1522,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
                        continue;
                ret = check_bdq(inode->i_dquot[cnt], number, !warn,
                                warntype+cnt);
-                if (ret) {
+                if (ret && !nofail) {
                        spin_unlock(&dq_data_lock);
                        goto out_flush_warn;
                }
@@ -1638,10 +1621,11 @@ EXPORT_SYMBOL(dquot_claim_space_nodirty);
 /*
 * This operation can block, but only after everything is updated
 */
-void __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
+void __dquot_free_space(struct inode *inode, qsize_t number, int flags)
 {
        unsigned int cnt;
        char warntype[MAXQUOTAS];
+        int reserve = flags & DQUOT_SPACE_RESERVE;
        /* First test before acquiring mutex - solves deadlocks when we
         * re-enter the quota code and are already holding the mutex */
@@ -1812,7 +1796,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr)
        if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid)
                transfer_to[USRQUOTA] = dqget(sb, iattr->ia_uid, USRQUOTA);
        if (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)
-                transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_uid, GRPQUOTA);
+                transfer_to[GRPQUOTA] = dqget(sb, iattr->ia_gid, GRPQUOTA);
        ret = __dquot_transfer(inode, transfer_to);
        dqput_all(transfer_to);
@@ -1847,6 +1831,7 @@ const struct dquot_operations dquot_operations = {
        .alloc_dquot    = dquot_alloc,
        .destroy_dquot  = dquot_destroy,
 };
+EXPORT_SYMBOL(dquot_operations);
 /*
 * Generic helper for ->open on filesystems supporting disk quotas.
@@ -1865,7 +1850,7 @@ EXPORT_SYMBOL(dquot_file_open);
 /*
 * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount)
 */
-int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags)
+int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 {
        int cnt, ret = 0;
        struct quota_info *dqopt = sb_dqopt(sb);
@@ -1995,14 +1980,15 @@ put_inodes:
                }
        return ret;
 }
-EXPORT_SYMBOL(vfs_quota_disable);
+EXPORT_SYMBOL(dquot_disable);
-int vfs_quota_off(struct super_block *sb, int type, int remount)
+int dquot_quota_off(struct super_block *sb, int type)
 {
-        return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED :
+        return dquot_disable(sb, type,
-                                 (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED));
+                             DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
 }
-EXPORT_SYMBOL(vfs_quota_off);
+EXPORT_SYMBOL(dquot_quota_off);
 /*
 *      Turn quotas on on a device
 */
@@ -2120,36 +2106,43 @@ out_fmt:
 }
 /* Reenable quotas on remount RW */
-static int vfs_quota_on_remount(struct super_block *sb, int type)
+int dquot_resume(struct super_block *sb, int type)
 {
        struct quota_info *dqopt = sb_dqopt(sb);
        struct inode *inode;
-        int ret;
+        int ret = 0, cnt;
        unsigned int flags;
-        mutex_lock(&dqopt->dqonoff_mutex);
+        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-        if (!sb_has_quota_suspended(sb, type)) {
+                if (type != -1 && cnt != type)
+                        continue;
+                mutex_lock(&dqopt->dqonoff_mutex);
+                if (!sb_has_quota_suspended(sb, cnt)) {
+                        mutex_unlock(&dqopt->dqonoff_mutex);
+                        continue;
+                }
+                inode = dqopt->files[cnt];
+                dqopt->files[cnt] = NULL;
+                spin_lock(&dq_state_lock);
+                flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
+                                                        DQUOT_LIMITS_ENABLED,
+                                                        cnt);
+                dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
+                spin_unlock(&dq_state_lock);
                mutex_unlock(&dqopt->dqonoff_mutex);
-                return 0;
-        }
-        inode = dqopt->files[type];
-        dqopt->files[type] = NULL;
-        spin_lock(&dq_state_lock);
-        flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED |
-                                                DQUOT_LIMITS_ENABLED, type);
-        dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type);
-        spin_unlock(&dq_state_lock);
-        mutex_unlock(&dqopt->dqonoff_mutex);
-        flags = dquot_generic_flag(flags, type);
+                flags = dquot_generic_flag(flags, cnt);
-        ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id,
+                ret = vfs_load_quota_inode(inode, cnt,
-                                   flags);
+                                dqopt->info[cnt].dqi_fmt_id, flags);
-        iput(inode);
+                iput(inode);
+        }
        return ret;
 }
+EXPORT_SYMBOL(dquot_resume);
-int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
+int dquot_quota_on_path(struct super_block *sb, int type, int format_id,
                      struct path *path)
 {
        int error = security_quota_on(path->dentry);
@@ -2164,40 +2157,36 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id,
                                             DQUOT_LIMITS_ENABLED);
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on_path);
+EXPORT_SYMBOL(dquot_quota_on_path);
-int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name,
+int dquot_quota_on(struct super_block *sb, int type, int format_id, char *name)
-                 int remount)
 {
        struct path path;
        int error;
-        if (remount)
-                return vfs_quota_on_remount(sb, type);
        error = kern_path(name, LOOKUP_FOLLOW, &path);
        if (!error) {
-                error = vfs_quota_on_path(sb, type, format_id, &path);
+                error = dquot_quota_on_path(sb, type, format_id, &path);
                path_put(&path);
        }
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on);
+EXPORT_SYMBOL(dquot_quota_on);
 /*
 * More powerful function for turning on quotas allowing setting
 * of individual quota flags
 */
-int vfs_quota_enable(struct inode *inode, int type, int format_id,
+int dquot_enable(struct inode *inode, int type, int format_id,
-                unsigned int flags)
+                 unsigned int flags)
 {
        int ret = 0;
        struct super_block *sb = inode->i_sb;
        struct quota_info *dqopt = sb_dqopt(sb);
        /* Just unsuspend quotas? */
-        if (flags & DQUOT_SUSPENDED)
+        BUG_ON(flags & DQUOT_SUSPENDED);
-                return vfs_quota_on_remount(sb, type);
        if (!flags)
                return 0;
        /* Just updating flags needed? */
@@ -2229,13 +2218,13 @@ out_lock:
 load_quota:
        return vfs_load_quota_inode(inode, type, format_id, flags);
 }
-EXPORT_SYMBOL(vfs_quota_enable);
+EXPORT_SYMBOL(dquot_enable);
 /*
 * This function is used when filesystem needs to initialize quotas
 * during mount time.
 */
-int vfs_quota_on_mount(struct super_block *sb, char *qf_name,
+int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
                int format_id, int type)
 {
        struct dentry *dentry;
@@ -2261,24 +2250,7 @@ out:
        dput(dentry);
        return error;
 }
-EXPORT_SYMBOL(vfs_quota_on_mount);
+EXPORT_SYMBOL(dquot_quota_on_mount);
-/* Wrapper to turn on quotas when remounting rw */
-int vfs_dq_quota_on_remount(struct super_block *sb)
-{
-        int cnt;
-        int ret = 0, err;
-        if (!sb->s_qcop || !sb->s_qcop->quota_on)
-                return -ENOSYS;
-        for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-                err = sb->s_qcop->quota_on(sb, cnt, 0, NULL, 1);
-                if (err < 0 && !ret)
-                        ret = err;
-        }
-        return ret;
-}
-EXPORT_SYMBOL(vfs_dq_quota_on_remount);
 static inline qsize_t qbtos(qsize_t blocks)
 {
@@ -2313,8 +2285,8 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
        spin_unlock(&dq_data_lock);
 }
-int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
+int dquot_get_dqblk(struct super_block *sb, int type, qid_t id,
-                  struct fs_disk_quota *di)
+                    struct fs_disk_quota *di)
 {
        struct dquot *dquot;
@@ -2326,7 +2298,7 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id,
        return 0;
 }
-EXPORT_SYMBOL(vfs_get_dqblk);
+EXPORT_SYMBOL(dquot_get_dqblk);
 #define VFS_FS_DQ_MASK \
        (FS_DQ_BCOUNT | FS_DQ_BSOFT | FS_DQ_BHARD | \
@@ -2425,7 +2397,7 @@ static int do_set_dqblk(struct dquot *dquot, struct fs_disk_quota *di)
        return 0;
 }
-int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
+int dquot_set_dqblk(struct super_block *sb, int type, qid_t id,
                  struct fs_disk_quota *di)
 {
        struct dquot *dquot;
@@ -2441,10 +2413,10 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id,
 out:
        return rc;
 }
-EXPORT_SYMBOL(vfs_set_dqblk);
+EXPORT_SYMBOL(dquot_set_dqblk);
 /* Generic routine for getting common part of quota file information */
-int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
  
@@ -2463,10 +2435,10 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return 0;
 }
-EXPORT_SYMBOL(vfs_get_dqinfo);
+EXPORT_SYMBOL(dquot_get_dqinfo);
 /* Generic routine for setting common part of quota file information */
-int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
+int dquot_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii)
 {
        struct mem_dqinfo *mi;
        int err = 0;
@@ -2493,27 +2465,27 @@ out:
        mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
        return err;
 }
-EXPORT_SYMBOL(vfs_set_dqinfo);
+EXPORT_SYMBOL(dquot_set_dqinfo);
-const struct quotactl_ops vfs_quotactl_ops = {
+const struct quotactl_ops dquot_quotactl_ops = {
-        .quota_on       = vfs_quota_on,
+        .quota_on       = dquot_quota_on,
-        .quota_off      = vfs_quota_off,
+        .quota_off      = dquot_quota_off,
-        .quota_sync     = vfs_quota_sync,
+        .quota_sync     = dquot_quota_sync,
-        .get_info       = vfs_get_dqinfo,
+        .get_info       = dquot_get_dqinfo,
-        .set_info       = vfs_set_dqinfo,
+        .set_info       = dquot_set_dqinfo,
-        .get_dqblk      = vfs_get_dqblk,
+        .get_dqblk      = dquot_get_dqblk,
-        .set_dqblk      = vfs_set_dqblk
+        .set_dqblk      = dquot_set_dqblk
 };
+EXPORT_SYMBOL(dquot_quotactl_ops);
 static int do_proc_dqstats(struct ctl_table *table, int write,
                     void __user *buffer, size_t *lenp, loff_t *ppos)
 {
-#ifdef CONFIG_SMP
-        /* Update global table */
        unsigned int type = (int *)table->data - dqstats.stat;
-        dqstats.stat[type] = dqstats_read(type);
-#endif
+        /* Update global table */
+        dqstats.stat[type] =
+                        percpu_counter_sum_positive(&dqstats.counter[type]);
        return proc_dointvec(table, write, buffer, lenp, ppos);
 }
@@ -2606,7 +2578,7 @@ static ctl_table sys_table[] = {
 static int __init dquot_init(void)
 {
-        int i;
+        int i, ret;
        unsigned long nr_hash, order;
        printk(KERN_NOTICE "VFS: Disk quotas %s\n", __DQUOT_VERSION__);
@@ -2624,12 +2596,11 @@ static int __init dquot_init(void)
        if (!dquot_hash)
                panic("Cannot create dquot hash table");
-#ifdef CONFIG_SMP
+        for (i = 0; i < _DQST_DQSTAT_LAST; i++) {
-        dqstats_pcpu = alloc_percpu(struct dqstats);
+                ret = percpu_counter_init(&dqstats.counter[i], 0);
-        if (!dqstats_pcpu)
+                if (ret)
-                panic("Cannot create dquot stats table");
+                        panic("Cannot create dquot stat counters");
-#endif
+        }
-        memset(&dqstats, 0, sizeof(struct dqstats));
        /* Find power-of-two hlist_heads which can fit into allocation */
        nr_hash = (1UL << order) * PAGE_SIZE / sizeof(struct hlist_head);
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index ce3dfd066f59..b299961e1edb 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -73,7 +73,7 @@ static int quota_quotaon(struct super_block *sb, int type, int cmd, qid_t id,
        if (IS_ERR(pathname))
                return PTR_ERR(pathname);
        if (sb->s_qcop->quota_on)
-                ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0);
+                ret = sb->s_qcop->quota_on(sb, type, id, pathname);
        putname(pathname);
        return ret;
 }
@@ -260,7 +260,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
        case Q_QUOTAOFF:
                if (!sb->s_qcop->quota_off)
                        return -ENOSYS;
-                return sb->s_qcop->quota_off(sb, type, 0);
+                return sb->s_qcop->quota_off(sb, type);
        case Q_GETFMT:
                return quota_getfmt(sb, type, addr);
        case Q_GETINFO:
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 78f613cb9c76..4884ac5ae9be 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -43,12 +43,13 @@ const struct file_operations ramfs_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_sync_file,
+        .fsync          = noop_fsync,
        .splice_read    = generic_file_splice_read,
        .splice_write   = generic_file_splice_write,
        .llseek         = generic_file_llseek,
 };
 const struct inode_operations ramfs_file_inode_operations = {
+        .setattr        = simple_setattr,
        .getattr        = simple_getattr,
 };
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 5ea4ad81a429..d532c20fc179 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -42,7 +42,7 @@ const struct file_operations ramfs_file_operations = {
        .aio_read               = generic_file_aio_read,
        .write                  = do_sync_write,
        .aio_write              = generic_file_aio_write,
-        .fsync                  = simple_sync_file,
+        .fsync                  = noop_fsync,
        .splice_read            = generic_file_splice_read,
        .splice_write           = generic_file_splice_write,
        .llseek                 = generic_file_llseek,
@@ -146,7 +146,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size)
                        return ret;
        }
-        ret = vmtruncate(inode, newsize);
+        ret = simple_setsize(inode, newsize);
        return ret;
 }
@@ -169,7 +169,8 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
        /* pick out size-changing events */
        if (ia->ia_valid & ATTR_SIZE) {
-                loff_t size = i_size_read(inode);
+                loff_t size = inode->i_size;
                if (ia->ia_size != size) {
                        ret = ramfs_nommu_resize(inode, ia->ia_size, size);
                        if (ret < 0 || ia->ia_valid == ATTR_SIZE)
@@ -182,7 +183,7 @@ static int ramfs_nommu_setattr(struct dentry *dentry, struct iattr *ia)
                }
        }
-        ret = inode_setattr(inode, ia);
+        generic_setattr(inode, ia);
 out:
        ia->ia_valid = old_ia_valid;
        return ret;
diff --git a/fs/read_write.c b/fs/read_write.c
index 113386d6fd2d..9c0485236e68 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -97,6 +97,23 @@ loff_t generic_file_llseek(struct file *file, loff_t offset, int origin)
 }
 EXPORT_SYMBOL(generic_file_llseek);
+/**
+ * noop_llseek - No Operation Performed llseek implementation
+ * @file:       file structure to seek on
+ * @offset:     file offset to seek to
+ * @origin:     type of seek
+ *
+ * This is an implementation of ->llseek useable for the rare special case when
+ * userspace expects the seek to succeed but the (device) file is actually not
+ * able to perform the seek. In this case you use noop_llseek() instead of
+ * falling back to the default implementation of ->llseek.
+ */
+loff_t noop_llseek(struct file *file, loff_t offset, int origin)
+{
+        return file->f_pos;
+}
+EXPORT_SYMBOL(noop_llseek);
 loff_t no_llseek(struct file *file, loff_t offset, int origin)
 {
        return -ESPIPE;
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 07930449a958..198dabf1b2bb 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -14,10 +14,10 @@
 extern const struct reiserfs_key MIN_KEY;
 static int reiserfs_readdir(struct file *, void *, filldir_t);
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int reiserfs_dir_fsync(struct file *filp, int datasync);
-                              int datasync);
 const struct file_operations reiserfs_dir_operations = {
+        .llseek = generic_file_llseek,
        .read = generic_read_dir,
        .readdir = reiserfs_readdir,
        .fsync = reiserfs_dir_fsync,
@@ -27,10 +27,9 @@ const struct file_operations reiserfs_dir_operations = {
 #endif
 };
-static int reiserfs_dir_fsync(struct file *filp, struct dentry *dentry,
+static int reiserfs_dir_fsync(struct file *filp, int datasync)
-                              int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int err;
        reiserfs_write_lock(inode->i_sb);
        err = reiserfs_commit_for_inode(inode);
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 9977df9f3a54..b82cdd8a45dd 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -134,10 +134,9 @@ static void reiserfs_vfs_truncate_file(struct inode *inode)
 * be removed...
 */
-static int reiserfs_sync_file(struct file *filp,
+static int reiserfs_sync_file(struct file *filp, int datasync)
-                              struct dentry *dentry, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        int err;
        int barrier_done;
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 59125fb36d42..9822fa15118b 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -158,6 +158,7 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
        int i;
        int ms_active_set;
+        int quota_enabled[MAXQUOTAS];
 #endif
        /* compose key to look for "save" links */
@@ -179,8 +180,15 @@ static int finish_unfinished(struct super_block *s)
        }
        /* Turn on quotas so that they are updated correctly */
        for (i = 0; i < MAXQUOTAS; i++) {
+                quota_enabled[i] = 1;
                if (REISERFS_SB(s)->s_qf_names[i]) {
-                        int ret = reiserfs_quota_on_mount(s, i);
+                        int ret;
+                        if (sb_has_quota_active(s, i)) {
+                                quota_enabled[i] = 0;
+                                continue;
+                        }
+                        ret = reiserfs_quota_on_mount(s, i);
                        if (ret < 0)
                                reiserfs_warning(s, "reiserfs-2500",
                                                 "cannot turn on journaled "
@@ -304,8 +312,8 @@ static int finish_unfinished(struct super_block *s)
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
-                if (sb_dqopt(s)->files[i])
+                if (sb_dqopt(s)->files[i] && quota_enabled[i])
-                        vfs_quota_off(s, i, 0);
+                        dquot_quota_off(s, i);
        }
        if (ms_active_set)
                /* Restore the flag back */
@@ -466,6 +474,8 @@ static void reiserfs_put_super(struct super_block *s)
        struct reiserfs_transaction_handle th;
        th.t_trans_id = 0;
+        dquot_disable(s, -1, DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED);
        reiserfs_write_lock(s);
        if (s->s_dirt)
@@ -620,7 +630,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, char *, int);
+static int reiserfs_quota_on(struct super_block *, int, int, char *);
 static const struct dquot_operations reiserfs_quota_operations = {
        .write_dquot = reiserfs_write_dquot,
@@ -634,12 +644,12 @@ static const struct dquot_operations reiserfs_quota_operations = {
 static const struct quotactl_ops reiserfs_qctl_operations = {
        .quota_on = reiserfs_quota_on,
-        .quota_off = vfs_quota_off,
+        .quota_off = dquot_quota_off,
-        .quota_sync = vfs_quota_sync,
+        .quota_sync = dquot_quota_sync,
-        .get_info = vfs_get_dqinfo,
+        .get_info = dquot_get_dqinfo,
-        .set_info = vfs_set_dqinfo,
+        .set_info = dquot_set_dqinfo,
-        .get_dqblk = vfs_get_dqblk,
+        .get_dqblk = dquot_get_dqblk,
-        .set_dqblk = vfs_set_dqblk,
+        .set_dqblk = dquot_set_dqblk,
 };
 #endif
@@ -1242,6 +1252,11 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
                if (s->s_flags & MS_RDONLY)
                        /* it is read-only already */
                        goto out_ok;
+                err = dquot_suspend(s, -1);
+                if (err < 0)
+                        goto out_err;
                /* try to remount file system with read-only permissions */
                if (sb_umount_state(rs) == REISERFS_VALID_FS
                    || REISERFS_SB(s)->s_mount_state != REISERFS_VALID_FS) {
@@ -1295,6 +1310,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg)
        s->s_dirt = 0;
        if (!(*mount_flags & MS_RDONLY)) {
+                dquot_resume(s, -1);
                finish_unfinished(s);
                reiserfs_xattr_init(s, *mount_flags);
        }
@@ -2022,15 +2038,15 @@ static int reiserfs_write_info(struct super_block *sb, int type)
 */
 static int reiserfs_quota_on_mount(struct super_block *sb, int type)
 {
-        return vfs_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
+        return dquot_quota_on_mount(sb, REISERFS_SB(sb)->s_qf_names[type],
-                                  REISERFS_SB(sb)->s_jquota_fmt, type);
+                                        REISERFS_SB(sb)->s_jquota_fmt, type);
 }
 /*
 * Standard function to be called on quota_on
 */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-                             char *name, int remount)
+                             char *name)
 {
        int err;
        struct path path;
@@ -2039,9 +2055,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
        if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA)))
                return -EINVAL;
-        /* No more checks needed? Path and format_id are bogus anyway... */
-        if (remount)
-                return vfs_quota_on(sb, type, format_id, name, 1);
        err = kern_path(name, LOOKUP_FOLLOW, &path);
        if (err)
                return err;
@@ -2085,7 +2099,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
                if (err)
                        goto out;
        }
-        err = vfs_quota_on_path(sb, type, format_id, &path);
+        err = dquot_quota_on_path(sb, type, format_id, &path);
 out:
        path_put(&path);
        return err;
diff --git a/fs/smbfs/dir.c b/fs/smbfs/dir.c
index 6c978428892d..00a70cab1f36 100644
--- a/fs/smbfs/dir.c
+++ b/fs/smbfs/dir.c
@@ -37,6 +37,7 @@ static int smb_link(struct dentry *, struct inode *, struct dentry *);
 const struct file_operations smb_dir_operations =
 {
+        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = smb_readdir,
        .unlocked_ioctl = smb_ioctl,
diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c
index 84ecf0e43f91..8e187a0f94bb 100644
--- a/fs/smbfs/file.c
+++ b/fs/smbfs/file.c
@@ -28,8 +28,9 @@
 #include "proto.h"
 static int
-smb_fsync(struct file *file, struct dentry * dentry, int datasync)
+smb_fsync(struct file *file, int datasync)
 {
+        struct dentry *dentry = file->f_path.dentry;
        struct smb_sb_info *server = server_from_dentry(dentry);
        int result;
diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c
index dfa1d67f8fca..9551cb6f7fe4 100644
--- a/fs/smbfs/inode.c
+++ b/fs/smbfs/inode.c
@@ -714,7 +714,7 @@ smb_notify_change(struct dentry *dentry, struct iattr *attr)
                error = server->ops->truncate(inode, attr->ia_size);
                if (error)
                        goto out;
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
                if (error)
                        goto out;
                refresh = 1;
diff --git a/fs/splice.c b/fs/splice.c
index ac22b00d86c3..740e6b9faf7a 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -354,7 +354,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
                                break;
                        error = add_to_page_cache_lru(page, mapping, index,
-                                                mapping_gfp_mask(mapping));
+                                                GFP_KERNEL);
                        if (unlikely(error)) {
                                page_cache_release(page);
                                if (error == -EEXIST)
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig
index 25a00d19d686..cc6ce8a84c21 100644
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -26,6 +26,17 @@ config SQUASHFS
          If unsure, say N.
+config SQUASHFS_XATTRS
+        bool "Squashfs XATTR support"
+        depends on SQUASHFS
+        default n
+        help
+          Saying Y here includes support for extended attributes (xattrs).
+          Xattrs are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page).
+          If unsure, say N.
 config SQUASHFS_EMBEDDED
        bool "Additional option for memory-constrained systems" 
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile
index df8a19ef870d..2cee3e9fa452 100644
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -5,3 +5,5 @@
 obj-$(CONFIG_SQUASHFS) += squashfs.o
 squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
 squashfs-y += namei.o super.o symlink.o zlib_wrapper.o decompressor.o
+squashfs-$(CONFIG_SQUASHFS_XATTRS) += xattr.o xattr_id.o
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 49daaf669e41..62e63ad25075 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -40,11 +40,13 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Initialise VFS inode with the base inode information common to all
@@ -111,6 +113,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
        int err, type, offset = SQUASHFS_INODE_OFFSET(ino);
        union squashfs_inode squashfs_ino;
        struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base;
+        int xattr_id = SQUASHFS_INVALID_XATTR;
        TRACE("Entered squashfs_read_inode\n");
@@ -199,8 +202,10 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                        frag_offset = 0;
                }
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le64_to_cpu(sqsh_ino->file_size);
+                inode->i_op = &squashfs_inode_ops;
                inode->i_fop = &generic_ro_fops;
                inode->i_mode |= S_IFREG;
                inode->i_blocks = ((inode->i_size -
@@ -251,6 +256,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                if (err < 0)
                        goto failed_read;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le32_to_cpu(sqsh_ino->file_size);
                inode->i_op = &squashfs_dir_inode_ops;
@@ -280,21 +286,33 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
                inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
-                inode->i_op = &page_symlink_inode_operations;
+                inode->i_op = &squashfs_symlink_inode_ops;
                inode->i_data.a_ops = &squashfs_symlink_aops;
                inode->i_mode |= S_IFLNK;
                squashfs_i(inode)->start = block;
                squashfs_i(inode)->offset = offset;
+                if (type == SQUASHFS_LSYMLINK_TYPE) {
+                        __le32 xattr;
+                        err = squashfs_read_metadata(sb, NULL, &block,
+                                                &offset, inode->i_size);
+                        if (err < 0)
+                                goto failed_read;
+                        err = squashfs_read_metadata(sb, &xattr, &block,
+                                                &offset, sizeof(xattr));
+                        if (err < 0)
+                                goto failed_read;
+                        xattr_id = le32_to_cpu(xattr);
+                }
                TRACE("Symbolic link inode %x:%x, start_block %llx, offset "
                                "%x\n", SQUASHFS_INODE_BLK(ino), offset,
                                block, offset);
                break;
        }
        case SQUASHFS_BLKDEV_TYPE:
-        case SQUASHFS_CHRDEV_TYPE:
+        case SQUASHFS_CHRDEV_TYPE: {
-        case SQUASHFS_LBLKDEV_TYPE:
-        case SQUASHFS_LCHRDEV_TYPE: {
                struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev;
                unsigned int rdev;
@@ -315,10 +333,32 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                                SQUASHFS_INODE_BLK(ino), offset, rdev);
                break;
        }
+        case SQUASHFS_LBLKDEV_TYPE:
+        case SQUASHFS_LCHRDEV_TYPE: {
+                struct squashfs_ldev_inode *sqsh_ino = &squashfs_ino.ldev;
+                unsigned int rdev;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_LCHRDEV_TYPE)
+                        inode->i_mode |= S_IFCHR;
+                else
+                        inode->i_mode |= S_IFBLK;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
+                inode->i_op = &squashfs_inode_ops;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                rdev = le32_to_cpu(sqsh_ino->rdev);
+                init_special_inode(inode, inode->i_mode, new_decode_dev(rdev));
+                TRACE("Device inode %x:%x, rdev %x\n",
+                                SQUASHFS_INODE_BLK(ino), offset, rdev);
+                break;
+        }
        case SQUASHFS_FIFO_TYPE:
-        case SQUASHFS_SOCKET_TYPE:
+        case SQUASHFS_SOCKET_TYPE: {
-        case SQUASHFS_LFIFO_TYPE:
-        case SQUASHFS_LSOCKET_TYPE: {
                struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc;
                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
@@ -334,14 +374,52 @@ int squashfs_read_inode(struct inode *inode, long long ino)
                init_special_inode(inode, inode->i_mode, 0);
                break;
        }
+        case SQUASHFS_LFIFO_TYPE:
+        case SQUASHFS_LSOCKET_TYPE: {
+                struct squashfs_lipc_inode *sqsh_ino = &squashfs_ino.lipc;
+                err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset,
+                                sizeof(*sqsh_ino));
+                if (err < 0)
+                        goto failed_read;
+                if (type == SQUASHFS_LFIFO_TYPE)
+                        inode->i_mode |= S_IFIFO;
+                else
+                        inode->i_mode |= S_IFSOCK;
+                xattr_id = le32_to_cpu(sqsh_ino->xattr);
+                inode->i_op = &squashfs_inode_ops;
+                inode->i_nlink = le32_to_cpu(sqsh_ino->nlink);
+                init_special_inode(inode, inode->i_mode, 0);
+                break;
+        }
        default:
                ERROR("Unknown inode type %d in squashfs_iget!\n", type);
                return -EINVAL;
        }
+        if (xattr_id != SQUASHFS_INVALID_XATTR && msblk->xattr_id_table) {
+                err = squashfs_xattr_lookup(sb, xattr_id,
+                                        &squashfs_i(inode)->xattr_count,
+                                        &squashfs_i(inode)->xattr_size,
+                                        &squashfs_i(inode)->xattr);
+                if (err < 0)
+                        goto failed_read;
+                inode->i_blocks += ((squashfs_i(inode)->xattr_size - 1) >> 9)
+                                + 1;
+        } else
+                squashfs_i(inode)->xattr_count = 0;
        return 0;
 failed_read:
        ERROR("Unable to read inode 0x%llx\n", ino);
        return err;
 }
+const struct inode_operations squashfs_inode_ops = {
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
+};
diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c
index 5266bd8ad932..7a9464d08cf6 100644
--- a/fs/squashfs/namei.c
+++ b/fs/squashfs/namei.c
@@ -57,11 +57,13 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/dcache.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 /*
 * Lookup name in the directory index, returning the location of the metadata
@@ -237,5 +239,7 @@ failed:
 const struct inode_operations squashfs_dir_inode_ops = {
-        .lookup = squashfs_lookup
+        .lookup = squashfs_lookup,
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
 };
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index fe2587af5512..733a17c42945 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -73,8 +73,11 @@ extern struct inode *squashfs_iget(struct super_block *, long long,
                                unsigned int);
 extern int squashfs_read_inode(struct inode *, long long);
+/* xattr.c */
+extern ssize_t squashfs_listxattr(struct dentry *, char *, size_t);
 /*
- * Inodes, files and decompressor operations
+ * Inodes, files,  decompressor and xattr operations
 */
 /* dir.c */
@@ -86,11 +89,18 @@ extern const struct export_operations squashfs_export_ops;
 /* file.c */
 extern const struct address_space_operations squashfs_aops;
+/* inode.c */
+extern const struct inode_operations squashfs_inode_ops;
 /* namei.c */
 extern const struct inode_operations squashfs_dir_inode_ops;
 /* symlink.c */
 extern const struct address_space_operations squashfs_symlink_aops;
+extern const struct inode_operations squashfs_symlink_inode_ops;
+/* xattr.c */
+extern const struct xattr_handler *squashfs_xattr_handlers[];
 /* zlib_wrapper.c */
 extern const struct squashfs_decompressor squashfs_zlib_comp_ops;
diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h
index 79024245ea00..8eabb808b78d 100644
--- a/fs/squashfs/squashfs_fs.h
+++ b/fs/squashfs/squashfs_fs.h
@@ -46,6 +46,7 @@
 #define SQUASHFS_NAME_LEN               256
 #define SQUASHFS_INVALID_FRAG           (0xffffffffU)
+#define SQUASHFS_INVALID_XATTR          (0xffffffffU)
 #define SQUASHFS_INVALID_BLK            (-1LL)
 /* Filesystem flags */
@@ -96,6 +97,13 @@
 #define SQUASHFS_LFIFO_TYPE             13
 #define SQUASHFS_LSOCKET_TYPE           14
+/* Xattr types */
+#define SQUASHFS_XATTR_USER             0
+#define SQUASHFS_XATTR_TRUSTED          1
+#define SQUASHFS_XATTR_SECURITY         2
+#define SQUASHFS_XATTR_VALUE_OOL        256
+#define SQUASHFS_XATTR_PREFIX_MASK      0xff
 /* Flag whether block is compressed or uncompressed, bit is set if block is
 * uncompressed */
 #define SQUASHFS_COMPRESSED_BIT         (1 << 15)
@@ -174,6 +182,24 @@
 #define SQUASHFS_ID_BLOCK_BYTES(A)      (SQUASHFS_ID_BLOCKS(A) *\
                                        sizeof(u64))
+/* xattr id lookup table defines */
+#define SQUASHFS_XATTR_BYTES(A)         ((A) * sizeof(struct squashfs_xattr_id))
+#define SQUASHFS_XATTR_BLOCK(A)         (SQUASHFS_XATTR_BYTES(A) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCK_OFFSET(A)  (SQUASHFS_XATTR_BYTES(A) % \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCKS(A)        ((SQUASHFS_XATTR_BYTES(A) + \
+                                        SQUASHFS_METADATA_SIZE - 1) / \
+                                        SQUASHFS_METADATA_SIZE)
+#define SQUASHFS_XATTR_BLOCK_BYTES(A)   (SQUASHFS_XATTR_BLOCKS(A) *\
+                                        sizeof(u64))
+#define SQUASHFS_XATTR_BLK(A)           ((unsigned int) ((A) >> 16))
+#define SQUASHFS_XATTR_OFFSET(A)        ((unsigned int) ((A) & 0xffff))
 /* cached data constants for filesystem */
 #define SQUASHFS_CACHED_BLKS            8
@@ -228,7 +254,7 @@ struct squashfs_super_block {
        __le64                  root_inode;
        __le64                  bytes_used;
        __le64                  id_table_start;
-        __le64                  xattr_table_start;
+        __le64                  xattr_id_table_start;
        __le64                  inode_table_start;
        __le64                  directory_table_start;
        __le64                  fragment_table_start;
@@ -261,6 +287,17 @@ struct squashfs_ipc_inode {
        __le32                  nlink;
 };
+struct squashfs_lipc_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  xattr;
+};
 struct squashfs_dev_inode {
        __le16                  inode_type;
        __le16                  mode;
@@ -272,6 +309,18 @@ struct squashfs_dev_inode {
        __le32                  rdev;
 };
+struct squashfs_ldev_inode {
+        __le16                  inode_type;
+        __le16                  mode;
+        __le16                  uid;
+        __le16                  guid;
+        __le32                  mtime;
+        __le32                  inode_number;
+        __le32                  nlink;
+        __le32                  rdev;
+        __le32                  xattr;
+};
 struct squashfs_symlink_inode {
        __le16                  inode_type;
        __le16                  mode;
@@ -349,12 +398,14 @@ struct squashfs_ldir_inode {
 union squashfs_inode {
        struct squashfs_base_inode              base;
        struct squashfs_dev_inode               dev;
+        struct squashfs_ldev_inode              ldev;
        struct squashfs_symlink_inode           symlink;
        struct squashfs_reg_inode               reg;
        struct squashfs_lreg_inode              lreg;
        struct squashfs_dir_inode               dir;
        struct squashfs_ldir_inode              ldir;
        struct squashfs_ipc_inode               ipc;
+        struct squashfs_lipc_inode              lipc;
 };
 struct squashfs_dir_entry {
@@ -377,4 +428,27 @@ struct squashfs_fragment_entry {
        unsigned int            unused;
 };
+struct squashfs_xattr_entry {
+        __le16                  type;
+        __le16                  size;
+        char                    data[0];
+};
+struct squashfs_xattr_val {
+        __le32                  vsize;
+        char                    value[0];
+};
+struct squashfs_xattr_id {
+        __le64                  xattr;
+        __le32                  count;
+        __le32                  size;
+};
+struct squashfs_xattr_id_table {
+        __le64                  xattr_table_start;
+        __le32                  xattr_ids;
+        __le32                  unused;
+};
 #endif
diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h
index fbfca30c0c68..d3e3a37f28a1 100644
--- a/fs/squashfs/squashfs_fs_i.h
+++ b/fs/squashfs/squashfs_fs_i.h
@@ -26,6 +26,9 @@
 struct squashfs_inode_info {
        u64             start;
        int             offset;
+        u64             xattr;
+        unsigned int    xattr_size;
+        int             xattr_count;
        union {
                struct {
                        u64             fragment_block;
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h
index 2e77dc547e25..d9037a5215f0 100644
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -61,6 +61,7 @@ struct squashfs_sb_info {
        int                                     next_meta_index;
        __le64                                  *id_table;
        __le64                                  *fragment_index;
+        __le64                                  *xattr_id_table;
        struct mutex                            read_data_mutex;
        struct mutex                            meta_index_mutex;
        struct meta_index                       *meta_index;
@@ -68,9 +69,11 @@ struct squashfs_sb_info {
        __le64                                  *inode_lookup_table;
        u64                                     inode_table;
        u64                                     directory_table;
+        u64                                     xattr_table;
        unsigned int                            block_size;
        unsigned short                          block_log;
        long long                               bytes_used;
        unsigned int                            inodes;
+        int                                     xattr_ids;
 };
 #endif
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 48b6f4a385a6..88b4f8606652 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -36,12 +36,14 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/magic.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
 #include "decompressor.h"
+#include "xattr.h"
 static struct file_system_type squashfs_fs_type;
 static const struct super_operations squashfs_super_ops;
@@ -82,7 +84,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        long long root_inode;
        unsigned short flags;
        unsigned int fragments;
-        u64 lookup_table_start;
+        u64 lookup_table_start, xattr_id_table_start;
        int err;
        TRACE("Entered squashfs_fill_superblock\n");
@@ -139,13 +141,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
        if (msblk->decompressor == NULL)
                goto failed_mount;
-        /*
-         * Check if there's xattrs in the filesystem.  These are not
-         * supported in this version, so warn that they will be ignored.
-         */
-        if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK)
-                ERROR("Xattrs in filesystem, these will be ignored\n");
        /* Check the filesystem does not extend beyond the end of the
           block device */
        msblk->bytes_used = le64_to_cpu(sblk->bytes_used);
@@ -253,7 +248,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 allocate_lookup_table:
        lookup_table_start = le64_to_cpu(sblk->lookup_table_start);
        if (lookup_table_start == SQUASHFS_INVALID_BLK)
-                goto allocate_root;
+                goto allocate_xattr_table;
        /* Allocate and read inode lookup table */
        msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb,
@@ -266,6 +261,21 @@ allocate_lookup_table:
        sb->s_export_op = &squashfs_export_ops;
+allocate_xattr_table:
+        sb->s_xattr = squashfs_xattr_handlers;
+        xattr_id_table_start = le64_to_cpu(sblk->xattr_id_table_start);
+        if (xattr_id_table_start == SQUASHFS_INVALID_BLK)
+                goto allocate_root;
+        /* Allocate and read xattr id lookup table */
+        msblk->xattr_id_table = squashfs_read_xattr_id_table(sb,
+                xattr_id_table_start, &msblk->xattr_table, &msblk->xattr_ids);
+        if (IS_ERR(msblk->xattr_id_table)) {
+                err = PTR_ERR(msblk->xattr_id_table);
+                msblk->xattr_id_table = NULL;
+                if (err != -ENOTSUPP)
+                        goto failed_mount;
+        }
 allocate_root:
        root = new_inode(sb);
        if (!root) {
@@ -301,6 +311,7 @@ failed_mount:
        kfree(msblk->inode_lookup_table);
        kfree(msblk->fragment_index);
        kfree(msblk->id_table);
+        kfree(msblk->xattr_id_table);
        kfree(sb->s_fs_info);
        sb->s_fs_info = NULL;
        kfree(sblk);
@@ -355,6 +366,7 @@ static void squashfs_put_super(struct super_block *sb)
                kfree(sbi->fragment_index);
                kfree(sbi->meta_index);
                kfree(sbi->inode_lookup_table);
+                kfree(sbi->xattr_id_table);
                kfree(sb->s_fs_info);
                sb->s_fs_info = NULL;
        }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 32b911f4ee39..ec86434921e1 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -35,11 +35,13 @@
 #include <linux/kernel.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
+#include <linux/xattr.h>
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
 #include "squashfs_fs_i.h"
 #include "squashfs.h"
+#include "xattr.h"
 static int squashfs_symlink_readpage(struct file *file, struct page *page)
 {
@@ -114,3 +116,12 @@ error_out:
 const struct address_space_operations squashfs_symlink_aops = {
        .readpage = squashfs_symlink_readpage
 };
+const struct inode_operations squashfs_symlink_inode_ops = {
+        .readlink = generic_readlink,
+        .follow_link = page_follow_link_light,
+        .put_link = page_put_link,
+        .getxattr = generic_getxattr,
+        .listxattr = squashfs_listxattr
+};
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
new file mode 100644
index 000000000000..c7655e8b31cd
--- /dev/null
+++ b/fs/squashfs/xattr.c
@@ -0,0 +1,323 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/xattr.h>
+#include <linux/slab.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+static const struct xattr_handler *squashfs_xattr_handler(int);
+ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
+        size_t buffer_size)
+{
+        struct inode *inode = d->d_inode;
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+                                                 + msblk->xattr_table;
+        int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+        int count = squashfs_i(inode)->xattr_count;
+        size_t rest = buffer_size;
+        int err;
+        /* check that the file system has xattrs */
+        if (msblk->xattr_id_table == NULL)
+                return -EOPNOTSUPP;
+        /* loop reading each xattr name */
+        while (count--) {
+                struct squashfs_xattr_entry entry;
+                struct squashfs_xattr_val val;
+                const struct xattr_handler *handler;
+                int name_size, prefix_size = 0;
+                err = squashfs_read_metadata(sb, &entry, &start, &offset,
+                                                        sizeof(entry));
+                if (err < 0)
+                        goto failed;
+                name_size = le16_to_cpu(entry.size);
+                handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
+                if (handler)
+                        prefix_size = handler->list(d, buffer, rest, NULL,
+                                name_size, handler->flags);
+                if (prefix_size) {
+                        if (buffer) {
+                                if (prefix_size + name_size + 1 > rest) {
+                                        err = -ERANGE;
+                                        goto failed;
+                                }
+                                buffer += prefix_size;
+                        }
+                        err = squashfs_read_metadata(sb, buffer, &start,
+                                &offset, name_size);
+                        if (err < 0)
+                                goto failed;
+                        if (buffer) {
+                                buffer[name_size] = '\0';
+                                buffer += name_size + 1;
+                        }
+                        rest -= prefix_size + name_size + 1;
+                } else  {
+                        /* no handler or insuffficient privileges, so skip */
+                        err = squashfs_read_metadata(sb, NULL, &start,
+                                &offset, name_size);
+                        if (err < 0)
+                                goto failed;
+                }
+                /* skip remaining xattr entry */
+                err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                sizeof(val));
+                if (err < 0)
+                        goto failed;
+                err = squashfs_read_metadata(sb, NULL, &start, &offset,
+                                                le32_to_cpu(val.vsize));
+                if (err < 0)
+                        goto failed;
+        }
+        err = buffer_size - rest;
+failed:
+        return err;
+}
+static int squashfs_xattr_get(struct inode *inode, int name_index,
+        const char *name, void *buffer, size_t buffer_size)
+{
+        struct super_block *sb = inode->i_sb;
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        u64 start = SQUASHFS_XATTR_BLK(squashfs_i(inode)->xattr)
+                                                 + msblk->xattr_table;
+        int offset = SQUASHFS_XATTR_OFFSET(squashfs_i(inode)->xattr);
+        int count = squashfs_i(inode)->xattr_count;
+        int name_len = strlen(name);
+        int err, vsize;
+        char *target = kmalloc(name_len, GFP_KERNEL);
+        if (target == NULL)
+                return  -ENOMEM;
+        /* loop reading each xattr name */
+        for (; count; count--) {
+                struct squashfs_xattr_entry entry;
+                struct squashfs_xattr_val val;
+                int type, prefix, name_size;
+                err = squashfs_read_metadata(sb, &entry, &start, &offset,
+                                                        sizeof(entry));
+                if (err < 0)
+                        goto failed;
+                name_size = le16_to_cpu(entry.size);
+                type = le16_to_cpu(entry.type);
+                prefix = type & SQUASHFS_XATTR_PREFIX_MASK;
+                if (prefix == name_index && name_size == name_len)
+                        err = squashfs_read_metadata(sb, target, &start,
+                                                &offset, name_size);
+                else
+                        err = squashfs_read_metadata(sb, NULL, &start,
+                                                &offset, name_size);
+                if (err < 0)
+                        goto failed;
+                if (prefix == name_index && name_size == name_len &&
+                                        strncmp(target, name, name_size) == 0) {
+                        /* found xattr */
+                        if (type & SQUASHFS_XATTR_VALUE_OOL) {
+                                __le64 xattr;
+                                /* val is a reference to the real location */
+                                err = squashfs_read_metadata(sb, &val, &start,
+                                                &offset, sizeof(val));
+                                if (err < 0)
+                                        goto failed;
+                                err = squashfs_read_metadata(sb, &xattr, &start,
+                                         &offset, sizeof(xattr));
+                                if (err < 0)
+                                        goto failed;
+                                xattr = le64_to_cpu(xattr);
+                                start = SQUASHFS_XATTR_BLK(xattr) +
+                                                        msblk->xattr_table;
+                                offset = SQUASHFS_XATTR_OFFSET(xattr);
+                        }
+                        /* read xattr value */
+                        err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                        sizeof(val));
+                        if (err < 0)
+                                goto failed;
+                        vsize = le32_to_cpu(val.vsize);
+                        if (buffer) {
+                                if (vsize > buffer_size) {
+                                        err = -ERANGE;
+                                        goto failed;
+                                }
+                                err = squashfs_read_metadata(sb, buffer, &start,
+                                         &offset, vsize);
+                                if (err < 0)
+                                        goto failed;
+                        }
+                        break;
+                }
+                /* no match, skip remaining xattr entry */
+                err = squashfs_read_metadata(sb, &val, &start, &offset,
+                                                        sizeof(val));
+                if (err < 0)
+                        goto failed;
+                err = squashfs_read_metadata(sb, NULL, &start, &offset,
+                                                le32_to_cpu(val.vsize));
+                if (err < 0)
+                        goto failed;
+        }
+        err = count ? vsize : -ENODATA;
+failed:
+        kfree(target);
+        return err;
+}
+/*
+ * User namespace support
+ */
+static size_t squashfs_user_list(struct dentry *d, char *list, size_t list_size,
+        const char *name, size_t name_len, int type)
+{
+        if (list && XATTR_USER_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN);
+        return XATTR_USER_PREFIX_LEN;
+}
+static int squashfs_user_get(struct dentry *d, const char *name, void *buffer,
+        size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_USER, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .list   = squashfs_user_list,
+        .get    = squashfs_user_get
+};
+/*
+ * Trusted namespace support
+ */
+static size_t squashfs_trusted_list(struct dentry *d, char *list,
+        size_t list_size, const char *name, size_t name_len, int type)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return 0;
+        if (list && XATTR_TRUSTED_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+        return XATTR_TRUSTED_PREFIX_LEN;
+}
+static int squashfs_trusted_get(struct dentry *d, const char *name,
+        void *buffer, size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_TRUSTED, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_trusted_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .list   = squashfs_trusted_list,
+        .get    = squashfs_trusted_get
+};
+/*
+ * Security namespace support
+ */
+static size_t squashfs_security_list(struct dentry *d, char *list,
+        size_t list_size, const char *name, size_t name_len, int type)
+{
+        if (list && XATTR_SECURITY_PREFIX_LEN <= list_size)
+                memcpy(list, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN);
+        return XATTR_SECURITY_PREFIX_LEN;
+}
+static int squashfs_security_get(struct dentry *d, const char *name,
+        void *buffer, size_t size, int type)
+{
+        if (name[0] == '\0')
+                return  -EINVAL;
+        return squashfs_xattr_get(d->d_inode, SQUASHFS_XATTR_SECURITY, name,
+                buffer, size);
+}
+static const struct xattr_handler squashfs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .list   = squashfs_security_list,
+        .get    = squashfs_security_get
+};
+static inline const struct xattr_handler *squashfs_xattr_handler(int type)
+{
+        if (type & ~(SQUASHFS_XATTR_PREFIX_MASK | SQUASHFS_XATTR_VALUE_OOL))
+                /* ignore unrecognised type */
+                return NULL;
+        switch (type & SQUASHFS_XATTR_PREFIX_MASK) {
+        case SQUASHFS_XATTR_USER:
+                return &squashfs_xattr_user_handler;
+        case SQUASHFS_XATTR_TRUSTED:
+                return &squashfs_xattr_trusted_handler;
+        case SQUASHFS_XATTR_SECURITY:
+                return &squashfs_xattr_security_handler;
+        default:
+                /* ignore unrecognised type */
+                return NULL;
+        }
+}
+const struct xattr_handler *squashfs_xattr_handlers[] = {
+        &squashfs_xattr_user_handler,
+        &squashfs_xattr_trusted_handler,
+        &squashfs_xattr_security_handler,
+        NULL
+};
diff --git a/fs/squashfs/xattr.h b/fs/squashfs/xattr.h
new file mode 100644
index 000000000000..9da071ae181c
--- /dev/null
+++ b/fs/squashfs/xattr.h
@@ -0,0 +1,46 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr.h
+ */
+#ifdef CONFIG_SQUASHFS_XATTRS
+extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
+                u64 *, int *);
+extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
+                int *, unsigned long long *);
+#else
+static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
+                u64 start, u64 *xattr_table_start, int *xattr_ids)
+{
+        ERROR("Xattrs in filesystem, these will be ignored\n");
+        return ERR_PTR(-ENOTSUPP);
+}
+static inline int squashfs_xattr_lookup(struct super_block *sb,
+                unsigned int index, int *count, int *size,
+                unsigned long long *xattr)
+{
+        return 0;
+}
+#define squashfs_listxattr NULL
+#define generic_getxattr NULL
+#define squashfs_xattr_handlers NULL
+#endif
diff --git a/fs/squashfs/xattr_id.c b/fs/squashfs/xattr_id.c
new file mode 100644
index 000000000000..cfb41106098f
--- /dev/null
+++ b/fs/squashfs/xattr_id.c
@@ -0,0 +1,100 @@
+/*
+ * Squashfs - a compressed read only filesystem for Linux
+ *
+ * Copyright (c) 2010
+ * Phillip Lougher <phillip@lougher.demon.co.uk>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2,
+ * or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ *
+ * xattr_id.c
+ */
+/*
+ * This file implements code to map the 32-bit xattr id stored in the inode
+ * into the on disk location of the xattr data.
+ */
+#include <linux/fs.h>
+#include <linux/vfs.h>
+#include <linux/slab.h>
+#include "squashfs_fs.h"
+#include "squashfs_fs_sb.h"
+#include "squashfs_fs_i.h"
+#include "squashfs.h"
+/*
+ * Map xattr id using the xattr id look up table
+ */
+int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
+                int *count, unsigned int *size, unsigned long long *xattr)
+{
+        struct squashfs_sb_info *msblk = sb->s_fs_info;
+        int block = SQUASHFS_XATTR_BLOCK(index);
+        int offset = SQUASHFS_XATTR_BLOCK_OFFSET(index);
+        u64 start_block = le64_to_cpu(msblk->xattr_id_table[block]);
+        struct squashfs_xattr_id id;
+        int err;
+        err = squashfs_read_metadata(sb, &id, &start_block, &offset,
+                                                        sizeof(id));
+        if (err < 0)
+                return err;
+        *xattr = le64_to_cpu(id.xattr);
+        *size = le32_to_cpu(id.size);
+        *count = le32_to_cpu(id.count);
+        return 0;
+}
+/*
+ * Read uncompressed xattr id lookup table indexes from disk into memory
+ */
+__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 start,
+                u64 *xattr_table_start, int *xattr_ids)
+{
+        unsigned int len;
+        __le64 *xid_table;
+        struct squashfs_xattr_id_table id_table;
+        int err;
+        err = squashfs_read_table(sb, &id_table, start, sizeof(id_table));
+        if (err < 0) {
+                ERROR("unable to read xattr id table\n");
+                return ERR_PTR(err);
+        }
+        *xattr_table_start = le64_to_cpu(id_table.xattr_table_start);
+        *xattr_ids = le32_to_cpu(id_table.xattr_ids);
+        len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
+        TRACE("In read_xattr_index_table, length %d\n", len);
+        /* Allocate xattr id lookup table indexes */
+        xid_table = kmalloc(len, GFP_KERNEL);
+        if (xid_table == NULL) {
+                ERROR("Failed to allocate xattr id index table\n");
+                return ERR_PTR(-ENOMEM);
+        }
+        err = squashfs_read_table(sb, xid_table, start + sizeof(id_table), len);
+        if (err < 0) {
+                ERROR("unable to read xattr id index table\n");
+                kfree(xid_table);
+                return ERR_PTR(err);
+        }
+        return xid_table;
+}
diff --git a/fs/super.c b/fs/super.c
index 69688b15f1fa..5c35bc7a499e 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -24,7 +24,6 @@
 #include <linux/slab.h>
 #include <linux/acct.h>
 #include <linux/blkdev.h>
-#include <linux/quotaops.h>
 #include <linux/mount.h>
 #include <linux/security.h>
 #include <linux/writeback.h>            /* for the emergency remount stuff */
@@ -94,8 +93,6 @@ static struct super_block *alloc_super(struct file_system_type *type)
                init_rwsem(&s->s_dquot.dqptr_sem);
                init_waitqueue_head(&s->s_wait_unfrozen);
                s->s_maxbytes = MAX_NON_LFS;
-                s->dq_op = sb_dquot_ops;
-                s->s_qcop = sb_quotactl_ops;
                s->s_op = &default_op;
                s->s_time_gran = 1000000000;
        }
@@ -160,7 +157,6 @@ void deactivate_locked_super(struct super_block *s)
 {
        struct file_system_type *fs = s->s_type;
        if (atomic_dec_and_test(&s->s_active)) {
-                vfs_dq_off(s, 0);
                fs->kill_sb(s);
                put_filesystem(fs);
                put_super(s);
@@ -524,7 +520,7 @@ rescan:
 int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
 {
        int retval;
-        int remount_rw, remount_ro;
+        int remount_ro;
        if (sb->s_frozen != SB_UNFROZEN)
                return -EBUSY;
@@ -540,7 +536,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
        sync_filesystem(sb);
        remount_ro = (flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY);
-        remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY);
        /* If we are remounting RDONLY and current sb is read/write,
           make sure there are no rw files opened */
@@ -549,9 +544,6 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                        mark_files_ro(sb);
                else if (!fs_may_remount_ro(sb))
                        return -EBUSY;
-                retval = vfs_dq_off(sb, 1);
-                if (retval < 0 && retval != -ENOSYS)
-                        return -EBUSY;
        }
        if (sb->s_op->remount_fs) {
@@ -560,8 +552,7 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
                        return retval;
        }
        sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK);
-        if (remount_rw)
-                vfs_dq_quota_on_remount(sb);
        /*
         * Some filesystems modify their metadata via some other path than the
         * bdev buffer cache (eg. use a private mapping, or directories in
@@ -946,8 +937,8 @@ out:
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 /**
- * freeze_super -- lock the filesystem and force it into a consistent state
+ * freeze_super - lock the filesystem and force it into a consistent state
- * @super: the super to lock
+ * @sb: the super to lock
 *
 * Syncs the super to make sure the filesystem is consistent and calls the fs's
 * freeze_fs.  Subsequent calls to this without first thawing the fs will return
diff --git a/fs/sync.c b/fs/sync.c
index e8cbd415e50a..15aa6f03b2da 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,7 +42,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
        if (wait)
                sync_inodes_sb(sb);
        else
-                writeback_inodes_sb_locked(sb);
+                writeback_inodes_sb(sb);
        if (sb->s_op->sync_fs)
                sb->s_op->sync_fs(sb, wait);
@@ -130,12 +130,10 @@ void emergency_sync(void)
 /*
 * Generic function to fsync a file.
- *
- * filp may be NULL if called via the msync of a vma.
 */
-int file_fsync(struct file *filp, struct dentry *dentry, int datasync)
+int file_fsync(struct file *filp, int datasync)
 {
-        struct inode * inode = dentry->d_inode;
+        struct inode *inode = filp->f_mapping->host;
        struct super_block * sb;
        int ret, err;
@@ -183,7 +181,7 @@ int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
         * livelocks in fsync_buffers_list().
         */
        mutex_lock(&mapping->host->i_mutex);
-        err = file->f_op->fsync(file, file->f_path.dentry, datasync);
+        err = file->f_op->fsync(file, datasync);
        if (!ret)
                ret = err;
        mutex_unlock(&mapping->host->i_mutex);
diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c
index bbd77e95cf7f..0835a3b70e03 100644
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -117,13 +117,13 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
        if (error)
                goto out;
-        iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */
+        error = sysfs_sd_setattr(sd, iattr);
-        error = inode_setattr(inode, iattr);
        if (error)
                goto out;
-        error = sysfs_sd_setattr(sd, iattr);
+        /* this ignores size changes */
+        generic_setattr(inode, iattr);
 out:
        mutex_unlock(&sysfs_mutex);
        return error;
diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c
index 1dabed286b4c..79941e4964a4 100644
--- a/fs/sysv/dir.c
+++ b/fs/sysv/dir.c
@@ -24,7 +24,7 @@ const struct file_operations sysv_dir_operations = {
        .llseek         = generic_file_llseek,
        .read           = generic_read_dir,
        .readdir        = sysv_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
 };
 static inline void dir_put_page(struct page *page)
diff --git a/fs/sysv/file.c b/fs/sysv/file.c
index 96340c01f4a7..750cc22349bd 100644
--- a/fs/sysv/file.c
+++ b/fs/sysv/file.c
@@ -26,7 +26,7 @@ const struct file_operations sysv_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 4573734d723d..d4a5380b5669 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -43,6 +43,7 @@ static int sysv_sync_fs(struct super_block *sb, int wait)
         * then attach current time stamp.
         * But if the filesystem was marked clean, keep it clean.
         */
+        sb->s_dirt = 0;
        old_time = fs32_to_cpu(sbi, *sbi->s_sb_time);
        if (sbi->s_type == FSTYPE_SYSV4) {
                if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38 - old_time))
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 5692cf72b807..12f445cee9f7 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -967,12 +967,15 @@ static int do_writepage(struct page *page, int len)
 * the page locked, and it locks @ui_mutex. However, write-back does take inode
 * @i_mutex, which means other VFS operations may be run on this inode at the
 * same time. And the problematic one is truncation to smaller size, from where
- * we have to call 'vmtruncate()', which first changes @inode->i_size, then
+ * we have to call 'simple_setsize()', which first changes @inode->i_size, then
 * drops the truncated pages. And while dropping the pages, it takes the page
- * lock. This means that 'do_truncation()' cannot call 'vmtruncate()' with
+ * lock. This means that 'do_truncation()' cannot call 'simple_setsize()' with
 * @ui_mutex locked, because it would deadlock with 'ubifs_writepage()'. This
 * means that @inode->i_size is changed while @ui_mutex is unlocked.
 *
+ * XXX: with the new truncate the above is not true anymore, the simple_setsize
+ * calls can be replaced with the individual components.
+ *
 * But in 'ubifs_writepage()' we have to guarantee that we do not write beyond
 * inode size. How do we do this if @inode->i_size may became smaller while we
 * are in the middle of 'ubifs_writepage()'? The UBIFS solution is the
@@ -1125,7 +1128,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode,
                budgeted = 0;
        }
-        err = vmtruncate(inode, new_size);
+        err = simple_setsize(inode, new_size);
        if (err)
                goto out_budg;
@@ -1214,7 +1217,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        if (attr->ia_valid & ATTR_SIZE) {
                dbg_gen("size %lld -> %lld", inode->i_size, new_size);
-                err = vmtruncate(inode, new_size);
+                err = simple_setsize(inode, new_size);
                if (err)
                        goto out;
        }
@@ -1223,7 +1226,7 @@ static int do_setattr(struct ubifs_info *c, struct inode *inode,
        if (attr->ia_valid & ATTR_SIZE) {
                /* Truncation changes inode [mc]time */
                inode->i_mtime = inode->i_ctime = ubifs_current_time(inode);
-                /* 'vmtruncate()' changed @i_size, update @ui_size */
+                /* 'simple_setsize()' changed @i_size, update @ui_size */
                ui->ui_size = inode->i_size;
        }
@@ -1304,9 +1307,9 @@ static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd)
        return NULL;
 }
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync)
+int ubifs_fsync(struct file *file, int datasync)
 {
-        struct inode *inode = dentry->d_inode;
+        struct inode *inode = file->f_mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        int err;
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index bd2542dad014..2eef553d50c8 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -379,7 +379,7 @@ struct ubifs_gced_idx_leb {
 * The @ui_size is a "shadow" variable for @inode->i_size and UBIFS uses
 * @ui_size instead of @inode->i_size. The reason for this is that UBIFS cannot
 * make sure @inode->i_size is always changed under @ui_mutex, because it
- * cannot call 'vmtruncate()' with @ui_mutex locked, because it would deadlock
+ * cannot call 'simple_setsize()' with @ui_mutex locked, because it would deadlock
 * with 'ubifs_writepage()' (see file.c). All the other inode fields are
 * changed under @ui_mutex, so they do not need "shadow" fields. Note, one
 * could consider to rework locking and base it on "shadow" fields.
@@ -1678,7 +1678,7 @@ const struct ubifs_lprops *ubifs_fast_find_frdi_idx(struct ubifs_info *c);
 int ubifs_calc_dark(const struct ubifs_info *c, int spc);
 /* file.c */
-int ubifs_fsync(struct file *file, struct dentry *dentry, int datasync);
+int ubifs_fsync(struct file *file, int datasync);
 int ubifs_setattr(struct dentry *dentry, struct iattr *attr);
 /* dir.c */
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 9a9378b4eb5a..b608efaa4cee 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -21,7 +21,6 @@
 #include "udfdecl.h"
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
@@ -159,8 +158,6 @@ static void udf_bitmap_free_blocks(struct super_block *sb,
                                udf_debug("byte=%2x\n",
                                        ((char *)bh->b_data)[(bit + i) >> 3]);
                        } else {
-                                if (inode)
-                                        dquot_free_block(inode, 1);
                                udf_add_free_space(sb, sbi->s_partition, 1);
                        }
                }
@@ -210,15 +207,8 @@ static int udf_bitmap_prealloc_blocks(struct super_block *sb,
                bit = block % (sb->s_blocksize << 3);
                while (bit < (sb->s_blocksize << 3) && block_count > 0) {
-                        if (!udf_test_bit(bit, bh->b_data))
+                        if (!udf_clear_bit(bit, bh->b_data))
                                goto out;
-                        else if (dquot_prealloc_block(inode, 1))
-                                goto out;
-                        else if (!udf_clear_bit(bit, bh->b_data)) {
-                                udf_debug("bit already cleared for block %d\n", bit);
-                                dquot_free_block(inode, 1);
-                                goto out;
-                        }
                        block_count--;
                        alloc_count++;
                        bit++;
@@ -338,20 +328,6 @@ search_back:
        }
 got_block:
-        /*
-         * Check quota for allocation of this block.
-         */
-        if (inode) {
-                int ret = dquot_alloc_block(inode, 1);
-                if (ret) {
-                        mutex_unlock(&sbi->s_alloc_mutex);
-                        *err = ret;
-                        return 0;
-                }
-        }
        newblock = bit + (block_group << (sb->s_blocksize_bits + 3)) -
                (sizeof(struct spaceBitmapDesc) << 3);
@@ -401,10 +377,6 @@ static void udf_table_free_blocks(struct super_block *sb,
        }
        iinfo = UDF_I(table);
-        /* We do this up front - There are some error conditions that
-           could occure, but.. oh well */
-        if (inode)
-                dquot_free_block(inode, count);
        udf_add_free_space(sb, sbi->s_partition, count);
        start = bloc->logicalBlockNum + offset;
@@ -649,10 +621,7 @@ static int udf_table_prealloc_blocks(struct super_block *sb,
                epos.offset -= adsize;
                alloc_count = (elen >> sb->s_blocksize_bits);
-                if (inode && dquot_prealloc_block(inode,
+                if (alloc_count > block_count) {
-                        alloc_count > block_count ? block_count : alloc_count))
-                        alloc_count = 0;
-                else if (alloc_count > block_count) {
                        alloc_count = block_count;
                        eloc.logicalBlockNum += alloc_count;
                        elen -= (alloc_count << sb->s_blocksize_bits);
@@ -752,14 +721,6 @@ static int udf_table_new_block(struct super_block *sb,
        newblock = goal_eloc.logicalBlockNum;
        goal_eloc.logicalBlockNum++;
        goal_elen -= sb->s_blocksize;
-        if (inode) {
-                *err = dquot_alloc_block(inode, 1);
-                if (*err) {
-                        brelse(goal_epos.bh);
-                        mutex_unlock(&sbi->s_alloc_mutex);
-                        return 0;
-                }
-        }
        if (goal_elen)
                udf_write_aext(table, &goal_epos, &goal_eloc, goal_elen, 1);
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index 3a84455c2a77..51552bf50225 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -207,8 +207,9 @@ static int udf_readdir(struct file *filp, void *dirent, filldir_t filldir)
 /* readdir and lookup functions */
 const struct file_operations udf_dir_operations = {
+        .llseek                 = generic_file_llseek,
        .read                   = generic_read_dir,
        .readdir                = udf_readdir,
        .unlocked_ioctl         = udf_ioctl,
-        .fsync                  = simple_fsync,
+        .fsync                  = generic_file_fsync,
 };
diff --git a/fs/udf/file.c b/fs/udf/file.c
index baae3a723946..94e06d6bddbd 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -34,7 +34,6 @@
 #include <linux/errno.h>
 #include <linux/smp_lock.h>
 #include <linux/pagemap.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/aio.h>
 #include <linux/smp_lock.h>
@@ -219,39 +218,16 @@ const struct file_operations udf_file_operations = {
        .read                   = do_sync_read,
        .aio_read               = generic_file_aio_read,
        .unlocked_ioctl         = udf_ioctl,
-        .open                   = dquot_file_open,
+        .open                   = generic_file_open,
        .mmap                   = generic_file_mmap,
        .write                  = do_sync_write,
        .aio_write              = udf_file_aio_write,
        .release                = udf_release_file,
-        .fsync                  = simple_fsync,
+        .fsync                  = generic_file_fsync,
        .splice_read            = generic_file_splice_read,
        .llseek                 = generic_file_llseek,
 };
-int udf_setattr(struct dentry *dentry, struct iattr *iattr)
-{
-        struct inode *inode = dentry->d_inode;
-        int error;
-        error = inode_change_ok(inode, iattr);
-        if (error)
-                return error;
-        if (is_quota_modification(inode, iattr))
-                dquot_initialize(inode);
-        if ((iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) ||
-            (iattr->ia_valid & ATTR_GID && iattr->ia_gid != inode->i_gid)) {
-                error = dquot_transfer(inode, iattr);
-                if (error)
-                        return error;
-        }
-        return inode_setattr(inode, iattr);
-}
 const struct inode_operations udf_file_inode_operations = {
        .truncate               = udf_truncate,
-        .setattr                = udf_setattr,
 };
diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 2b5586c7f02a..18cd7111185d 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -20,7 +20,6 @@
 #include "udfdecl.h"
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
@@ -32,13 +31,6 @@ void udf_free_inode(struct inode *inode)
        struct super_block *sb = inode->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
-        /*
-         * Note: we must free any quota before locking the superblock,
-         * as writing the quota to disk may need the lock as well.
-         */
-        dquot_free_inode(inode);
-        dquot_drop(inode);
        clear_inode(inode);
        mutex_lock(&sbi->s_alloc_mutex);
@@ -61,7 +53,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        struct super_block *sb = dir->i_sb;
        struct udf_sb_info *sbi = UDF_SB(sb);
        struct inode *inode;
-        int block, ret;
+        int block;
        uint32_t start = UDF_I(dir)->i_location.logicalBlockNum;
        struct udf_inode_info *iinfo;
        struct udf_inode_info *dinfo = UDF_I(dir);
@@ -146,17 +138,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err)
        insert_inode_hash(inode);
        mark_inode_dirty(inode);
-        dquot_initialize(inode);
-        ret = dquot_alloc_inode(inode);
-        if (ret) {
-                dquot_drop(inode);
-                inode->i_flags |= S_NOQUOTA;
-                inode->i_nlink = 0;
-                iput(inode);
-                *err = ret;
-                return NULL;
-        }
        *err = 0;
        return inode;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 8a3fbd177cab..124852bcf6fe 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -36,7 +36,6 @@
 #include <linux/pagemap.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/crc-itu-t.h>
@@ -71,9 +70,6 @@ static int udf_get_block(struct inode *, sector_t, struct buffer_head *, int);
 void udf_delete_inode(struct inode *inode)
 {
-        if (!is_bad_inode(inode))
-                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
@@ -113,7 +109,6 @@ void udf_clear_inode(struct inode *inode)
                        (unsigned long long)iinfo->i_lenExtents);
        }
-        dquot_drop(inode);
        kfree(iinfo->i_ext.i_data);
        iinfo->i_ext.i_data = NULL;
 }
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 585f733615dc..bf5fc674193c 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -27,7 +27,6 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <linux/quotaops.h>
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
@@ -563,8 +562,6 @@ static int udf_create(struct inode *dir, struct dentry *dentry, int mode,
        int err;
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
        inode = udf_new_inode(dir, mode, &err);
        if (!inode) {
@@ -617,8 +614,6 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, int mode,
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        dquot_initialize(dir);
        lock_kernel();
        err = -EIO;
        inode = udf_new_inode(dir, mode, &err);
@@ -664,8 +659,6 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct udf_inode_info *dinfo = UDF_I(dir);
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
        err = -EMLINK;
        if (dir->i_nlink >= (256 << sizeof(dir->i_nlink)) - 1)
@@ -800,8 +793,6 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc *fi, cfi;
        struct kernel_lb_addr tloc;
-        dquot_initialize(dir);
        retval = -ENOENT;
        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -848,8 +839,6 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry)
        struct fileIdentDesc cfi;
        struct kernel_lb_addr tloc;
-        dquot_initialize(dir);
        retval = -ENOENT;
        lock_kernel();
        fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi);
@@ -904,8 +893,6 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
        struct buffer_head *bh;
        struct udf_inode_info *iinfo;
-        dquot_initialize(dir);
        lock_kernel();
        inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
        if (!inode)
@@ -1075,8 +1062,6 @@ static int udf_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        struct buffer_head *bh;
-        dquot_initialize(dir);
        lock_kernel();
        if (inode->i_nlink >= (256 << sizeof(inode->i_nlink)) - 1) {
                unlock_kernel();
@@ -1139,9 +1124,6 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct kernel_lb_addr tloc;
        struct udf_inode_info *old_iinfo = UDF_I(old_inode);
-        dquot_initialize(old_dir);
-        dquot_initialize(new_dir);
        lock_kernel();
        ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi);
        if (ofi) {
@@ -1387,7 +1369,6 @@ const struct export_operations udf_export_ops = {
 const struct inode_operations udf_dir_inode_operations = {
        .lookup                         = udf_lookup,
        .create                         = udf_create,
-        .setattr                        = udf_setattr,
        .link                           = udf_link,
        .unlink                         = udf_unlink,
        .symlink                        = udf_symlink,
@@ -1400,5 +1381,4 @@ const struct inode_operations udf_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
-        .setattr        = udf_setattr,
 };
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 1e4543cbcd27..612d1e2e285a 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -557,6 +557,7 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
 {
        struct udf_options uopt;
        struct udf_sb_info *sbi = UDF_SB(sb);
+        int error = 0;
        uopt.flags = sbi->s_flags;
        uopt.uid   = sbi->s_uid;
@@ -582,17 +583,17 @@ static int udf_remount_fs(struct super_block *sb, int *flags, char *options)
                        *flags |= MS_RDONLY;
        }
-        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) {
+        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
-                unlock_kernel();
+                goto out_unlock;
-                return 0;
-        }
        if (*flags & MS_RDONLY)
                udf_close_lvid(sb);
        else
                udf_open_lvid(sb);
+out_unlock:
        unlock_kernel();
-        return 0;
+        return error;
 }
 /* Check Volume Structure Descriptors (ECMA 167 2/9.1) */
@@ -1939,7 +1940,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
        /* Fill in the rest of the superblock */
        sb->s_op = &udf_sb_ops;
        sb->s_export_op = &udf_export_ops;
-        sb->dq_op = NULL;
        sb->s_dirt = 0;
        sb->s_magic = UDF_SUPER_MAGIC;
        sb->s_time_gran = 1000;
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 9079ff7d6255..2bac0354891f 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -131,7 +131,6 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *,
 /* file.c */
 extern long udf_ioctl(struct file *, unsigned int, unsigned long);
-extern int udf_setattr(struct dentry *dentry, struct iattr *iattr);
 /* inode.c */
 extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *);
 extern int udf_sync_inode(struct inode *);
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 5cfa4d85ccf2..048484fb10d2 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -12,7 +12,6 @@
 #include <linux/stat.h>
 #include <linux/time.h>
 #include <linux/string.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/bitops.h>
@@ -85,9 +84,6 @@ void ufs_free_fragments(struct inode *inode, u64 fragment, unsigned count)
                                   "bit already cleared for fragment %u", i);
        }
        
-        dquot_free_block(inode, count);
-        
        fs32_add(sb, &ucg->cg_cs.cs_nffree, count);
        uspi->cs_total.cs_nffree += count;
        fs32_add(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -195,7 +191,6 @@ do_more:
                ubh_setblock(UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
                if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                        ufs_clusteracct (sb, ucpi, blkno, 1);
-                dquot_free_block(inode, uspi->s_fpb);
                fs32_add(sb, &ucg->cg_cs.cs_nbfree, 1);
                uspi->cs_total.cs_nbfree++;
@@ -511,7 +506,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
        struct ufs_cg_private_info * ucpi;
        struct ufs_cylinder_group * ucg;
        unsigned cgno, fragno, fragoff, count, fragsize, i;
-        int ret;
        
        UFSD("ENTER, fragment %llu, oldcount %u, newcount %u\n",
             (unsigned long long)fragment, oldcount, newcount);
@@ -557,11 +551,6 @@ static u64 ufs_add_fragments(struct inode *inode, u64 fragment,
                fs32_add(sb, &ucg->cg_frsum[fragsize - count], 1);
        for (i = oldcount; i < newcount; i++)
                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, fragno + i);
-        ret = dquot_alloc_block(inode, count);
-        if (ret) {
-                *err = ret;
-                return 0;
-        }
        fs32_sub(sb, &ucg->cg_cs.cs_nffree, count);
        fs32_sub(sb, &UFS_SB(sb)->fs_cs(cgno).cs_nffree, count);
@@ -598,7 +587,6 @@ static u64 ufs_alloc_fragments(struct inode *inode, unsigned cgno,
        struct ufs_cylinder_group * ucg;
        unsigned oldcg, i, j, k, allocsize;
        u64 result;
-        int ret;
        
        UFSD("ENTER, ino %lu, cgno %u, goal %llu, count %u\n",
             inode->i_ino, cgno, (unsigned long long)goal, count);
@@ -667,7 +655,6 @@ cg_found:
                for (i = count; i < uspi->s_fpb; i++)
                        ubh_setbit (UCPI_UBH(ucpi), ucpi->c_freeoff, goal + i);
                i = uspi->s_fpb - count;
-                dquot_free_block(inode, i);
                fs32_add(sb, &ucg->cg_cs.cs_nffree, i);
                uspi->cs_total.cs_nffree += i;
@@ -679,11 +666,6 @@ cg_found:
        result = ufs_bitmap_search (sb, ucpi, goal, allocsize);
        if (result == INVBLOCK)
                return 0;
-        ret = dquot_alloc_block(inode, count);
-        if (ret) {
-                *err = ret;
-                return 0;
-        }
        for (i = 0; i < count; i++)
                ubh_clrbit (UCPI_UBH(ucpi), ucpi->c_freeoff, result + i);
        
@@ -718,7 +700,6 @@ static u64 ufs_alloccg_block(struct inode *inode,
        struct ufs_super_block_first * usb1;
        struct ufs_cylinder_group * ucg;
        u64 result, blkno;
-        int ret;
        UFSD("ENTER, goal %llu\n", (unsigned long long)goal);
@@ -752,11 +733,6 @@ gotit:
        ubh_clrblock (UCPI_UBH(ucpi), ucpi->c_freeoff, blkno);
        if ((UFS_SB(sb)->s_flags & UFS_CG_MASK) == UFS_CG_44BSD)
                ufs_clusteracct (sb, ucpi, blkno, -1);
-        ret = dquot_alloc_block(inode, uspi->s_fpb);
-        if (ret) {
-                *err = ret;
-                return INVBLOCK;
-        }
        fs32_sub(sb, &ucg->cg_cs.cs_nbfree, 1);
        uspi->cs_total.cs_nbfree--;
diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c
index 317a0d444f6b..ec784756dc65 100644
--- a/fs/ufs/dir.c
+++ b/fs/ufs/dir.c
@@ -666,6 +666,6 @@ not_empty:
 const struct file_operations ufs_dir_operations = {
        .read           = generic_read_dir,
        .readdir        = ufs_readdir,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .llseek         = generic_file_llseek,
 };
diff --git a/fs/ufs/file.c b/fs/ufs/file.c
index a8962cecde5b..33afa20d4509 100644
--- a/fs/ufs/file.c
+++ b/fs/ufs/file.c
@@ -24,7 +24,6 @@
 */
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -41,7 +40,7 @@ const struct file_operations ufs_file_operations = {
        .write          = do_sync_write,
        .aio_write      = generic_file_aio_write,
        .mmap           = generic_file_mmap,
-        .open           = dquot_file_open,
+        .open           = generic_file_open,
-        .fsync          = simple_fsync,
+        .fsync          = generic_file_fsync,
        .splice_read    = generic_file_splice_read,
 };
diff --git a/fs/ufs/ialloc.c b/fs/ufs/ialloc.c
index 3a959d55084d..594480e537d2 100644
--- a/fs/ufs/ialloc.c
+++ b/fs/ufs/ialloc.c
@@ -27,7 +27,6 @@
 #include <linux/time.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/sched.h>
 #include <linux/bitops.h>
@@ -95,9 +94,6 @@ void ufs_free_inode (struct inode * inode)
        is_directory = S_ISDIR(inode->i_mode);
-        dquot_free_inode(inode);
-        dquot_drop(inode);
        clear_inode (inode);
        if (ubh_isclr (UCPI_UBH(ucpi), ucpi->c_iusedoff, bit))
@@ -347,21 +343,12 @@ cg_found:
        unlock_super (sb);
-        dquot_initialize(inode);
-        err = dquot_alloc_inode(inode);
-        if (err) {
-                dquot_drop(inode);
-                goto fail_without_unlock;
-        }
        UFSD("allocating inode %lu\n", inode->i_ino);
        UFSD("EXIT\n");
        return inode;
 fail_remove_inode:
        unlock_super(sb);
-fail_without_unlock:
-        inode->i_flags |= S_NOQUOTA;
        inode->i_nlink = 0;
        iput(inode);
        UFSD("EXIT (FAILED): err %d\n", err);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index cffa756f1047..73fe773aa034 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -37,7 +37,6 @@
 #include <linux/smp_lock.h>
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -910,9 +909,6 @@ void ufs_delete_inode (struct inode * inode)
 {
        loff_t old_i_size;
-        if (!is_bad_inode(inode))
-                dquot_initialize(inode);
        truncate_inode_pages(&inode->i_data, 0);
        if (is_bad_inode(inode))
                goto no_delete;
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index eabc02eb1294..b056f02b1fb3 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -30,7 +30,6 @@
 #include <linux/time.h>
 #include <linux/fs.h>
 #include <linux/smp_lock.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -86,8 +85,6 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, int mode,
        UFSD("BEGIN\n");
-        dquot_initialize(dir);
        inode = ufs_new_inode(dir, mode);
        err = PTR_ERR(inode);
@@ -112,8 +109,6 @@ static int ufs_mknod (struct inode * dir, struct dentry *dentry, int mode, dev_t
        if (!old_valid_dev(rdev))
                return -EINVAL;
-        dquot_initialize(dir);
        inode = ufs_new_inode(dir, mode);
        err = PTR_ERR(inode);
        if (!IS_ERR(inode)) {
@@ -138,8 +133,6 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
        if (l > sb->s_blocksize)
                goto out_notlocked;
-        dquot_initialize(dir);
        lock_kernel();
        inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO);
        err = PTR_ERR(inode);
@@ -185,8 +178,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir,
                return -EMLINK;
        }
-        dquot_initialize(dir);
        inode->i_ctime = CURRENT_TIME_SEC;
        inode_inc_link_count(inode);
        atomic_inc(&inode->i_count);
@@ -204,8 +195,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, int mode)
        if (dir->i_nlink >= UFS_LINK_MAX)
                goto out;
-        dquot_initialize(dir);
        lock_kernel();
        inode_inc_link_count(dir);
@@ -250,8 +239,6 @@ static int ufs_unlink(struct inode *dir, struct dentry *dentry)
        struct page *page;
        int err = -ENOENT;
-        dquot_initialize(dir);
        de = ufs_find_entry(dir, &dentry->d_name, &page);
        if (!de)
                goto out;
@@ -296,9 +283,6 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct ufs_dir_entry *old_de;
        int err = -ENOENT;
-        dquot_initialize(old_dir);
-        dquot_initialize(new_dir);
        old_de = ufs_find_entry(old_dir, &old_dentry->d_name, &old_page);
        if (!old_de)
                goto out;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index 14743d935a93..3ec5a9eb6efb 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -77,7 +77,6 @@
 #include <linux/errno.h>
 #include <linux/fs.h>
-#include <linux/quotaops.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/stat.h>
@@ -918,6 +917,7 @@ again:
        sbi->s_bytesex = BYTESEX_LE;
        switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
                case UFS_MAGIC:
+                case UFS_MAGIC_BW:
                case UFS2_MAGIC:
                case UFS_MAGIC_LFN:
                case UFS_MAGIC_FEA:
@@ -927,6 +927,7 @@ again:
        sbi->s_bytesex = BYTESEX_BE;
        switch ((uspi->fs_magic = fs32_to_cpu(sb, usb3->fs_magic))) {
                case UFS_MAGIC:
+                case UFS_MAGIC_BW:
                case UFS2_MAGIC:
                case UFS_MAGIC_LFN:
                case UFS_MAGIC_FEA:
@@ -1045,7 +1046,7 @@ magic_found:
         */
        sb->s_op = &ufs_super_ops;
        sb->s_export_op = &ufs_export_ops;
-        sb->dq_op = NULL; /***/
        sb->s_magic = fs32_to_cpu(sb, usb3->fs_magic);
        uspi->s_sblkno = fs32_to_cpu(sb, usb1->fs_sblkno);
@@ -1435,126 +1436,19 @@ static void destroy_inodecache(void)
        kmem_cache_destroy(ufs_inode_cachep);
 }
-static void ufs_clear_inode(struct inode *inode)
-{
-        dquot_drop(inode);
-}
-#ifdef CONFIG_QUOTA
-static ssize_t ufs_quota_read(struct super_block *, int, char *,size_t, loff_t);
-static ssize_t ufs_quota_write(struct super_block *, int, const char *, size_t, loff_t);
-#endif
 static const struct super_operations ufs_super_ops = {
        .alloc_inode    = ufs_alloc_inode,
        .destroy_inode  = ufs_destroy_inode,
        .write_inode    = ufs_write_inode,
        .delete_inode   = ufs_delete_inode,
-        .clear_inode    = ufs_clear_inode,
        .put_super      = ufs_put_super,
        .write_super    = ufs_write_super,
        .sync_fs        = ufs_sync_fs,
        .statfs         = ufs_statfs,
        .remount_fs     = ufs_remount,
        .show_options   = ufs_show_options,
-#ifdef CONFIG_QUOTA
-        .quota_read     = ufs_quota_read,
-        .quota_write    = ufs_quota_write,
-#endif
 };
-#ifdef CONFIG_QUOTA
-/* Read data from quotafile - avoid pagecache and such because we cannot afford
- * acquiring the locks... As quota files are never truncated and quota code
- * itself serializes the operations (and noone else should touch the files)
- * we don't have to be afraid of races */
-static ssize_t ufs_quota_read(struct super_block *sb, int type, char *data,
-                               size_t len, loff_t off)
-{
-        struct inode *inode = sb_dqopt(sb)->files[type];
-        sector_t blk = off >> sb->s_blocksize_bits;
-        int err = 0;
-        int offset = off & (sb->s_blocksize - 1);
-        int tocopy;
-        size_t toread;
-        struct buffer_head *bh;
-        loff_t i_size = i_size_read(inode);
-        if (off > i_size)
-                return 0;
-        if (off+len > i_size)
-                len = i_size-off;
-        toread = len;
-        while (toread > 0) {
-                tocopy = sb->s_blocksize - offset < toread ?
-                                sb->s_blocksize - offset : toread;
-                bh = ufs_bread(inode, blk, 0, &err);
-                if (err)
-                        return err;
-                if (!bh)        /* A hole? */
-                        memset(data, 0, tocopy);
-                else {
-                        memcpy(data, bh->b_data+offset, tocopy);
-                        brelse(bh);
-                }
-                offset = 0;
-                toread -= tocopy;
-                data += tocopy;
-                blk++;
-        }
-        return len;
-}
-/* Write to quotafile */
-static ssize_t ufs_quota_write(struct super_block *sb, int type,
-                                const char *data, size_t len, loff_t off)
-{
-        struct inode *inode = sb_dqopt(sb)->files[type];
-        sector_t blk = off >> sb->s_blocksize_bits;
-        int err = 0;
-        int offset = off & (sb->s_blocksize - 1);
-        int tocopy;
-        size_t towrite = len;
-        struct buffer_head *bh;
-        mutex_lock_nested(&inode->i_mutex, I_MUTEX_QUOTA);
-        while (towrite > 0) {
-                tocopy = sb->s_blocksize - offset < towrite ?
-                                sb->s_blocksize - offset : towrite;
-                bh = ufs_bread(inode, blk, 1, &err);
-                if (!bh)
-                        goto out;
-                lock_buffer(bh);
-                memcpy(bh->b_data+offset, data, tocopy);
-                flush_dcache_page(bh->b_page);
-                set_buffer_uptodate(bh);
-                mark_buffer_dirty(bh);
-                unlock_buffer(bh);
-                brelse(bh);
-                offset = 0;
-                towrite -= tocopy;
-                data += tocopy;
-                blk++;
-        }
-out:
-        if (len == towrite) {
-                mutex_unlock(&inode->i_mutex);
-                return err;
-        }
-        if (inode->i_size < off+len-towrite)
-                i_size_write(inode, off+len-towrite);
-        inode->i_version++;
-        inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
-        mark_inode_dirty(inode);
-        mutex_unlock(&inode->i_mutex);
-        return len - towrite;
-}
-#endif
 static int ufs_get_sb(struct file_system_type *fs_type,
        int flags, const char *dev_name, void *data, struct vfsmount *mnt)
 {
diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c
index f294c44577dc..589e01a465ba 100644
--- a/fs/ufs/truncate.c
+++ b/fs/ufs/truncate.c
@@ -44,7 +44,6 @@
 #include <linux/buffer_head.h>
 #include <linux/blkdev.h>
 #include <linux/sched.h>
-#include <linux/quotaops.h>
 #include "ufs_fs.h"
 #include "ufs.h"
@@ -501,12 +500,10 @@ out:
        return err;
 }
 /*
- * We don't define our `inode->i_op->truncate', and call it here,
+ * TODO:
- * because of:
+ *      - truncate case should use proper ordering instead of using
- * - there is no way to know old size
+ *        simple_setsize
- * - there is no way inform user about error, if it happens in `truncate'
 */
 int ufs_setattr(struct dentry *dentry, struct iattr *attr)
 {
@@ -518,19 +515,10 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr)
        if (error)
                return error;
-        if (is_quota_modification(inode, attr))
-                dquot_initialize(inode);
-        if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
-            (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
-                error = dquot_transfer(inode, attr);
-                if (error)
-                        return error;
-        }
        if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
                loff_t old_i_size = inode->i_size;
-                error = vmtruncate(inode, attr->ia_size);
+                error = simple_setsize(inode, attr->ia_size);
                if (error)
                        return error;
                error = ufs_truncate(inode, old_i_size);
diff --git a/fs/ufs/ufs_fs.h b/fs/ufs/ufs_fs.h
index 6943ec677c0b..8aba544f9fad 100644
--- a/fs/ufs/ufs_fs.h
+++ b/fs/ufs/ufs_fs.h
@@ -48,6 +48,7 @@ typedef __u16 __bitwise __fs16;
 #define UFS_SECTOR_SIZE 512
 #define UFS_SECTOR_BITS 9
 #define UFS_MAGIC  0x00011954
+#define UFS_MAGIC_BW 0x0f242697
 #define UFS2_MAGIC 0x19540119
 #define UFS_CIGAM  0x54190100 /* byteswapped MAGIC */
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 089eaca860b4..34640d6dbdcb 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1333,6 +1333,21 @@ xfs_vm_writepage(
        trace_xfs_writepage(inode, page, 0);
        /*
+         * Refuse to write the page out if we are called from reclaim context.
+         *
+         * This is primarily to avoid stack overflows when called from deep
+         * used stacks in random callers for direct reclaim, but disabling
+         * reclaim for kswap is a nice side-effect as kswapd causes rather
+         * suboptimal I/O patters, too.
+         *
+         * This should really be done by the core VM, but until that happens
+         * filesystems like XFS, btrfs and ext4 have to take care of this
+         * by themselves.
+         */
+        if (current->flags & PF_MEMALLOC)
+                goto out_fail;
+        /*
         * We need a transaction if:
         *  1. There are delalloc buffers on the page
         *  2. The page is uptodate and we have unmapped buffers
@@ -1366,14 +1381,6 @@ xfs_vm_writepage(
        if (!page_has_buffers(page))
                create_empty_buffers(page, 1 << inode->i_blkbits, 0);
-        /*
-         *  VM calculation for nr_to_write seems off.  Bump it way
-         *  up, this gets simple streaming writes zippy again.
-         *  To be reviewed again after Jens' writeback changes.
-         */
-        wbc->nr_to_write *= 4;
        /*
         * Convert delayed allocate, unwritten or unmapped space
         * to real space and flush out to disk.
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index d8fb1b5d6cb5..257a56b127cf 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -100,10 +100,10 @@ xfs_iozero(
 STATIC int
 xfs_file_fsync(
        struct file             *file,
-        struct dentry           *dentry,
        int                     datasync)
 {
-        struct xfs_inode        *ip = XFS_I(dentry->d_inode);
+        struct inode            *inode = file->f_mapping->host;
+        struct xfs_inode        *ip = XFS_I(inode);
        struct xfs_trans        *tp;
        int                     error = 0;
        int                     log_flushed = 0;
@@ -140,8 +140,8 @@ xfs_file_fsync(
         * might gets cleared when the inode gets written out via the AIL
         * or xfs_iflush_cluster.
         */
-        if (((dentry->d_inode->i_state & I_DIRTY_DATASYNC) ||
+        if (((inode->i_state & I_DIRTY_DATASYNC) ||
-            ((dentry->d_inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
+            ((inode->i_state & I_DIRTY_SYNC) && !datasync)) &&
            ip->i_update_core) {
                /*
                 * Kick off a transaction to log the inode core to get the
@@ -868,7 +868,7 @@ write_retry:
                        mutex_lock(&inode->i_mutex);
                xfs_ilock(ip, iolock);
-                error2 = -xfs_file_fsync(file, file->f_path.dentry,
+                error2 = -xfs_file_fsync(file,
                                         (file->f_flags & __O_SYNC) ? 0 : 1);
                if (!error)
                        error = error2;
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 9c8019c78c92..44f0b2de153e 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -585,11 +585,20 @@ xfs_vn_fallocate(
        bf.l_len = len;
        xfs_ilock(ip, XFS_IOLOCK_EXCL);
+        /* check the new inode size is valid before allocating */
+        if (!(mode & FALLOC_FL_KEEP_SIZE) &&
+            offset + len > i_size_read(inode)) {
+                new_size = offset + len;
+                error = inode_newsize_ok(inode, new_size);
+                if (error)
+                        goto out_unlock;
+        }
        error = -xfs_change_file_space(ip, XFS_IOC_RESVSP, &bf,
                                       0, XFS_ATTR_NOLOCK);
-        if (!error && !(mode & FALLOC_FL_KEEP_SIZE) &&
+        if (error)
-            offset + len > i_size_read(inode))
+                goto out_unlock;
-                new_size = offset + len;
        /* Change file size if needed */
        if (new_size) {
@@ -600,6 +609,7 @@ xfs_vn_fallocate(
                error = -xfs_setattr(ip, &iattr, XFS_ATTR_NOLOCK);
        }
+out_unlock:
        xfs_iunlock(ip, XFS_IOLOCK_EXCL);
 out_error:
        return error;
diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c
index 9ac8aea91529..067cafbfc635 100644
--- a/fs/xfs/linux-2.6/xfs_quotaops.c
+++ b/fs/xfs/linux-2.6/xfs_quotaops.c
@@ -23,7 +23,6 @@
 #include "xfs_ag.h"
 #include "xfs_mount.h"
 #include "xfs_quota.h"
-#include "xfs_log.h"
 #include "xfs_trans.h"
 #include "xfs_bmap_btree.h"
 #include "xfs_inode.h"
diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c
index 3884e20bc14e..ef7f0218bccb 100644
--- a/fs/xfs/linux-2.6/xfs_sync.c
+++ b/fs/xfs/linux-2.6/xfs_sync.c
@@ -164,10 +164,6 @@ xfs_inode_ag_iterator(
                struct xfs_perag        *pag;
                pag = xfs_perag_get(mp, ag);
-                if (!pag->pag_ici_init) {
-                        xfs_perag_put(pag);
-                        continue;
-                }
                error = xfs_inode_ag_walk(mp, pag, execute, flags, tag,
                                                exclusive, &nr);
                xfs_perag_put(pag);
@@ -867,12 +863,7 @@ xfs_reclaim_inode_shrink(
        down_read(&xfs_mount_list_lock);
        list_for_each_entry(mp, &xfs_mount_list, m_mplist) {
                for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) {
                        pag = xfs_perag_get(mp, ag);
-                        if (!pag->pag_ici_init) {
-                                xfs_perag_put(pag);
-                                continue;
-                        }
                        reclaimable += pag->pag_ici_reclaimable;
                        xfs_perag_put(pag);
                }
diff --git a/fs/xfs/linux-2.6/xfs_trace.c b/fs/xfs/linux-2.6/xfs_trace.c
index 207fa77f63ae..d12be8470cba 100644
--- a/fs/xfs/linux-2.6/xfs_trace.c
+++ b/fs/xfs/linux-2.6/xfs_trace.c
@@ -50,7 +50,6 @@
 #include "quota/xfs_dquot_item.h"
 #include "quota/xfs_dquot.h"
 #include "xfs_log_recover.h"
-#include "xfs_buf_item.h"
 #include "xfs_inode_item.h"
 /*
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index ff6bc797baf2..73d5aa117384 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -82,33 +82,6 @@ DECLARE_EVENT_CLASS(xfs_attr_list_class,
        )
 )
-#define DEFINE_PERAG_REF_EVENT(name) \
-TRACE_EVENT(name, \
-        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount, \
-                 unsigned long caller_ip), \
-        TP_ARGS(mp, agno, refcount, caller_ip), \
-        TP_STRUCT__entry( \
-                __field(dev_t, dev) \
-                __field(xfs_agnumber_t, agno) \
-                __field(int, refcount) \
-                __field(unsigned long, caller_ip) \
-        ), \
-        TP_fast_assign( \
-                __entry->dev = mp->m_super->s_dev; \
-                __entry->agno = agno; \
-                __entry->refcount = refcount; \
-                __entry->caller_ip = caller_ip; \
-        ), \
-        TP_printk("dev %d:%d agno %u refcount %d caller %pf", \
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
-                  __entry->agno, \
-                  __entry->refcount, \
-                  (char *)__entry->caller_ip) \
-);
-DEFINE_PERAG_REF_EVENT(xfs_perag_get)
-DEFINE_PERAG_REF_EVENT(xfs_perag_put)
 #define DEFINE_ATTR_LIST_EVENT(name) \
 DEFINE_EVENT(xfs_attr_list_class, name, \
        TP_PROTO(struct xfs_attr_list_context *ctx), \
@@ -122,6 +95,37 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_add);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_wrong_blk);
 DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound);
+DECLARE_EVENT_CLASS(xfs_perag_class,
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,
+                 unsigned long caller_ip),
+        TP_ARGS(mp, agno, refcount, caller_ip),
+        TP_STRUCT__entry(
+                __field(dev_t, dev)
+                __field(xfs_agnumber_t, agno)
+                __field(int, refcount)
+                __field(unsigned long, caller_ip)
+        ),
+        TP_fast_assign(
+                __entry->dev = mp->m_super->s_dev;
+                __entry->agno = agno;
+                __entry->refcount = refcount;
+                __entry->caller_ip = caller_ip;
+        ),
+        TP_printk("dev %d:%d agno %u refcount %d caller %pf",
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
+                  __entry->agno,
+                  __entry->refcount,
+                  (char *)__entry->caller_ip)
+);
+#define DEFINE_PERAG_REF_EVENT(name)    \
+DEFINE_EVENT(xfs_perag_class, name,     \
+        TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, int refcount,       \
+                 unsigned long caller_ip),                                      \
+        TP_ARGS(mp, agno, refcount, caller_ip))
+DEFINE_PERAG_REF_EVENT(xfs_perag_get);
+DEFINE_PERAG_REF_EVENT(xfs_perag_put);
 TRACE_EVENT(xfs_attr_list_node_descend,
        TP_PROTO(struct xfs_attr_list_context *ctx,
                 struct xfs_da_node_entry *btree),
@@ -775,165 +779,181 @@ DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_enter);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_exit);
 DEFINE_LOGGRANT_EVENT(xfs_log_ungrant_sub);
-#define DEFINE_RW_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_file_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags),
-        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+        TP_ARGS(ip, count, offset, flags),
-        TP_ARGS(ip, count, offset, flags), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(xfs_fsize_t, size)
-                __field(xfs_fsize_t, size) \
+                __field(xfs_fsize_t, new_size)
-                __field(xfs_fsize_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+                __field(int, flags)
-                __field(int, flags) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+                __entry->flags = flags;
-                __entry->flags = flags; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count 0x%zx ioflags %s",
-                  "offset 0x%llx count 0x%zx ioflags %s", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count,
-                  __entry->count, \
+                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS))
-                  __print_flags(__entry->flags, "|", XFS_IO_FLAGS)) \
 )
+#define DEFINE_RW_EVENT(name)           \
+DEFINE_EVENT(xfs_file_class, name,      \
+        TP_PROTO(struct xfs_inode *ip, size_t count, loff_t offset, int flags), \
+        TP_ARGS(ip, count, offset, flags))
 DEFINE_RW_EVENT(xfs_file_read);
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
 DEFINE_RW_EVENT(xfs_file_splice_write);
+DECLARE_EVENT_CLASS(xfs_page_class,
-#define DEFINE_PAGE_EVENT(name) \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),
-TRACE_EVENT(name, \
+        TP_ARGS(inode, page, off),
-        TP_PROTO(struct inode *inode, struct page *page, unsigned long off), \
+        TP_STRUCT__entry(
-        TP_ARGS(inode, page, off), \
+                __field(dev_t, dev)
-        TP_STRUCT__entry( \
+                __field(xfs_ino_t, ino)
-                __field(dev_t, dev) \
+                __field(pgoff_t, pgoff)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(pgoff_t, pgoff) \
+                __field(unsigned long, offset)
-                __field(loff_t, size) \
+                __field(int, delalloc)
-                __field(unsigned long, offset) \
+                __field(int, unmapped)
-                __field(int, delalloc) \
+                __field(int, unwritten)
-                __field(int, unmapped) \
+        ),
-                __field(int, unwritten) \
+        TP_fast_assign(
-        ), \
+                int delalloc = -1, unmapped = -1, unwritten = -1;
-        TP_fast_assign( \
-                int delalloc = -1, unmapped = -1, unwritten = -1; \
+                if (page_has_buffers(page))
-        \
+                        xfs_count_page_state(page, &delalloc,
-                if (page_has_buffers(page)) \
+                                             &unmapped, &unwritten);
-                        xfs_count_page_state(page, &delalloc, \
+                __entry->dev = inode->i_sb->s_dev;
-                                             &unmapped, &unwritten); \
+                __entry->ino = XFS_I(inode)->i_ino;
-                __entry->dev = inode->i_sb->s_dev; \
+                __entry->pgoff = page_offset(page);
-                __entry->ino = XFS_I(inode)->i_ino; \
+                __entry->size = i_size_read(inode);
-                __entry->pgoff = page_offset(page); \
+                __entry->offset = off;
-                __entry->size = i_size_read(inode); \
+                __entry->delalloc = delalloc;
-                __entry->offset = off; \
+                __entry->unmapped = unmapped;
-                __entry->delalloc = delalloc; \
+                __entry->unwritten = unwritten;
-                __entry->unmapped = unmapped; \
+        ),
-                __entry->unwritten = unwritten; \
+        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx "
-        ), \
+                  "delalloc %d unmapped %d unwritten %d",
-        TP_printk("dev %d:%d ino 0x%llx pgoff 0x%lx size 0x%llx offset %lx " \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  "delalloc %d unmapped %d unwritten %d", \
+                  __entry->ino,
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->pgoff,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->pgoff, \
+                  __entry->offset,
-                  __entry->size, \
+                  __entry->delalloc,
-                  __entry->offset, \
+                  __entry->unmapped,
-                  __entry->delalloc, \
+                  __entry->unwritten)
-                  __entry->unmapped, \
-                  __entry->unwritten) \
 )
+#define DEFINE_PAGE_EVENT(name)         \
+DEFINE_EVENT(xfs_page_class, name,      \
+        TP_PROTO(struct inode *inode, struct page *page, unsigned long off),    \
+        TP_ARGS(inode, page, off))
 DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
-#define DEFINE_IOMAP_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_iomap_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
-        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+                 int flags, struct xfs_bmbt_irec *irec),
-                 int flags, struct xfs_bmbt_irec *irec), \
+        TP_ARGS(ip, offset, count, flags, irec),
-        TP_ARGS(ip, offset, count, flags, irec), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(loff_t, size) \
+                __field(loff_t, new_size)
-                __field(loff_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+                __field(int, flags)
-                __field(int, flags) \
+                __field(xfs_fileoff_t, startoff)
-                __field(xfs_fileoff_t, startoff) \
+                __field(xfs_fsblock_t, startblock)
-                __field(xfs_fsblock_t, startblock) \
+                __field(xfs_filblks_t, blockcount)
-                __field(xfs_filblks_t, blockcount) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+                __entry->flags = flags;
-                __entry->flags = flags; \
+                __entry->startoff = irec ? irec->br_startoff : 0;
-                __entry->startoff = irec ? irec->br_startoff : 0; \
+                __entry->startblock = irec ? irec->br_startblock : 0;
-                __entry->startblock = irec ? irec->br_startblock : 0; \
+                __entry->blockcount = irec ? irec->br_blockcount : 0;
-                __entry->blockcount = irec ? irec->br_blockcount : 0; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count %zd flags %s "
-                  "offset 0x%llx count %zd flags %s " \
+                  "startoff 0x%llx startblock %lld blockcount 0x%llx",
-                  "startoff 0x%llx startblock %lld blockcount 0x%llx", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count,
-                  __entry->count, \
+                  __print_flags(__entry->flags, "|", BMAPI_FLAGS),
-                  __print_flags(__entry->flags, "|", BMAPI_FLAGS), \
+                  __entry->startoff,
-                  __entry->startoff, \
+                  (__int64_t)__entry->startblock,
-                  (__int64_t)__entry->startblock, \
+                  __entry->blockcount)
-                  __entry->blockcount) \
 )
+#define DEFINE_IOMAP_EVENT(name)        \
+DEFINE_EVENT(xfs_iomap_class, name,     \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, \
+                 int flags, struct xfs_bmbt_irec *irec),                \
+        TP_ARGS(ip, offset, count, flags, irec))
 DEFINE_IOMAP_EVENT(xfs_iomap_enter);
 DEFINE_IOMAP_EVENT(xfs_iomap_found);
 DEFINE_IOMAP_EVENT(xfs_iomap_alloc);
-#define DEFINE_SIMPLE_IO_EVENT(name) \
+DECLARE_EVENT_CLASS(xfs_simple_io_class,
-TRACE_EVENT(name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),
-        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), \
+        TP_ARGS(ip, offset, count),
-        TP_ARGS(ip, offset, count), \
+        TP_STRUCT__entry(
-        TP_STRUCT__entry( \
+                __field(dev_t, dev)
-                __field(dev_t, dev) \
+                __field(xfs_ino_t, ino)
-                __field(xfs_ino_t, ino) \
+                __field(loff_t, size)
-                __field(loff_t, size) \
+                __field(loff_t, new_size)
-                __field(loff_t, new_size) \
+                __field(loff_t, offset)
-                __field(loff_t, offset) \
+                __field(size_t, count)
-                __field(size_t, count) \
+        ),
-        ), \
+        TP_fast_assign(
-        TP_fast_assign( \
+                __entry->dev = VFS_I(ip)->i_sb->s_dev;
-                __entry->dev = VFS_I(ip)->i_sb->s_dev; \
+                __entry->ino = ip->i_ino;
-                __entry->ino = ip->i_ino; \
+                __entry->size = ip->i_d.di_size;
-                __entry->size = ip->i_d.di_size; \
+                __entry->new_size = ip->i_new_size;
-                __entry->new_size = ip->i_new_size; \
+                __entry->offset = offset;
-                __entry->offset = offset; \
+                __entry->count = count;
-                __entry->count = count; \
+        ),
-        ), \
+        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx "
-        TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \
+                  "offset 0x%llx count %zd",
-                  "offset 0x%llx count %zd", \
+                  MAJOR(__entry->dev), MINOR(__entry->dev),
-                  MAJOR(__entry->dev), MINOR(__entry->dev), \
+                  __entry->ino,
-                  __entry->ino, \
+                  __entry->size,
-                  __entry->size, \
+                  __entry->new_size,
-                  __entry->new_size, \
+                  __entry->offset,
-                  __entry->offset, \
+                  __entry->count)
-                  __entry->count) \
 );
+#define DEFINE_SIMPLE_IO_EVENT(name)    \
+DEFINE_EVENT(xfs_simple_io_class, name, \
+        TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count),        \
+        TP_ARGS(ip, offset, count))
 DEFINE_SIMPLE_IO_EVENT(xfs_delalloc_enospc);
 DEFINE_SIMPLE_IO_EVENT(xfs_unwritten_convert);
diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c
index 38e764146644..2d8b7bc792c9 100644
--- a/fs/xfs/quota/xfs_qm.c
+++ b/fs/xfs/quota/xfs_qm.c
@@ -249,8 +249,10 @@ xfs_qm_hold_quotafs_ref(
        if (!xfs_Gqm) {
                xfs_Gqm = xfs_Gqm_init();
-                if (!xfs_Gqm)
+                if (!xfs_Gqm) {
+                        mutex_unlock(&xfs_Gqm_lock);
                        return ENOMEM;
+                }
        }
        /*
diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h
index 401f364ad36c..4917d4eed4ed 100644
--- a/fs/xfs/xfs_ag.h
+++ b/fs/xfs/xfs_ag.h
@@ -227,7 +227,6 @@ typedef struct xfs_perag {
        atomic_t        pagf_fstrms;    /* # of filestreams active in this AG */
-        int             pag_ici_init;   /* incore inode cache initialised */
        rwlock_t        pag_ici_lock;   /* incore inode lock */
        struct radix_tree_root pag_ici_root;    /* incore inode cache root */
        int             pag_ici_reclaimable;    /* reclaimable inodes */
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index 6845db90818f..75df75f43d48 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -382,9 +382,6 @@ xfs_iget(
        /* get the perag structure and ensure that it's inode capable */
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ino));
-        if (!pag->pagi_inodeok)
-                return EINVAL;
-        ASSERT(pag->pag_ici_init);
        agino = XFS_INO_TO_AGINO(mp, ino);
 again:
@@ -744,30 +741,24 @@ xfs_ilock_demote(
 }
 #ifdef DEBUG
-/*
- * Debug-only routine, without additional rw_semaphore APIs, we can
- * now only answer requests regarding whether we hold the lock for write
- * (reader state is outside our visibility, we only track writer state).
- *
- * Note: this means !xfs_isilocked would give false positives, so don't do that.
- */
 int
 xfs_isilocked(
        xfs_inode_t             *ip,
        uint                    lock_flags)
 {
-        if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) ==
+        if (lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) {
-                        XFS_ILOCK_EXCL) {
+                if (!(lock_flags & XFS_ILOCK_SHARED))
-                if (!ip->i_lock.mr_writer)
+                        return !!ip->i_lock.mr_writer;
-                        return 0;
+                return rwsem_is_locked(&ip->i_lock.mr_lock);
        }
-        if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) ==
+        if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
-                        XFS_IOLOCK_EXCL) {
+                if (!(lock_flags & XFS_IOLOCK_SHARED))
-                if (!ip->i_iolock.mr_writer)
+                        return !!ip->i_iolock.mr_writer;
-                        return 0;
+                return rwsem_is_locked(&ip->i_iolock.mr_lock);
        }
-        return 1;
+        ASSERT(0);
+        return 0;
 }
 #endif
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8cd6e8d8fe9c..d53c39de7d05 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -1940,10 +1940,10 @@ xfs_ifree_cluster(
        int                     blks_per_cluster;
        int                     nbufs;
        int                     ninodes;
-        int                     i, j, found, pre_flushed;
+        int                     i, j;
        xfs_daddr_t             blkno;
        xfs_buf_t               *bp;
-        xfs_inode_t             *ip, **ip_found;
+        xfs_inode_t             *ip;
        xfs_inode_log_item_t    *iip;
        xfs_log_item_t          *lip;
        struct xfs_perag        *pag;
@@ -1960,114 +1960,97 @@ xfs_ifree_cluster(
                nbufs = XFS_IALLOC_BLOCKS(mp) / blks_per_cluster;
        }
-        ip_found = kmem_alloc(ninodes * sizeof(xfs_inode_t *), KM_NOFS);
        for (j = 0; j < nbufs; j++, inum += ninodes) {
+                int     found = 0;
                blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum),
                                         XFS_INO_TO_AGBNO(mp, inum));
+                /*
+                 * We obtain and lock the backing buffer first in the process
+                 * here, as we have to ensure that any dirty inode that we
+                 * can't get the flush lock on is attached to the buffer.
+                 * If we scan the in-memory inodes first, then buffer IO can
+                 * complete before we get a lock on it, and hence we may fail
+                 * to mark all the active inodes on the buffer stale.
+                 */
+                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno,
+                                        mp->m_bsize * blks_per_cluster,
+                                        XBF_LOCK);
+                /*
+                 * Walk the inodes already attached to the buffer and mark them
+                 * stale. These will all have the flush locks held, so an
+                 * in-memory inode walk can't lock them.
+                 */
+                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                while (lip) {
+                        if (lip->li_type == XFS_LI_INODE) {
+                                iip = (xfs_inode_log_item_t *)lip;
+                                ASSERT(iip->ili_logged == 1);
+                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
+                                xfs_trans_ail_copy_lsn(mp->m_ail,
+                                                        &iip->ili_flush_lsn,
+                                                        &iip->ili_item.li_lsn);
+                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
+                                found++;
+                        }
+                        lip = lip->li_bio_list;
+                }
                /*
-                 * Look for each inode in memory and attempt to lock it,
+                 * For each inode in memory attempt to add it to the inode
-                 * we can be racing with flush and tail pushing here.
+                 * buffer and set it up for being staled on buffer IO
-                 * any inode we get the locks on, add to an array of
+                 * completion.  This is safe as we've locked out tail pushing
-                 * inode items to process later.
+                 * and flushing by locking the buffer.
                 *
-                 * The get the buffer lock, we could beat a flush
+                 * We have already marked every inode that was part of a
-                 * or tail pushing thread to the lock here, in which
+                 * transaction stale above, which means there is no point in
-                 * case they will go looking for the inode buffer
+                 * even trying to lock them.
-                 * and fail, we need some other form of interlock
-                 * here.
                 */
-                found = 0;
                for (i = 0; i < ninodes; i++) {
                        read_lock(&pag->pag_ici_lock);
                        ip = radix_tree_lookup(&pag->pag_ici_root,
                                        XFS_INO_TO_AGINO(mp, (inum + i)));
-                        /* Inode not in memory or we found it already,
+                        /* Inode not in memory or stale, nothing to do */
-                         * nothing to do
-                         */
                        if (!ip || xfs_iflags_test(ip, XFS_ISTALE)) {
                                read_unlock(&pag->pag_ici_lock);
                                continue;
                        }
-                        if (xfs_inode_clean(ip)) {
+                        /* don't try to lock/unlock the current inode */
-                                read_unlock(&pag->pag_ici_lock);
+                        if (ip != free_ip &&
-                                continue;
+                            !xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
-                        }
-                        /* If we can get the locks then add it to the
-                         * list, otherwise by the time we get the bp lock
-                         * below it will already be attached to the
-                         * inode buffer.
-                         */
-                        /* This inode will already be locked - by us, lets
-                         * keep it that way.
-                         */
-                        if (ip == free_ip) {
-                                if (xfs_iflock_nowait(ip)) {
-                                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        if (xfs_inode_clean(ip)) {
-                                                xfs_ifunlock(ip);
-                                        } else {
-                                                ip_found[found++] = ip;
-                                        }
-                                }
                                read_unlock(&pag->pag_ici_lock);
                                continue;
                        }
+                        read_unlock(&pag->pag_ici_lock);
-                        if (xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
+                        if (!xfs_iflock_nowait(ip)) {
-                                if (xfs_iflock_nowait(ip)) {
+                                if (ip != free_ip)
-                                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        if (xfs_inode_clean(ip)) {
-                                                xfs_ifunlock(ip);
-                                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                        } else {
-                                                ip_found[found++] = ip;
-                                        }
-                                } else {
                                        xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                                }
+                                continue;
                        }
-                        read_unlock(&pag->pag_ici_lock);
-                }
-                bp = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, 
+                        xfs_iflags_set(ip, XFS_ISTALE);
-                                        mp->m_bsize * blks_per_cluster,
+                        if (xfs_inode_clean(ip)) {
-                                        XBF_LOCK);
+                                ASSERT(ip != free_ip);
+                                xfs_ifunlock(ip);
-                pre_flushed = 0;
+                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                lip = XFS_BUF_FSPRIVATE(bp, xfs_log_item_t *);
+                                continue;
-                while (lip) {
-                        if (lip->li_type == XFS_LI_INODE) {
-                                iip = (xfs_inode_log_item_t *)lip;
-                                ASSERT(iip->ili_logged == 1);
-                                lip->li_cb = (void(*)(xfs_buf_t*,xfs_log_item_t*)) xfs_istale_done;
-                                xfs_trans_ail_copy_lsn(mp->m_ail,
-                                                        &iip->ili_flush_lsn,
-                                                        &iip->ili_item.li_lsn);
-                                xfs_iflags_set(iip->ili_inode, XFS_ISTALE);
-                                pre_flushed++;
                        }
-                        lip = lip->li_bio_list;
-                }
-                for (i = 0; i < found; i++) {
-                        ip = ip_found[i];
                        iip = ip->i_itemp;
                        if (!iip) {
+                                /* inode with unlogged changes only */
+                                ASSERT(ip != free_ip);
                                ip->i_update_core = 0;
                                xfs_ifunlock(ip);
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
                                continue;
                        }
+                        found++;
                        iip->ili_last_fields = iip->ili_format.ilf_fields;
                        iip->ili_format.ilf_fields = 0;
@@ -2078,17 +2061,16 @@ xfs_ifree_cluster(
                        xfs_buf_attach_iodone(bp,
                                (void(*)(xfs_buf_t*,xfs_log_item_t*))
                                xfs_istale_done, (xfs_log_item_t *)iip);
-                        if (ip != free_ip) {
+                        if (ip != free_ip)
                                xfs_iunlock(ip, XFS_ILOCK_EXCL);
-                        }
                }
-                if (found || pre_flushed)
+                if (found)
                        xfs_trans_stale_inode_buf(tp, bp);
                xfs_trans_binval(tp, bp);
        }
-        kmem_free(ip_found);
        xfs_perag_put(pag);
 }
@@ -2649,8 +2631,6 @@ xfs_iflush_cluster(
        int                     i;
        pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
-        ASSERT(pag->pagi_inodeok);
-        ASSERT(pag->pag_ici_init);
        inodes_per_cluster = XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog;
        ilist_size = inodes_per_cluster * sizeof(xfs_inode_t *);
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 14a69aec2c0b..ed0684cc50ee 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -132,15 +132,10 @@ xlog_align(
        int             nbblks,
        xfs_buf_t       *bp)
 {
-        xfs_daddr_t     offset;
+        xfs_daddr_t     offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
-        xfs_caddr_t     ptr;
-        offset = blk_no & ((xfs_daddr_t) log->l_sectBBsize - 1);
+        ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
-        ptr = XFS_BUF_PTR(bp) + BBTOB(offset);
+        return XFS_BUF_PTR(bp) + BBTOB(offset);
-        ASSERT(ptr + BBTOB(nbblks) <= XFS_BUF_PTR(bp) + XFS_BUF_SIZE(bp));
-        return ptr;
 }
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d7bf38c8cd1c..d59f4e8bedcf 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -268,10 +268,10 @@ xfs_sb_validate_fsb_count(
 #if XFS_BIG_BLKNOS     /* Limited by ULONG_MAX of page cache index */
        if (nblocks >> (PAGE_CACHE_SHIFT - sbp->sb_blocklog) > ULONG_MAX)
-                return E2BIG;
+                return EFBIG;
 #else                  /* Limited by UINT_MAX of sectors */
        if (nblocks << (sbp->sb_blocklog - BBSHIFT) > UINT_MAX)
-                return E2BIG;
+                return EFBIG;
 #endif
        return 0;
 }
@@ -393,7 +393,7 @@ xfs_mount_validate_sb(
            xfs_sb_validate_fsb_count(sbp, sbp->sb_rblocks)) {
                xfs_fs_mount_cmn_err(flags,
                        "file system too large to be mounted on this system.");
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        if (unlikely(sbp->sb_inprogress)) {
@@ -413,17 +413,6 @@ xfs_mount_validate_sb(
        return 0;
 }
-STATIC void
-xfs_initialize_perag_icache(
-        xfs_perag_t     *pag)
-{
-        if (!pag->pag_ici_init) {
-                rwlock_init(&pag->pag_ici_lock);
-                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-                pag->pag_ici_init = 1;
-        }
-}
 int
 xfs_initialize_perag(
        xfs_mount_t     *mp,
@@ -436,13 +425,8 @@ xfs_initialize_perag(
        xfs_agino_t     agino;
        xfs_ino_t       ino;
        xfs_sb_t        *sbp = &mp->m_sb;
-        xfs_ino_t       max_inum = XFS_MAXINUMBER_32;
        int             error = -ENOMEM;
-        /* Check to see if the filesystem can overflow 32 bit inodes */
-        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
-        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
        /*
         * Walk the current per-ag tree so we don't try to initialise AGs
         * that already exist (growfs case). Allocate and insert all the
@@ -456,11 +440,18 @@ xfs_initialize_perag(
                }
                if (!first_initialised)
                        first_initialised = index;
                pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
                if (!pag)
                        goto out_unwind;
+                pag->pag_agno = index;
+                pag->pag_mount = mp;
+                rwlock_init(&pag->pag_ici_lock);
+                INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
                if (radix_tree_preload(GFP_NOFS))
                        goto out_unwind;
                spin_lock(&mp->m_perag_lock);
                if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
                        BUG();
@@ -469,25 +460,26 @@ xfs_initialize_perag(
                        error = -EEXIST;
                        goto out_unwind;
                }
-                pag->pag_agno = index;
-                pag->pag_mount = mp;
                spin_unlock(&mp->m_perag_lock);
                radix_tree_preload_end();
        }
-        /* Clear the mount flag if no inode can overflow 32 bits
+        /*
-         * on this filesystem, or if specifically requested..
+         * If we mount with the inode64 option, or no inode overflows
+         * the legacy 32-bit address space clear the inode32 option.
         */
-        if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > max_inum) {
+        agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
+        ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
+        if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) && ino > XFS_MAXINUMBER_32)
                mp->m_flags |= XFS_MOUNT_32BITINODES;
-        } else {
+        else
                mp->m_flags &= ~XFS_MOUNT_32BITINODES;
-        }
-        /* If we can overflow then setup the ag headers accordingly */
        if (mp->m_flags & XFS_MOUNT_32BITINODES) {
-                /* Calculate how much should be reserved for inodes to
+                /*
-                 * meet the max inode percentage.
+                 * Calculate how much should be reserved for inodes to meet
+                 * the max inode percentage.
                 */
                if (mp->m_maxicount) {
                        __uint64_t      icount;
@@ -500,30 +492,28 @@ xfs_initialize_perag(
                } else {
                        max_metadata = agcount;
                }
                for (index = 0; index < agcount; index++) {
                        ino = XFS_AGINO_TO_INO(mp, index, agino);
-                        if (ino > max_inum) {
+                        if (ino > XFS_MAXINUMBER_32) {
                                index++;
                                break;
                        }
-                        /* This ag is preferred for inodes */
                        pag = xfs_perag_get(mp, index);
                        pag->pagi_inodeok = 1;
                        if (index < max_metadata)
                                pag->pagf_metadata = 1;
-                        xfs_initialize_perag_icache(pag);
                        xfs_perag_put(pag);
                }
        } else {
-                /* Setup default behavior for smaller filesystems */
                for (index = 0; index < agcount; index++) {
                        pag = xfs_perag_get(mp, index);
                        pag->pagi_inodeok = 1;
-                        xfs_initialize_perag_icache(pag);
                        xfs_perag_put(pag);
                }
        }
        if (maxagi)
                *maxagi = index;
        return 0;
@@ -1009,7 +999,7 @@ xfs_check_sizes(xfs_mount_t *mp)
        d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks);
        if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) {
                cmn_err(CE_WARN, "XFS: size check 1 failed");
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        error = xfs_read_buf(mp, mp->m_ddev_targp,
                             d - XFS_FSS_TO_BB(mp, 1),
@@ -1019,7 +1009,7 @@ xfs_check_sizes(xfs_mount_t *mp)
        } else {
                cmn_err(CE_WARN, "XFS: size check 2 failed");
                if (error == ENOSPC)
-                        error = XFS_ERROR(E2BIG);
+                        error = XFS_ERROR(EFBIG);
                return error;
        }
@@ -1027,7 +1017,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks);
                if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
-                        return XFS_ERROR(E2BIG);
+                        return XFS_ERROR(EFBIG);
                }
                error = xfs_read_buf(mp, mp->m_logdev_targp,
                                     d - XFS_FSB_TO_BB(mp, 1),
@@ -1037,7 +1027,7 @@ xfs_check_sizes(xfs_mount_t *mp)
                } else {
                        cmn_err(CE_WARN, "XFS: size check 3 failed");
                        if (error == ENOSPC)
-                                error = XFS_ERROR(E2BIG);
+                                error = XFS_ERROR(EFBIG);
                        return error;
                }
        }
@@ -1254,7 +1244,7 @@ xfs_mountfs(
         * Allocate and initialize the per-ag data.
         */
        spin_lock_init(&mp->m_perag_lock);
-        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
+        INIT_RADIX_TREE(&mp->m_perag_tree, GFP_ATOMIC);
        error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
        if (error) {
                cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index 6be05f756d59..16445518506d 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -2247,7 +2247,7 @@ xfs_rtmount_init(
                cmn_err(CE_WARN, "XFS: realtime mount -- %llu != %llu",
                        (unsigned long long) XFS_BB_TO_FSB(mp, d),
                        (unsigned long long) mp->m_sb.sb_rblocks);
-                return XFS_ERROR(E2BIG);
+                return XFS_ERROR(EFBIG);
        }
        error = xfs_read_buf(mp, mp->m_rtdev_targp,
                                d - XFS_FSB_TO_BB(mp, 1),
@@ -2256,7 +2256,7 @@ xfs_rtmount_init(
                cmn_err(CE_WARN,
        "XFS: realtime mount -- xfs_read_buf failed, returned %d", error);
                if (error == ENOSPC)
-                        return XFS_ERROR(E2BIG);
+                        return XFS_ERROR(EFBIG);
                return error;
        }
        xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h
index b2d67adb6a08..ff614c29b441 100644
--- a/fs/xfs/xfs_rtalloc.h
+++ b/fs/xfs/xfs_rtalloc.h
@@ -147,7 +147,16 @@ xfs_growfs_rt(
 # define xfs_rtfree_extent(t,b,l)                       (ENOSYS)
 # define xfs_rtpick_extent(m,t,l,rb)                    (ENOSYS)
 # define xfs_growfs_rt(mp,in)                           (ENOSYS)
-# define xfs_rtmount_init(m)    (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
+static inline int               /* error */
+xfs_rtmount_init(
+        xfs_mount_t     *mp)    /* file system mount structure */
+{
+        if (mp->m_sb.sb_rblocks == 0)
+                return 0;
+        cmn_err(CE_WARN, "XFS: Not built with CONFIG_XFS_RT");
+        return ENOSYS;
+}
 # define xfs_rtmount_inodes(m)  (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS))
 # define xfs_rtunmount_inodes(m)
 #endif  /* CONFIG_XFS_RT */
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index ce558efa2ea0..28547dfce037 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -48,134 +48,489 @@
 kmem_zone_t     *xfs_trans_zone;
 /*
- * Reservation functions here avoid a huge stack in xfs_trans_init
+ * Various log reservation values.
- * due to register overflow from temporaries in the calculations.
+ *
+ * These are based on the size of the file system block because that is what
+ * most transactions manipulate.  Each adds in an additional 128 bytes per
+ * item logged to try to account for the overhead of the transaction mechanism.
+ *
+ * Note:  Most of the reservations underestimate the number of allocation
+ * groups into which they could free extents in the xfs_bmap_finish() call.
+ * This is because the number in the worst case is quite high and quite
+ * unusual.  In order to fix this we need to change xfs_bmap_finish() to free
+ * extents in only a single AG at a time.  This will require changes to the
+ * EFI code as well, however, so that the EFI for the extents not freed is
+ * logged again in each transaction.  See SGI PV #261917.
+ *
+ * Reservation functions here avoid a huge stack in xfs_trans_init due to
+ * register overflow from temporaries in the calculations.
+ */
+/*
+ * In a write transaction we can allocate a maximum of 2
+ * extents.  This gives:
+ *    the inode getting the new extents: inode size
+ *    the inode's bmap btree: max depth * block size
+ *    the agfs of the ags from which the extents are allocated: 2 * sector
+ *    the superblock free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ * And the bmap_finish transaction can free bmap blocks in a join:
+ *    the agfs of the ags containing the blocks: 2 * sector size
+ *    the agfls of the ags containing the blocks: 2 * sector size
+ *    the super block free block counter: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
 */
 STATIC uint
-xfs_calc_write_reservation(xfs_mount_t *mp)
+xfs_calc_write_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_WRITE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 2))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * In truncating a file we free up to two extents at once.  We can modify:
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: (max depth + 1) * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_itruncate_reservation(xfs_mount_t *mp)
+xfs_calc_itruncate_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ITRUNCATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) +
+                     128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+                    (4 * mp->m_sb.sb_sectsize +
+                     4 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 4) +
+                     128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4)) +
+                     128 * 5 +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * In renaming a files we can modify:
+ *    the four inodes involved: 4 * inode size
+ *    the two directory btrees: 2 * (max depth + v2) * dir block size
+ *    the two directory bmap btrees: 2 * max depth * block size
+ * And the bmap_finish transaction can free dir and bmap blocks (two sets
+ *      of bmap blocks) giving:
+ *    the agf for the ags in which the blocks live: 3 * sector size
+ *    the agfl for the ags in which the blocks live: 3 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_rename_reservation(xfs_mount_t *mp)
+xfs_calc_rename_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_RENAME_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((4 * mp->m_sb.sb_inodesize +
+                     2 * XFS_DIROP_LOG_RES(mp) +
+                     128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp))),
+                    (3 * mp->m_sb.sb_sectsize +
+                     3 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 3) +
+                     128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))));
 }
+/*
+ * For creating a link to an inode:
+ *    the parent directory inode: inode size
+ *    the linked inode: inode size
+ *    the directory btree could split: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free some bmap blocks giving:
+ *    the agf for the ag in which the blocks live: sector size
+ *    the agfl for the ag in which the blocks live: sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_link_reservation(xfs_mount_t *mp)
+xfs_calc_link_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_LINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+                    (mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * For removing a directory entry we can modify:
+ *    the parent directory inode: inode size
+ *    the removed inode: inode size
+ *    the directory btree could join: (max depth + v2) * dir block size
+ *    the directory bmap btree could join or split: (max depth + v2) * blocksize
+ * And the bmap_finish transaction can free the dir and bmap blocks giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_remove_reservation(xfs_mount_t *mp)
+xfs_calc_remove_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_REMOVE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (2 + XFS_DIROP_LOG_COUNT(mp))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * For symlink we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: 1 block
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ *    the blocks for the symlink: 1 kB
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_symlink_reservation(xfs_mount_t *mp)
+xfs_calc_symlink_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_SYMLINK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, 1) +
+                     XFS_DIROP_LOG_RES(mp) +
+                     1024 +
+                     128 * (4 + XFS_DIROP_LOG_COUNT(mp))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+                     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * For create we can modify:
+ *    the parent directory inode: inode size
+ *    the new inode: inode size
+ *    the inode btree entry: block size
+ *    the superblock for the nlink flag: sector size
+ *    the directory btree: (max depth + v2) * dir block size
+ *    the directory inode's bmap btree: (max depth + v2) * block size
+ * Or in the first xact we allocate some inodes giving:
+ *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
+ *    the superblock for the nlink flag: sector size
+ *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_create_reservation(xfs_mount_t *mp)
+xfs_calc_create_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_CREATE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_inodesize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, 1) +
+                     XFS_DIROP_LOG_RES(mp) +
+                     128 * (3 + XFS_DIROP_LOG_COUNT(mp))),
+                    (3 * mp->m_sb.sb_sectsize +
+                     XFS_FSB_TO_B(mp, XFS_IALLOC_BLOCKS(mp)) +
+                     XFS_FSB_TO_B(mp, mp->m_in_maxlevels) +
+                     XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                     128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                            XFS_ALLOCFREE_LOG_COUNT(mp, 1))));
 }
+/*
+ * Making a new directory is the same as creating a new file.
+ */
 STATIC uint
-xfs_calc_mkdir_reservation(xfs_mount_t *mp)
+xfs_calc_mkdir_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_MKDIR_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return xfs_calc_create_reservation(mp);
 }
+/*
+ * In freeing an inode we can modify:
+ *    the inode being freed: inode size
+ *    the super block free inode counter: sector size
+ *    the agi hash list and counters: sector size
+ *    the inode btree entry: block size
+ *    the on disk inode before ours in the agi hash list: inode cluster size
+ *    the inode btree: max depth * blocksize
+ *    the allocation btrees: 2 trees * (max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_ifree_reservation(xfs_mount_t *mp)
+xfs_calc_ifree_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_IFREE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, 1) +
+                MAX((__uint16_t)XFS_FSB_TO_B(mp, 1),
+                    XFS_INODE_CLUSTER_SIZE(mp)) +
+                128 * 5 +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (2 + XFS_IALLOC_BLOCKS(mp) + mp->m_in_maxlevels +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * When only changing the inode we log the inode and possibly the superblock
+ * We also add a bit of slop for the transaction stuff.
+ */
 STATIC uint
-xfs_calc_ichange_reservation(xfs_mount_t *mp)
+xfs_calc_ichange_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ICHANGE_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                512;
 }
+/*
+ * Growing the data section of the filesystem.
+ *      superblock
+ *      agi and agf
+ *      allocation btrees
+ */
 STATIC uint
-xfs_calc_growdata_reservation(xfs_mount_t *mp)
+xfs_calc_growdata_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWDATA_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize * 3 +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the first set of transactions (ALLOC) we allocate space to the
+ * bitmap or summary files.
+ *      superblock: sector size
+ *      agf of the ag from which the extent is allocated: sector size
+ *      bmap btree for bitmap/summary inode: max depth * blocksize
+ *      bitmap/summary inode: inode size
+ *      allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
+ */
 STATIC uint
-xfs_calc_growrtalloc_reservation(xfs_mount_t *mp)
+xfs_calc_growrtalloc_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTALLOC_LOG_RES(mp);
+        return 2 * mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) +
+                mp->m_sb.sb_inodesize +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the second set of transactions (ZERO) we zero the new metadata blocks.
+ *      one bitmap/summary block: blocksize
+ */
 STATIC uint
-xfs_calc_growrtzero_reservation(xfs_mount_t *mp)
+xfs_calc_growrtzero_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTZERO_LOG_RES(mp);
+        return mp->m_sb.sb_blocksize + 128;
 }
+/*
+ * Growing the rt section of the filesystem.
+ * In the third set of transactions (FREE) we update metadata without
+ * allocating any new blocks.
+ *      superblock: sector size
+ *      bitmap inode: inode size
+ *      summary inode: inode size
+ *      one bitmap block: blocksize
+ *      summary blocks: new summary size
+ */
 STATIC uint
-xfs_calc_growrtfree_reservation(xfs_mount_t *mp)
+xfs_calc_growrtfree_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_GROWRTFREE_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize +
+                2 * mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_blocksize +
+                mp->m_rsumsize +
+                128 * 5;
 }
+/*
+ * Logging the inode modification timestamp on a synchronous write.
+ *      inode
+ */
 STATIC uint
-xfs_calc_swrite_reservation(xfs_mount_t *mp)
+xfs_calc_swrite_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_SWRITE_LOG_RES(mp);
+        return mp->m_sb.sb_inodesize + 128;
 }
+/*
+ * Logging the inode mode bits when writing a setuid/setgid file
+ *      inode
+ */
 STATIC uint
 xfs_calc_writeid_reservation(xfs_mount_t *mp)
 {
-        return XFS_CALC_WRITEID_LOG_RES(mp);
+        return mp->m_sb.sb_inodesize + 128;
 }
+/*
+ * Converting the inode from non-attributed to attributed.
+ *      the inode being converted: inode size
+ *      agf block and superblock (for block allocation)
+ *      the new block (directory sized)
+ *      bmap blocks for the new directory block
+ *      allocation btrees
+ */
 STATIC uint
-xfs_calc_addafork_reservation(xfs_mount_t *mp)
+xfs_calc_addafork_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ADDAFORK_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize * 2 +
+                mp->m_dirblksize +
+                XFS_FSB_TO_B(mp, XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) +
+                XFS_ALLOCFREE_LOG_RES(mp, 1) +
+                128 * (4 + XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1 +
+                       XFS_ALLOCFREE_LOG_COUNT(mp, 1));
 }
+/*
+ * Removing the attribute fork of a file
+ *    the inode being truncated: inode size
+ *    the inode's bmap btree: max depth * block size
+ * And the bmap_finish transaction can free the blocks and bmap blocks:
+ *    the agf for each of the ags: 4 * sector size
+ *    the agfl for each of the ags: 4 * sector size
+ *    the super block to reflect the freed blocks: sector size
+ *    worst case split in allocation btrees per extent assuming 4 extents:
+ *              4 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrinval_reservation(xfs_mount_t *mp)
+xfs_calc_attrinval_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRINVAL_LOG_RES(mp);
+        return MAX((mp->m_sb.sb_inodesize +
+                    XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                    128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))),
+                   (4 * mp->m_sb.sb_sectsize +
+                    4 * mp->m_sb.sb_sectsize +
+                    mp->m_sb.sb_sectsize +
+                    XFS_ALLOCFREE_LOG_RES(mp, 4) +
+                    128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))));
 }
+/*
+ * Setting an attribute.
+ *      the inode getting the attribute
+ *      the superblock for allocations
+ *      the agfs extents are allocated from
+ *      the attribute btree * max depth
+ *      the inode allocation btree
+ * Since attribute transaction space is dependent on the size of the attribute,
+ * the calculation is done partially at mount time and partially at runtime.
+ */
 STATIC uint
-xfs_calc_attrset_reservation(xfs_mount_t *mp)
+xfs_calc_attrset_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRSET_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                mp->m_sb.sb_inodesize +
+                mp->m_sb.sb_sectsize +
+                XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+                128 * (2 + XFS_DA_NODE_MAXDEPTH);
 }
+/*
+ * Removing an attribute.
+ *    the inode: inode size
+ *    the attribute btree could join: max depth * block size
+ *    the inode bmap btree could join or split: max depth * block size
+ * And the bmap_finish transaction can free the attr blocks freed giving:
+ *    the agf for the ag in which the blocks live: 2 * sector size
+ *    the agfl for the ag in which the blocks live: 2 * sector size
+ *    the superblock for the free block count: sector size
+ *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
+ */
 STATIC uint
-xfs_calc_attrrm_reservation(xfs_mount_t *mp)
+xfs_calc_attrrm_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_ATTRRM_LOG_RES(mp) + XFS_DQUOT_LOGRES(mp);
+        return XFS_DQUOT_LOGRES(mp) +
+                MAX((mp->m_sb.sb_inodesize +
+                     XFS_FSB_TO_B(mp, XFS_DA_NODE_MAXDEPTH) +
+                     XFS_FSB_TO_B(mp, XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) +
+                     128 * (1 + XFS_DA_NODE_MAXDEPTH +
+                            XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK))),
+                    (2 * mp->m_sb.sb_sectsize +
+                     2 * mp->m_sb.sb_sectsize +
+                     mp->m_sb.sb_sectsize +
+                     XFS_ALLOCFREE_LOG_RES(mp, 2) +
+                     128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))));
 }
+/*
+ * Clearing a bad agino number in an agi hash bucket.
+ */
 STATIC uint
-xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
+xfs_calc_clear_agi_bucket_reservation(
+        struct xfs_mount        *mp)
 {
-        return XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp);
+        return mp->m_sb.sb_sectsize + 128;
 }
 /*
@@ -184,11 +539,10 @@ xfs_calc_clear_agi_bucket_reservation(xfs_mount_t *mp)
 */
 void
 xfs_trans_init(
-        xfs_mount_t     *mp)
+        struct xfs_mount        *mp)
 {
-        xfs_trans_reservations_t        *resp;
+        struct xfs_trans_reservations *resp = &mp->m_reservations;
-        resp = &(mp->m_reservations);
        resp->tr_write = xfs_calc_write_reservation(mp);
        resp->tr_itruncate = xfs_calc_itruncate_reservation(mp);
        resp->tr_rename = xfs_calc_rename_reservation(mp);
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 8c69e7824f68..e639e8e9a2a9 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -300,24 +300,6 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
 /*
- * Various log reservation values.
- * These are based on the size of the file system block
- * because that is what most transactions manipulate.
- * Each adds in an additional 128 bytes per item logged to
- * try to account for the overhead of the transaction mechanism.
- *
- * Note:
- * Most of the reservations underestimate the number of allocation
- * groups into which they could free extents in the xfs_bmap_finish()
- * call.  This is because the number in the worst case is quite high
- * and quite unusual.  In order to fix this we need to change
- * xfs_bmap_finish() to free extents in only a single AG at a time.
- * This will require changes to the EFI code as well, however, so that
- * the EFI for the extents not freed is logged again in each transaction.
- * See bug 261917.
- */
-/*
 * Per-extent log reservation for the allocation btree changes
 * involved in freeing or allocating an extent.
 * 2 trees * (2 blocks/level * max depth - 1) * block size
@@ -341,429 +323,36 @@ xfs_lic_desc_to_chunk(xfs_log_item_desc_t *dp)
        (XFS_DAENTER_BLOCKS(mp, XFS_DATA_FORK) + \
         XFS_DAENTER_BMAPS(mp, XFS_DATA_FORK) + 1)
-/*
- * In a write transaction we can allocate a maximum of 2
- * extents.  This gives:
- *    the inode getting the new extents: inode size
- *    the inode's bmap btree: max depth * block size
- *    the agfs of the ags from which the extents are allocated: 2 * sector
- *    the superblock free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- * And the bmap_finish transaction can free bmap blocks in a join:
- *    the agfs of the ags containing the blocks: 2 * sector size
- *    the agfls of the ags containing the blocks: 2 * sector size
- *    the super block free block counter: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_WRITE_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (4 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + XFS_ALLOCFREE_LOG_COUNT(mp, 2)))),\
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_WRITE_LOG_RES(mp)   ((mp)->m_reservations.tr_write)
-/*
- * In truncating a file we free up to two extents at once.  We can modify:
- *    the inode being truncated: inode size
- *    the inode's bmap btree: (max depth + 1) * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *              4 exts * 2 trees * (2 * max depth - 1) * block size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_ITRUNCATE_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1) + \
-          (128 * (2 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-         ((4 * (mp)->m_sb.sb_sectsize) + \
-          (4 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))) + \
-          (128 * 5) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-           (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-            XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_ITRUNCATE_LOG_RES(mp)   ((mp)->m_reservations.tr_itruncate)
-/*
- * In renaming a files we can modify:
- *    the four inodes involved: 4 * inode size
- *    the two directory btrees: 2 * (max depth + v2) * dir block size
- *    the two directory bmap btrees: 2 * max depth * block size
- * And the bmap_finish transaction can free dir and bmap blocks (two sets
- *      of bmap blocks) giving:
- *    the agf for the ags in which the blocks live: 3 * sector size
- *    the agfl for the ags in which the blocks live: 3 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 3 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_RENAME_LOG_RES(mp) \
-        (MAX( \
-         ((4 * (mp)->m_sb.sb_inodesize) + \
-          (2 * XFS_DIROP_LOG_RES(mp)) + \
-          (128 * (4 + 2 * XFS_DIROP_LOG_COUNT(mp)))), \
-         ((3 * (mp)->m_sb.sb_sectsize) + \
-          (3 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 3) + \
-          (128 * (7 + XFS_ALLOCFREE_LOG_COUNT(mp, 3))))))
 #define XFS_RENAME_LOG_RES(mp)  ((mp)->m_reservations.tr_rename)
-/*
- * For creating a link to an inode:
- *    the parent directory inode: inode size
- *    the linked inode: inode size
- *    the directory btree could split: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free some bmap blocks giving:
- *    the agf for the ag in which the blocks live: sector size
- *    the agfl for the ag in which the blocks live: sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_LINK_LOG_RES(mp) \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-         ((mp)->m_sb.sb_sectsize + \
-          (mp)->m_sb.sb_sectsize + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_LINK_LOG_RES(mp)    ((mp)->m_reservations.tr_link)
-/*
- * For removing a directory entry we can modify:
- *    the parent directory inode: inode size
- *    the removed inode: inode size
- *    the directory btree could join: (max depth + v2) * dir block size
- *    the directory bmap btree could join or split: (max depth + v2) * blocksize
- * And the bmap_finish transaction can free the dir and bmap blocks giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_REMOVE_LOG_RES(mp)     \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (2 + XFS_DIROP_LOG_COUNT(mp)))), \
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_REMOVE_LOG_RES(mp)  ((mp)->m_reservations.tr_remove)
-/*
- * For symlink we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: 1 block
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- *    the blocks for the symlink: 1 kB
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_SYMLINK_LOG_RES(mp)            \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B(mp, 1) + \
-          XFS_DIROP_LOG_RES(mp) + \
-          1024 + \
-          (128 * (4 + XFS_DIROP_LOG_COUNT(mp)))), \
-         (2 * (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-          XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_SYMLINK_LOG_RES(mp) ((mp)->m_reservations.tr_symlink)
-/*
- * For create we can modify:
- *    the parent directory inode: inode size
- *    the new inode: inode size
- *    the inode btree entry: block size
- *    the superblock for the nlink flag: sector size
- *    the directory btree: (max depth + v2) * dir block size
- *    the directory inode's bmap btree: (max depth + v2) * block size
- * Or in the first xact we allocate some inodes giving:
- *    the agi and agf of the ag getting the new inodes: 2 * sectorsize
- *    the superblock for the nlink flag: sector size
- *    the inode blocks allocated: XFS_IALLOC_BLOCKS * blocksize
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_CREATE_LOG_RES(mp)             \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_inodesize + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B(mp, 1) + \
-          XFS_DIROP_LOG_RES(mp) + \
-          (128 * (3 + XFS_DIROP_LOG_COUNT(mp)))), \
-         (3 * (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_IALLOC_BLOCKS((mp))) + \
-          XFS_FSB_TO_B((mp), (mp)->m_in_maxlevels) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))))
 #define XFS_CREATE_LOG_RES(mp)  ((mp)->m_reservations.tr_create)
-/*
- * Making a new directory is the same as creating a new file.
- */
-#define XFS_CALC_MKDIR_LOG_RES(mp)      XFS_CALC_CREATE_LOG_RES(mp)
 #define XFS_MKDIR_LOG_RES(mp)   ((mp)->m_reservations.tr_mkdir)
-/*
- * In freeing an inode we can modify:
- *    the inode being freed: inode size
- *    the super block free inode counter: sector size
- *    the agi hash list and counters: sector size
- *    the inode btree entry: block size
- *    the on disk inode before ours in the agi hash list: inode cluster size
- *    the inode btree: max depth * blocksize
- *    the allocation btrees: 2 trees * (max depth - 1) * block size
- */
-#define XFS_CALC_IFREE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize + \
-         (mp)->m_sb.sb_sectsize + \
-         XFS_FSB_TO_B((mp), 1) + \
-         MAX((__uint16_t)XFS_FSB_TO_B((mp), 1), XFS_INODE_CLUSTER_SIZE(mp)) + \
-         (128 * 5) + \
-          XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-          (128 * (2 + XFS_IALLOC_BLOCKS(mp) + (mp)->m_in_maxlevels + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_IFREE_LOG_RES(mp)   ((mp)->m_reservations.tr_ifree)
-/*
- * When only changing the inode we log the inode and possibly the superblock
- * We also add a bit of slop for the transaction stuff.
- */
-#define XFS_CALC_ICHANGE_LOG_RES(mp)    ((mp)->m_sb.sb_inodesize + \
-                                         (mp)->m_sb.sb_sectsize + 512)
 #define XFS_ICHANGE_LOG_RES(mp) ((mp)->m_reservations.tr_ichange)
-/*
- * Growing the data section of the filesystem.
- *      superblock
- *      agi and agf
- *      allocation btrees
- */
-#define XFS_CALC_GROWDATA_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize * 3 + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (3 + XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_GROWDATA_LOG_RES(mp)    ((mp)->m_reservations.tr_growdata)
-/*
- * Growing the rt section of the filesystem.
- * In the first set of transactions (ALLOC) we allocate space to the
- * bitmap or summary files.
- *      superblock: sector size
- *      agf of the ag from which the extent is allocated: sector size
- *      bmap btree for bitmap/summary inode: max depth * blocksize
- *      bitmap/summary inode: inode size
- *      allocation btrees for 1 block alloc: 2 * (2 * maxdepth - 1) * blocksize
- */
-#define XFS_CALC_GROWRTALLOC_LOG_RES(mp) \
-        (2 * (mp)->m_sb.sb_sectsize + \
-         XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)) + \
-         (mp)->m_sb.sb_inodesize + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * \
-          (3 + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + \
-           XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_GROWRTALLOC_LOG_RES(mp)     ((mp)->m_reservations.tr_growrtalloc)
-/*
- * Growing the rt section of the filesystem.
- * In the second set of transactions (ZERO) we zero the new metadata blocks.
- *      one bitmap/summary block: blocksize
- */
-#define XFS_CALC_GROWRTZERO_LOG_RES(mp) \
-        ((mp)->m_sb.sb_blocksize + 128)
 #define XFS_GROWRTZERO_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtzero)
-/*
- * Growing the rt section of the filesystem.
- * In the third set of transactions (FREE) we update metadata without
- * allocating any new blocks.
- *      superblock: sector size
- *      bitmap inode: inode size
- *      summary inode: inode size
- *      one bitmap block: blocksize
- *      summary blocks: new summary size
- */
-#define XFS_CALC_GROWRTFREE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize + \
-         2 * (mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_blocksize + \
-         (mp)->m_rsumsize + \
-         (128 * 5))
 #define XFS_GROWRTFREE_LOG_RES(mp)      ((mp)->m_reservations.tr_growrtfree)
-/*
- * Logging the inode modification timestamp on a synchronous write.
- *      inode
- */
-#define XFS_CALC_SWRITE_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + 128)
 #define XFS_SWRITE_LOG_RES(mp)  ((mp)->m_reservations.tr_swrite)
 /*
 * Logging the inode timestamps on an fsync -- same as SWRITE
 * as long as SWRITE logs the entire inode core
 */
 #define XFS_FSYNC_TS_LOG_RES(mp)        ((mp)->m_reservations.tr_swrite)
-/*
- * Logging the inode mode bits when writing a setuid/setgid file
- *      inode
- */
-#define XFS_CALC_WRITEID_LOG_RES(mp) \
-        ((mp)->m_sb.sb_inodesize + 128)
 #define XFS_WRITEID_LOG_RES(mp) ((mp)->m_reservations.tr_swrite)
-/*
- * Converting the inode from non-attributed to attributed.
- *      the inode being converted: inode size
- *      agf block and superblock (for block allocation)
- *      the new block (directory sized)
- *      bmap blocks for the new directory block
- *      allocation btrees
- */
-#define XFS_CALC_ADDAFORK_LOG_RES(mp)   \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize * 2 + \
-         (mp)->m_dirblksize + \
-         XFS_FSB_TO_B(mp, (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1)) + \
-         XFS_ALLOCFREE_LOG_RES(mp, 1) + \
-         (128 * (4 + (XFS_DAENTER_BMAP1B(mp, XFS_DATA_FORK) + 1) + \
-                 XFS_ALLOCFREE_LOG_COUNT(mp, 1))))
 #define XFS_ADDAFORK_LOG_RES(mp)        ((mp)->m_reservations.tr_addafork)
-/*
- * Removing the attribute fork of a file
- *    the inode being truncated: inode size
- *    the inode's bmap btree: max depth * block size
- * And the bmap_finish transaction can free the blocks and bmap blocks:
- *    the agf for each of the ags: 4 * sector size
- *    the agfl for each of the ags: 4 * sector size
- *    the super block to reflect the freed blocks: sector size
- *    worst case split in allocation btrees per extent assuming 4 extents:
- *              4 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRINVAL_LOG_RES(mp)  \
-        (MAX( \
-         ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-          (128 * (1 + XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))), \
-         ((4 * (mp)->m_sb.sb_sectsize) + \
-          (4 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 4) + \
-          (128 * (9 + XFS_ALLOCFREE_LOG_COUNT(mp, 4))))))
 #define XFS_ATTRINVAL_LOG_RES(mp)       ((mp)->m_reservations.tr_attrinval)
-/*
- * Setting an attribute.
- *      the inode getting the attribute
- *      the superblock for allocations
- *      the agfs extents are allocated from
- *      the attribute btree * max depth
- *      the inode allocation btree
- * Since attribute transaction space is dependent on the size of the attribute,
- * the calculation is done partially at mount time and partially at runtime.
- */
-#define XFS_CALC_ATTRSET_LOG_RES(mp)    \
-        ((mp)->m_sb.sb_inodesize + \
-         (mp)->m_sb.sb_sectsize + \
-          XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-          (128 * (2 + XFS_DA_NODE_MAXDEPTH)))
 #define XFS_ATTRSET_LOG_RES(mp, ext)    \
        ((mp)->m_reservations.tr_attrset + \
         (ext * (mp)->m_sb.sb_sectsize) + \
         (ext * XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK))) + \
         (128 * (ext + (ext * XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)))))
-/*
- * Removing an attribute.
- *    the inode: inode size
- *    the attribute btree could join: max depth * block size
- *    the inode bmap btree could join or split: max depth * block size
- * And the bmap_finish transaction can free the attr blocks freed giving:
- *    the agf for the ag in which the blocks live: 2 * sector size
- *    the agfl for the ag in which the blocks live: 2 * sector size
- *    the superblock for the free block count: sector size
- *    the allocation btrees: 2 exts * 2 trees * (2 * max depth - 1) * block size
- */
-#define XFS_CALC_ATTRRM_LOG_RES(mp)     \
-        (MAX( \
-          ((mp)->m_sb.sb_inodesize + \
-          XFS_FSB_TO_B((mp), XFS_DA_NODE_MAXDEPTH) + \
-          XFS_FSB_TO_B((mp), XFS_BM_MAXLEVELS(mp, XFS_ATTR_FORK)) + \
-          (128 * (1 + XFS_DA_NODE_MAXDEPTH + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK)))), \
-         ((2 * (mp)->m_sb.sb_sectsize) + \
-          (2 * (mp)->m_sb.sb_sectsize) + \
-          (mp)->m_sb.sb_sectsize + \
-          XFS_ALLOCFREE_LOG_RES(mp, 2) + \
-          (128 * (5 + XFS_ALLOCFREE_LOG_COUNT(mp, 2))))))
 #define XFS_ATTRRM_LOG_RES(mp)  ((mp)->m_reservations.tr_attrrm)
-/*
- * Clearing a bad agino number in an agi hash bucket.
- */
-#define XFS_CALC_CLEAR_AGI_BUCKET_LOG_RES(mp) \
-        ((mp)->m_sb.sb_sectsize + 128)
 #define XFS_CLEAR_AGI_BUCKET_LOG_RES(mp)  ((mp)->m_reservations.tr_clearagi)
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 9d376be0ea38..a06bd62504fc 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -267,7 +267,7 @@ xfs_setattr(
                if (code) {
                        ASSERT(tp == NULL);
                        lock_flags &= ~XFS_ILOCK_EXCL;
-                        ASSERT(lock_flags == XFS_IOLOCK_EXCL);
+                        ASSERT(lock_flags == XFS_IOLOCK_EXCL || !need_iolock);
                        goto error_return;
                }
                tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE);
author	David S. Miller <davem@davemloft.net>	2010-06-26 13:27:00 -0400
committer	David S. Miller <davem@davemloft.net>	2010-06-26 13:27:00 -0400
commit	c67dda14389205f0a223c5089307495290939b3b (patch)
tree	fad0bb26b28703d02a22ebdd44d94eabac4a2ade /fs
parent	43bc2db47292a824152145253b1dd2847e7312a3 (diff)
parent	7e27d6e778cd87b6f2415515d7127eba53fe5d02 (diff)