Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6 into for-linus

author: Felix Blyakher <felixb@sgi.com> 2009-04-01 17:58:39 -0400
committer: Felix Blyakher <felixb@sgi.com> 2009-04-01 17:58:39 -0400
commit: f36345ff9a4a77f2cc576a2777b6256d5c8798fa (patch)
tree: 7ae4c607f6baae74060c2e385f744e171fbbf92b /fs
parent: 1aacc064e029f0017384e463121b98f06d3a2cc3 (diff)
parent: 8b53ef33d9d8fa5f771ae11cc6a6e7bc0182beec (diff)
92 files changed, 5259 insertions, 3261 deletions
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a76803108d06..b7ff33c63101 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -186,6 +186,8 @@ int autofs4_expire_wait(struct dentry *dentry);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
                        struct autofs_sb_info *,
                        struct autofs_packet_expire __user *);
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+                            struct autofs_sb_info *sbi, int when);
 int autofs4_expire_multi(struct super_block *, struct vfsmount *,
                        struct autofs_sb_info *, int __user *);
 struct dentry *autofs4_expire_direct(struct super_block *sb,
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index 025e105bffea..9e5ae8a4f5c8 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -525,40 +525,13 @@ static int autofs_dev_ioctl_expire(struct file *fp,
                                   struct autofs_sb_info *sbi,
                                   struct autofs_dev_ioctl *param)
 {
-        struct dentry *dentry;
        struct vfsmount *mnt;
-        int err = -EAGAIN;
        int how;
        how = param->expire.how;
        mnt = fp->f_path.mnt;
-        if (autofs_type_trigger(sbi->type))
+        return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how);
-                dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how);
-        else
-                dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how);
-        if (dentry) {
-                struct autofs_info *ino = autofs4_dentry_ino(dentry);
-                /*
-                 * This is synchronous because it makes the daemon a
-                 * little easier
-                */
-                err = autofs4_wait(sbi, dentry, NFY_EXPIRE);
-                spin_lock(&sbi->fs_lock);
-                if (ino->flags & AUTOFS_INF_MOUNTPOINT) {
-                        ino->flags &= ~AUTOFS_INF_MOUNTPOINT;
-                        sbi->sb->s_root->d_mounted++;
-                }
-                ino->flags &= ~AUTOFS_INF_EXPIRING;
-                complete_all(&ino->expire_complete);
-                spin_unlock(&sbi->fs_lock);
-                dput(dentry);
-        }
-        return err;
 }
 /* Check if autofs mount point is in use */
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index e3bd50776f9e..75f7ddacf7d6 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -478,22 +478,16 @@ int autofs4_expire_run(struct super_block *sb,
        return ret;
 }
-/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
+int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
-   more to be done */
+                            struct autofs_sb_info *sbi, int when)
-int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
-                        struct autofs_sb_info *sbi, int __user *arg)
 {
        struct dentry *dentry;
        int ret = -EAGAIN;
-        int do_now = 0;
-        if (arg && get_user(do_now, arg))
-                return -EFAULT;
        if (autofs_type_trigger(sbi->type))
-                dentry = autofs4_expire_direct(sb, mnt, sbi, do_now);
+                dentry = autofs4_expire_direct(sb, mnt, sbi, when);
        else
-                dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now);
+                dentry = autofs4_expire_indirect(sb, mnt, sbi, when);
        if (dentry) {
                struct autofs_info *ino = autofs4_dentry_ino(dentry);
@@ -516,3 +510,16 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
        return ret;
 }
+/* Call repeatedly until it returns -EAGAIN, meaning there's nothing
+   more to be done */
+int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt,
+                        struct autofs_sb_info *sbi, int __user *arg)
+{
+        int do_now = 0;
+        if (arg && get_user(do_now, arg))
+                return -EFAULT;
+        return autofs4_do_expire_multi(sb, mnt, sbi, do_now);
+}
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 74b1469a9504..e383bf0334f1 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -485,22 +485,6 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
        DPRINTK("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d",
                 current->pid, task_pgrp_nr(current), sbi->catatonic, oz_mode);
-        expiring = autofs4_lookup_expiring(sbi, dentry->d_parent, &dentry->d_name);
-        if (expiring) {
-                /*
-                 * If we are racing with expire the request might not
-                 * be quite complete but the directory has been removed
-                 * so it must have been successful, so just wait for it.
-                 */
-                ino = autofs4_dentry_ino(expiring);
-                autofs4_expire_wait(expiring);
-                spin_lock(&sbi->lookup_lock);
-                if (!list_empty(&ino->expiring))
-                        list_del_init(&ino->expiring);
-                spin_unlock(&sbi->lookup_lock);
-                dput(expiring);
-        }
        unhashed = autofs4_lookup_active(sbi, dentry->d_parent, &dentry->d_name);
        if (unhashed)
                dentry = unhashed;
@@ -538,14 +522,31 @@ static struct dentry *autofs4_lookup(struct inode *dir, struct dentry *dentry, s
        }
        if (!oz_mode) {
+                mutex_unlock(&dir->i_mutex);
+                expiring = autofs4_lookup_expiring(sbi,
+                                                   dentry->d_parent,
+                                                   &dentry->d_name);
+                if (expiring) {
+                        /*
+                         * If we are racing with expire the request might not
+                         * be quite complete but the directory has been removed
+                         * so it must have been successful, so just wait for it.
+                         */
+                        ino = autofs4_dentry_ino(expiring);
+                        autofs4_expire_wait(expiring);
+                        spin_lock(&sbi->lookup_lock);
+                        if (!list_empty(&ino->expiring))
+                                list_del_init(&ino->expiring);
+                        spin_unlock(&sbi->lookup_lock);
+                        dput(expiring);
+                }
                spin_lock(&dentry->d_lock);
                dentry->d_flags |= DCACHE_AUTOFS_PENDING;
                spin_unlock(&dentry->d_lock);
-                if (dentry->d_op && dentry->d_op->d_revalidate) {
+                if (dentry->d_op && dentry->d_op->d_revalidate)
-                        mutex_unlock(&dir->i_mutex);
                        (dentry->d_op->d_revalidate)(dentry, nd);
-                        mutex_lock(&dir->i_mutex);
+                mutex_lock(&dir->i_mutex);
-                }
        }
        /*
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b8..9adf5e4f7e96 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
-           compression.o
+           compression.o delayed-ref.o
 else
 # Normal Makefile
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74f..b30986f00b9d 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
         */
        struct list_head delalloc_inodes;
+        /*
+         * list for tracking inodes that must be sent to disk before a
+         * rename or truncate commit
+         */
+        struct list_head ordered_operations;
        /* the space_info for where this inode's data allocations are done */
        struct btrfs_space_info *space_info;
@@ -86,12 +92,6 @@ struct btrfs_inode {
         */
        u64 logged_trans;
-        /*
-         * trans that last made a change that should be fully fsync'd.  This
-         * gets reset to zero each time the inode is logged
-         */
-        u64 log_dirty_trans;
        /* total number of bytes pending delalloc, used by stat to calc the
         * real block usage of the file
         */
@@ -121,6 +121,25 @@ struct btrfs_inode {
        /* the start of block group preferred for allocations. */
        u64 block_group;
+        /* the fsync log has some corner cases that mean we have to check
+         * directories to see if any unlinks have been done before
+         * the directory was logged.  See tree-log.c for all the
+         * details
+         */
+        u64 last_unlink_trans;
+        /*
+         * ordered_data_close is set by truncate when a file that used
+         * to have good data has been truncated to zero.  When it is set
+         * the btrfs file release call will add this inode to the
+         * ordered operations list so that we make sure to flush out any
+         * new data the application may have written before commit.
+         *
+         * yes, its silly to have a single bitflag, but we might grow more
+         * of these.
+         */
+        unsigned ordered_data_close:1;
        struct inode vfs_inode;
 };
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529aa..dbb724124633 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 * empty_size -- a hint that you plan on doing more cow.  This is the size in
 * bytes the allocator should try to find free next to the block it returns.
 * This is just a hint and may be ignored by the allocator.
- *
- * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.
- * btrfs_alloc_reserved_extent is used to finish the allocation.
 */
 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct extent_buffer *buf,
                             struct extent_buffer *parent, int parent_slot,
                             struct extent_buffer **cow_ret,
-                             u64 search_start, u64 empty_size,
+                             u64 search_start, u64 empty_size)
-                             u64 prealloc_dest)
 {
        u64 parent_start;
        struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
        level = btrfs_header_level(buf);
        nritems = btrfs_header_nritems(buf);
-        if (prealloc_dest) {
+        cow = btrfs_alloc_free_block(trans, root, buf->len,
-                struct btrfs_key ins;
+                                     parent_start, root->root_key.objectid,
+                                     trans->transid, level,
-                ins.objectid = prealloc_dest;
+                                     search_start, empty_size);
-                ins.offset = buf->len;
-                ins.type = BTRFS_EXTENT_ITEM_KEY;
-                ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
-                                                  root->root_key.objectid,
-                                                  trans->transid, level, &ins);
-                BUG_ON(ret);
-                cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-                                            buf->len, level);
-        } else {
-                cow = btrfs_alloc_free_block(trans, root, buf->len,
-                                             parent_start,
-                                             root->root_key.objectid,
-                                             trans->transid, level,
-                                             search_start, empty_size);
-        }
        if (IS_ERR(cow))
                return PTR_ERR(cow);
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, struct extent_buffer *buf,
                    struct extent_buffer *parent, int parent_slot,
-                    struct extent_buffer **cow_ret, u64 prealloc_dest)
+                    struct extent_buffer **cow_ret)
 {
        u64 search_start;
        int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
            btrfs_header_owner(buf) == root->root_key.objectid &&
            !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                *cow_ret = buf;
-                WARN_ON(prealloc_dest);
                return 0;
        }
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
        btrfs_set_lock_blocking(buf);
        ret = __btrfs_cow_block(trans, root, buf, parent,
-                                 parent_slot, cow_ret, search_start, 0,
+                                 parent_slot, cow_ret, search_start, 0);
-                                 prealloc_dest);
        return ret;
 }
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                err = __btrfs_cow_block(trans, root, cur, parent, i,
                                        &cur, search_start,
                                        min(16 * blocksize,
-                                            (end_slot - i) * blocksize), 0);
+                                            (end_slot - i) * blocksize));
                if (err) {
                        btrfs_tree_unlock(cur);
                        free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                BUG_ON(!child);
                btrfs_tree_lock(child);
                btrfs_set_lock_blocking(child);
-                ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+                ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
                BUG_ON(ret);
                spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                spin_unlock(&root->node_lock);
                ret = btrfs_update_extent_ref(trans, root, child->start,
+                                              child->len,
                                              mid->start, child->start,
                                              root->root_key.objectid,
                                              trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
+        if (trans->transaction->delayed_refs.flushing &&
+            btrfs_header_nritems(mid) > 2)
+                return 0;
        if (btrfs_header_nritems(mid) < 2)
                err_on_enospc = 1;
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(left);
                btrfs_set_lock_blocking(left);
                wret = btrfs_cow_block(trans, root, left,
-                                       parent, pslot - 1, &left, 0);
+                                       parent, pslot - 1, &left);
                if (wret) {
                        ret = wret;
                        goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                btrfs_tree_lock(right);
                btrfs_set_lock_blocking(right);
                wret = btrfs_cow_block(trans, root, right,
-                                       parent, pslot + 1, &right, 0);
+                                       parent, pslot + 1, &right);
                if (wret) {
                        ret = wret;
                        goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                        wret = 1;
                } else {
                        ret = btrfs_cow_block(trans, root, left, parent,
-                                              pslot - 1, &left, 0);
+                                              pslot - 1, &left);
                        if (ret)
                                wret = 1;
                        else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
                } else {
                        ret = btrfs_cow_block(trans, root, right,
                                              parent, pslot + 1,
-                                              &right, 0);
+                                              &right);
                        if (ret)
                                wret = 1;
                        else {
@@ -1492,7 +1474,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        u8 lowest_level = 0;
        u64 blocknr;
        u64 gen;
-        struct btrfs_key prealloc_block;
        lowest_level = p->lowest_level;
        WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1482,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
        if (ins_len < 0)
                lowest_unlock = 2;
-        prealloc_block.objectid = 0;
 again:
        if (p->skip_locking)
                b = btrfs_root_node(root);
@@ -1529,44 +1508,11 @@ again:
                            !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
                                goto cow_done;
                        }
-                        /* ok, we have to cow, is our old prealloc the right
-                         * size?
-                         */
-                        if (prealloc_block.objectid &&
-                            prealloc_block.offset != b->len) {
-                                btrfs_release_path(root, p);
-                                btrfs_free_reserved_extent(root,
-                                           prealloc_block.objectid,
-                                           prealloc_block.offset);
-                                prealloc_block.objectid = 0;
-                                goto again;
-                        }
-                        /*
-                         * for higher level blocks, try not to allocate blocks
-                         * with the block and the parent locks held.
-                         */
-                        if (level > 0 && !prealloc_block.objectid) {
-                                u32 size = b->len;
-                                u64 hint = b->start;
-                                btrfs_release_path(root, p);
-                                ret = btrfs_reserve_extent(trans, root,
-                                                           size, size, 0,
-                                                           hint, (u64)-1,
-                                                           &prealloc_block, 0);
-                                BUG_ON(ret);
-                                goto again;
-                        }
                        btrfs_set_path_blocking(p);
                        wret = btrfs_cow_block(trans, root, b,
                                               p->nodes[level + 1],
-                                               p->slots[level + 1],
+                                               p->slots[level + 1], &b);
-                                               &b, prealloc_block.objectid);
-                        prealloc_block.objectid = 0;
                        if (wret) {
                                free_extent_buffer(b);
                                ret = wret;
@@ -1742,12 +1688,8 @@ done:
         * we don't really know what they plan on doing with the path
         * from here on, so for now just mark it as blocking
         */
-        btrfs_set_path_blocking(p);
+        if (!p->leave_spinning)
-        if (prealloc_block.objectid) {
+                btrfs_set_path_blocking(p);
-                btrfs_free_reserved_extent(root,
-                           prealloc_block.objectid,
-                           prealloc_block.offset);
-        }
        return ret;
 }
@@ -1768,7 +1710,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
        int ret;
        eb = btrfs_lock_root_node(root);
-        ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+        ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
        BUG_ON(ret);
        btrfs_set_lock_blocking(eb);
@@ -1826,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
                        }
                        ret = btrfs_cow_block(trans, root, eb, parent, slot,
-                                              &eb, 0);
+                                              &eb);
                        BUG_ON(ret);
                        if (root->root_key.objectid ==
@@ -2139,7 +2081,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        spin_unlock(&root->node_lock);
        ret = btrfs_update_extent_ref(trans, root, lower->start,
-                                      lower->start, c->start,
+                                      lower->len, lower->start, c->start,
                                      root->root_key.objectid,
                                      trans->transid, level - 1);
        BUG_ON(ret);
@@ -2221,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
                ret = insert_new_root(trans, root, path, level + 1);
                if (ret)
                        return ret;
-        } else {
+        } else if (!trans->transaction->delayed_refs.flushing) {
                ret = push_nodes_for_insert(trans, root, path, level);
                c = path->nodes[level];
                if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2271,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
        return ret;
 }
-/*
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
- * push some data in the path leaf to the right, trying to free up at
+                                      struct btrfs_root *root,
- * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+                                      struct btrfs_path *path,
- *
+                                      int data_size, int empty,
- * returns 1 if the push failed because the other node didn't have enough
+                                      struct extent_buffer *right,
- * room, 0 if everything worked out and < 0 if there were major errors.
+                                      int free_space, u32 left_nritems)
- */
-static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
-                           *root, struct btrfs_path *path, int data_size,
-                           int empty)
 {
        struct extent_buffer *left = path->nodes[0];
-        struct extent_buffer *right;
+        struct extent_buffer *upper = path->nodes[1];
-        struct extent_buffer *upper;
        struct btrfs_disk_key disk_key;
        int slot;
        u32 i;
-        int free_space;
        int push_space = 0;
        int push_items = 0;
        struct btrfs_item *item;
-        u32 left_nritems;
        u32 nr;
        u32 right_nritems;
        u32 data_end;
        u32 this_item_size;
        int ret;
-        slot = path->slots[1];
-        if (!path->nodes[1])
-                return 1;
-        upper = path->nodes[1];
-        if (slot >= btrfs_header_nritems(upper) - 1)
-                return 1;
-        btrfs_assert_tree_locked(path->nodes[1]);
-        right = read_node_slot(root, upper, slot + 1);
-        btrfs_tree_lock(right);
-        btrfs_set_lock_blocking(right);
-        free_space = btrfs_leaf_free_space(root, right);
-        if (free_space < data_size)
-                goto out_unlock;
-        /* cow and double check */
-        ret = btrfs_cow_block(trans, root, right, upper,
-                              slot + 1, &right, 0);
-        if (ret)
-                goto out_unlock;
-        free_space = btrfs_leaf_free_space(root, right);
-        if (free_space < data_size)
-                goto out_unlock;
-        left_nritems = btrfs_header_nritems(left);
-        if (left_nritems == 0)
-                goto out_unlock;
        if (empty)
                nr = 0;
        else
@@ -2397,6 +2300,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        if (path->slots[0] >= left_nritems)
                push_space += data_size;
+        slot = path->slots[1];
        i = left_nritems - 1;
        while (i >= nr) {
                item = btrfs_item_nr(left, i);
@@ -2528,24 +2432,82 @@ out_unlock:
 }
 /*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+                           *root, struct btrfs_path *path, int data_size,
+                           int empty)
+{
+        struct extent_buffer *left = path->nodes[0];
+        struct extent_buffer *right;
+        struct extent_buffer *upper;
+        int slot;
+        int free_space;
+        u32 left_nritems;
+        int ret;
+        if (!path->nodes[1])
+                return 1;
+        slot = path->slots[1];
+        upper = path->nodes[1];
+        if (slot >= btrfs_header_nritems(upper) - 1)
+                return 1;
+        btrfs_assert_tree_locked(path->nodes[1]);
+        right = read_node_slot(root, upper, slot + 1);
+        btrfs_tree_lock(right);
+        btrfs_set_lock_blocking(right);
+        free_space = btrfs_leaf_free_space(root, right);
+        if (free_space < data_size)
+                goto out_unlock;
+        /* cow and double check */
+        ret = btrfs_cow_block(trans, root, right, upper,
+                              slot + 1, &right);
+        if (ret)
+                goto out_unlock;
+        free_space = btrfs_leaf_free_space(root, right);
+        if (free_space < data_size)
+                goto out_unlock;
+        left_nritems = btrfs_header_nritems(left);
+        if (left_nritems == 0)
+                goto out_unlock;
+        return __push_leaf_right(trans, root, path, data_size, empty,
+                                right, free_space, left_nritems);
+out_unlock:
+        btrfs_tree_unlock(right);
+        free_extent_buffer(right);
+        return 1;
+}
+/*
 * push some data in the path leaf to the left, trying to free up at
 * least data_size bytes.  returns zero if the push worked, nonzero otherwise
 */
-static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
-                          *root, struct btrfs_path *path, int data_size,
+                                     struct btrfs_root *root,
-                          int empty)
+                                     struct btrfs_path *path, int data_size,
+                                     int empty, struct extent_buffer *left,
+                                     int free_space, int right_nritems)
 {
        struct btrfs_disk_key disk_key;
        struct extent_buffer *right = path->nodes[0];
-        struct extent_buffer *left;
        int slot;
        int i;
-        int free_space;
        int push_space = 0;
        int push_items = 0;
        struct btrfs_item *item;
        u32 old_left_nritems;
-        u32 right_nritems;
        u32 nr;
        int ret = 0;
        int wret;
@@ -2553,41 +2515,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
        u32 old_left_item_size;
        slot = path->slots[1];
-        if (slot == 0)
-                return 1;
-        if (!path->nodes[1])
-                return 1;
-        right_nritems = btrfs_header_nritems(right);
-        if (right_nritems == 0)
-                return 1;
-        btrfs_assert_tree_locked(path->nodes[1]);
-        left = read_node_slot(root, path->nodes[1], slot - 1);
-        btrfs_tree_lock(left);
-        btrfs_set_lock_blocking(left);
-        free_space = btrfs_leaf_free_space(root, left);
-        if (free_space < data_size) {
-                ret = 1;
-                goto out;
-        }
-        /* cow and double check */
-        ret = btrfs_cow_block(trans, root, left,
-                              path->nodes[1], slot - 1, &left, 0);
-        if (ret) {
-                /* we hit -ENOSPC, but it isn't fatal here */
-                ret = 1;
-                goto out;
-        }
-        free_space = btrfs_leaf_free_space(root, left);
-        if (free_space < data_size) {
-                ret = 1;
-                goto out;
-        }
        if (empty)
                nr = right_nritems;
@@ -2755,6 +2682,154 @@ out:
 }
 /*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes.  returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+                          *root, struct btrfs_path *path, int data_size,
+                          int empty)
+{
+        struct extent_buffer *right = path->nodes[0];
+        struct extent_buffer *left;
+        int slot;
+        int free_space;
+        u32 right_nritems;
+        int ret = 0;
+        slot = path->slots[1];
+        if (slot == 0)
+                return 1;
+        if (!path->nodes[1])
+                return 1;
+        right_nritems = btrfs_header_nritems(right);
+        if (right_nritems == 0)
+                return 1;
+        btrfs_assert_tree_locked(path->nodes[1]);
+        left = read_node_slot(root, path->nodes[1], slot - 1);
+        btrfs_tree_lock(left);
+        btrfs_set_lock_blocking(left);
+        free_space = btrfs_leaf_free_space(root, left);
+        if (free_space < data_size) {
+                ret = 1;
+                goto out;
+        }
+        /* cow and double check */
+        ret = btrfs_cow_block(trans, root, left,
+                              path->nodes[1], slot - 1, &left);
+        if (ret) {
+                /* we hit -ENOSPC, but it isn't fatal here */
+                ret = 1;
+                goto out;
+        }
+        free_space = btrfs_leaf_free_space(root, left);
+        if (free_space < data_size) {
+                ret = 1;
+                goto out;
+        }
+        return __push_leaf_left(trans, root, path, data_size,
+                               empty, left, free_space, right_nritems);
+out:
+        btrfs_tree_unlock(left);
+        free_extent_buffer(left);
+        return ret;
+}
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root,
+                               struct btrfs_path *path,
+                               struct extent_buffer *l,
+                               struct extent_buffer *right,
+                               int slot, int mid, int nritems)
+{
+        int data_copy_size;
+        int rt_data_off;
+        int i;
+        int ret = 0;
+        int wret;
+        struct btrfs_disk_key disk_key;
+        nritems = nritems - mid;
+        btrfs_set_header_nritems(right, nritems);
+        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+                           btrfs_item_nr_offset(mid),
+                           nritems * sizeof(struct btrfs_item));
+        copy_extent_buffer(right, l,
+                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+                     data_copy_size, btrfs_leaf_data(l) +
+                     leaf_data_end(root, l), data_copy_size);
+        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+                      btrfs_item_end_nr(l, mid);
+        for (i = 0; i < nritems; i++) {
+                struct btrfs_item *item = btrfs_item_nr(right, i);
+                u32 ioff;
+                if (!right->map_token) {
+                        map_extent_buffer(right, (unsigned long)item,
+                                        sizeof(struct btrfs_item),
+                                        &right->map_token, &right->kaddr,
+                                        &right->map_start, &right->map_len,
+                                        KM_USER1);
+                }
+                ioff = btrfs_item_offset(right, item);
+                btrfs_set_item_offset(right, item, ioff + rt_data_off);
+        }
+        if (right->map_token) {
+                unmap_extent_buffer(right, right->map_token, KM_USER1);
+                right->map_token = NULL;
+        }
+        btrfs_set_header_nritems(l, mid);
+        ret = 0;
+        btrfs_item_key(right, &disk_key, 0);
+        wret = insert_ptr(trans, root, path, &disk_key, right->start,
+                          path->slots[1] + 1, 1);
+        if (wret)
+                ret = wret;
+        btrfs_mark_buffer_dirty(right);
+        btrfs_mark_buffer_dirty(l);
+        BUG_ON(path->slots[0] != slot);
+        ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+        BUG_ON(ret);
+        if (mid <= slot) {
+                btrfs_tree_unlock(path->nodes[0]);
+                free_extent_buffer(path->nodes[0]);
+                path->nodes[0] = right;
+                path->slots[0] -= mid;
+                path->slots[1] += 1;
+        } else {
+                btrfs_tree_unlock(right);
+                free_extent_buffer(right);
+        }
+        BUG_ON(path->slots[0] < 0);
+        return ret;
+}
+/*
 * split the path's leaf in two, making sure there is at least data_size
 * available for the resulting leaf level of the path.
 *
@@ -2771,17 +2846,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
        int mid;
        int slot;
        struct extent_buffer *right;
-        int data_copy_size;
-        int rt_data_off;
-        int i;
        int ret = 0;
        int wret;
        int double_split;
        int num_doubles = 0;
-        struct btrfs_disk_key disk_key;
        /* first try to make some room by pushing left and right */
-        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+        if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+            !trans->transaction->delayed_refs.flushing) {
                wret = push_leaf_right(trans, root, path, data_size, 0);
                if (wret < 0)
                        return wret;
@@ -2830,11 +2902,14 @@ again:
        write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
                            (unsigned long)btrfs_header_chunk_tree_uuid(right),
                            BTRFS_UUID_SIZE);
        if (mid <= slot) {
                if (nritems == 1 ||
                    leaf_space_used(l, mid, nritems - mid) + data_size >
                        BTRFS_LEAF_DATA_SIZE(root)) {
                        if (slot >= nritems) {
+                                struct btrfs_disk_key disk_key;
                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
                                btrfs_set_header_nritems(right, 0);
                                wret = insert_ptr(trans, root, path,
@@ -2862,6 +2937,8 @@ again:
                if (leaf_space_used(l, 0, mid) + data_size >
                        BTRFS_LEAF_DATA_SIZE(root)) {
                        if (!extend && data_size && slot == 0) {
+                                struct btrfs_disk_key disk_key;
                                btrfs_cpu_key_to_disk(&disk_key, ins_key);
                                btrfs_set_header_nritems(right, 0);
                                wret = insert_ptr(trans, root, path,
@@ -2894,76 +2971,16 @@ again:
                        }
                }
        }
-        nritems = nritems - mid;
-        btrfs_set_header_nritems(right, nritems);
-        data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
-        copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
-                           btrfs_item_nr_offset(mid),
-                           nritems * sizeof(struct btrfs_item));
-        copy_extent_buffer(right, l,
-                     btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
-                     data_copy_size, btrfs_leaf_data(l) +
-                     leaf_data_end(root, l), data_copy_size);
-        rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
-                      btrfs_item_end_nr(l, mid);
-        for (i = 0; i < nritems; i++) {
-                struct btrfs_item *item = btrfs_item_nr(right, i);
-                u32 ioff;
-                if (!right->map_token) {
-                        map_extent_buffer(right, (unsigned long)item,
-                                        sizeof(struct btrfs_item),
-                                        &right->map_token, &right->kaddr,
-                                        &right->map_start, &right->map_len,
-                                        KM_USER1);
-                }
-                ioff = btrfs_item_offset(right, item);
-                btrfs_set_item_offset(right, item, ioff + rt_data_off);
-        }
-        if (right->map_token) {
-                unmap_extent_buffer(right, right->map_token, KM_USER1);
-                right->map_token = NULL;
-        }
-        btrfs_set_header_nritems(l, mid);
-        ret = 0;
-        btrfs_item_key(right, &disk_key, 0);
-        wret = insert_ptr(trans, root, path, &disk_key, right->start,
-                          path->slots[1] + 1, 1);
-        if (wret)
-                ret = wret;
-        btrfs_mark_buffer_dirty(right);
-        btrfs_mark_buffer_dirty(l);
-        BUG_ON(path->slots[0] != slot);
-        ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+        ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
        BUG_ON(ret);
-        if (mid <= slot) {
-                btrfs_tree_unlock(path->nodes[0]);
-                free_extent_buffer(path->nodes[0]);
-                path->nodes[0] = right;
-                path->slots[0] -= mid;
-                path->slots[1] += 1;
-        } else {
-                btrfs_tree_unlock(right);
-                free_extent_buffer(right);
-        }
-        BUG_ON(path->slots[0] < 0);
        if (double_split) {
                BUG_ON(num_doubles != 0);
                num_doubles++;
                goto again;
        }
        return ret;
 }
@@ -3021,26 +3038,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
                return -EAGAIN;
        }
+        btrfs_set_path_blocking(path);
        ret = split_leaf(trans, root, &orig_key, path,
                         sizeof(struct btrfs_item), 1);
        path->keep_locks = 0;
        BUG_ON(ret);
+        btrfs_unlock_up_safe(path, 1);
+        leaf = path->nodes[0];
+        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+split:
        /*
         * make sure any changes to the path from split_leaf leave it
         * in a blocking state
         */
        btrfs_set_path_blocking(path);
-        leaf = path->nodes[0];
-        BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-split:
        item = btrfs_item_nr(leaf, path->slots[0]);
        orig_offset = btrfs_item_offset(leaf, item);
        item_size = btrfs_item_size(leaf, item);
        buf = kmalloc(item_size, GFP_NOFS);
        read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
                            path->slots[0]), item_size);
@@ -3445,39 +3463,27 @@ out:
 }
 /*
- * Given a key and some data, insert items into the tree.
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
- * This does all the path init required, making room in the tree if needed.
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
 */
-int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+static noinline_for_stack int
-                            struct btrfs_root *root,
+setup_items_for_insert(struct btrfs_trans_handle *trans,
-                            struct btrfs_path *path,
+                      struct btrfs_root *root, struct btrfs_path *path,
-                            struct btrfs_key *cpu_key, u32 *data_size,
+                      struct btrfs_key *cpu_key, u32 *data_size,
-                            int nr)
+                      u32 total_data, u32 total_size, int nr)
 {
-        struct extent_buffer *leaf;
        struct btrfs_item *item;
-        int ret = 0;
-        int slot;
-        int slot_orig;
        int i;
        u32 nritems;
-        u32 total_size = 0;
-        u32 total_data = 0;
        unsigned int data_end;
        struct btrfs_disk_key disk_key;
+        int ret;
+        struct extent_buffer *leaf;
+        int slot;
-        for (i = 0; i < nr; i++)
-                total_data += data_size[i];
-        total_size = total_data + (nr * sizeof(struct btrfs_item));
-        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
-        if (ret == 0)
-                return -EEXIST;
-        if (ret < 0)
-                goto out;
-        slot_orig = path->slots[0];
        leaf = path->nodes[0];
+        slot = path->slots[0];
        nritems = btrfs_header_nritems(leaf);
        data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3495,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                BUG();
        }
-        slot = path->slots[0];
-        BUG_ON(slot < 0);
        if (slot != nritems) {
                unsigned int old_data = btrfs_item_end_nr(leaf, slot);
@@ -3547,21 +3550,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                data_end -= data_size[i];
                btrfs_set_item_size(leaf, item, data_size[i]);
        }
        btrfs_set_header_nritems(leaf, nritems + nr);
-        btrfs_mark_buffer_dirty(leaf);
        ret = 0;
        if (slot == 0) {
+                struct btrfs_disk_key disk_key;
                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
                ret = fixup_low_keys(trans, root, path, &disk_key, 1);
        }
+        btrfs_unlock_up_safe(path, 1);
+        btrfs_mark_buffer_dirty(leaf);
        if (btrfs_leaf_free_space(root, leaf) < 0) {
                btrfs_print_leaf(root, leaf);
                BUG();
        }
+        return ret;
+}
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root,
+                            struct btrfs_path *path,
+                            struct btrfs_key *cpu_key, u32 *data_size,
+                            int nr)
+{
+        struct extent_buffer *leaf;
+        int ret = 0;
+        int slot;
+        int i;
+        u32 total_size = 0;
+        u32 total_data = 0;
+        for (i = 0; i < nr; i++)
+                total_data += data_size[i];
+        total_size = total_data + (nr * sizeof(struct btrfs_item));
+        ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+        if (ret == 0)
+                return -EEXIST;
+        if (ret < 0)
+                goto out;
+        leaf = path->nodes[0];
+        slot = path->slots[0];
+        BUG_ON(slot < 0);
+        ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+                               total_data, total_size, nr);
 out:
-        btrfs_unlock_up_safe(path, 1);
        return ret;
 }
@@ -3749,7 +3791,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                }
                /* delete the leaf if it is mostly empty */
-                if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+                if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+                    !trans->transaction->delayed_refs.flushing) {
                        /* push_leaf_left fixes the path.
                         * make sure the path still points to our leaf
                         * for possible call to del_ptr below
@@ -3757,6 +3800,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                        slot = path->slots[1];
                        extent_buffer_get(leaf);
+                        btrfs_set_path_blocking(path);
                        wret = push_leaf_left(trans, root, path, 1, 1);
                        if (wret < 0 && wret != -ENOSPC)
                                ret = wret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5e1d4e30e9d8..9417713542a2 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
 #define BTRFS_MAX_LEVEL 8
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list.  That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -401,15 +408,16 @@ struct btrfs_path {
        int locks[BTRFS_MAX_LEVEL];
        int reada;
        /* keep some upper locks as we walk down */
-        int keep_locks;
-        int skip_locking;
        int lowest_level;
        /*
         * set by btrfs_split_item, tells search_slot to keep all locks
         * and to force calls to keep space in the nodes
         */
-        int search_for_split;
+        unsigned int search_for_split:1;
+        unsigned int keep_locks:1;
+        unsigned int skip_locking:1;
+        unsigned int leave_spinning:1;
 };
 /*
@@ -688,15 +696,18 @@ struct btrfs_fs_info {
        struct rb_root block_group_cache_tree;
        struct extent_io_tree pinned_extents;
-        struct extent_io_tree pending_del;
-        struct extent_io_tree extent_ins;
        /* logical->physical extent mapping */
        struct btrfs_mapping_tree mapping_tree;
        u64 generation;
        u64 last_trans_committed;
-        u64 last_trans_new_blockgroup;
+        /*
+         * this is updated to the current trans every time a full commit
+         * is required instead of the faster short fsync log commits
+         */
+        u64 last_trans_log_full_commit;
        u64 open_ioctl_trans;
        unsigned long mount_opt;
        u64 max_extent;
@@ -717,12 +728,21 @@ struct btrfs_fs_info {
        struct mutex tree_log_mutex;
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
-        struct mutex extent_ins_mutex;
        struct mutex pinned_mutex;
        struct mutex chunk_mutex;
        struct mutex drop_mutex;
        struct mutex volume_mutex;
        struct mutex tree_reloc_mutex;
+        /*
+         * this protects the ordered operations list only while we are
+         * processing all of the entries on it.  This way we make
+         * sure the commit code doesn't find the list temporarily empty
+         * because another function happens to be doing non-waiting preflush
+         * before jumping into the main commit.
+         */
+        struct mutex ordered_operations_mutex;
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
@@ -737,10 +757,29 @@ struct btrfs_fs_info {
         * ordered extents
         */
        spinlock_t ordered_extent_lock;
+        /*
+         * all of the data=ordered extents pending writeback
+         * these can span multiple transactions and basically include
+         * every dirty data page that isn't from nodatacow
+         */
        struct list_head ordered_extents;
+        /*
+         * all of the inodes that have delalloc bytes.  It is possible for
+         * this list to be empty even when there is still dirty data=ordered
+         * extents waiting to finish IO.
+         */
        struct list_head delalloc_inodes;
        /*
+         * special rename and truncate targets that must be on disk before
+         * we're allowed to commit.  This is basically the ext3 style
+         * data=ordered list.
+         */
+        struct list_head ordered_operations;
+        /*
         * there is a pool of worker threads for checksumming during writes
         * and a pool for checksumming after reads.  This is because readers
         * can run with FS locks held, and the writers may be waiting for
@@ -781,6 +820,11 @@ struct btrfs_fs_info {
        atomic_t throttle_gen;
        u64 total_pinned;
+        /* protected by the delalloc lock, used to keep from writing
+         * metadata until there is a nice batch
+         */
+        u64 dirty_metadata_bytes;
        struct list_head dirty_cowonly_roots;
        struct btrfs_fs_devices *fs_devices;
@@ -1704,18 +1748,15 @@ static inline struct dentry *fdentry(struct file *file)
 }
 /* extent-tree.c */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, unsigned long count);
 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
-                            u64 num_bytes, u32 *refs);
 int btrfs_update_pinned_extents(struct btrfs_root *root,
                                u64 bytenr, u64 num, int pin);
 int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, u64 objectid, u64 bytenr);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root);
 int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
@@ -1777,7 +1818,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                         u64 root_objectid, u64 ref_generation,
                         u64 owner_objectid);
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root, u64 bytenr,
+                            struct btrfs_root *root, u64 bytenr, u64 num_bytes,
                            u64 orig_parent, u64 parent,
                            u64 root_objectid, u64 ref_generation,
                            u64 owner_objectid);
@@ -1838,7 +1879,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
                    struct btrfs_root *root, struct extent_buffer *buf,
                    struct extent_buffer *parent, int parent_slot,
-                    struct extent_buffer **cow_ret, u64 prealloc_dest);
+                    struct extent_buffer **cow_ret);
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root,
                      struct extent_buffer *buf,
@@ -2060,7 +2101,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
                              struct file_ra_state *ra, struct file *file,
                              pgoff_t offset, pgoff_t last_index);
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
 void btrfs_put_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 000000000000..cbf7dc8ae3ec
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (C) 2009 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/sort.h>
+#include <linux/ftrace.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+/*
+ * delayed back reference update tracking.  For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing.   This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ *
+ * Right now this code is only used for reference counted trees, but
+ * the long term goal is to get rid of the similar code for delayed
+ * extent tree modifications.
+ */
+/*
+ * entries in the rb tree are ordered by the byte number of the extent
+ * and by the byte number of the parent block.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref,
+                      u64 bytenr, u64 parent)
+{
+        if (bytenr < ref->bytenr)
+                return -1;
+        if (bytenr > ref->bytenr)
+                return 1;
+        if (parent < ref->parent)
+                return -1;
+        if (parent > ref->parent)
+                return 1;
+        return 0;
+}
+/*
+ * insert a new ref into the rbtree.  This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+                                                  u64 bytenr, u64 parent,
+                                                  struct rb_node *node)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent_node = NULL;
+        struct btrfs_delayed_ref_node *entry;
+        int cmp;
+        while (*p) {
+                parent_node = *p;
+                entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+                                 rb_node);
+                cmp = comp_entry(entry, bytenr, parent);
+                if (cmp < 0)
+                        p = &(*p)->rb_left;
+                else if (cmp > 0)
+                        p = &(*p)->rb_right;
+                else
+                        return entry;
+        }
+        entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+        rb_link_node(node, parent_node, p);
+        rb_insert_color(node, root);
+        return NULL;
+}
+/*
+ * find an entry based on (bytenr,parent).  This returns the delayed
+ * ref if it was able to find one, or NULL if nothing was in that spot
+ */
+static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+                                  u64 bytenr, u64 parent,
+                                  struct btrfs_delayed_ref_node **last)
+{
+        struct rb_node *n = root->rb_node;
+        struct btrfs_delayed_ref_node *entry;
+        int cmp;
+        while (n) {
+                entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+                WARN_ON(!entry->in_tree);
+                if (last)
+                        *last = entry;
+                cmp = comp_entry(entry, bytenr, parent);
+                if (cmp < 0)
+                        n = n->rb_left;
+                else if (cmp > 0)
+                        n = n->rb_right;
+                else
+                        return entry;
+        }
+        return NULL;
+}
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+                           struct btrfs_delayed_ref_head *head)
+{
+        struct btrfs_delayed_ref_root *delayed_refs;
+        delayed_refs = &trans->transaction->delayed_refs;
+        assert_spin_locked(&delayed_refs->lock);
+        if (mutex_trylock(&head->mutex))
+                return 0;
+        atomic_inc(&head->node.refs);
+        spin_unlock(&delayed_refs->lock);
+        mutex_lock(&head->mutex);
+        spin_lock(&delayed_refs->lock);
+        if (!head->node.in_tree) {
+                mutex_unlock(&head->mutex);
+                btrfs_put_delayed_ref(&head->node);
+                return -EAGAIN;
+        }
+        btrfs_put_delayed_ref(&head->node);
+        return 0;
+}
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+                           struct list_head *cluster, u64 start)
+{
+        int count = 0;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct rb_node *node;
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_head *head;
+        delayed_refs = &trans->transaction->delayed_refs;
+        if (start == 0) {
+                node = rb_first(&delayed_refs->root);
+        } else {
+                ref = NULL;
+                tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+                if (ref) {
+                        struct btrfs_delayed_ref_node *tmp;
+                        node = rb_prev(&ref->rb_node);
+                        while (node) {
+                                tmp = rb_entry(node,
+                                               struct btrfs_delayed_ref_node,
+                                               rb_node);
+                                if (tmp->bytenr < start)
+                                        break;
+                                ref = tmp;
+                                node = rb_prev(&ref->rb_node);
+                        }
+                        node = &ref->rb_node;
+                } else
+                        node = rb_first(&delayed_refs->root);
+        }
+again:
+        while (node && count < 32) {
+                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                if (btrfs_delayed_ref_is_head(ref)) {
+                        head = btrfs_delayed_node_to_head(ref);
+                        if (list_empty(&head->cluster)) {
+                                list_add_tail(&head->cluster, cluster);
+                                delayed_refs->run_delayed_start =
+                                        head->node.bytenr;
+                                count++;
+                                WARN_ON(delayed_refs->num_heads_ready == 0);
+                                delayed_refs->num_heads_ready--;
+                        } else if (count) {
+                                /* the goal of the clustering is to find extents
+                                 * that are likely to end up in the same extent
+                                 * leaf on disk.  So, we don't want them spread
+                                 * all over the tree.  Stop now if we've hit
+                                 * a head that was already in use
+                                 */
+                                break;
+                        }
+                }
+                node = rb_next(node);
+        }
+        if (count) {
+                return 0;
+        } else if (start) {
+                /*
+                 * we've gone to the end of the rbtree without finding any
+                 * clusters.  start from the beginning and try again
+                 */
+                start = 0;
+                node = rb_first(&delayed_refs->root);
+                goto again;
+        }
+        return 1;
+}
+/*
+ * This checks to see if there are any delayed refs in the
+ * btree for a given bytenr.  It returns one if it finds any
+ * and zero otherwise.
+ *
+ * If it only finds a head node, it returns 0.
+ *
+ * The idea is to use this when deciding if you can safely delete an
+ * extent from the extent allocation tree.  There may be a pending
+ * ref in the rbtree that adds or removes references, so as long as this
+ * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
+ * allocation tree.
+ */
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct rb_node *prev_node;
+        int ret = 0;
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        if (ref) {
+                prev_node = rb_prev(&ref->rb_node);
+                if (!prev_node)
+                        goto out;
+                ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
+                               rb_node);
+                if (ref->bytenr == bytenr)
+                        ret = 1;
+        }
+out:
+        spin_unlock(&delayed_refs->lock);
+        return ret;
+}
+/*
+ * helper function to lookup reference count
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree.  This way you
+ * can check to see what the reference count would be if all of the
+ * delayed refs are processed.
+ */
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 num_bytes, u32 *refs)
+{
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_head *head;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        struct btrfs_extent_item *ei;
+        struct btrfs_key key;
+        u32 num_refs;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        key.objectid = bytenr;
+        key.type = BTRFS_EXTENT_ITEM_KEY;
+        key.offset = num_bytes;
+        delayed_refs = &trans->transaction->delayed_refs;
+again:
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+                                &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret == 0) {
+                leaf = path->nodes[0];
+                ei = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_extent_item);
+                num_refs = btrfs_extent_refs(leaf, ei);
+        } else {
+                num_refs = 0;
+                ret = 0;
+        }
+        spin_lock(&delayed_refs->lock);
+        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        if (ref) {
+                head = btrfs_delayed_node_to_head(ref);
+                if (mutex_trylock(&head->mutex)) {
+                        num_refs += ref->ref_mod;
+                        mutex_unlock(&head->mutex);
+                        *refs = num_refs;
+                        goto out;
+                }
+                atomic_inc(&ref->refs);
+                spin_unlock(&delayed_refs->lock);
+                btrfs_release_path(root->fs_info->extent_root, path);
+                mutex_lock(&head->mutex);
+                mutex_unlock(&head->mutex);
+                btrfs_put_delayed_ref(ref);
+                goto again;
+        } else {
+                *refs = num_refs;
+        }
+out:
+        spin_unlock(&delayed_refs->lock);
+        btrfs_free_path(path);
+        return ret;
+}
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree.  existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+                    struct btrfs_delayed_ref_root *delayed_refs,
+                    struct btrfs_delayed_ref_node *existing,
+                    struct btrfs_delayed_ref_node *update)
+{
+        struct btrfs_delayed_ref *existing_ref;
+        struct btrfs_delayed_ref *ref;
+        existing_ref = btrfs_delayed_node_to_ref(existing);
+        ref = btrfs_delayed_node_to_ref(update);
+        if (ref->pin)
+                existing_ref->pin = 1;
+        if (ref->action != existing_ref->action) {
+                /*
+                 * this is effectively undoing either an add or a
+                 * drop.  We decrement the ref_mod, and if it goes
+                 * down to zero we just delete the entry without
+                 * every changing the extent allocation tree.
+                 */
+                existing->ref_mod--;
+                if (existing->ref_mod == 0) {
+                        rb_erase(&existing->rb_node,
+                                 &delayed_refs->root);
+                        existing->in_tree = 0;
+                        btrfs_put_delayed_ref(existing);
+                        delayed_refs->num_entries--;
+                        if (trans->delayed_ref_updates)
+                                trans->delayed_ref_updates--;
+                }
+        } else {
+                if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+                        /* if we're adding refs, make sure all the
+                         * details match up.  The extent could
+                         * have been totally freed and reallocated
+                         * by a different owner before the delayed
+                         * ref entries were removed.
+                         */
+                        existing_ref->owner_objectid = ref->owner_objectid;
+                        existing_ref->generation = ref->generation;
+                        existing_ref->root = ref->root;
+                        existing->num_bytes = update->num_bytes;
+                }
+                /*
+                 * the action on the existing ref matches
+                 * the action on the ref we're trying to add.
+                 * Bump the ref_mod by one so the backref that
+                 * is eventually added/removed has the correct
+                 * reference count
+                 */
+                existing->ref_mod += update->ref_mod;
+        }
+}
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+                         struct btrfs_delayed_ref_node *update)
+{
+        struct btrfs_delayed_ref_head *existing_ref;
+        struct btrfs_delayed_ref_head *ref;
+        existing_ref = btrfs_delayed_node_to_head(existing);
+        ref = btrfs_delayed_node_to_head(update);
+        if (ref->must_insert_reserved) {
+                /* if the extent was freed and then
+                 * reallocated before the delayed ref
+                 * entries were processed, we can end up
+                 * with an existing head ref without
+                 * the must_insert_reserved flag set.
+                 * Set it again here
+                 */
+                existing_ref->must_insert_reserved = ref->must_insert_reserved;
+                /*
+                 * update the num_bytes so we make sure the accounting
+                 * is done correctly
+                 */
+                existing->num_bytes = update->num_bytes;
+        }
+        /*
+         * update the reference mod on the head to reflect this new operation
+         */
+        existing->ref_mod += update->ref_mod;
+}
+/*
+ * helper function to actually insert a delayed ref into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count in the head node and properly dealing
+ * with updating existing nodes as new modifications are queued.
+ */
+static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+                          struct btrfs_delayed_ref_node *ref,
+                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                          u64 ref_generation, u64 owner_objectid, int action,
+                          int pin)
+{
+        struct btrfs_delayed_ref_node *existing;
+        struct btrfs_delayed_ref *full_ref;
+        struct btrfs_delayed_ref_head *head_ref = NULL;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int count_mod = 1;
+        int must_insert_reserved = 0;
+        /*
+         * the head node stores the sum of all the mods, so dropping a ref
+         * should drop the sum in the head node by one.
+         */
+        if (parent == (u64)-1) {
+                if (action == BTRFS_DROP_DELAYED_REF)
+                        count_mod = -1;
+                else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+                        count_mod = 0;
+        }
+        /*
+         * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+         * the reserved accounting when the extent is finally added, or
+         * if a later modification deletes the delayed ref without ever
+         * inserting the extent into the extent allocation tree.
+         * ref->must_insert_reserved is the flag used to record
+         * that accounting mods are required.
+         *
+         * Once we record must_insert_reserved, switch the action to
+         * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+         */
+        if (action == BTRFS_ADD_DELAYED_EXTENT) {
+                must_insert_reserved = 1;
+                action = BTRFS_ADD_DELAYED_REF;
+        } else {
+                must_insert_reserved = 0;
+        }
+        delayed_refs = &trans->transaction->delayed_refs;
+        /* first set the basic ref node struct up */
+        atomic_set(&ref->refs, 1);
+        ref->bytenr = bytenr;
+        ref->parent = parent;
+        ref->ref_mod = count_mod;
+        ref->in_tree = 1;
+        ref->num_bytes = num_bytes;
+        if (btrfs_delayed_ref_is_head(ref)) {
+                head_ref = btrfs_delayed_node_to_head(ref);
+                head_ref->must_insert_reserved = must_insert_reserved;
+                INIT_LIST_HEAD(&head_ref->cluster);
+                mutex_init(&head_ref->mutex);
+        } else {
+                full_ref = btrfs_delayed_node_to_ref(ref);
+                full_ref->root = ref_root;
+                full_ref->generation = ref_generation;
+                full_ref->owner_objectid = owner_objectid;
+                full_ref->pin = pin;
+                full_ref->action = action;
+        }
+        existing = tree_insert(&delayed_refs->root, bytenr,
+                               parent, &ref->rb_node);
+        if (existing) {
+                if (btrfs_delayed_ref_is_head(ref))
+                        update_existing_head_ref(existing, ref);
+                else
+                        update_existing_ref(trans, delayed_refs, existing, ref);
+                /*
+                 * we've updated the existing ref, free the newly
+                 * allocated ref
+                 */
+                kfree(ref);
+        } else {
+                if (btrfs_delayed_ref_is_head(ref)) {
+                        delayed_refs->num_heads++;
+                        delayed_refs->num_heads_ready++;
+                }
+                delayed_refs->num_entries++;
+                trans->delayed_ref_updates++;
+        }
+        return 0;
+}
+/*
+ * add a delayed ref to the tree.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                          u64 ref_generation, u64 owner_objectid, int action,
+                          int pin)
+{
+        struct btrfs_delayed_ref *ref;
+        struct btrfs_delayed_ref_head *head_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        if (!ref)
+                return -ENOMEM;
+        /*
+         * the parent = 0 case comes from cases where we don't actually
+         * know the parent yet.  It will get updated later via a add/drop
+         * pair.
+         */
+        if (parent == 0)
+                parent = bytenr;
+        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        if (!head_ref) {
+                kfree(ref);
+                return -ENOMEM;
+        }
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        /*
+         * insert both the head node and the new ref without dropping
+         * the spin lock
+         */
+        ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+                                      (u64)-1, 0, 0, 0, action, pin);
+        BUG_ON(ret);
+        ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+                                      parent, ref_root, ref_generation,
+                                      owner_objectid, action, pin);
+        BUG_ON(ret);
+        spin_unlock(&delayed_refs->lock);
+        return 0;
+}
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        delayed_refs = &trans->transaction->delayed_refs;
+        ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+        if (ref)
+                return btrfs_delayed_node_to_head(ref);
+        return NULL;
+}
+/*
+ * add a delayed ref to the tree.  This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ *
+ * The main point of this call is to add and remove a backreference in a single
+ * shot, taking the lock only once, and only searching for the head node once.
+ *
+ * It is the same as doing a ref add and delete in two separate calls.
+ */
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+                          u64 bytenr, u64 num_bytes, u64 orig_parent,
+                          u64 parent, u64 orig_ref_root, u64 ref_root,
+                          u64 orig_ref_generation, u64 ref_generation,
+                          u64 owner_objectid, int pin)
+{
+        struct btrfs_delayed_ref *ref;
+        struct btrfs_delayed_ref *old_ref;
+        struct btrfs_delayed_ref_head *head_ref;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        int ret;
+        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        if (!ref)
+                return -ENOMEM;
+        old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
+        if (!old_ref) {
+                kfree(ref);
+                return -ENOMEM;
+        }
+        /*
+         * the parent = 0 case comes from cases where we don't actually
+         * know the parent yet.  It will get updated later via a add/drop
+         * pair.
+         */
+        if (parent == 0)
+                parent = bytenr;
+        if (orig_parent == 0)
+                orig_parent = bytenr;
+        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        if (!head_ref) {
+                kfree(ref);
+                kfree(old_ref);
+                return -ENOMEM;
+        }
+        delayed_refs = &trans->transaction->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        /*
+         * insert both the head node and the new ref without dropping
+         * the spin lock
+         */
+        ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+                                      (u64)-1, 0, 0, 0,
+                                      BTRFS_UPDATE_DELAYED_HEAD, 0);
+        BUG_ON(ret);
+        ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+                                      parent, ref_root, ref_generation,
+                                      owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
+        BUG_ON(ret);
+        ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
+                                      orig_parent, orig_ref_root,
+                                      orig_ref_generation, owner_objectid,
+                                      BTRFS_DROP_DELAYED_REF, pin);
+        BUG_ON(ret);
+        spin_unlock(&delayed_refs->lock);
+        return 0;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 000000000000..3bec2ff0b15c
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+/* these are the possible values of struct btrfs_delayed_ref->action */
+#define BTRFS_ADD_DELAYED_REF    1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF   2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+struct btrfs_delayed_ref_node {
+        struct rb_node rb_node;
+        /* the starting bytenr of the extent */
+        u64 bytenr;
+        /* the parent our backref will point to */
+        u64 parent;
+        /* the size of the extent */
+        u64 num_bytes;
+        /* ref count on this data structure */
+        atomic_t refs;
+        /*
+         * how many refs is this entry adding or deleting.  For
+         * head refs, this may be a negative number because it is keeping
+         * track of the total mods done to the reference count.
+         * For individual refs, this will always be a positive number
+         *
+         * It may be more than one, since it is possible for a single
+         * parent to have more than one ref on an extent
+         */
+        int ref_mod;
+        /* is this node still in the rbtree? */
+        unsigned int in_tree:1;
+};
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent.  They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+        struct btrfs_delayed_ref_node node;
+        /*
+         * the mutex is held while running the refs, and it is also
+         * held when checking the sum of reference modifications.
+         */
+        struct mutex mutex;
+        struct list_head cluster;
+        /*
+         * when a new extent is allocated, it is just reserved in memory
+         * The actual extent isn't inserted into the extent allocation tree
+         * until the delayed ref is processed.  must_insert_reserved is
+         * used to flag a delayed ref so the accounting can be updated
+         * when a full insert is done.
+         *
+         * It is possible the extent will be freed before it is ever
+         * inserted into the extent allocation tree.  In this case
+         * we need to update the in ram accounting to properly reflect
+         * the free has happened.
+         */
+        unsigned int must_insert_reserved:1;
+};
+struct btrfs_delayed_ref {
+        struct btrfs_delayed_ref_node node;
+        /* the root objectid our ref will point to */
+        u64 root;
+        /* the generation for the backref */
+        u64 generation;
+        /* owner_objectid of the backref  */
+        u64 owner_objectid;
+        /* operation done by this entry in the rbtree */
+        u8 action;
+        /* if pin == 1, when the extent is freed it will be pinned until
+         * transaction commit
+         */
+        unsigned int pin:1;
+};
+struct btrfs_delayed_ref_root {
+        struct rb_root root;
+        /* this spin lock protects the rbtree and the entries inside */
+        spinlock_t lock;
+        /* how many delayed ref updates we've queued, used by the
+         * throttling code
+         */
+        unsigned long num_entries;
+        /* total number of head nodes in tree */
+        unsigned long num_heads;
+        /* total number of head nodes ready for processing */
+        unsigned long num_heads_ready;
+        /*
+         * set when the tree is flushing before a transaction commit,
+         * used by the throttling code to decide if new updates need
+         * to be run right away
+         */
+        int flushing;
+        u64 run_delayed_start;
+};
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+        WARN_ON(atomic_read(&ref->refs) == 0);
+        if (atomic_dec_and_test(&ref->refs)) {
+                WARN_ON(ref->in_tree);
+                kfree(ref);
+        }
+}
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+                          u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+                          u64 ref_generation, u64 owner_objectid, int action,
+                          int pin);
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 bytenr,
+                            u64 num_bytes, u32 *refs);
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+                          u64 bytenr, u64 num_bytes, u64 orig_parent,
+                          u64 parent, u64 orig_ref_root, u64 ref_root,
+                          u64 orig_ref_generation, u64 ref_generation,
+                          u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+                           struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+                           struct list_head *cluster, u64 search_start);
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+        return node->parent == (u64)-1;
+}
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_ref *
+btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+{
+        WARN_ON(btrfs_delayed_ref_is_head(node));
+        return container_of(node, struct btrfs_delayed_ref, node);
+}
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+        WARN_ON(!btrfs_delayed_ref_is_head(node));
+        return container_of(node, struct btrfs_delayed_ref_head, node);
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7d..1d70236ba00c 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
        key.objectid = dir;
        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
        key.offset = btrfs_name_hash(name, name_len);
        path = btrfs_alloc_path();
+        path->leave_spinning = 1;
        data_size = sizeof(*dir_item) + name_len;
        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
                                        name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6ec80c0fc869..92d73929d381 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -668,14 +668,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
+        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        struct extent_buffer *eb;
+        int was_dirty;
        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (!(current->flags & PF_MEMALLOC)) {
+                return extent_write_full_page(tree, page,
+                                              btree_get_extent, wbc);
+        }
-        if (current->flags & PF_MEMALLOC) {
+        redirty_page_for_writepage(wbc, page);
-                redirty_page_for_writepage(wbc, page);
+        eb = btrfs_find_tree_block(root, page_offset(page),
-                unlock_page(page);
+                                      PAGE_CACHE_SIZE);
-                return 0;
+        WARN_ON(!eb);
+        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+        if (!was_dirty) {
+                spin_lock(&root->fs_info->delalloc_lock);
+                root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+                spin_unlock(&root->fs_info->delalloc_lock);
        }
-        return extent_write_full_page(tree, page, btree_get_extent, wbc);
+        free_extent_buffer(eb);
+        unlock_page(page);
+        return 0;
 }
 static int btree_writepages(struct address_space *mapping,
@@ -684,15 +701,15 @@ static int btree_writepages(struct address_space *mapping,
        struct extent_io_tree *tree;
        tree = &BTRFS_I(mapping->host)->io_tree;
        if (wbc->sync_mode == WB_SYNC_NONE) {
+                struct btrfs_root *root = BTRFS_I(mapping->host)->root;
                u64 num_dirty;
-                u64 start = 0;
                unsigned long thresh = 32 * 1024 * 1024;
                if (wbc->for_kupdate)
                        return 0;
-                num_dirty = count_range_bits(tree, &start, (u64)-1,
+                /* this is a bit racy, but that's ok */
-                                             thresh, EXTENT_DIRTY);
+                num_dirty = root->fs_info->dirty_metadata_bytes;
                if (num_dirty < thresh)
                        return 0;
        }
@@ -859,9 +876,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
            root->fs_info->running_transaction->transid) {
                btrfs_assert_tree_locked(buf);
-                /* ugh, clear_extent_buffer_dirty can be expensive */
+                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
-                btrfs_set_lock_blocking(buf);
+                        spin_lock(&root->fs_info->delalloc_lock);
+                        if (root->fs_info->dirty_metadata_bytes >= buf->len)
+                                root->fs_info->dirty_metadata_bytes -= buf->len;
+                        else
+                                WARN_ON(1);
+                        spin_unlock(&root->fs_info->delalloc_lock);
+                }
+                /* ugh, clear_extent_buffer_dirty needs to lock the page */
+                btrfs_set_lock_blocking(buf);
                clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
                                          buf);
        }
@@ -1471,12 +1496,6 @@ static int transaction_kthread(void *arg)
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
-                if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
-                        printk(KERN_INFO "btrfs: total reference cache "
-                               "size %llu\n",
-                               root->fs_info->total_ref_cache_size);
-                }
                mutex_lock(&root->fs_info->trans_mutex);
                cur = root->fs_info->running_transaction;
                if (!cur) {
@@ -1493,6 +1512,7 @@ static int transaction_kthread(void *arg)
                mutex_unlock(&root->fs_info->trans_mutex);
                trans = btrfs_start_transaction(root, 1);
                ret = btrfs_commit_transaction(trans, root);
 sleep:
                wake_up_process(root->fs_info->cleaner_kthread);
                mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->hashers);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+        INIT_LIST_HEAD(&fs_info->ordered_operations);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->new_trans_lock);
        spin_lock_init(&fs_info->ref_cache_lock);
@@ -1611,10 +1632,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        extent_io_tree_init(&fs_info->pinned_extents,
                             fs_info->btree_inode->i_mapping, GFP_NOFS);
-        extent_io_tree_init(&fs_info->pending_del,
-                             fs_info->btree_inode->i_mapping, GFP_NOFS);
-        extent_io_tree_init(&fs_info->extent_ins,
-                             fs_info->btree_inode->i_mapping, GFP_NOFS);
        fs_info->do_barriers = 1;
        INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,9 +1644,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        insert_inode_hash(fs_info->btree_inode);
        mutex_init(&fs_info->trans_mutex);
+        mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->drop_mutex);
-        mutex_init(&fs_info->extent_ins_mutex);
        mutex_init(&fs_info->pinned_mutex);
        mutex_init(&fs_info->chunk_mutex);
        mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2358,8 +2375,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
        struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
        u64 transid = btrfs_header_generation(buf);
        struct inode *btree_inode = root->fs_info->btree_inode;
+        int was_dirty;
-        btrfs_set_lock_blocking(buf);
        btrfs_assert_tree_locked(buf);
        if (transid != root->fs_info->generation) {
@@ -2370,7 +2386,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
                        (unsigned long long)root->fs_info->generation);
                WARN_ON(1);
        }
-        set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+        was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+                                            buf);
+        if (!was_dirty) {
+                spin_lock(&root->fs_info->delalloc_lock);
+                root->fs_info->dirty_metadata_bytes += buf->len;
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
 }
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2410,6 +2432,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 int btree_lock_page_hook(struct page *page)
 {
        struct inode *inode = page->mapping->host;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_buffer *eb;
        unsigned long len;
@@ -2425,6 +2448,16 @@ int btree_lock_page_hook(struct page *page)
        btrfs_tree_lock(eb);
        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+                spin_lock(&root->fs_info->delalloc_lock);
+                if (root->fs_info->dirty_metadata_bytes >= eb->len)
+                        root->fs_info->dirty_metadata_bytes -= eb->len;
+                else
+                        WARN_ON(1);
+                spin_unlock(&root->fs_info->delalloc_lock);
+        }
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
 out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227be..c958ecbc1916 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fefe83ad2059..f5e7cae63d80 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -49,17 +49,23 @@ struct pending_extent_op {
        int del;
 };
-static int finish_current_insert(struct btrfs_trans_handle *trans,
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *extent_root, int all);
+                                         struct btrfs_root *root, u64 parent,
-static int del_pending_extents(struct btrfs_trans_handle *trans,
+                                         u64 root_objectid, u64 ref_generation,
-                               struct btrfs_root *extent_root, int all);
+                                         u64 owner, struct btrfs_key *ins,
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
+                                         int ref_mod);
-                          struct btrfs_root *root,
+static int update_reserved_extents(struct btrfs_root *root,
-                          u64 bytenr, u64 num_bytes, int is_data);
+                                   u64 bytenr, u64 num, int reserve);
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc,
                              int mark_free);
+static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        u64 bytenr, u64 num_bytes, u64 parent,
+                                        u64 root_objectid, u64 ref_generation,
+                                        u64 owner_objectid, int pin,
+                                        int ref_to_drop);
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                          struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -554,262 +560,13 @@ out:
        return ret;
 }
-/*
- * updates all the backrefs that are pending on update_list for the
- * extent_root
- */
-static noinline int update_backrefs(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *extent_root,
-                                    struct btrfs_path *path,
-                                    struct list_head *update_list)
-{
-        struct btrfs_key key;
-        struct btrfs_extent_ref *ref;
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        struct pending_extent_op *op;
-        struct extent_buffer *leaf;
-        int ret = 0;
-        struct list_head *cur = update_list->next;
-        u64 ref_objectid;
-        u64 ref_root = extent_root->root_key.objectid;
-        op = list_entry(cur, struct pending_extent_op, list);
-search:
-        key.objectid = op->bytenr;
-        key.type = BTRFS_EXTENT_REF_KEY;
-        key.offset = op->orig_parent;
-        ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
-        BUG_ON(ret);
-        leaf = path->nodes[0];
-loop:
-        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-        ref_objectid = btrfs_ref_objectid(leaf, ref);
-        if (btrfs_ref_root(leaf, ref) != ref_root ||
-            btrfs_ref_generation(leaf, ref) != op->orig_generation ||
-            (ref_objectid != op->level &&
-             ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
-                printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
-                       "root %llu, owner %u\n",
-                       (unsigned long long)op->bytenr,
-                       (unsigned long long)op->orig_parent,
-                       (unsigned long long)ref_root, op->level);
-                btrfs_print_leaf(extent_root, leaf);
-                BUG();
-        }
-        key.objectid = op->bytenr;
-        key.offset = op->parent;
-        key.type = BTRFS_EXTENT_REF_KEY;
-        ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
-        BUG_ON(ret);
-        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-        btrfs_set_ref_generation(leaf, ref, op->generation);
-        cur = cur->next;
-        list_del_init(&op->list);
-        unlock_extent(&info->extent_ins, op->bytenr,
-                      op->bytenr + op->num_bytes - 1, GFP_NOFS);
-        kfree(op);
-        if (cur == update_list) {
-                btrfs_mark_buffer_dirty(path->nodes[0]);
-                btrfs_release_path(extent_root, path);
-                goto out;
-        }
-        op = list_entry(cur, struct pending_extent_op, list);
-        path->slots[0]++;
-        while (path->slots[0] < btrfs_header_nritems(leaf)) {
-                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                if (key.objectid == op->bytenr &&
-                    key.type == BTRFS_EXTENT_REF_KEY)
-                        goto loop;
-                path->slots[0]++;
-        }
-        btrfs_mark_buffer_dirty(path->nodes[0]);
-        btrfs_release_path(extent_root, path);
-        goto search;
-out:
-        return 0;
-}
-static noinline int insert_extents(struct btrfs_trans_handle *trans,
-                                   struct btrfs_root *extent_root,
-                                   struct btrfs_path *path,
-                                   struct list_head *insert_list, int nr)
-{
-        struct btrfs_key *keys;
-        u32 *data_size;
-        struct pending_extent_op *op;
-        struct extent_buffer *leaf;
-        struct list_head *cur = insert_list->next;
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        u64 ref_root = extent_root->root_key.objectid;
-        int i = 0, last = 0, ret;
-        int total = nr * 2;
-        if (!nr)
-                return 0;
-        keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
-        if (!keys)
-                return -ENOMEM;
-        data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
-        if (!data_size) {
-                kfree(keys);
-                return -ENOMEM;
-        }
-        list_for_each_entry(op, insert_list, list) {
-                keys[i].objectid = op->bytenr;
-                keys[i].offset = op->num_bytes;
-                keys[i].type = BTRFS_EXTENT_ITEM_KEY;
-                data_size[i] = sizeof(struct btrfs_extent_item);
-                i++;
-                keys[i].objectid = op->bytenr;
-                keys[i].offset = op->parent;
-                keys[i].type = BTRFS_EXTENT_REF_KEY;
-                data_size[i] = sizeof(struct btrfs_extent_ref);
-                i++;
-        }
-        op = list_entry(cur, struct pending_extent_op, list);
-        i = 0;
-        while (i < total) {
-                int c;
-                ret = btrfs_insert_some_items(trans, extent_root, path,
-                                              keys+i, data_size+i, total-i);
-                BUG_ON(ret < 0);
-                if (last && ret > 1)
-                        BUG();
-                leaf = path->nodes[0];
-                for (c = 0; c < ret; c++) {
-                        int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
-                        /*
-                         * if the first item we inserted was a backref, then
-                         * the EXTENT_ITEM will be the odd c's, else it will
-                         * be the even c's
-                         */
-                        if ((ref_first && (c % 2)) ||
-                            (!ref_first && !(c % 2))) {
-                                struct btrfs_extent_item *itm;
-                                itm = btrfs_item_ptr(leaf, path->slots[0] + c,
-                                                     struct btrfs_extent_item);
-                                btrfs_set_extent_refs(path->nodes[0], itm, 1);
-                                op->del++;
-                        } else {
-                                struct btrfs_extent_ref *ref;
-                                ref = btrfs_item_ptr(leaf, path->slots[0] + c,
-                                                     struct btrfs_extent_ref);
-                                btrfs_set_ref_root(leaf, ref, ref_root);
-                                btrfs_set_ref_generation(leaf, ref,
-                                                         op->generation);
-                                btrfs_set_ref_objectid(leaf, ref, op->level);
-                                btrfs_set_ref_num_refs(leaf, ref, 1);
-                                op->del++;
-                        }
-                        /*
-                         * using del to see when its ok to free up the
-                         * pending_extent_op.  In the case where we insert the
-                         * last item on the list in order to help do batching
-                         * we need to not free the extent op until we actually
-                         * insert the extent_item
-                         */
-                        if (op->del == 2) {
-                                unlock_extent(&info->extent_ins, op->bytenr,
-                                              op->bytenr + op->num_bytes - 1,
-                                              GFP_NOFS);
-                                cur = cur->next;
-                                list_del_init(&op->list);
-                                kfree(op);
-                                if (cur != insert_list)
-                                        op = list_entry(cur,
-                                                struct pending_extent_op,
-                                                list);
-                        }
-                }
-                btrfs_mark_buffer_dirty(leaf);
-                btrfs_release_path(extent_root, path);
-                /*
-                 * Ok backref's and items usually go right next to eachother,
-                 * but if we could only insert 1 item that means that we
-                 * inserted on the end of a leaf, and we have no idea what may
-                 * be on the next leaf so we just play it safe.  In order to
-                 * try and help this case we insert the last thing on our
-                 * insert list so hopefully it will end up being the last
-                 * thing on the leaf and everything else will be before it,
-                 * which will let us insert a whole bunch of items at the same
-                 * time.
-                 */
-                if (ret == 1 && !last && (i + ret < total)) {
-                        /*
-                         * last: where we will pick up the next time around
-                         * i: our current key to insert, will be total - 1
-                         * cur: the current op we are screwing with
-                         * op: duh
-                         */
-                        last = i + ret;
-                        i = total - 1;
-                        cur = insert_list->prev;
-                        op = list_entry(cur, struct pending_extent_op, list);
-                } else if (last) {
-                        /*
-                         * ok we successfully inserted the last item on the
-                         * list, lets reset everything
-                         *
-                         * i: our current key to insert, so where we left off
-                         *    last time
-                         * last: done with this
-                         * cur: the op we are messing with
-                         * op: duh
-                         * total: since we inserted the last key, we need to
-                         *        decrement total so we dont overflow
-                         */
-                        i = last;
-                        last = 0;
-                        total--;
-                        if (i < total) {
-                                cur = insert_list->next;
-                                op = list_entry(cur, struct pending_extent_op,
-                                                list);
-                        }
-                } else {
-                        i += ret;
-                }
-                cond_resched();
-        }
-        ret = 0;
-        kfree(keys);
-        kfree(data_size);
-        return ret;
-}
 static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path,
                                          u64 bytenr, u64 parent,
                                          u64 ref_root, u64 ref_generation,
-                                          u64 owner_objectid)
+                                          u64 owner_objectid,
+                                          int refs_to_add)
 {
        struct btrfs_key key;
        struct extent_buffer *leaf;
@@ -829,9 +586,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
                btrfs_set_ref_root(leaf, ref, ref_root);
                btrfs_set_ref_generation(leaf, ref, ref_generation);
                btrfs_set_ref_objectid(leaf, ref, owner_objectid);
-                btrfs_set_ref_num_refs(leaf, ref, 1);
+                btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
        } else if (ret == -EEXIST) {
                u64 existing_owner;
                BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
                leaf = path->nodes[0];
                ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +603,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
                num_refs = btrfs_ref_num_refs(leaf, ref);
                BUG_ON(num_refs == 0);
-                btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+                btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
                existing_owner = btrfs_ref_objectid(leaf, ref);
                if (existing_owner != owner_objectid &&
@@ -857,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
        } else {
                goto out;
        }
+        btrfs_unlock_up_safe(path, 1);
        btrfs_mark_buffer_dirty(path->nodes[0]);
 out:
        btrfs_release_path(root, path);
@@ -865,7 +624,8 @@ out:
 static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
-                                          struct btrfs_path *path)
+                                          struct btrfs_path *path,
+                                          int refs_to_drop)
 {
        struct extent_buffer *leaf;
        struct btrfs_extent_ref *ref;
@@ -875,8 +635,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
        leaf = path->nodes[0];
        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
        num_refs = btrfs_ref_num_refs(leaf, ref);
-        BUG_ON(num_refs == 0);
+        BUG_ON(num_refs < refs_to_drop);
-        num_refs -= 1;
+        num_refs -= refs_to_drop;
        if (num_refs == 0) {
                ret = btrfs_del_item(trans, root, path);
        } else {
@@ -927,332 +687,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 #endif
 }
-static noinline int free_extents(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *extent_root,
-                                 struct list_head *del_list)
-{
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        struct btrfs_path *path;
-        struct btrfs_key key, found_key;
-        struct extent_buffer *leaf;
-        struct list_head *cur;
-        struct pending_extent_op *op;
-        struct btrfs_extent_item *ei;
-        int ret, num_to_del, extent_slot = 0, found_extent = 0;
-        u32 refs;
-        u64 bytes_freed = 0;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        path->reada = 1;
-search:
-        /* search for the backref for the current ref we want to delete */
-        cur = del_list->next;
-        op = list_entry(cur, struct pending_extent_op, list);
-        ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
-                                    op->orig_parent,
-                                    extent_root->root_key.objectid,
-                                    op->orig_generation, op->level, 1);
-        if (ret) {
-                printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
-                       "root %llu gen %llu owner %u\n",
-                       (unsigned long long)op->bytenr,
-                       (unsigned long long)extent_root->root_key.objectid,
-                       (unsigned long long)op->orig_generation, op->level);
-                btrfs_print_leaf(extent_root, path->nodes[0]);
-                WARN_ON(1);
-                goto out;
-        }
-        extent_slot = path->slots[0];
-        num_to_del = 1;
-        found_extent = 0;
-        /*
-         * if we aren't the first item on the leaf we can move back one and see
-         * if our ref is right next to our extent item
-         */
-        if (likely(extent_slot)) {
-                extent_slot--;
-                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
-                                      extent_slot);
-                if (found_key.objectid == op->bytenr &&
-                    found_key.type == BTRFS_EXTENT_ITEM_KEY &&
-                    found_key.offset == op->num_bytes) {
-                        num_to_del++;
-                        found_extent = 1;
-                }
-        }
-        /*
-         * if we didn't find the extent we need to delete the backref and then
-         * search for the extent item key so we can update its ref count
-         */
-        if (!found_extent) {
-                key.objectid = op->bytenr;
-                key.type = BTRFS_EXTENT_ITEM_KEY;
-                key.offset = op->num_bytes;
-                ret = remove_extent_backref(trans, extent_root, path);
-                BUG_ON(ret);
-                btrfs_release_path(extent_root, path);
-                ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
-                BUG_ON(ret);
-                extent_slot = path->slots[0];
-        }
-        /* this is where we update the ref count for the extent */
-        leaf = path->nodes[0];
-        ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
-        refs = btrfs_extent_refs(leaf, ei);
-        BUG_ON(refs == 0);
-        refs--;
-        btrfs_set_extent_refs(leaf, ei, refs);
-        btrfs_mark_buffer_dirty(leaf);
-        /*
-         * This extent needs deleting.  The reason cur_slot is extent_slot +
-         * num_to_del is because extent_slot points to the slot where the extent
-         * is, and if the backref was not right next to the extent we will be
-         * deleting at least 1 item, and will want to start searching at the
-         * slot directly next to extent_slot.  However if we did find the
-         * backref next to the extent item them we will be deleting at least 2
-         * items and will want to start searching directly after the ref slot
-         */
-        if (!refs) {
-                struct list_head *pos, *n, *end;
-                int cur_slot = extent_slot+num_to_del;
-                u64 super_used;
-                u64 root_used;
-                path->slots[0] = extent_slot;
-                bytes_freed = op->num_bytes;
-                mutex_lock(&info->pinned_mutex);
-                ret = pin_down_bytes(trans, extent_root, op->bytenr,
-                                     op->num_bytes, op->level >=
-                                     BTRFS_FIRST_FREE_OBJECTID);
-                mutex_unlock(&info->pinned_mutex);
-                BUG_ON(ret < 0);
-                op->del = ret;
-                /*
-                 * we need to see if we can delete multiple things at once, so
-                 * start looping through the list of extents we are wanting to
-                 * delete and see if their extent/backref's are right next to
-                 * eachother and the extents only have 1 ref
-                 */
-                for (pos = cur->next; pos != del_list; pos = pos->next) {
-                        struct pending_extent_op *tmp;
-                        tmp = list_entry(pos, struct pending_extent_op, list);
-                        /* we only want to delete extent+ref at this stage */
-                        if (cur_slot >= btrfs_header_nritems(leaf) - 1)
-                                break;
-                        btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
-                        if (found_key.objectid != tmp->bytenr ||
-                            found_key.type != BTRFS_EXTENT_ITEM_KEY ||
-                            found_key.offset != tmp->num_bytes)
-                                break;
-                        /* check to make sure this extent only has one ref */
-                        ei = btrfs_item_ptr(leaf, cur_slot,
-                                            struct btrfs_extent_item);
-                        if (btrfs_extent_refs(leaf, ei) != 1)
-                                break;
-                        btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
-                        if (found_key.objectid != tmp->bytenr ||
-                            found_key.type != BTRFS_EXTENT_REF_KEY ||
-                            found_key.offset != tmp->orig_parent)
-                                break;
-                        /*
-                         * the ref is right next to the extent, we can set the
-                         * ref count to 0 since we will delete them both now
-                         */
-                        btrfs_set_extent_refs(leaf, ei, 0);
-                        /* pin down the bytes for this extent */
-                        mutex_lock(&info->pinned_mutex);
-                        ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
-                                             tmp->num_bytes, tmp->level >=
-                                             BTRFS_FIRST_FREE_OBJECTID);
-                        mutex_unlock(&info->pinned_mutex);
-                        BUG_ON(ret < 0);
-                        /*
-                         * use the del field to tell if we need to go ahead and
-                         * free up the extent when we delete the item or not.
-                         */
-                        tmp->del = ret;
-                        bytes_freed += tmp->num_bytes;
-                        num_to_del += 2;
-                        cur_slot += 2;
-                }
-                end = pos;
-                /* update the free space counters */
-                spin_lock(&info->delalloc_lock);
-                super_used = btrfs_super_bytes_used(&info->super_copy);
-                btrfs_set_super_bytes_used(&info->super_copy,
-                                           super_used - bytes_freed);
-                root_used = btrfs_root_used(&extent_root->root_item);
-                btrfs_set_root_used(&extent_root->root_item,
-                                    root_used - bytes_freed);
-                spin_unlock(&info->delalloc_lock);
-                /* delete the items */
-                ret = btrfs_del_items(trans, extent_root, path,
-                                      path->slots[0], num_to_del);
-                BUG_ON(ret);
-                /*
-                 * loop through the extents we deleted and do the cleanup work
-                 * on them
-                 */
-                for (pos = cur, n = pos->next; pos != end;
-                     pos = n, n = pos->next) {
-                        struct pending_extent_op *tmp;
-                        tmp = list_entry(pos, struct pending_extent_op, list);
-                        /*
-                         * remember tmp->del tells us wether or not we pinned
-                         * down the extent
-                         */
-                        ret = update_block_group(trans, extent_root,
-                                                 tmp->bytenr, tmp->num_bytes, 0,
-                                                 tmp->del);
-                        BUG_ON(ret);
-                        list_del_init(&tmp->list);
-                        unlock_extent(&info->extent_ins, tmp->bytenr,
-                                      tmp->bytenr + tmp->num_bytes - 1,
-                                      GFP_NOFS);
-                        kfree(tmp);
-                }
-        } else if (refs && found_extent) {
-                /*
-                 * the ref and extent were right next to eachother, but the
-                 * extent still has a ref, so just free the backref and keep
-                 * going
-                 */
-                ret = remove_extent_backref(trans, extent_root, path);
-                BUG_ON(ret);
-                list_del_init(&op->list);
-                unlock_extent(&info->extent_ins, op->bytenr,
-                              op->bytenr + op->num_bytes - 1, GFP_NOFS);
-                kfree(op);
-        } else {
-                /*
-                 * the extent has multiple refs and the backref we were looking
-                 * for was not right next to it, so just unlock and go next,
-                 * we're good to go
-                 */
-                list_del_init(&op->list);
-                unlock_extent(&info->extent_ins, op->bytenr,
-                              op->bytenr + op->num_bytes - 1, GFP_NOFS);
-                kfree(op);
-        }
-        btrfs_release_path(extent_root, path);
-        if (!list_empty(del_list))
-                goto search;
-out:
-        btrfs_free_path(path);
-        return ret;
-}
 static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root, u64 bytenr,
+                                     u64 num_bytes,
                                     u64 orig_parent, u64 parent,
                                     u64 orig_root, u64 ref_root,
                                     u64 orig_generation, u64 ref_generation,
                                     u64 owner_objectid)
 {
        int ret;
-        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
-        struct btrfs_path *path;
-        if (root == root->fs_info->extent_root) {
-                struct pending_extent_op *extent_op;
-                u64 num_bytes;
-                BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
-                num_bytes = btrfs_level_size(root, (int)owner_objectid);
-                mutex_lock(&root->fs_info->extent_ins_mutex);
-                if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-                                bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-                        u64 priv;
-                        ret = get_state_private(&root->fs_info->extent_ins,
-                                                bytenr, &priv);
-                        BUG_ON(ret);
-                        extent_op = (struct pending_extent_op *)
-                                                        (unsigned long)priv;
-                        BUG_ON(extent_op->parent != orig_parent);
-                        BUG_ON(extent_op->generation != orig_generation);
-                        extent_op->parent = parent;
+        ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
-                        extent_op->generation = ref_generation;
+                                       orig_parent, parent, orig_root,
-                } else {
+                                       ref_root, orig_generation,
-                        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                                       ref_generation, owner_objectid, pin);
-                        BUG_ON(!extent_op);
-                        extent_op->type = PENDING_BACKREF_UPDATE;
-                        extent_op->bytenr = bytenr;
-                        extent_op->num_bytes = num_bytes;
-                        extent_op->parent = parent;
-                        extent_op->orig_parent = orig_parent;
-                        extent_op->generation = ref_generation;
-                        extent_op->orig_generation = orig_generation;
-                        extent_op->level = (int)owner_objectid;
-                        INIT_LIST_HEAD(&extent_op->list);
-                        extent_op->del = 0;
-                        set_extent_bits(&root->fs_info->extent_ins,
-                                        bytenr, bytenr + num_bytes - 1,
-                                        EXTENT_WRITEBACK, GFP_NOFS);
-                        set_state_private(&root->fs_info->extent_ins,
-                                          bytenr, (unsigned long)extent_op);
-                }
-                mutex_unlock(&root->fs_info->extent_ins_mutex);
-                return 0;
-        }
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        ret = lookup_extent_backref(trans, extent_root, path,
-                                    bytenr, orig_parent, orig_root,
-                                    orig_generation, owner_objectid, 1);
-        if (ret)
-                goto out;
-        ret = remove_extent_backref(trans, extent_root, path);
-        if (ret)
-                goto out;
-        ret = insert_extent_backref(trans, extent_root, path, bytenr,
-                                    parent, ref_root, ref_generation,
-                                    owner_objectid);
        BUG_ON(ret);
-        finish_current_insert(trans, extent_root, 0);
-        del_pending_extents(trans, extent_root, 0);
-out:
-        btrfs_free_path(path);
        return ret;
 }
 int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
                            struct btrfs_root *root, u64 bytenr,
-                            u64 orig_parent, u64 parent,
+                            u64 num_bytes, u64 orig_parent, u64 parent,
                            u64 ref_root, u64 ref_generation,
                            u64 owner_objectid)
 {
@@ -1260,20 +716,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
                return 0;
-        ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
-                                        parent, ref_root, ref_root,
+        ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
-                                        ref_generation, ref_generation,
+                                        orig_parent, parent, ref_root,
-                                        owner_objectid);
+                                        ref_root, ref_generation,
+                                        ref_generation, owner_objectid);
        return ret;
 }
 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root, u64 bytenr,
+                                  u64 num_bytes,
                                  u64 orig_parent, u64 parent,
                                  u64 orig_root, u64 ref_root,
                                  u64 orig_generation, u64 ref_generation,
                                  u64 owner_objectid)
 {
+        int ret;
+        ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
+                                    ref_generation, owner_objectid,
+                                    BTRFS_ADD_DELAYED_REF, 0);
+        BUG_ON(ret);
+        return ret;
+}
+static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
+                          struct btrfs_root *root, u64 bytenr,
+                          u64 num_bytes, u64 parent, u64 ref_root,
+                          u64 ref_generation, u64 owner_objectid,
+                          int refs_to_add)
+{
        struct btrfs_path *path;
        int ret;
        struct btrfs_key key;
@@ -1286,17 +758,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->reada = 1;
+        path->leave_spinning = 1;
        key.objectid = bytenr;
        key.type = BTRFS_EXTENT_ITEM_KEY;
-        key.offset = (u64)-1;
+        key.offset = num_bytes;
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
+        /* first find the extent item and update its reference count */
-                                0, 1);
+        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
-        if (ret < 0)
+                                path, 0, 1);
+        if (ret < 0) {
+                btrfs_set_path_blocking(path);
                return ret;
-        BUG_ON(ret == 0 || path->slots[0] == 0);
+        }
-        path->slots[0]--;
+        if (ret > 0) {
+                WARN_ON(1);
+                btrfs_free_path(path);
+                return -EIO;
+        }
        l = path->nodes[0];
        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1310,21 +789,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(l, item);
-        btrfs_set_extent_refs(l, item, refs + 1);
+        btrfs_set_extent_refs(l, item, refs + refs_to_add);
+        btrfs_unlock_up_safe(path, 1);
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_release_path(root->fs_info->extent_root, path);
        path->reada = 1;
+        path->leave_spinning = 1;
+        /* now insert the actual backref */
        ret = insert_extent_backref(trans, root->fs_info->extent_root,
                                    path, bytenr, parent,
                                    ref_root, ref_generation,
-                                    owner_objectid);
+                                    owner_objectid, refs_to_add);
        BUG_ON(ret);
-        finish_current_insert(trans, root->fs_info->extent_root, 0);
-        del_pending_extents(trans, root->fs_info->extent_root, 0);
        btrfs_free_path(path);
        return 0;
 }
@@ -1339,68 +821,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
        if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
            owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
                return 0;
-        ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+        ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
                                     0, ref_root, 0, ref_generation,
                                     owner_objectid);
        return ret;
 }
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
+static int drop_delayed_ref(struct btrfs_trans_handle *trans,
-                         struct btrfs_root *root)
+                                        struct btrfs_root *root,
+                                        struct btrfs_delayed_ref_node *node)
+{
+        int ret = 0;
+        struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+        BUG_ON(node->ref_mod == 0);
+        ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+                                  node->parent, ref->root, ref->generation,
+                                  ref->owner_objectid, ref->pin, node->ref_mod);
+        return ret;
+}
+/* helper function to actually process a single delayed ref entry */
+static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_delayed_ref_node *node,
+                                        int insert_reserved)
 {
-        u64 start;
-        u64 end;
        int ret;
+        struct btrfs_delayed_ref *ref;
+        if (node->parent == (u64)-1) {
+                struct btrfs_delayed_ref_head *head;
+                /*
+                 * we've hit the end of the chain and we were supposed
+                 * to insert this extent into the tree.  But, it got
+                 * deleted before we ever needed to insert it, so all
+                 * we have to do is clean up the accounting
+                 */
+                if (insert_reserved) {
+                        update_reserved_extents(root, node->bytenr,
+                                                node->num_bytes, 0);
+                }
+                head = btrfs_delayed_node_to_head(node);
+                mutex_unlock(&head->mutex);
+                return 0;
+        }
-        while(1) {
+        ref = btrfs_delayed_node_to_ref(node);
-                finish_current_insert(trans, root->fs_info->extent_root, 1);
+        if (ref->action == BTRFS_ADD_DELAYED_REF) {
-                del_pending_extents(trans, root->fs_info->extent_root, 1);
+                if (insert_reserved) {
+                        struct btrfs_key ins;
-                /* is there more work to do? */
+                        ins.objectid = node->bytenr;
-                ret = find_first_extent_bit(&root->fs_info->pending_del,
+                        ins.offset = node->num_bytes;
-                                            0, &start, &end, EXTENT_WRITEBACK);
+                        ins.type = BTRFS_EXTENT_ITEM_KEY;
-                if (!ret)
-                        continue;
+                        /* record the full extent allocation */
-                ret = find_first_extent_bit(&root->fs_info->extent_ins,
+                        ret = __btrfs_alloc_reserved_extent(trans, root,
-                                            0, &start, &end, EXTENT_WRITEBACK);
+                                        node->parent, ref->root,
-                if (!ret)
+                                        ref->generation, ref->owner_objectid,
-                        continue;
+                                        &ins, node->ref_mod);
-                break;
+                        update_reserved_extents(root, node->bytenr,
+                                                node->num_bytes, 0);
+                } else {
+                        /* just add one backref */
+                        ret = add_extent_ref(trans, root, node->bytenr,
+                                     node->num_bytes,
+                                     node->parent, ref->root, ref->generation,
+                                     ref->owner_objectid, node->ref_mod);
+                }
+                BUG_ON(ret);
+        } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+                WARN_ON(insert_reserved);
+                ret = drop_delayed_ref(trans, root, node);
        }
        return 0;
 }
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+static noinline struct btrfs_delayed_ref_node *
-                            struct btrfs_root *root, u64 bytenr,
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
-                            u64 num_bytes, u32 *refs)
 {
-        struct btrfs_path *path;
+        struct rb_node *node;
+        struct btrfs_delayed_ref_node *ref;
+        int action = BTRFS_ADD_DELAYED_REF;
+again:
+        /*
+         * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+         * this prevents ref count from going down to zero when
+         * there still are pending delayed ref.
+         */
+        node = rb_prev(&head->node.rb_node);
+        while (1) {
+                if (!node)
+                        break;
+                ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                                rb_node);
+                if (ref->bytenr != head->node.bytenr)
+                        break;
+                if (btrfs_delayed_node_to_ref(ref)->action == action)
+                        return ref;
+                node = rb_prev(node);
+        }
+        if (action == BTRFS_ADD_DELAYED_REF) {
+                action = BTRFS_DROP_DELAYED_REF;
+                goto again;
+        }
+        return NULL;
+}
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct list_head *cluster)
+{
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_delayed_ref_node *ref;
+        struct btrfs_delayed_ref_head *locked_ref = NULL;
        int ret;
-        struct btrfs_key key;
+        int count = 0;
-        struct extent_buffer *l;
+        int must_insert_reserved = 0;
-        struct btrfs_extent_item *item;
-        WARN_ON(num_bytes < root->sectorsize);
+        delayed_refs = &trans->transaction->delayed_refs;
-        path = btrfs_alloc_path();
+        while (1) {
-        path->reada = 1;
+                if (!locked_ref) {
-        key.objectid = bytenr;
+                        /* pick a new head ref from the cluster list */
-        key.offset = num_bytes;
+                        if (list_empty(cluster))
-        btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
+                                break;
-        ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
-                                0, 0);
+                        locked_ref = list_entry(cluster->next,
-        if (ret < 0)
+                                     struct btrfs_delayed_ref_head, cluster);
-                goto out;
-        if (ret != 0) {
+                        /* grab the lock that says we are going to process
-                btrfs_print_leaf(root, path->nodes[0]);
+                         * all the refs for this head */
-                printk(KERN_INFO "btrfs failed to find block number %llu\n",
+                        ret = btrfs_delayed_ref_lock(trans, locked_ref);
-                       (unsigned long long)bytenr);
-                BUG();
+                        /*
+                         * we may have dropped the spin lock to get the head
+                         * mutex lock, and that might have given someone else
+                         * time to free the head.  If that's true, it has been
+                         * removed from our list and we can move on.
+                         */
+                        if (ret == -EAGAIN) {
+                                locked_ref = NULL;
+                                count++;
+                                continue;
+                        }
+                }
+                /*
+                 * record the must insert reserved flag before we
+                 * drop the spin lock.
+                 */
+                must_insert_reserved = locked_ref->must_insert_reserved;
+                locked_ref->must_insert_reserved = 0;
+                /*
+                 * locked_ref is the head node, so we have to go one
+                 * node back for any delayed ref updates
+                 */
+                ref = select_delayed_ref(locked_ref);
+                if (!ref) {
+                        /* All delayed refs have been processed, Go ahead
+                         * and send the head node to run_one_delayed_ref,
+                         * so that any accounting fixes can happen
+                         */
+                        ref = &locked_ref->node;
+                        list_del_init(&locked_ref->cluster);
+                        locked_ref = NULL;
+                }
+                ref->in_tree = 0;
+                rb_erase(&ref->rb_node, &delayed_refs->root);
+                delayed_refs->num_entries--;
+                spin_unlock(&delayed_refs->lock);
+                ret = run_one_delayed_ref(trans, root, ref,
+                                          must_insert_reserved);
+                BUG_ON(ret);
+                btrfs_put_delayed_ref(ref);
+                count++;
+                cond_resched();
+                spin_lock(&delayed_refs->lock);
+        }
+        return count;
+}
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far.  count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, unsigned long count)
+{
+        struct rb_node *node;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_delayed_ref_node *ref;
+        struct list_head cluster;
+        int ret;
+        int run_all = count == (unsigned long)-1;
+        int run_most = 0;
+        if (root == root->fs_info->extent_root)
+                root = root->fs_info->tree_root;
+        delayed_refs = &trans->transaction->delayed_refs;
+        INIT_LIST_HEAD(&cluster);
+again:
+        spin_lock(&delayed_refs->lock);
+        if (count == 0) {
+                count = delayed_refs->num_entries * 2;
+                run_most = 1;
+        }
+        while (1) {
+                if (!(run_all || run_most) &&
+                    delayed_refs->num_heads_ready < 64)
+                        break;
+                /*
+                 * go find something we can process in the rbtree.  We start at
+                 * the beginning of the tree, and then build a cluster
+                 * of refs to process starting at the first one we are able to
+                 * lock
+                 */
+                ret = btrfs_find_ref_cluster(trans, &cluster,
+                                             delayed_refs->run_delayed_start);
+                if (ret)
+                        break;
+                ret = run_clustered_refs(trans, root, &cluster);
+                BUG_ON(ret < 0);
+                count -= min_t(unsigned long, ret, count);
+                if (count == 0)
+                        break;
+        }
+        if (run_all) {
+                node = rb_first(&delayed_refs->root);
+                if (!node)
+                        goto out;
+                count = (unsigned long)-1;
+                while (node) {
+                        ref = rb_entry(node, struct btrfs_delayed_ref_node,
+                                       rb_node);
+                        if (btrfs_delayed_ref_is_head(ref)) {
+                                struct btrfs_delayed_ref_head *head;
+                                head = btrfs_delayed_node_to_head(ref);
+                                atomic_inc(&ref->refs);
+                                spin_unlock(&delayed_refs->lock);
+                                mutex_lock(&head->mutex);
+                                mutex_unlock(&head->mutex);
+                                btrfs_put_delayed_ref(ref);
+                                cond_resched();
+                                goto again;
+                        }
+                        node = rb_next(node);
+                }
+                spin_unlock(&delayed_refs->lock);
+                schedule_timeout(1);
+                goto again;
        }
-        l = path->nodes[0];
-        item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
-        *refs = btrfs_extent_refs(l, item);
 out:
-        btrfs_free_path(path);
+        spin_unlock(&delayed_refs->lock);
        return 0;
 }
@@ -1624,7 +1316,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
        int refi = 0;
        int slot;
        int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
-                            u64, u64, u64, u64, u64, u64, u64, u64);
+                            u64, u64, u64, u64, u64, u64, u64, u64, u64);
        ref_root = btrfs_header_owner(buf);
        ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1388,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                if (level == 0) {
                        btrfs_item_key_to_cpu(buf, &key, slot);
+                        fi = btrfs_item_ptr(buf, slot,
+                                            struct btrfs_file_extent_item);
+                        bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+                        if (bytenr == 0)
+                                continue;
                        ret = process_func(trans, root, bytenr,
-                                           orig_buf->start, buf->start,
+                                   btrfs_file_extent_disk_num_bytes(buf, fi),
-                                           orig_root, ref_root,
+                                   orig_buf->start, buf->start,
-                                           orig_generation, ref_generation,
+                                   orig_root, ref_root,
-                                           key.objectid);
+                                   orig_generation, ref_generation,
+                                   key.objectid);
                        if (ret) {
                                faili = slot;
@@ -1709,7 +1408,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
                                goto fail;
                        }
                } else {
-                        ret = process_func(trans, root, bytenr,
+                        ret = process_func(trans, root, bytenr, buf->len,
                                           orig_buf->start, buf->start,
                                           orig_root, ref_root,
                                           orig_generation, ref_generation,
@@ -1786,17 +1485,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
                        if (bytenr == 0)
                                continue;
                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                            orig_buf->start, buf->start,
+                                    btrfs_file_extent_disk_num_bytes(buf, fi),
-                                            orig_root, ref_root,
+                                    orig_buf->start, buf->start,
-                                            orig_generation, ref_generation,
+                                    orig_root, ref_root, orig_generation,
-                                            key.objectid);
+                                    ref_generation, key.objectid);
                        if (ret)
                                goto fail;
                } else {
                        bytenr = btrfs_node_blockptr(buf, slot);
                        ret = __btrfs_update_extent_ref(trans, root, bytenr,
-                                            orig_buf->start, buf->start,
+                                            buf->len, orig_buf->start,
-                                            orig_root, ref_root,
+                                            buf->start, orig_root, ref_root,
                                            orig_generation, ref_generation,
                                            level - 1);
                        if (ret)
@@ -1815,7 +1514,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
                                 struct btrfs_block_group_cache *cache)
 {
        int ret;
-        int pending_ret;
        struct btrfs_root *extent_root = root->fs_info->extent_root;
        unsigned long bi;
        struct extent_buffer *leaf;
@@ -1831,12 +1529,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(extent_root, path);
 fail:
-        finish_current_insert(trans, extent_root, 0);
-        pending_ret = del_pending_extents(trans, extent_root, 0);
        if (ret)
                return ret;
-        if (pending_ret)
-                return pending_ret;
        return 0;
 }
@@ -2361,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
                clear_extent_dirty(&fs_info->pinned_extents,
                                bytenr, bytenr + num - 1, GFP_NOFS);
        }
+        mutex_unlock(&root->fs_info->pinned_mutex);
        while (num > 0) {
                cache = btrfs_lookup_block_group(fs_info, bytenr);
                BUG_ON(!cache);
@@ -2452,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
        u64 end;
        int ret;
-        mutex_lock(&root->fs_info->pinned_mutex);
        while (1) {
+                mutex_lock(&root->fs_info->pinned_mutex);
                ret = find_first_extent_bit(unpin, 0, &start, &end,
                                            EXTENT_DIRTY);
                if (ret)
@@ -2461,209 +2157,21 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                ret = btrfs_discard_extent(root, start, end + 1 - start);
+                /* unlocks the pinned mutex */
                btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
-                if (need_resched()) {
+                cond_resched();
-                        mutex_unlock(&root->fs_info->pinned_mutex);
-                        cond_resched();
-                        mutex_lock(&root->fs_info->pinned_mutex);
-                }
        }
        mutex_unlock(&root->fs_info->pinned_mutex);
        return ret;
 }
-static int finish_current_insert(struct btrfs_trans_handle *trans,
-                                 struct btrfs_root *extent_root, int all)
-{
-        u64 start;
-        u64 end;
-        u64 priv;
-        u64 search = 0;
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        struct btrfs_path *path;
-        struct pending_extent_op *extent_op, *tmp;
-        struct list_head insert_list, update_list;
-        int ret;
-        int num_inserts = 0, max_inserts, restart = 0;
-        path = btrfs_alloc_path();
-        INIT_LIST_HEAD(&insert_list);
-        INIT_LIST_HEAD(&update_list);
-        max_inserts = extent_root->leafsize /
-                (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
-                 sizeof(struct btrfs_extent_ref) +
-                 sizeof(struct btrfs_extent_item));
-again:
-        mutex_lock(&info->extent_ins_mutex);
-        while (1) {
-                ret = find_first_extent_bit(&info->extent_ins, search, &start,
-                                            &end, EXTENT_WRITEBACK);
-                if (ret) {
-                        if (restart && !num_inserts &&
-                            list_empty(&update_list)) {
-                                restart = 0;
-                                search = 0;
-                                continue;
-                        }
-                        break;
-                }
-                ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
-                if (!ret) {
-                        if (all)
-                                restart = 1;
-                        search = end + 1;
-                        if (need_resched()) {
-                                mutex_unlock(&info->extent_ins_mutex);
-                                cond_resched();
-                                mutex_lock(&info->extent_ins_mutex);
-                        }
-                        continue;
-                }
-                ret = get_state_private(&info->extent_ins, start, &priv);
-                BUG_ON(ret);
-                extent_op = (struct pending_extent_op *)(unsigned long) priv;
-                if (extent_op->type == PENDING_EXTENT_INSERT) {
-                        num_inserts++;
-                        list_add_tail(&extent_op->list, &insert_list);
-                        search = end + 1;
-                        if (num_inserts == max_inserts) {
-                                restart = 1;
-                                break;
-                        }
-                } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
-                        list_add_tail(&extent_op->list, &update_list);
-                        search = end + 1;
-                } else {
-                        BUG();
-                }
-        }
-        /*
-         * process the update list, clear the writeback bit for it, and if
-         * somebody marked this thing for deletion then just unlock it and be
-         * done, the free_extents will handle it
-         */
-        list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
-                clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-                                  extent_op->bytenr + extent_op->num_bytes - 1,
-                                  EXTENT_WRITEBACK, GFP_NOFS);
-                if (extent_op->del) {
-                        list_del_init(&extent_op->list);
-                        unlock_extent(&info->extent_ins, extent_op->bytenr,
-                                      extent_op->bytenr + extent_op->num_bytes
-                                      - 1, GFP_NOFS);
-                        kfree(extent_op);
-                }
-        }
-        mutex_unlock(&info->extent_ins_mutex);
-        /*
-         * still have things left on the update list, go ahead an update
-         * everything
-         */
-        if (!list_empty(&update_list)) {
-                ret = update_backrefs(trans, extent_root, path, &update_list);
-                BUG_ON(ret);
-                /* we may have COW'ed new blocks, so lets start over */
-                if (all)
-                        restart = 1;
-        }
-        /*
-         * if no inserts need to be done, but we skipped some extents and we
-         * need to make sure everything is cleaned then reset everything and
-         * go back to the beginning
-         */
-        if (!num_inserts && restart) {
-                search = 0;
-                restart = 0;
-                INIT_LIST_HEAD(&update_list);
-                INIT_LIST_HEAD(&insert_list);
-                goto again;
-        } else if (!num_inserts) {
-                goto out;
-        }
-        /*
-         * process the insert extents list.  Again if we are deleting this
-         * extent, then just unlock it, pin down the bytes if need be, and be
-         * done with it.  Saves us from having to actually insert the extent
-         * into the tree and then subsequently come along and delete it
-         */
-        mutex_lock(&info->extent_ins_mutex);
-        list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
-                clear_extent_bits(&info->extent_ins, extent_op->bytenr,
-                                  extent_op->bytenr + extent_op->num_bytes - 1,
-                                  EXTENT_WRITEBACK, GFP_NOFS);
-                if (extent_op->del) {
-                        u64 used;
-                        list_del_init(&extent_op->list);
-                        unlock_extent(&info->extent_ins, extent_op->bytenr,
-                                      extent_op->bytenr + extent_op->num_bytes
-                                      - 1, GFP_NOFS);
-                        mutex_lock(&extent_root->fs_info->pinned_mutex);
-                        ret = pin_down_bytes(trans, extent_root,
-                                             extent_op->bytenr,
-                                             extent_op->num_bytes, 0);
-                        mutex_unlock(&extent_root->fs_info->pinned_mutex);
-                        spin_lock(&info->delalloc_lock);
-                        used = btrfs_super_bytes_used(&info->super_copy);
-                        btrfs_set_super_bytes_used(&info->super_copy,
-                                        used - extent_op->num_bytes);
-                        used = btrfs_root_used(&extent_root->root_item);
-                        btrfs_set_root_used(&extent_root->root_item,
-                                        used - extent_op->num_bytes);
-                        spin_unlock(&info->delalloc_lock);
-                        ret = update_block_group(trans, extent_root,
-                                                 extent_op->bytenr,
-                                                 extent_op->num_bytes,
-                                                 0, ret > 0);
-                        BUG_ON(ret);
-                        kfree(extent_op);
-                        num_inserts--;
-                }
-        }
-        mutex_unlock(&info->extent_ins_mutex);
-        ret = insert_extents(trans, extent_root, path, &insert_list,
-                             num_inserts);
-        BUG_ON(ret);
-        /*
-         * if restart is set for whatever reason we need to go back and start
-         * searching through the pending list again.
-         *
-         * We just inserted some extents, which could have resulted in new
-         * blocks being allocated, which would result in new blocks needing
-         * updates, so if all is set we _must_ restart to get the updated
-         * blocks.
-         */
-        if (restart || all) {
-                INIT_LIST_HEAD(&insert_list);
-                INIT_LIST_HEAD(&update_list);
-                search = 0;
-                restart = 0;
-                num_inserts = 0;
-                goto again;
-        }
-out:
-        btrfs_free_path(path);
-        return 0;
-}
 static int pin_down_bytes(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
-                          u64 bytenr, u64 num_bytes, int is_data)
+                          struct btrfs_path *path,
+                          u64 bytenr, u64 num_bytes, int is_data,
+                          struct extent_buffer **must_clean)
 {
        int err = 0;
        struct extent_buffer *buf;
@@ -2686,17 +2194,19 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
                u64 header_transid = btrfs_header_generation(buf);
                if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
                    header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+                    header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
                    header_transid == trans->transid &&
                    !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
-                        clean_tree_block(NULL, root, buf);
+                        *must_clean = buf;
-                        btrfs_tree_unlock(buf);
-                        free_extent_buffer(buf);
                        return 1;
                }
                btrfs_tree_unlock(buf);
        }
        free_extent_buffer(buf);
 pinit:
+        btrfs_set_path_blocking(path);
+        mutex_lock(&root->fs_info->pinned_mutex);
+        /* unlocks the pinned mutex */
        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
        BUG_ON(err < 0);
@@ -2710,7 +2220,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_root *root,
                         u64 bytenr, u64 num_bytes, u64 parent,
                         u64 root_objectid, u64 ref_generation,
-                         u64 owner_objectid, int pin, int mark_free)
+                         u64 owner_objectid, int pin, int mark_free,
+                         int refs_to_drop)
 {
        struct btrfs_path *path;
        struct btrfs_key key;
@@ -2732,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        path->reada = 1;
+        path->leave_spinning = 1;
        ret = lookup_extent_backref(trans, extent_root, path,
                                    bytenr, parent, root_objectid,
                                    ref_generation, owner_objectid, 1);
@@ -2753,9 +2265,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                                break;
                }
                if (!found_extent) {
-                        ret = remove_extent_backref(trans, extent_root, path);
+                        ret = remove_extent_backref(trans, extent_root, path,
+                                                    refs_to_drop);
                        BUG_ON(ret);
                        btrfs_release_path(extent_root, path);
+                        path->leave_spinning = 1;
                        ret = btrfs_search_slot(trans, extent_root,
                                                &key, path, -1, 1);
                        if (ret) {
@@ -2771,8 +2285,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_print_leaf(extent_root, path->nodes[0]);
                WARN_ON(1);
                printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
-                       "root %llu gen %llu owner %llu\n",
+                       "parent %llu root %llu gen %llu owner %llu\n",
                       (unsigned long long)bytenr,
+                       (unsigned long long)parent,
                       (unsigned long long)root_objectid,
                       (unsigned long long)ref_generation,
                       (unsigned long long)owner_objectid);
@@ -2782,17 +2297,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
        ei = btrfs_item_ptr(leaf, extent_slot,
                            struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, ei);
-        BUG_ON(refs == 0);
-        refs -= 1;
-        btrfs_set_extent_refs(leaf, ei, refs);
+        /*
+         * we're not allowed to delete the extent item if there
+         * are other delayed ref updates pending
+         */
+        BUG_ON(refs < refs_to_drop);
+        refs -= refs_to_drop;
+        btrfs_set_extent_refs(leaf, ei, refs);
        btrfs_mark_buffer_dirty(leaf);
-        if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+        if (refs == 0 && found_extent &&
+            path->slots[0] == extent_slot + 1) {
                struct btrfs_extent_ref *ref;
                ref = btrfs_item_ptr(leaf, path->slots[0],
                                     struct btrfs_extent_ref);
-                BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+                BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
                /* if the back ref and the extent are next to each other
                 * they get deleted below in one shot
                 */
@@ -2800,11 +2321,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                num_to_del = 2;
        } else if (found_extent) {
                /* otherwise delete the extent back ref */
-                ret = remove_extent_backref(trans, extent_root, path);
+                ret = remove_extent_backref(trans, extent_root, path,
+                                            refs_to_drop);
                BUG_ON(ret);
                /* if refs are 0, we need to setup the path for deletion */
                if (refs == 0) {
                        btrfs_release_path(extent_root, path);
+                        path->leave_spinning = 1;
                        ret = btrfs_search_slot(trans, extent_root, &key, path,
                                                -1, 1);
                        BUG_ON(ret);
@@ -2814,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
        if (refs == 0) {
                u64 super_used;
                u64 root_used;
+                struct extent_buffer *must_clean = NULL;
                if (pin) {
-                        mutex_lock(&root->fs_info->pinned_mutex);
+                        ret = pin_down_bytes(trans, root, path,
-                        ret = pin_down_bytes(trans, root, bytenr, num_bytes,
+                                bytenr, num_bytes,
-                                owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
+                                owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
-                        mutex_unlock(&root->fs_info->pinned_mutex);
+                                &must_clean);
                        if (ret > 0)
                                mark_free = 1;
                        BUG_ON(ret < 0);
                }
                /* block accounting for super block */
                spin_lock(&info->delalloc_lock);
                super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2835,14 +2360,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                btrfs_set_root_used(&root->root_item,
                                           root_used - num_bytes);
                spin_unlock(&info->delalloc_lock);
+                /*
+                 * it is going to be very rare for someone to be waiting
+                 * on the block we're freeing.  del_items might need to
+                 * schedule, so rather than get fancy, just force it
+                 * to blocking here
+                 */
+                if (must_clean)
+                        btrfs_set_lock_blocking(must_clean);
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
                btrfs_release_path(extent_root, path);
+                if (must_clean) {
+                        clean_tree_block(NULL, root, must_clean);
+                        btrfs_tree_unlock(must_clean);
+                        free_extent_buffer(must_clean);
+                }
                if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
                        BUG_ON(ret);
+                } else {
+                        invalidate_mapping_pages(info->btree_inode->i_mapping,
+                             bytenr >> PAGE_CACHE_SHIFT,
+                             (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
                }
                ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2850,218 +2395,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
                BUG_ON(ret);
        }
        btrfs_free_path(path);
-        finish_current_insert(trans, extent_root, 0);
        return ret;
 }
 /*
- * find all the blocks marked as pending in the radix tree and remove
+ * remove an extent from the root, returns 0 on success
- * them from the extent map
 */
-static int del_pending_extents(struct btrfs_trans_handle *trans,
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *extent_root, int all)
+                                        struct btrfs_root *root,
+                                        u64 bytenr, u64 num_bytes, u64 parent,
+                                        u64 root_objectid, u64 ref_generation,
+                                        u64 owner_objectid, int pin,
+                                        int refs_to_drop)
 {
-        int ret;
+        WARN_ON(num_bytes < root->sectorsize);
-        int err = 0;
-        u64 start;
-        u64 end;
-        u64 priv;
-        u64 search = 0;
-        int nr = 0, skipped = 0;
-        struct extent_io_tree *pending_del;
-        struct extent_io_tree *extent_ins;
-        struct pending_extent_op *extent_op;
-        struct btrfs_fs_info *info = extent_root->fs_info;
-        struct list_head delete_list;
-        INIT_LIST_HEAD(&delete_list);
-        extent_ins = &extent_root->fs_info->extent_ins;
-        pending_del = &extent_root->fs_info->pending_del;
-again:
-        mutex_lock(&info->extent_ins_mutex);
-        while (1) {
-                ret = find_first_extent_bit(pending_del, search, &start, &end,
-                                            EXTENT_WRITEBACK);
-                if (ret) {
-                        if (all && skipped && !nr) {
-                                search = 0;
-                                skipped = 0;
-                                continue;
-                        }
-                        mutex_unlock(&info->extent_ins_mutex);
-                        break;
-                }
-                ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
-                if (!ret) {
-                        search = end+1;
-                        skipped = 1;
-                        if (need_resched()) {
-                                mutex_unlock(&info->extent_ins_mutex);
-                                cond_resched();
-                                mutex_lock(&info->extent_ins_mutex);
-                        }
-                        continue;
-                }
-                BUG_ON(ret < 0);
-                ret = get_state_private(pending_del, start, &priv);
-                BUG_ON(ret);
-                extent_op = (struct pending_extent_op *)(unsigned long)priv;
-                clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
-                                  GFP_NOFS);
-                if (!test_range_bit(extent_ins, start, end,
-                                    EXTENT_WRITEBACK, 0)) {
-                        list_add_tail(&extent_op->list, &delete_list);
-                        nr++;
-                } else {
-                        kfree(extent_op);
-                        ret = get_state_private(&info->extent_ins, start,
-                                                &priv);
-                        BUG_ON(ret);
-                        extent_op = (struct pending_extent_op *)
-                                                (unsigned long)priv;
-                        clear_extent_bits(&info->extent_ins, start, end,
-                                          EXTENT_WRITEBACK, GFP_NOFS);
-                        if (extent_op->type == PENDING_BACKREF_UPDATE) {
-                                list_add_tail(&extent_op->list, &delete_list);
-                                search = end + 1;
-                                nr++;
-                                continue;
-                        }
-                        mutex_lock(&extent_root->fs_info->pinned_mutex);
-                        ret = pin_down_bytes(trans, extent_root, start,
-                                             end + 1 - start, 0);
-                        mutex_unlock(&extent_root->fs_info->pinned_mutex);
-                        ret = update_block_group(trans, extent_root, start,
-                                                end + 1 - start, 0, ret > 0);
-                        unlock_extent(extent_ins, start, end, GFP_NOFS);
-                        BUG_ON(ret);
-                        kfree(extent_op);
-                }
-                if (ret)
-                        err = ret;
-                search = end + 1;
-                if (need_resched()) {
-                        mutex_unlock(&info->extent_ins_mutex);
-                        cond_resched();
-                        mutex_lock(&info->extent_ins_mutex);
-                }
-        }
-        if (nr) {
+        /*
-                ret = free_extents(trans, extent_root, &delete_list);
+         * if metadata always pin
-                BUG_ON(ret);
+         * if data pin when any transaction has committed this
-        }
+         */
+        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
+            ref_generation != trans->transid)
+                pin = 1;
-        if (all && skipped) {
+        if (ref_generation != trans->transid)
-                INIT_LIST_HEAD(&delete_list);
+                pin = 1;
-                search = 0;
-                nr = 0;
-                goto again;
-        }
-        if (!err)
+        return __free_extent(trans, root, bytenr, num_bytes, parent,
-                finish_current_insert(trans, extent_root, 0);
+                            root_objectid, ref_generation,
-        return err;
+                            owner_objectid, pin, pin == 0, refs_to_drop);
 }
 /*
- * remove an extent from the root, returns 0 on success
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well.  This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
 */
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
+                                      struct btrfs_root *root, u64 bytenr)
-                               u64 bytenr, u64 num_bytes, u64 parent,
-                               u64 root_objectid, u64 ref_generation,
-                               u64 owner_objectid, int pin)
 {
-        struct btrfs_root *extent_root = root->fs_info->extent_root;
+        struct btrfs_delayed_ref_head *head;
-        int pending_ret;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_delayed_ref_node *ref;
+        struct rb_node *node;
        int ret;
-        WARN_ON(num_bytes < root->sectorsize);
+        delayed_refs = &trans->transaction->delayed_refs;
-        if (root == extent_root) {
+        spin_lock(&delayed_refs->lock);
-                struct pending_extent_op *extent_op = NULL;
+        head = btrfs_find_delayed_ref_head(trans, bytenr);
+        if (!head)
-                mutex_lock(&root->fs_info->extent_ins_mutex);
+                goto out;
-                if (test_range_bit(&root->fs_info->extent_ins, bytenr,
-                                bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
-                        u64 priv;
-                        ret = get_state_private(&root->fs_info->extent_ins,
-                                                bytenr, &priv);
-                        BUG_ON(ret);
-                        extent_op = (struct pending_extent_op *)
-                                                (unsigned long)priv;
-                        extent_op->del = 1;
+        node = rb_prev(&head->node.rb_node);
-                        if (extent_op->type == PENDING_EXTENT_INSERT) {
+        if (!node)
-                                mutex_unlock(&root->fs_info->extent_ins_mutex);
+                goto out;
-                                return 0;
-                        }
-                }
-                if (extent_op) {
+        ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
-                        ref_generation = extent_op->orig_generation;
-                        parent = extent_op->orig_parent;
-                }
-                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+        /* there are still entries for this ref, we can't drop it */
-                BUG_ON(!extent_op);
+        if (ref->bytenr == bytenr)
+                goto out;
-                extent_op->type = PENDING_EXTENT_DELETE;
-                extent_op->bytenr = bytenr;
-                extent_op->num_bytes = num_bytes;
-                extent_op->parent = parent;
-                extent_op->orig_parent = parent;
-                extent_op->generation = ref_generation;
-                extent_op->orig_generation = ref_generation;
-                extent_op->level = (int)owner_objectid;
-                INIT_LIST_HEAD(&extent_op->list);
-                extent_op->del = 0;
-                set_extent_bits(&root->fs_info->pending_del,
-                                bytenr, bytenr + num_bytes - 1,
-                                EXTENT_WRITEBACK, GFP_NOFS);
-                set_state_private(&root->fs_info->pending_del,
-                                  bytenr, (unsigned long)extent_op);
-                mutex_unlock(&root->fs_info->extent_ins_mutex);
-                return 0;
-        }
-        /* if metadata always pin */
-        if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-                if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
-                        mutex_lock(&root->fs_info->pinned_mutex);
-                        btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
-                        mutex_unlock(&root->fs_info->pinned_mutex);
-                        update_reserved_extents(root, bytenr, num_bytes, 0);
-                        return 0;
-                }
-                pin = 1;
-        }
-        /* if data pin when any transaction has committed this */
+        /*
-        if (ref_generation != trans->transid)
+         * waiting for the lock here would deadlock.  If someone else has it
-                pin = 1;
+         * locked they are already in the process of dropping it anyway
+         */
+        if (!mutex_trylock(&head->mutex))
+                goto out;
-        ret = __free_extent(trans, root, bytenr, num_bytes, parent,
+        /*
-                            root_objectid, ref_generation,
+         * at this point we have a head with no other entries.  Go
-                            owner_objectid, pin, pin == 0);
+         * ahead and process it.
+         */
+        head->node.in_tree = 0;
+        rb_erase(&head->node.rb_node, &delayed_refs->root);
-        finish_current_insert(trans, root->fs_info->extent_root, 0);
+        delayed_refs->num_entries--;
-        pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
-        return ret ? ret : pending_ret;
+        /*
+         * we don't take a ref on the node because we're removing it from the
+         * tree, so we just steal the ref the tree was holding.
+         */
+        delayed_refs->num_heads--;
+        if (list_empty(&head->cluster))
+                delayed_refs->num_heads_ready--;
+        list_del_init(&head->cluster);
+        spin_unlock(&delayed_refs->lock);
+        ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+                                  &head->node, head->must_insert_reserved);
+        BUG_ON(ret);
+        btrfs_put_delayed_ref(&head->node);
+        return 0;
+out:
+        spin_unlock(&delayed_refs->lock);
+        return 0;
 }
 int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2502,30 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 {
        int ret;
-        ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
+        /*
-                                  root_objectid, ref_generation,
+         * tree log blocks never actually go into the extent allocation
-                                  owner_objectid, pin);
+         * tree, just update pinning info and exit early.
+         *
+         * data extents referenced by the tree log do need to have
+         * their reference counts bumped.
+         */
+        if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+            owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+                mutex_lock(&root->fs_info->pinned_mutex);
+                /* unlocks the pinned mutex */
+                btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+                update_reserved_extents(root, bytenr, num_bytes, 0);
+                ret = 0;
+        } else {
+                ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+                                       root_objectid, ref_generation,
+                                       owner_objectid,
+                                       BTRFS_DROP_DELAYED_REF, 1);
+                BUG_ON(ret);
+                ret = check_ref_cleanup(trans, root, bytenr);
+                BUG_ON(ret);
+        }
        return ret;
 }
@@ -3475,10 +2926,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
                                         struct btrfs_root *root, u64 parent,
                                         u64 root_objectid, u64 ref_generation,
-                                         u64 owner, struct btrfs_key *ins)
+                                         u64 owner, struct btrfs_key *ins,
+                                         int ref_mod)
 {
        int ret;
-        int pending_ret;
        u64 super_used;
        u64 root_used;
        u64 num_bytes = ins->offset;
@@ -3503,33 +2954,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        btrfs_set_root_used(&root->root_item, root_used + num_bytes);
        spin_unlock(&info->delalloc_lock);
-        if (root == extent_root) {
-                struct pending_extent_op *extent_op;
-                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
-                BUG_ON(!extent_op);
-                extent_op->type = PENDING_EXTENT_INSERT;
-                extent_op->bytenr = ins->objectid;
-                extent_op->num_bytes = ins->offset;
-                extent_op->parent = parent;
-                extent_op->orig_parent = 0;
-                extent_op->generation = ref_generation;
-                extent_op->orig_generation = 0;
-                extent_op->level = (int)owner;
-                INIT_LIST_HEAD(&extent_op->list);
-                extent_op->del = 0;
-                mutex_lock(&root->fs_info->extent_ins_mutex);
-                set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
-                                ins->objectid + ins->offset - 1,
-                                EXTENT_WRITEBACK, GFP_NOFS);
-                set_state_private(&root->fs_info->extent_ins,
-                                  ins->objectid, (unsigned long)extent_op);
-                mutex_unlock(&root->fs_info->extent_ins_mutex);
-                goto update_block;
-        }
        memcpy(&keys[0], ins, sizeof(*ins));
        keys[1].objectid = ins->objectid;
        keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3540,37 +2964,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
                                       sizes, 2);
        BUG_ON(ret);
        extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_extent_item);
-        btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+        btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
        ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
                             struct btrfs_extent_ref);
        btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
        btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
        btrfs_set_ref_objectid(path->nodes[0], ref, owner);
-        btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+        btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
        btrfs_mark_buffer_dirty(path->nodes[0]);
        trans->alloc_exclude_start = 0;
        trans->alloc_exclude_nr = 0;
        btrfs_free_path(path);
-        finish_current_insert(trans, extent_root, 0);
-        pending_ret = del_pending_extents(trans, extent_root, 0);
        if (ret)
                goto out;
-        if (pending_ret) {
-                ret = pending_ret;
-                goto out;
-        }
-update_block:
        ret = update_block_group(trans, root, ins->objectid,
                                 ins->offset, 1, 0);
        if (ret) {
@@ -3592,9 +3010,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
        if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
                return 0;
-        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-                                            ref_generation, owner, ins);
+        ret = btrfs_add_delayed_ref(trans, ins->objectid,
-        update_reserved_extents(root, ins->objectid, ins->offset, 0);
+                                    ins->offset, parent, root_objectid,
+                                    ref_generation, owner,
+                                    BTRFS_ADD_DELAYED_EXTENT, 0);
+        BUG_ON(ret);
        return ret;
 }
@@ -3621,7 +3042,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        put_block_group(block_group);
        ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
-                                            ref_generation, owner, ins);
+                                            ref_generation, owner, ins, 1);
        return ret;
 }
@@ -3640,20 +3061,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
                       u64 search_end, struct btrfs_key *ins, u64 data)
 {
        int ret;
        ret = __btrfs_reserve_extent(trans, root, num_bytes,
                                     min_alloc_size, empty_size, hint_byte,
                                     search_end, ins, data);
        BUG_ON(ret);
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
-                ret = __btrfs_alloc_reserved_extent(trans, root, parent,
+                ret = btrfs_add_delayed_ref(trans, ins->objectid,
-                                        root_objectid, ref_generation,
+                                            ins->offset, parent, root_objectid,
-                                        owner_objectid, ins);
+                                            ref_generation, owner_objectid,
+                                            BTRFS_ADD_DELAYED_EXTENT, 0);
                BUG_ON(ret);
-        } else {
-                update_reserved_extents(root, ins->objectid, ins->offset, 1);
        }
+        update_reserved_extents(root, ins->objectid, ins->offset, 1);
        return ret;
 }
@@ -3789,7 +3208,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
                fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
-                ret = __btrfs_free_extent(trans, root, disk_bytenr,
+                ret = btrfs_free_extent(trans, root, disk_bytenr,
                                btrfs_file_extent_disk_num_bytes(leaf, fi),
                                leaf->start, leaf_owner, leaf_generation,
                                key.objectid, 0);
@@ -3829,7 +3248,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
         */
        for (i = 0; i < ref->nritems; i++) {
                info = ref->extents + sorted[i].slot;
-                ret = __btrfs_free_extent(trans, root, info->bytenr,
+                ret = btrfs_free_extent(trans, root, info->bytenr,
                                          info->num_bytes, ref->bytenr,
                                          ref->owner, ref->generation,
                                          info->objectid, 0);
@@ -3846,12 +3265,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
        return 0;
 }
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root, u64 start,
                                     u64 len, u32 *refs)
 {
        int ret;
-        ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+        ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
        BUG_ON(ret);
 #if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3379,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
                 * we just decrement it below and don't update any
                 * of the refs the leaf points to.
                 */
-                ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+                ret = drop_snap_lookup_refcount(trans, root, bytenr,
+                                                blocksize, &refs);
                BUG_ON(ret);
                if (refs != 1)
                        continue;
@@ -4010,7 +3431,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
         */
        for (i = 0; i < refi; i++) {
                bytenr = sorted[i].bytenr;
-                ret = __btrfs_free_extent(trans, root, bytenr,
+                ret = btrfs_free_extent(trans, root, bytenr,
                                        blocksize, eb->start,
                                        root_owner, root_gen, 0, 1);
                BUG_ON(ret);
@@ -4053,7 +3474,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
        WARN_ON(*level < 0);
        WARN_ON(*level >= BTRFS_MAX_LEVEL);
-        ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+        ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
                                path->nodes[*level]->len, &refs);
        BUG_ON(ret);
        if (refs > 1)
@@ -4104,7 +3525,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
                blocksize = btrfs_level_size(root, *level - 1);
-                ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+                ret = drop_snap_lookup_refcount(trans, root, bytenr,
+                                                blocksize, &refs);
                BUG_ON(ret);
                /*
@@ -4119,7 +3541,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
                        root_gen = btrfs_header_generation(parent);
                        path->slots[*level]++;
-                        ret = __btrfs_free_extent(trans, root, bytenr,
+                        ret = btrfs_free_extent(trans, root, bytenr,
                                                blocksize, parent->start,
                                                root_owner, root_gen,
                                                *level - 1, 1);
@@ -4165,7 +3587,7 @@ out:
         * cleanup and free the reference on the last node
         * we processed
         */
-        ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+        ret = btrfs_free_extent(trans, root, bytenr, blocksize,
                                  parent->start, root_owner, root_gen,
                                  *level, 1);
        free_extent_buffer(path->nodes[*level]);
@@ -4354,6 +3776,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
        struct btrfs_path *path;
        int i;
        int orig_level;
+        int update_count;
        struct btrfs_root_item *root_item = &root->root_item;
        WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4395,6 +3818,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                }
        }
        while (1) {
+                unsigned long update;
                wret = walk_down_tree(trans, root, path, &level);
                if (wret > 0)
                        break;
@@ -4407,12 +3831,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
                        break;
                if (wret < 0)
                        ret = wret;
-                if (trans->transaction->in_commit) {
+                if (trans->transaction->in_commit ||
+                    trans->transaction->delayed_refs.flushing) {
                        ret = -EAGAIN;
                        break;
                }
                atomic_inc(&root->fs_info->throttle_gen);
                wake_up(&root->fs_info->transaction_throttle);
+                for (update_count = 0; update_count < 16; update_count++) {
+                        update = trans->delayed_ref_updates;
+                        trans->delayed_ref_updates = 0;
+                        if (update)
+                                btrfs_run_delayed_refs(trans, root, update);
+                        else
+                                break;
+                }
        }
        for (i = 0; i <= orig_level; i++) {
                if (path->nodes[i]) {
@@ -5457,6 +4890,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
                                        root->root_key.objectid,
                                        trans->transid, key.objectid);
                BUG_ON(ret);
                ret = btrfs_free_extent(trans, root,
                                        bytenr, num_bytes, leaf->start,
                                        btrfs_header_owner(leaf),
@@ -5768,9 +5202,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
                                ref_path, NULL, NULL);
        BUG_ON(ret);
-        if (root == root->fs_info->extent_root)
-                btrfs_extent_post_op(trans, root);
        return 0;
 }
@@ -6038,6 +5469,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_inode(trans, root, path, objectid);
        if (ret)
                goto out;
@@ -6208,6 +5640,9 @@ again:
        btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
        mutex_unlock(&root->fs_info->cleaner_mutex);
+        trans = btrfs_start_transaction(info->tree_root, 1);
+        btrfs_commit_transaction(trans, info->tree_root);
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
                if (ret < 0)
@@ -6466,7 +5901,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        extent_root = root->fs_info->extent_root;
-        root->fs_info->last_trans_new_blockgroup = trans->transid;
+        root->fs_info->last_trans_log_full_commit = trans->transid;
        cache = kzalloc(sizeof(*cache), GFP_NOFS);
        if (!cache)
@@ -6500,9 +5935,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
                                sizeof(cache->item));
        BUG_ON(ret);
-        finish_current_insert(trans, extent_root, 0);
-        ret = del_pending_extents(trans, extent_root, 0);
-        BUG_ON(ret);
        set_avail_alloc_bits(extent_root->fs_info, type);
        return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e6069..08085af089e2 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3124,20 +3124,15 @@ void free_extent_buffer(struct extent_buffer *eb)
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                              struct extent_buffer *eb)
 {
-        int set;
        unsigned long i;
        unsigned long num_pages;
        struct page *page;
-        u64 start = eb->start;
-        u64 end = start + eb->len - 1;
-        set = clear_extent_dirty(tree, start, end, GFP_NOFS);
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-                if (!set && !PageDirty(page))
+                if (!PageDirty(page))
                        continue;
                lock_page(page);
@@ -3146,22 +3141,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                else
                        set_page_private(page, EXTENT_PAGE_PRIVATE);
-                /*
-                 * if we're on the last page or the first page and the
-                 * block isn't aligned on a page boundary, do extra checks
-                 * to make sure we don't clean page that is partially dirty
-                 */
-                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
-                    ((i == num_pages - 1) &&
-                     ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
-                        start = (u64)page->index << PAGE_CACHE_SHIFT;
-                        end  = start + PAGE_CACHE_SIZE - 1;
-                        if (test_range_bit(tree, start, end,
-                                           EXTENT_DIRTY, 0)) {
-                                unlock_page(page);
-                                continue;
-                        }
-                }
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
                if (!PageDirty(page)) {
@@ -3187,29 +3166,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
 {
        unsigned long i;
        unsigned long num_pages;
+        int was_dirty = 0;
+        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
        num_pages = num_extent_pages(eb->start, eb->len);
-        for (i = 0; i < num_pages; i++) {
+        for (i = 0; i < num_pages; i++)
-                struct page *page = extent_buffer_page(eb, i);
-                /* writepage may need to do something special for the
-                 * first page, we have to make sure page->private is
-                 * properly set.  releasepage may drop page->private
-                 * on us if the page isn't already dirty.
-                 */
-                lock_page(page);
-                if (i == 0) {
-                        set_page_extent_head(page, eb->len);
-                } else if (PagePrivate(page) &&
-                           page->private != EXTENT_PAGE_PRIVATE) {
-                        set_page_extent_mapped(page);
-                }
                __set_page_dirty_nobuffers(extent_buffer_page(eb, i));
-                set_extent_dirty(tree, page_offset(page),
+        return was_dirty;
-                                 page_offset(page) + PAGE_CACHE_SIZE - 1,
-                                 GFP_NOFS);
-                unlock_page(page);
-        }
-        return 0;
 }
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3752,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
                ret = 0;
                goto out;
        }
+        if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+                ret = 0;
+                goto out;
+        }
        /* at this point we can safely release the extent buffer */
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf6..5bc20abf3f3d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
+#define EXTENT_BUFFER_DIRTY 2
 /*
 * page->private values.  Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                              struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
                             struct extent_buffer *eb);
+int test_extent_buffer_dirty(struct extent_io_tree *tree,
+                             struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
                               struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd1..9b99886562d0 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
        file_key.offset = pos;
        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
                                      sizeof(*item));
        if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                key.offset = end_byte - 1;
                key.type = BTRFS_EXTENT_CSUM_KEY;
+                path->leave_spinning = 1;
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret > 0) {
                        if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
        } else {
                ins_size = csum_size;
        }
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &file_key,
                                      ins_size);
+        path->leave_spinning = 0;
        if (ret < 0)
                goto fail_unlock;
        if (ret != 0) {
@@ -776,7 +780,6 @@ found:
        item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
                                      btrfs_item_size_nr(leaf, path->slots[0]));
        eb_token = NULL;
-        cond_resched();
 next_sector:
        if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
                eb_token = NULL;
        }
        btrfs_mark_buffer_dirty(path->nodes[0]);
-        cond_resched();
        if (total_bytes < sums->len) {
                btrfs_release_path(root, path);
+                cond_resched();
                goto again;
        }
 out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b3..9c9fb46ccd08 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -606,6 +606,7 @@ next_slot:
                        btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
                        btrfs_release_path(root, path);
+                        path->leave_spinning = 1;
                        ret = btrfs_insert_empty_item(trans, root, path, &ins,
                                                      sizeof(*extent));
                        BUG_ON(ret);
@@ -639,17 +640,22 @@ next_slot:
                                                        ram_bytes);
                        btrfs_set_file_extent_type(leaf, extent, found_type);
+                        btrfs_unlock_up_safe(path, 1);
                        btrfs_mark_buffer_dirty(path->nodes[0]);
+                        btrfs_set_lock_blocking(path->nodes[0]);
                        if (disk_bytenr != 0) {
                                ret = btrfs_update_extent_ref(trans, root,
-                                                disk_bytenr, orig_parent,
+                                                disk_bytenr,
+                                                le64_to_cpu(old.disk_num_bytes),
+                                                orig_parent,
                                                leaf->start,
                                                root->root_key.objectid,
                                                trans->transid, ins.objectid);
                                BUG_ON(ret);
                        }
+                        path->leave_spinning = 0;
                        btrfs_release_path(root, path);
                        if (disk_bytenr != 0)
                                inode_add_bytes(inode, extent_end - end);
@@ -912,7 +918,7 @@ again:
        btrfs_set_file_extent_other_encoding(leaf, fi, 0);
        if (orig_parent != leaf->start) {
-                ret = btrfs_update_extent_ref(trans, root, bytenr,
+                ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
                                              orig_parent, leaf->start,
                                              root->root_key.objectid,
                                              trans->transid, inode->i_ino);
@@ -1155,6 +1161,20 @@ out_nolock:
                page_cache_release(pinned[1]);
        *ppos = pos;
+        /*
+         * we want to make sure fsync finds this change
+         * but we haven't joined a transaction running right now.
+         *
+         * Later on, someone is sure to update the inode and get the
+         * real transid recorded.
+         *
+         * We set last_trans now to the fs_info generation + 1,
+         * this will either be one more than the running transaction
+         * or the generation used for the next transaction if there isn't
+         * one running right now.
+         */
+        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        if (num_written > 0 && will_write) {
                struct btrfs_trans_handle *trans;
@@ -1167,8 +1187,11 @@ out_nolock:
                        ret = btrfs_log_dentry_safe(trans, root,
                                                    file->f_dentry);
                        if (ret == 0) {
-                                btrfs_sync_log(trans, root);
+                                ret = btrfs_sync_log(trans, root);
-                                btrfs_end_transaction(trans, root);
+                                if (ret == 0)
+                                        btrfs_end_transaction(trans, root);
+                                else
+                                        btrfs_commit_transaction(trans, root);
                        } else {
                                btrfs_commit_transaction(trans, root);
                        }
@@ -1185,6 +1208,18 @@ out_nolock:
 int btrfs_release_file(struct inode *inode, struct file *filp)
 {
+        /*
+         * ordered_data_close is set by settattr when we are about to truncate
+         * a file from a non-zero size to a zero size.  This tries to
+         * flush down new bytes that may have been written if the
+         * application were using truncate to replace a file in place.
+         */
+        if (BTRFS_I(inode)->ordered_data_close) {
+                BTRFS_I(inode)->ordered_data_close = 0;
+                btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+                        filemap_flush(inode->i_mapping);
+        }
        if (filp->private_data)
                btrfs_ioctl_trans_end(filp);
        return 0;
@@ -1260,8 +1295,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
        if (ret > 0) {
                ret = btrfs_commit_transaction(trans, root);
        } else {
-                btrfs_sync_log(trans, root);
+                ret = btrfs_sync_log(trans, root);
-                ret = btrfs_end_transaction(trans, root);
+                if (ret == 0)
+                        ret = btrfs_end_transaction(trans, root);
+                else
+                        ret = btrfs_commit_transaction(trans, root);
        }
        mutex_lock(&dentry->d_inode->i_mutex);
 out:
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a4..6b627c611808 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
+        path->leave_spinning = 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret > 0) {
                ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                      ins_len);
        if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7d4f948bc22a..06d8db5afb08 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
+        path->leave_spinning = 1;
        btrfs_set_trans_block_group(trans, inode);
        key.objectid = inode->i_ino;
@@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                        cur_size = min_t(unsigned long, compressed_size,
                                       PAGE_CACHE_SIZE);
-                        kaddr = kmap(cpage);
+                        kaddr = kmap_atomic(cpage, KM_USER0);
                        write_extent_buffer(leaf, kaddr, ptr, cur_size);
-                        kunmap(cpage);
+                        kunmap_atomic(kaddr, KM_USER0);
                        i++;
                        ptr += cur_size;
@@ -204,7 +205,7 @@ fail:
 * does the checks required to make sure the data is small enough
 * to fit as an inline extent.
 */
-static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode, u64 start, u64 end,
                                 size_t compressed_size,
@@ -854,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
        u64 cur_end;
        int limit = 10 * 1024 * 1042;
-        if (!btrfs_test_opt(root, COMPRESS)) {
-                return cow_file_range(inode, locked_page, start, end,
-                                      page_started, nr_written, 1);
-        }
        clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
                         EXTENT_DELALLOC, 1, 0, GFP_NOFS);
        while (start < end) {
@@ -935,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
 * If no cow copies or snapshots exist, we write directly to the existing
 * blocks on disk
 */
-static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+static noinline int run_delalloc_nocow(struct inode *inode,
+                                       struct page *locked_page,
                              u64 start, u64 end, int *page_started, int force,
                              unsigned long *nr_written)
 {
@@ -1133,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                              unsigned long *nr_written)
 {
        int ret;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        if (btrfs_test_flag(inode, NODATACOW))
                ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
        else if (btrfs_test_flag(inode, PREALLOC))
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
+        else if (!btrfs_test_opt(root, COMPRESS))
+                ret = cow_file_range(inode, locked_page, start, end,
+                                      page_started, nr_written, 1);
        else
                ret = cow_file_range_async(inode, locked_page, start, end,
                                           page_started, nr_written);
        return ret;
 }
@@ -1453,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        path->leave_spinning = 1;
        ret = btrfs_drop_extents(trans, root, inode, file_pos,
                                 file_pos + num_bytes, file_pos, &hint);
        BUG_ON(ret);
@@ -1475,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_set_file_extent_compression(leaf, fi, compression);
        btrfs_set_file_extent_encryption(leaf, fi, encryption);
        btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+        btrfs_unlock_up_safe(path, 1);
+        btrfs_set_lock_blocking(leaf);
        btrfs_mark_buffer_dirty(leaf);
        inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                          root->root_key.objectid,
                                          trans->transid, inode->i_ino, &ins);
        BUG_ON(ret);
        btrfs_free_path(path);
        return 0;
 }
+/*
+ * helper function for btrfs_finish_ordered_io, this
+ * just reads in some of the csum leaves to prime them into ram
+ * before we start the transaction.  It limits the amount of btree
+ * reads required while inside the transaction.
+ */
+static noinline void reada_csum(struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                struct btrfs_ordered_extent *ordered_extent)
+{
+        struct btrfs_ordered_sum *sum;
+        u64 bytenr;
+        sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
+                         list);
+        bytenr = sum->sums[0].bytenr;
+        /*
+         * we don't care about the results, the point of this search is
+         * just to get the btree leaves into ram
+         */
+        btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
+}
 /* as ordered data IO finishes, this gets called so we can finish
 * an ordered extent if the range of bytes in the file it covers are
 * fully written.
@@ -1500,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
-        struct btrfs_ordered_extent *ordered_extent;
+        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct btrfs_path *path;
        int compressed = 0;
        int ret;
@@ -1509,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        if (!ret)
                return 0;
+        /*
+         * before we join the transaction, try to do some of our IO.
+         * This will limit the amount of IO that we have to do with
+         * the transaction running.  We're unlikely to need to do any
+         * IO if the file extents are new, the disk_i_size checks
+         * covers the most common case.
+         */
+        if (start < BTRFS_I(inode)->disk_i_size) {
+                path = btrfs_alloc_path();
+                if (path) {
+                        ret = btrfs_lookup_file_extent(NULL, root, path,
+                                                       inode->i_ino,
+                                                       start, 0);
+                        ordered_extent = btrfs_lookup_ordered_extent(inode,
+                                                                     start);
+                        if (!list_empty(&ordered_extent->list)) {
+                                btrfs_release_path(root, path);
+                                reada_csum(root, path, ordered_extent);
+                        }
+                        btrfs_free_path(path);
+                }
+        }
        trans = btrfs_join_transaction(root, 1);
-        ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+        if (!ordered_extent)
+                ordered_extent = btrfs_lookup_ordered_extent(inode, start);
        BUG_ON(!ordered_extent);
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
                goto nocow;
@@ -2101,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        path->leave_spinning = 1;
        ret = btrfs_lookup_inode(trans, root, path,
                                 &BTRFS_I(inode)->location, 1);
        if (ret) {
@@ -2147,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
                goto err;
        }
+        path->leave_spinning = 1;
        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
                                    name, name_len, -1);
        if (IS_ERR(di)) {
@@ -2190,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
                                         inode, dir->i_ino);
        BUG_ON(ret != 0 && ret != -ENOENT);
-        if (ret != -ENOENT)
-                BTRFS_I(dir)->log_dirty_trans = trans->transid;
        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
                                           dir, index);
@@ -2224,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        trans = btrfs_start_transaction(root, 1);
        btrfs_set_trans_block_group(trans, dir);
+        btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
                                 dentry->d_name.name, dentry->d_name.len);
@@ -2498,6 +2555,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        key.type = (u8)-1;
 search_again:
+        path->leave_spinning = 1;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
        if (ret < 0)
                goto error;
@@ -2644,6 +2702,7 @@ delete:
                        break;
                }
                if (found_extent) {
+                        btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes,
                                                leaf->start, root_owner,
@@ -2848,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
        if (err)
                return err;
-        if (S_ISREG(inode->i_mode) &&
+        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-            attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
+                if (attr->ia_size > inode->i_size) {
-                err = btrfs_cont_expand(inode, attr->ia_size);
+                        err = btrfs_cont_expand(inode, attr->ia_size);
-                if (err)
+                        if (err)
-                        return err;
+                                return err;
+                } else if (inode->i_size > 0 &&
+                           attr->ia_size == 0) {
+                        /* we're truncating a file that used to have good
+                         * data down to zero.  Make sure it gets into
+                         * the ordered flush list so that any new writes
+                         * get down to disk quickly.
+                         */
+                        BTRFS_I(inode)->ordered_data_close = 1;
+                }
        }
        err = inode_setattr(inode, attr);
@@ -2984,13 +3053,14 @@ static noinline void init_btrfs_i(struct inode *inode)
        bi->disk_i_size = 0;
        bi->flags = 0;
        bi->index_cnt = (u64)-1;
-        bi->log_dirty_trans = 0;
+        bi->last_unlink_trans = 0;
        extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
        extent_io_tree_init(&BTRFS_I(inode)->io_tree,
                             inode->i_mapping, GFP_NOFS);
        extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
                             inode->i_mapping, GFP_NOFS);
        INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+        INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
        btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
        mutex_init(&BTRFS_I(inode)->extent_mutex);
        mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3449,6 +3519,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        sizes[0] = sizeof(struct btrfs_inode_item);
        sizes[1] = name_len + sizeof(*ref);
+        path->leave_spinning = 1;
        ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
        if (ret != 0)
                goto fail;
@@ -3727,6 +3798,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
                drop_inode = 1;
        nr = trans->blocks_used;
+        btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
        btrfs_end_transaction_throttle(trans, root);
 fail:
        if (drop_inode) {
@@ -4292,8 +4365,9 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 * beyond EOF, then the page is guaranteed safe against truncation until we
 * unlock the page.
 */
-int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct inode *inode = fdentry(vma->vm_file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -4306,10 +4380,15 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        u64 page_end;
        ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
-        if (ret)
+        if (ret) {
+                if (ret == -ENOMEM)
+                        ret = VM_FAULT_OOM;
+                else /* -ENOSPC, -EIO, etc */
+                        ret = VM_FAULT_SIGBUS;
                goto out;
+        }
-        ret = -EINVAL;
+        ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
 again:
        lock_page(page);
        size = i_size_read(inode);
@@ -4357,6 +4436,8 @@ again:
        }
        ClearPageChecked(page);
        set_page_dirty(page);
+        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 out_unlock:
@@ -4382,6 +4463,27 @@ static void btrfs_truncate(struct inode *inode)
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        trans = btrfs_start_transaction(root, 1);
+        /*
+         * setattr is responsible for setting the ordered_data_close flag,
+         * but that is only tested during the last file release.  That
+         * could happen well after the next commit, leaving a great big
+         * window where new writes may get lost if someone chooses to write
+         * to this file after truncating to zero
+         *
+         * The inode doesn't have any dirty data here, and so if we commit
+         * this is a noop.  If someone immediately starts writing to the inode
+         * it is very likely we'll catch some of their writes in this
+         * transaction, and the commit will find this file on the ordered
+         * data list with good things to send down.
+         *
+         * This is a best effort solution, there is still a window where
+         * using truncate to replace the contents of the file will
+         * end up with a zero length file after a crash.
+         */
+        if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+                btrfs_add_ordered_operation(trans, root, inode);
        btrfs_set_trans_block_group(trans, inode);
        btrfs_i_size_write(inode, inode->i_size);
@@ -4458,12 +4560,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->i_acl = BTRFS_ACL_NOT_CACHED;
        ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
        INIT_LIST_HEAD(&ei->i_orphan);
+        INIT_LIST_HEAD(&ei->ordered_operations);
        return &ei->vfs_inode;
 }
 void btrfs_destroy_inode(struct inode *inode)
 {
        struct btrfs_ordered_extent *ordered;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
@@ -4474,13 +4579,24 @@ void btrfs_destroy_inode(struct inode *inode)
            BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
                posix_acl_release(BTRFS_I(inode)->i_default_acl);
-        spin_lock(&BTRFS_I(inode)->root->list_lock);
+        /*
+         * Make sure we're properly removed from the ordered operation
+         * lists.
+         */
+        smp_mb();
+        if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+                spin_lock(&root->fs_info->ordered_extent_lock);
+                list_del_init(&BTRFS_I(inode)->ordered_operations);
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+        }
+        spin_lock(&root->list_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
                printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
                       " list\n", inode->i_ino);
                dump_stack();
        }
-        spin_unlock(&BTRFS_I(inode)->root->list_lock);
+        spin_unlock(&root->list_lock);
        while (1) {
                ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4605,8 +4721,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_unlock;
+        /*
+         * we're using rename to replace one file with another.
+         * and the replacement file is large.  Start IO on it now so
+         * we don't add too much work to the end of the transaction
+         */
+        if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+            new_inode->i_size &&
+            old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+                filemap_flush(old_inode->i_mapping);
        trans = btrfs_start_transaction(root, 1);
+        /*
+         * make sure the inode gets flushed if it is replacing
+         * something.
+         */
+        if (new_inode && new_inode->i_size &&
+            old_inode && S_ISREG(old_inode->i_mode)) {
+                btrfs_add_ordered_operation(trans, root, old_inode);
+        }
+        /*
+         * this is an ugly little race, but the rename is required to make
+         * sure that if we crash, the inode is either at the old name
+         * or the new one.  pinning the log transaction lets us make sure
+         * we don't allow a log commit to come in after we unlink the
+         * name but before we add the new name back in.
+         */
+        btrfs_pin_log_trans(root);
        btrfs_set_trans_block_group(trans, new_dir);
        btrfs_inc_nlink(old_dentry->d_inode);
@@ -4614,6 +4758,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        new_dir->i_ctime = new_dir->i_mtime = ctime;
        old_inode->i_ctime = ctime;
+        if (old_dentry->d_parent != new_dentry->d_parent)
+                btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
        ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
                                 old_dentry->d_name.name,
                                 old_dentry->d_name.len);
@@ -4645,7 +4792,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_fail;
+        btrfs_log_new_name(trans, old_inode, old_dir,
+                                       new_dentry->d_parent);
 out_fail:
+        /* this btrfs_end_log_trans just allows the current
+         * log-sub transaction to complete
+         */
+        btrfs_end_log_trans(root);
        btrfs_end_transaction_throttle(trans, root);
 out_unlock:
        return ret;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 47b0a88c12a2..a5310c0f41e2 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
 static int btrfs_spin_on_block(struct extent_buffer *eb)
 {
        int i;
        for (i = 0; i < 512; i++) {
-                cpu_relax();
                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
                        return 1;
                if (need_resched())
                        break;
+                cpu_relax();
        }
        return 0;
 }
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
 {
        int i;
-        spin_nested(eb);
+        if (btrfs_spin_on_block(eb)) {
-        if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+                spin_nested(eb);
-                return 1;
+                if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
-        spin_unlock(&eb->lock);
+                        return 1;
+                spin_unlock(&eb->lock);
+        }
        /* spin for a bit on the BLOCKING flag */
        for (i = 0; i < 2; i++) {
+                cpu_relax();
                if (!btrfs_spin_on_block(eb))
                        break;
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
        DEFINE_WAIT(wait);
        wait.func = btrfs_wake_function;
+        if (!btrfs_spin_on_block(eb))
+                goto sleep;
        while(1) {
                spin_nested(eb);
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
                 * spin for a bit, and if the blocking flag goes away,
                 * loop around
                 */
+                cpu_relax();
                if (btrfs_spin_on_block(eb))
                        continue;
+sleep:
                prepare_to_wait_exclusive(&eb->lock_wq, &wait,
                                          TASK_UNINTERRUPTIBLE);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0f..53c87b197d70 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
        spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
+        /*
+         * we have no more ordered extents for this inode and
+         * no dirty pages.  We can safely remove it from the
+         * list of ordered extents
+         */
+        if (RB_EMPTY_ROOT(&tree->tree) &&
+            !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+                list_del_init(&BTRFS_I(inode)->ordered_operations);
+        }
        spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
        mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
 }
 /*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list.  These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io.  When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+        struct btrfs_inode *btrfs_inode;
+        struct inode *inode;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        mutex_lock(&root->fs_info->ordered_operations_mutex);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+        list_splice_init(&root->fs_info->ordered_operations, &splice);
+        while (!list_empty(&splice)) {
+                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                   ordered_operations);
+                inode = &btrfs_inode->vfs_inode;
+                list_del_init(&btrfs_inode->ordered_operations);
+                /*
+                 * the inode may be getting freed (in sys_unlink path).
+                 */
+                inode = igrab(inode);
+                if (!wait && inode) {
+                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                              &root->fs_info->ordered_operations);
+                }
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+                if (inode) {
+                        if (wait)
+                                btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                        else
+                                filemap_flush(inode->i_mapping);
+                        iput(inode);
+                }
+                cond_resched();
+                spin_lock(&root->fs_info->ordered_extent_lock);
+        }
+        if (wait && !list_empty(&root->fs_info->ordered_operations))
+                goto again;
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        mutex_unlock(&root->fs_info->ordered_operations_mutex);
+        return 0;
+}
+/*
 * Used to start IO or wait for a given ordered extent to finish.
 *
 * If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
        return ret;
 }
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct inode *inode)
+{
+        u64 last_mod;
+        last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+        /*
+         * if this file hasn't been changed since the last transaction
+         * commit, we can safely return without doing anything
+         */
+        if (last_mod < root->fs_info->last_trans_committed)
+                return 0;
+        /*
+         * the transaction is already committing.  Just start the IO and
+         * don't bother with all of this list nonsense
+         */
+        if (trans && root->fs_info->running_transaction->blocked) {
+                btrfs_wait_ordered_range(inode, 0, (u64)-1);
+                return 0;
+        }
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+                list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                              &root->fs_info->ordered_operations);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d6..3d31c8827b01 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
 int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
                           loff_t end, int sync_mode);
 int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct inode *inode);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4d..664782c6a2df 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -65,6 +65,15 @@ static noinline int join_transaction(struct btrfs_root *root)
                cur_trans->use_count = 1;
                cur_trans->commit_done = 0;
                cur_trans->start_time = get_seconds();
+                cur_trans->delayed_refs.root.rb_node = NULL;
+                cur_trans->delayed_refs.num_entries = 0;
+                cur_trans->delayed_refs.num_heads_ready = 0;
+                cur_trans->delayed_refs.num_heads = 0;
+                cur_trans->delayed_refs.flushing = 0;
+                cur_trans->delayed_refs.run_delayed_start = 0;
+                spin_lock_init(&cur_trans->delayed_refs.lock);
                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
                extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +191,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        h->block_group = 0;
        h->alloc_exclude_nr = 0;
        h->alloc_exclude_start = 0;
+        h->delayed_ref_updates = 0;
        root->fs_info->running_transaction->use_count++;
        mutex_unlock(&root->fs_info->trans_mutex);
        return h;
@@ -271,7 +282,6 @@ void btrfs_throttle(struct btrfs_root *root)
        if (!root->fs_info->open_ioctl_trans)
                wait_current_trans(root);
        mutex_unlock(&root->fs_info->trans_mutex);
        throttle_on_drops(root);
 }
@@ -280,6 +290,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 {
        struct btrfs_transaction *cur_trans;
        struct btrfs_fs_info *info = root->fs_info;
+        int count = 0;
+        while (count < 4) {
+                unsigned long cur = trans->delayed_ref_updates;
+                trans->delayed_ref_updates = 0;
+                if (cur &&
+                    trans->transaction->delayed_refs.num_heads_ready > 64) {
+                        trans->delayed_ref_updates = 0;
+                        /*
+                         * do a full flush if the transaction is trying
+                         * to close
+                         */
+                        if (trans->transaction->delayed_refs.flushing)
+                                cur = 0;
+                        btrfs_run_delayed_refs(trans, root, cur);
+                } else {
+                        break;
+                }
+                count++;
+        }
        mutex_lock(&info->trans_mutex);
        cur_trans = info->running_transaction;
@@ -424,9 +455,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
        u64 old_root_bytenr;
        struct btrfs_root *tree_root = root->fs_info->tree_root;
-        btrfs_extent_post_op(trans, root);
        btrfs_write_dirty_block_groups(trans, root);
-        btrfs_extent_post_op(trans, root);
+        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+        BUG_ON(ret);
        while (1) {
                old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +470,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
                                     btrfs_header_level(root->node));
                btrfs_set_root_generation(&root->root_item, trans->transid);
-                btrfs_extent_post_op(trans, root);
                ret = btrfs_update_root(trans, tree_root,
                                        &root->root_key,
                                        &root->root_item);
                BUG_ON(ret);
                btrfs_write_dirty_block_groups(trans, root);
-                btrfs_extent_post_op(trans, root);
+                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                BUG_ON(ret);
        }
        return 0;
 }
@@ -459,15 +491,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct list_head *next;
        struct extent_buffer *eb;
+        int ret;
-        btrfs_extent_post_op(trans, fs_info->tree_root);
+        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+        BUG_ON(ret);
        eb = btrfs_lock_root_node(fs_info->tree_root);
-        btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+        btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
-        btrfs_extent_post_op(trans, fs_info->tree_root);
+        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+        BUG_ON(ret);
        while (!list_empty(&fs_info->dirty_cowonly_roots)) {
                next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +510,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
                root = list_entry(next, struct btrfs_root, dirty_list);
                update_cowonly_root(trans, root);
+                ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+                BUG_ON(ret);
        }
        return 0;
 }
@@ -635,6 +673,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
 }
 /*
+ * when dropping snapshots, we generate a ton of delayed refs, and it makes
+ * sense not to join the transaction while it is trying to flush the current
+ * queue of delayed refs out.
+ *
+ * This is used by the drop snapshot code only
+ */
+static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
+{
+        DEFINE_WAIT(wait);
+        mutex_lock(&info->trans_mutex);
+        while (info->running_transaction &&
+               info->running_transaction->delayed_refs.flushing) {
+                prepare_to_wait(&info->transaction_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                mutex_unlock(&info->trans_mutex);
+                schedule();
+                mutex_lock(&info->trans_mutex);
+                finish_wait(&info->transaction_wait, &wait);
+        }
+        mutex_unlock(&info->trans_mutex);
+        return 0;
+}
+/*
 * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
 * all of them
 */
@@ -661,7 +724,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
                atomic_inc(&root->fs_info->throttles);
                while (1) {
+                        /*
+                         * we don't want to jump in and create a bunch of
+                         * delayed refs if the transaction is starting to close
+                         */
+                        wait_transaction_pre_flush(tree_root->fs_info);
                        trans = btrfs_start_transaction(tree_root, 1);
+                        /*
+                         * we've joined a transaction, make sure it isn't
+                         * closing right now
+                         */
+                        if (trans->transaction->delayed_refs.flushing) {
+                                btrfs_end_transaction(trans, tree_root);
+                                continue;
+                        }
                        mutex_lock(&root->fs_info->drop_mutex);
                        ret = btrfs_drop_snapshot(trans, dirty->root);
                        if (ret != -EAGAIN)
@@ -766,7 +844,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
        old = btrfs_lock_root_node(root);
-        btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+        btrfs_cow_block(trans, root, old, NULL, 0, &old);
        btrfs_copy_root(trans, root, old, &tmp, objectid);
        btrfs_tree_unlock(old);
@@ -894,12 +972,31 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        struct extent_io_tree *pinned_copy;
        DEFINE_WAIT(wait);
        int ret;
+        int should_grow = 0;
+        unsigned long now = get_seconds();
+        btrfs_run_ordered_operations(root, 0);
+        /* make a pass through all the delayed refs we have so far
+         * any runnings procs may add more while we are here
+         */
+        ret = btrfs_run_delayed_refs(trans, root, 0);
+        BUG_ON(ret);
+        cur_trans = trans->transaction;
+        /*
+         * set the flushing flag so procs in this transaction have to
+         * start sending their work down.
+         */
+        cur_trans->delayed_refs.flushing = 1;
+        ret = btrfs_run_delayed_refs(trans, root, 0);
+        BUG_ON(ret);
-        INIT_LIST_HEAD(&dirty_fs_roots);
        mutex_lock(&root->fs_info->trans_mutex);
-        if (trans->transaction->in_commit) {
+        INIT_LIST_HEAD(&dirty_fs_roots);
-                cur_trans = trans->transaction;
+        if (cur_trans->in_commit) {
-                trans->transaction->use_count++;
+                cur_trans->use_count++;
                mutex_unlock(&root->fs_info->trans_mutex);
                btrfs_end_transaction(trans, root);
@@ -922,7 +1019,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        trans->transaction->in_commit = 1;
        trans->transaction->blocked = 1;
-        cur_trans = trans->transaction;
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
@@ -937,6 +1033,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                }
        }
+        if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+                should_grow = 1;
        do {
                int snap_pending = 0;
                joined = cur_trans->num_joined;
@@ -949,7 +1048,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                if (cur_trans->num_writers > 1)
                        timeout = MAX_SCHEDULE_TIMEOUT;
-                else
+                else if (should_grow)
                        timeout = 1;
                mutex_unlock(&root->fs_info->trans_mutex);
@@ -959,16 +1058,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
-                schedule_timeout(timeout);
+                /*
+                 * rename don't use btrfs_join_transaction, so, once we
+                 * set the transaction to blocked above, we aren't going
+                 * to get any new ordered operations.  We can safely run
+                 * it here and no for sure that nothing new will be added
+                 * to the list
+                 */
+                btrfs_run_ordered_operations(root, 1);
+                smp_mb();
+                if (cur_trans->num_writers > 1 || should_grow)
+                        schedule_timeout(timeout);
                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
        } while (cur_trans->num_writers > 1 ||
-                 (cur_trans->num_joined != joined));
+                 (should_grow && cur_trans->num_joined != joined));
        ret = create_pending_snapshots(trans, root->fs_info);
        BUG_ON(ret);
+        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+        BUG_ON(ret);
        WARN_ON(cur_trans != trans->transaction);
        /* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1145,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_copy_pinned(root, pinned_copy);
        trans->transaction->blocked = 0;
        wake_up(&root->fs_info->transaction_throttle);
        wake_up(&root->fs_info->transaction_wait);
@@ -1058,6 +1172,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f882..94f5bde2b58d 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
 #ifndef __BTRFS_TRANSACTION__
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
+#include "delayed-ref.h"
 struct btrfs_transaction {
        u64 transid;
+        /*
+         * total writers in this transaction, it must be zero before the
+         * transaction can end
+         */
        unsigned long num_writers;
        unsigned long num_joined;
        int in_commit;
        int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
        wait_queue_head_t writer_wait;
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
+        struct btrfs_delayed_ref_root delayed_refs;
 };
 struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
        u64 block_group;
        u64 alloc_exclude_start;
        u64 alloc_exclude_nr;
+        unsigned long delayed_ref_updates;
 };
 struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570e..b10eacdb1620 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        }
        btrfs_release_path(root, path);
-        if (is_extent)
-                btrfs_extent_post_op(trans, root);
 out:
        if (path)
                btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60fa..fc9b87a7975b 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
 #define LOG_INODE_EXISTS 1
 /*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2).  After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ *  2a is actually the more important variant.  With the extra logging
+ *  a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir.  After a crash the rm -rf must
+ * be replayed.  This must be able to recurse down the entire
+ * directory tree.  The inode link count fixup code takes care of the
+ * ugly details.
+ */
+/*
 * stages for the tree walking.  The first
 * stage (0) is to only pin down the blocks we find
 * the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
 #define LOG_WALK_REPLAY_INODES 1
 #define LOG_WALK_REPLAY_ALL 2
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, struct inode *inode,
                             int inode_only);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+                                       struct btrfs_root *root,
+                                       struct btrfs_root *log,
+                                       struct btrfs_path *path,
+                                       u64 dirid, int del_all);
 /*
 * tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
 }
 /*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+        int ret = -ENOENT;
+        mutex_lock(&root->log_mutex);
+        atomic_inc(&root->log_writers);
+        mutex_unlock(&root->log_mutex);
+        return ret;
+}
+/*
 * indicate we're done making changes to the log tree
 * and wake up anyone waiting to do a sync
 */
-static int end_log_trans(struct btrfs_root *root)
+int btrfs_end_log_trans(struct btrfs_root *root)
 {
        if (atomic_dec_and_test(&root->log_writers)) {
                smp_mb();
@@ -203,7 +266,6 @@ static int process_one_buffer(struct btrfs_root *log,
                mutex_lock(&log->fs_info->pinned_mutex);
                btrfs_update_pinned_extents(log->fs_info->extent_root,
                                            eb->start, eb->len, 1);
-                mutex_unlock(&log->fs_info->pinned_mutex);
        }
        if (btrfs_buffer_uptodate(eb, gen)) {
@@ -603,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
        ret = link_to_fixup_dir(trans, root, path, location.objectid);
        BUG_ON(ret);
        ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
        BUG_ON(ret);
        kfree(name);
@@ -804,6 +867,7 @@ conflict_again:
                                            victim_name_len)) {
                                btrfs_inc_nlink(inode);
                                btrfs_release_path(root, path);
                                ret = btrfs_unlink_inode(trans, root, dir,
                                                         inode, victim_name,
                                                         victim_name_len);
@@ -922,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                key.offset--;
                btrfs_release_path(root, path);
        }
-        btrfs_free_path(path);
+        btrfs_release_path(root, path);
        if (nlink != inode->i_nlink) {
                inode->i_nlink = nlink;
                btrfs_update_inode(trans, root, inode);
        }
        BTRFS_I(inode)->index_cnt = (u64)-1;
+        if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
+                ret = replay_dir_deletes(trans, root, NULL, path,
+                                         inode->i_ino, 1);
+                BUG_ON(ret);
+        }
+        btrfs_free_path(path);
        return 0;
 }
@@ -971,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
                iput(inode);
-                if (key.offset == 0)
+                /*
-                        break;
+                 * fixup on a directory may create new entries,
-                key.offset--;
+                 * make sure we always look for the highset possible
+                 * offset
+                 */
+                key.offset = (u64)-1;
        }
        btrfs_release_path(root, path);
        return 0;
@@ -1313,11 +1387,11 @@ again:
                read_extent_buffer(eb, name, (unsigned long)(di + 1),
                                  name_len);
                log_di = NULL;
-                if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+                if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
                        log_di = btrfs_lookup_dir_item(trans, log, log_path,
                                                       dir_key->objectid,
                                                       name, name_len, 0);
-                } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+                } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
                        log_di = btrfs_lookup_dir_index_item(trans, log,
                                                     log_path,
                                                     dir_key->objectid,
@@ -1378,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
                                       struct btrfs_root *root,
                                       struct btrfs_root *log,
                                       struct btrfs_path *path,
-                                       u64 dirid)
+                                       u64 dirid, int del_all)
 {
        u64 range_start;
        u64 range_end;
@@ -1408,10 +1482,14 @@ again:
        range_start = 0;
        range_end = 0;
        while (1) {
-                ret = find_dir_range(log, path, dirid, key_type,
+                if (del_all)
-                                     &range_start, &range_end);
+                        range_end = (u64)-1;
-                if (ret != 0)
+                else {
-                        break;
+                        ret = find_dir_range(log, path, dirid, key_type,
+                                             &range_start, &range_end);
+                        if (ret != 0)
+                                break;
+                }
                dir_key.offset = range_start;
                while (1) {
@@ -1437,7 +1515,8 @@ again:
                                break;
                        ret = check_item_in_log(trans, root, log, path,
-                                                log_path, dir, &found_key);
+                                                log_path, dir,
+                                                &found_key);
                        BUG_ON(ret);
                        if (found_key.offset == (u64)-1)
                                break;
@@ -1514,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                        mode = btrfs_inode_mode(eb, inode_item);
                        if (S_ISDIR(mode)) {
                                ret = replay_dir_deletes(wc->trans,
-                                         root, log, path, key.objectid);
+                                         root, log, path, key.objectid, 0);
                                BUG_ON(ret);
                        }
                        ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1612,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
                                        root, inode, inode->i_size,
                                        BTRFS_EXTENT_DATA_KEY);
                                BUG_ON(ret);
+                                /* if the nlink count is zero here, the iput
+                                 * will free the inode.  We bump it to make
+                                 * sure it doesn't get freed until the link
+                                 * count fixup is done
+                                 */
+                                if (inode->i_nlink == 0) {
+                                        btrfs_inc_nlink(inode);
+                                        btrfs_update_inode(wc->trans,
+                                                           root, inode);
+                                }
                                iput(inode);
                        }
                        ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
        return ret;
 }
-static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
+static int wait_log_commit(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, unsigned long transid)
 {
        DEFINE_WAIT(wait);
        int index = transid % 2;
@@ -1854,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
                prepare_to_wait(&root->log_commit_wait[index],
                                &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&root->log_mutex);
-                if (root->log_transid < transid + 2 &&
+                if (root->fs_info->last_trans_log_full_commit !=
+                    trans->transid && root->log_transid < transid + 2 &&
                    atomic_read(&root->log_commit[index]))
                        schedule();
                finish_wait(&root->log_commit_wait[index], &wait);
                mutex_lock(&root->log_mutex);
        } while (root->log_transid < transid + 2 &&
@@ -1864,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
        return 0;
 }
-static int wait_for_writer(struct btrfs_root *root)
+static int wait_for_writer(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root)
 {
        DEFINE_WAIT(wait);
        while (atomic_read(&root->log_writers)) {
                prepare_to_wait(&root->log_writer_wait,
                                &wait, TASK_UNINTERRUPTIBLE);
                mutex_unlock(&root->log_mutex);
-                if (atomic_read(&root->log_writers))
+                if (root->fs_info->last_trans_log_full_commit !=
+                    trans->transid && atomic_read(&root->log_writers))
                        schedule();
                mutex_lock(&root->log_mutex);
                finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
 /*
 * btrfs_sync_log does sends a given tree log down to the disk and
 * updates the super blocks to record it.  When this call is done,
- * you know that any inodes previously logged are safely on disk
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS.  When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
 */
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root)
@@ -1896,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        mutex_lock(&root->log_mutex);
        index1 = root->log_transid % 2;
        if (atomic_read(&root->log_commit[index1])) {
-                wait_log_commit(root, root->log_transid);
+                wait_log_commit(trans, root, root->log_transid);
                mutex_unlock(&root->log_mutex);
                return 0;
        }
@@ -1904,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        /* wait for previous tree log sync to complete */
        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
-                wait_log_commit(root, root->log_transid - 1);
+                wait_log_commit(trans, root, root->log_transid - 1);
        while (1) {
                unsigned long batch = root->log_batch;
                mutex_unlock(&root->log_mutex);
                schedule_timeout_uninterruptible(1);
                mutex_lock(&root->log_mutex);
-                wait_for_writer(root);
+                wait_for_writer(trans, root);
                if (batch == root->log_batch)
                        break;
        }
+        /* bail out if we need to do a full commit */
+        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+                ret = -EAGAIN;
+                mutex_unlock(&root->log_mutex);
+                goto out;
+        }
        ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
        BUG_ON(ret);
@@ -1951,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        index2 = log_root_tree->log_transid % 2;
        if (atomic_read(&log_root_tree->log_commit[index2])) {
-                wait_log_commit(log_root_tree, log_root_tree->log_transid);
+                wait_log_commit(trans, log_root_tree,
+                                log_root_tree->log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                goto out;
        }
        atomic_set(&log_root_tree->log_commit[index2], 1);
-        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
+        if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
-                wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+                wait_log_commit(trans, log_root_tree,
+                                log_root_tree->log_transid - 1);
+        }
+        wait_for_writer(trans, log_root_tree);
-        wait_for_writer(log_root_tree);
+        /*
+         * now that we've moved on to the tree of log tree roots,
+         * check the full commit flag again
+         */
+        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+                mutex_unlock(&log_root_tree->log_mutex);
+                ret = -EAGAIN;
+                goto out_wake_log_root;
+        }
        ret = btrfs_write_and_wait_marked_extents(log_root_tree,
                                &log_root_tree->dirty_log_pages);
@@ -1985,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * in and cause problems either.
         */
        write_ctree_super(trans, root->fs_info->tree_root, 2);
+        ret = 0;
+out_wake_log_root:
        atomic_set(&log_root_tree->log_commit[index2], 0);
        smp_mb();
        if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2124,8 @@ out:
        return 0;
 }
-/* * free all the extents used by the tree log.  This should be called
+/*
+ * free all the extents used by the tree log.  This should be called
 * at commit time of the full transaction
 */
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        btrfs_free_path(path);
        mutex_unlock(&BTRFS_I(dir)->log_mutex);
-        end_log_trans(root);
+        btrfs_end_log_trans(root);
        return 0;
 }
@@ -2159,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
                                  dirid, &index);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
-        end_log_trans(root);
+        btrfs_end_log_trans(root);
        return ret;
 }
@@ -2559,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
 *
 * This handles both files and directories.
 */
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, struct inode *inode,
                             int inode_only)
 {
@@ -2585,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
        min_key.offset = 0;
        max_key.objectid = inode->i_ino;
+        /* today the code can only do partial logging of directories */
+        if (!S_ISDIR(inode->i_mode))
+            inode_only = LOG_INODE_ALL;
        if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
                max_key.type = BTRFS_XATTR_ITEM_KEY;
        else
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
-        /*
-         * if this inode has already been logged and we're in inode_only
-         * mode, we don't want to delete the things that have already
-         * been written to the log.
-         *
-         * But, if the inode has been through an inode_only log,
-         * the logged_trans field is not set.  This allows us to catch
-         * any new names for this inode in the backrefs by logging it
-         * again
-         */
-        if (inode_only == LOG_INODE_EXISTS &&
-            BTRFS_I(inode)->logged_trans == trans->transid) {
-                btrfs_free_path(path);
-                btrfs_free_path(dst_path);
-                goto out;
-        }
        mutex_lock(&BTRFS_I(inode)->log_mutex);
        /*
@@ -2693,7 +2809,6 @@ next_slot:
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
                btrfs_release_path(root, path);
                btrfs_release_path(log, dst_path);
-                BTRFS_I(inode)->log_dirty_trans = 0;
                ret = log_directory_changes(trans, root, inode, path, dst_path);
                BUG_ON(ret);
        }
@@ -2702,19 +2817,69 @@ next_slot:
        btrfs_free_path(path);
        btrfs_free_path(dst_path);
-out:
        return 0;
 }
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
+/*
-                    struct btrfs_root *root, struct inode *inode,
+ * follow the dentry parent pointers up the chain and see if any
-                    int inode_only)
+ * of the directories in it require a full commit before they can
+ * be logged.  Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+                                               struct inode *inode,
+                                               struct dentry *parent,
+                                               struct super_block *sb,
+                                               u64 last_committed)
 {
-        int ret;
+        int ret = 0;
+        struct btrfs_root *root;
-        start_log_trans(trans, root);
+        /*
-        ret = __btrfs_log_inode(trans, root, inode, inode_only);
+         * for regular files, if its inode is already on disk, we don't
-        end_log_trans(root);
+         * have to worry about the parents at all.  This is because
+         * we can use the last_unlink_trans field to record renames
+         * and other fun in this file.
+         */
+        if (S_ISREG(inode->i_mode) &&
+            BTRFS_I(inode)->generation <= last_committed &&
+            BTRFS_I(inode)->last_unlink_trans <= last_committed)
+                        goto out;
+        if (!S_ISDIR(inode->i_mode)) {
+                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+                        goto out;
+                inode = parent->d_inode;
+        }
+        while (1) {
+                BTRFS_I(inode)->logged_trans = trans->transid;
+                smp_mb();
+                if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+                        root = BTRFS_I(inode)->root;
+                        /*
+                         * make sure any commits to the log are forced
+                         * to be full commits
+                         */
+                        root->fs_info->last_trans_log_full_commit =
+                                trans->transid;
+                        ret = 1;
+                        break;
+                }
+                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+                        break;
+                if (parent == sb->s_root)
+                        break;
+                parent = parent->d_parent;
+                inode = parent->d_inode;
+        }
+out:
        return ret;
 }
@@ -2724,31 +2889,65 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
 * only logging is done of any parent directories that are older than
 * the last committed transaction
 */
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
-                    struct btrfs_root *root, struct dentry *dentry)
+                    struct btrfs_root *root, struct inode *inode,
+                    struct dentry *parent, int exists_only)
 {
-        int inode_only = LOG_INODE_ALL;
+        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
        struct super_block *sb;
-        int ret;
+        int ret = 0;
+        u64 last_committed = root->fs_info->last_trans_committed;
+        sb = inode->i_sb;
+        if (root->fs_info->last_trans_log_full_commit >
+            root->fs_info->last_trans_committed) {
+                ret = 1;
+                goto end_no_trans;
+        }
+        ret = check_parent_dirs_for_sync(trans, inode, parent,
+                                         sb, last_committed);
+        if (ret)
+                goto end_no_trans;
        start_log_trans(trans, root);
-        sb = dentry->d_inode->i_sb;
-        while (1) {
-                ret = __btrfs_log_inode(trans, root, dentry->d_inode,
-                                        inode_only);
-                BUG_ON(ret);
-                inode_only = LOG_INODE_EXISTS;
-                dentry = dentry->d_parent;
+        ret = btrfs_log_inode(trans, root, inode, inode_only);
-                if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+        BUG_ON(ret);
+        /*
+         * for regular files, if its inode is already on disk, we don't
+         * have to worry about the parents at all.  This is because
+         * we can use the last_unlink_trans field to record renames
+         * and other fun in this file.
+         */
+        if (S_ISREG(inode->i_mode) &&
+            BTRFS_I(inode)->generation <= last_committed &&
+            BTRFS_I(inode)->last_unlink_trans <= last_committed)
+                        goto no_parent;
+        inode_only = LOG_INODE_EXISTS;
+        while (1) {
+                if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
                        break;
-                if (BTRFS_I(dentry->d_inode)->generation <=
+                inode = parent->d_inode;
-                    root->fs_info->last_trans_committed)
+                if (BTRFS_I(inode)->generation >
+                    root->fs_info->last_trans_committed) {
+                        ret = btrfs_log_inode(trans, root, inode, inode_only);
+                        BUG_ON(ret);
+                }
+                if (parent == sb->s_root)
                        break;
+                parent = parent->d_parent;
        }
-        end_log_trans(root);
+no_parent:
-        return 0;
+        ret = 0;
+        btrfs_end_log_trans(root);
+end_no_trans:
+        return ret;
 }
 /*
@@ -2760,12 +2959,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry)
 {
-        u64 gen;
+        return btrfs_log_inode_parent(trans, root, dentry->d_inode,
-        gen = root->fs_info->last_trans_new_blockgroup;
+                                      dentry->d_parent, 0);
-        if (gen > root->fs_info->last_trans_committed)
-                return 1;
-        else
-                return btrfs_log_dentry(trans, root, dentry);
 }
 /*
@@ -2884,3 +3079,94 @@ again:
        kfree(log_root_tree);
        return 0;
 }
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+                             struct inode *dir, struct inode *inode,
+                             int for_rename)
+{
+        /*
+         * when we're logging a file, if it hasn't been renamed
+         * or unlinked, and its inode is fully committed on disk,
+         * we don't have to worry about walking up the directory chain
+         * to log its parents.
+         *
+         * So, we use the last_unlink_trans field to put this transid
+         * into the file.  When the file is logged we check it and
+         * don't log the parents if the file is fully on disk.
+         */
+        if (S_ISREG(inode->i_mode))
+                BTRFS_I(inode)->last_unlink_trans = trans->transid;
+        /*
+         * if this directory was already logged any new
+         * names for this file/dir will get recorded
+         */
+        smp_mb();
+        if (BTRFS_I(dir)->logged_trans == trans->transid)
+                return;
+        /*
+         * if the inode we're about to unlink was logged,
+         * the log will be properly updated for any new names
+         */
+        if (BTRFS_I(inode)->logged_trans == trans->transid)
+                return;
+        /*
+         * when renaming files across directories, if the directory
+         * there we're unlinking from gets fsync'd later on, there's
+         * no way to find the destination directory later and fsync it
+         * properly.  So, we have to be conservative and force commits
+         * so the new name gets discovered.
+         */
+        if (for_rename)
+                goto record;
+        /* we can safely do the unlink without any special recording */
+        return;
+record:
+        BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+                        struct inode *inode, struct inode *old_dir,
+                        struct dentry *parent)
+{
+        struct btrfs_root * root = BTRFS_I(inode)->root;
+        /*
+         * this will force the logging code to walk the dentry chain
+         * up for the file
+         */
+        if (S_ISREG(inode->i_mode))
+                BTRFS_I(inode)->last_unlink_trans = trans->transid;
+        /*
+         * if this inode hasn't been logged and directory we're renaming it
+         * from hasn't been logged, we don't need to log it
+         */
+        if (BTRFS_I(inode)->logged_trans <=
+            root->fs_info->last_trans_committed &&
+            (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+                    root->fs_info->last_trans_committed))
+                return 0;
+        return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed02..d09c7609e16b 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
 int btrfs_sync_log(struct btrfs_trans_handle *trans,
                   struct btrfs_root *root);
 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
-                    struct btrfs_root *root, struct dentry *dentry);
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry);
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
-                    struct btrfs_root *root, struct inode *inode,
-                    int inode_only);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               const char *name, int name_len,
                               struct inode *inode, u64 dirid);
+int btrfs_join_running_log_trans(struct btrfs_root *root);
+int btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+                    struct btrfs_root *root, struct inode *inode,
+                    struct dentry *parent, int exists_only);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+                             struct inode *dir, struct inode *inode,
+                             int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+                        struct inode *inode, struct inode *old_dir,
+                        struct dentry *parent);
 #endif
diff --git a/fs/buffer.c b/fs/buffer.c
index a2fd743d97cb..f5f8b15a6e40 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -290,7 +290,7 @@ static void free_more_memory(void)
                                                &zone);
                if (zone)
                        try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0,
-                                                GFP_NOFS);
+                                                GFP_NOFS, NULL);
        }
 }
@@ -547,6 +547,39 @@ repeat:
        return err;
 }
+void do_thaw_all(unsigned long unused)
+{
+        struct super_block *sb;
+        char b[BDEVNAME_SIZE];
+        spin_lock(&sb_lock);
+restart:
+        list_for_each_entry(sb, &super_blocks, s_list) {
+                sb->s_count++;
+                spin_unlock(&sb_lock);
+                down_read(&sb->s_umount);
+                while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
+                        printk(KERN_WARNING "Emergency Thaw on %s\n",
+                               bdevname(sb->s_bdev, b));
+                up_read(&sb->s_umount);
+                spin_lock(&sb_lock);
+                if (__put_super_and_need_restart(sb))
+                        goto restart;
+        }
+        spin_unlock(&sb_lock);
+        printk(KERN_WARNING "Emergency Thaw complete\n");
+}
+/**
+ * emergency_thaw_all -- forcibly thaw every frozen filesystem
+ *
+ * Used for emergency unfreeze of all filesystems via SysRq
+ */
+void emergency_thaw_all(void)
+{
+        pdflush_operation(do_thaw_all, 0);
+}
 /**
 * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers
 * @mapping: the mapping which wants those buffers written
@@ -621,14 +654,7 @@ static void __set_page_dirty(struct page *page,
        spin_lock_irq(&mapping->tree_lock);
        if (page->mapping) {    /* Race with truncate? */
                WARN_ON_ONCE(warn && !PageUptodate(page));
+                account_page_dirtied(page, mapping);
-                if (mapping_cap_account_dirty(mapping)) {
-                        __inc_zone_page_state(page, NR_FILE_DIRTY);
-                        __inc_bdi_stat(mapping->backing_dev_info,
-                                        BDI_RECLAIMABLE);
-                        task_dirty_inc(current);
-                        task_io_account_write(PAGE_CACHE_SIZE);
-                }
                radix_tree_tag_set(&mapping->page_tree,
                                page_index(page), PAGECACHE_TAG_DIRTY);
        }
@@ -2320,13 +2346,14 @@ int block_commit_write(struct page *page, unsigned from, unsigned to)
 * unlock the page.
 */
 int
-block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
+block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
                   get_block_t get_block)
 {
+        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        unsigned long end;
        loff_t size;
-        int ret = -EINVAL;
+        int ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
        lock_page(page);
        size = i_size_read(inode);
@@ -2346,6 +2373,13 @@ block_page_mkwrite(struct vm_area_struct *vma, struct page *page,
        if (!ret)
                ret = block_commit_write(page, 0, end);
+        if (unlikely(ret)) {
+                if (ret == -ENOMEM)
+                        ret = VM_FAULT_OOM;
+                else /* -ENOSPC, -EIO, etc */
+                        ret = VM_FAULT_SIGBUS;
+        }
 out_unlock:
        unlock_page(page);
        return ret;
diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c
index e4a6223c3145..af737bb56cb7 100644
--- a/fs/ecryptfs/keystore.c
+++ b/fs/ecryptfs/keystore.c
@@ -740,8 +740,7 @@ ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes,
 out_release_free_unlock:
        crypto_free_hash(s->hash_desc.tfm);
 out_free_unlock:
-        memset(s->block_aligned_filename, 0, s->block_aligned_filename_size);
+        kzfree(s->block_aligned_filename);
-        kfree(s->block_aligned_filename);
 out_unlock:
        mutex_unlock(s->tfm_mutex);
 out:
diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c
index 96ef51489e01..295e7fa56755 100644
--- a/fs/ecryptfs/messaging.c
+++ b/fs/ecryptfs/messaging.c
@@ -291,8 +291,7 @@ int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon)
        if (daemon->user_ns)
                put_user_ns(daemon->user_ns);
        mutex_unlock(&daemon->mux);
-        memset(daemon, 0, sizeof(*daemon));
+        kzfree(daemon);
-        kfree(daemon);
 out:
        return rc;
 }
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 5de2c2db3aa2..2a701d593d35 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -28,6 +28,7 @@ struct eventfd_ctx {
         * issue a wakeup.
         */
        __u64 count;
+        unsigned int flags;
 };
 /*
@@ -50,7 +51,7 @@ int eventfd_signal(struct file *file, int n)
                n = (int) (ULLONG_MAX - ctx->count);
        ctx->count += n;
        if (waitqueue_active(&ctx->wqh))
-                wake_up_locked(&ctx->wqh);
+                wake_up_locked_poll(&ctx->wqh, POLLIN);
        spin_unlock_irqrestore(&ctx->wqh.lock, flags);
        return n;
@@ -87,22 +88,20 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
 {
        struct eventfd_ctx *ctx = file->private_data;
        ssize_t res;
-        __u64 ucnt;
+        __u64 ucnt = 0;
        DECLARE_WAITQUEUE(wait, current);
        if (count < sizeof(ucnt))
                return -EINVAL;
        spin_lock_irq(&ctx->wqh.lock);
        res = -EAGAIN;
-        ucnt = ctx->count;
+        if (ctx->count > 0)
-        if (ucnt > 0)
                res = sizeof(ucnt);
        else if (!(file->f_flags & O_NONBLOCK)) {
                __add_wait_queue(&ctx->wqh, &wait);
                for (res = 0;;) {
                        set_current_state(TASK_INTERRUPTIBLE);
                        if (ctx->count > 0) {
-                                ucnt = ctx->count;
                                res = sizeof(ucnt);
                                break;
                        }
@@ -117,10 +116,11 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count,
                __remove_wait_queue(&ctx->wqh, &wait);
                __set_current_state(TASK_RUNNING);
        }
-        if (res > 0) {
+        if (likely(res > 0)) {
-                ctx->count = 0;
+                ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count;
+                ctx->count -= ucnt;
                if (waitqueue_active(&ctx->wqh))
-                        wake_up_locked(&ctx->wqh);
+                        wake_up_locked_poll(&ctx->wqh, POLLOUT);
        }
        spin_unlock_irq(&ctx->wqh.lock);
        if (res > 0 && put_user(ucnt, (__u64 __user *) buf))
@@ -166,10 +166,10 @@ static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t c
                __remove_wait_queue(&ctx->wqh, &wait);
                __set_current_state(TASK_RUNNING);
        }
-        if (res > 0) {
+        if (likely(res > 0)) {
                ctx->count += ucnt;
                if (waitqueue_active(&ctx->wqh))
-                        wake_up_locked(&ctx->wqh);
+                        wake_up_locked_poll(&ctx->wqh, POLLIN);
        }
        spin_unlock_irq(&ctx->wqh.lock);
@@ -207,7 +207,7 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
        BUILD_BUG_ON(EFD_CLOEXEC != O_CLOEXEC);
        BUILD_BUG_ON(EFD_NONBLOCK != O_NONBLOCK);
-        if (flags & ~(EFD_CLOEXEC | EFD_NONBLOCK))
+        if (flags & ~EFD_FLAGS_SET)
                return -EINVAL;
        ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
@@ -216,13 +216,14 @@ SYSCALL_DEFINE2(eventfd2, unsigned int, count, int, flags)
        init_waitqueue_head(&ctx->wqh);
        ctx->count = count;
+        ctx->flags = flags;
        /*
         * When we call this, the initialization must be complete, since
         * anon_inode_getfd() will install the fd.
         */
        fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx,
-                              flags & (O_CLOEXEC | O_NONBLOCK));
+                              flags & EFD_SHARED_FCNTL_FLAGS);
        if (fd < 0)
                kfree(ctx);
        return fd;
@@ -232,3 +233,4 @@ SYSCALL_DEFINE1(eventfd, unsigned int, count)
 {
        return sys_eventfd2(count, 0);
 }
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index c5c424f23fd5..a89f370fadb5 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1,6 +1,6 @@
 /*
- *  fs/eventpoll.c (Efficent event polling implementation)
+ *  fs/eventpoll.c (Efficient event retrieval implementation)
- *  Copyright (C) 2001,...,2007  Davide Libenzi
+ *  Copyright (C) 2001,...,2009  Davide Libenzi
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
@@ -71,29 +71,11 @@
 * a better scalability.
 */
-#define DEBUG_EPOLL 0
-#if DEBUG_EPOLL > 0
-#define DPRINTK(x) printk x
-#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0)
-#else /* #if DEBUG_EPOLL > 0 */
-#define DPRINTK(x) (void) 0
-#define DNPRINTK(n, x) (void) 0
-#endif /* #if DEBUG_EPOLL > 0 */
-#define DEBUG_EPI 0
-#if DEBUG_EPI != 0
-#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */)
-#else /* #if DEBUG_EPI != 0 */
-#define EPI_SLAB_DEBUG 0
-#endif /* #if DEBUG_EPI != 0 */
 /* Epoll private bits inside the event mask */
 #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET)
-/* Maximum number of poll wake up nests we are allowing */
+/* Maximum number of nesting allowed inside epoll sets */
-#define EP_MAX_POLLWAKE_NESTS 4
+#define EP_MAX_NESTS 4
 /* Maximum msec timeout value storeable in a long int */
 #define EP_MAX_MSTIMEO min(1000ULL * MAX_SCHEDULE_TIMEOUT / HZ, (LONG_MAX - 999ULL) / HZ)
@@ -110,24 +92,21 @@ struct epoll_filefd {
 };
 /*
- * Node that is linked into the "wake_task_list" member of the "struct poll_safewake".
+ * Structure used to track possible nested calls, for too deep recursions
- * It is used to keep track on all tasks that are currently inside the wake_up() code
+ * and loop cycles.
- * to 1) short-circuit the one coming from the same task and same wait queue head
- * (loop) 2) allow a maximum number of epoll descriptors inclusion nesting
- * 3) let go the ones coming from other tasks.
 */
-struct wake_task_node {
+struct nested_call_node {
        struct list_head llink;
-        struct task_struct *task;
+        void *cookie;
-        wait_queue_head_t *wq;
+        int cpu;
 };
 /*
- * This is used to implement the safe poll wake up avoiding to reenter
+ * This structure is used as collector for nested calls, to check for
- * the poll callback from inside wake_up().
+ * maximum recursion dept and loop cycles.
 */
-struct poll_safewake {
+struct nested_calls {
-        struct list_head wake_task_list;
+        struct list_head tasks_call_list;
        spinlock_t lock;
 };
@@ -213,7 +192,7 @@ struct eppoll_entry {
        struct list_head llink;
        /* The "base" pointer is set to the container "struct epitem" */
-        void *base;
+        struct epitem *base;
        /*
         * Wait queue item that will be linked to the target file wait
@@ -231,6 +210,12 @@ struct ep_pqueue {
        struct epitem *epi;
 };
+/* Used by the ep_send_events() function as callback private data */
+struct ep_send_events_data {
+        int maxevents;
+        struct epoll_event __user *events;
+};
 /*
 * Configuration options available inside /proc/sys/fs/epoll/
 */
@@ -242,8 +227,11 @@ static int max_user_watches __read_mostly;
 */
 static DEFINE_MUTEX(epmutex);
-/* Safe wake up implementation */
+/* Used for safe wake up implementation */
-static struct poll_safewake psw;
+static struct nested_calls poll_safewake_ncalls;
+/* Used to call file's f_op->poll() under the nested calls boundaries */
+static struct nested_calls poll_readywalk_ncalls;
 /* Slab cache used to allocate "struct epitem" */
 static struct kmem_cache *epi_cache __read_mostly;
@@ -312,89 +300,230 @@ static inline int ep_op_has_event(int op)
 }
 /* Initialize the poll safe wake up structure */
-static void ep_poll_safewake_init(struct poll_safewake *psw)
+static void ep_nested_calls_init(struct nested_calls *ncalls)
 {
+        INIT_LIST_HEAD(&ncalls->tasks_call_list);
-        INIT_LIST_HEAD(&psw->wake_task_list);
+        spin_lock_init(&ncalls->lock);
-        spin_lock_init(&psw->lock);
 }
-/*
+/**
- * Perform a safe wake up of the poll wait list. The problem is that
+ * ep_call_nested - Perform a bound (possibly) nested call, by checking
- * with the new callback'd wake up system, it is possible that the
+ *                  that the recursion limit is not exceeded, and that
- * poll callback is reentered from inside the call to wake_up() done
+ *                  the same nested call (by the meaning of same cookie) is
- * on the poll wait queue head. The rule is that we cannot reenter the
+ *                  no re-entered.
- * wake up code from the same task more than EP_MAX_POLLWAKE_NESTS times,
+ *
- * and we cannot reenter the same wait queue head at all. This will
+ * @ncalls: Pointer to the nested_calls structure to be used for this call.
- * enable to have a hierarchy of epoll file descriptor of no more than
+ * @max_nests: Maximum number of allowed nesting calls.
- * EP_MAX_POLLWAKE_NESTS deep. We need the irq version of the spin lock
+ * @nproc: Nested call core function pointer.
- * because this one gets called by the poll callback, that in turn is called
+ * @priv: Opaque data to be passed to the @nproc callback.
- * from inside a wake_up(), that might be called from irq context.
+ * @cookie: Cookie to be used to identify this nested call.
+ *
+ * Returns: Returns the code returned by the @nproc callback, or -1 if
+ *          the maximum recursion limit has been exceeded.
 */
-static void ep_poll_safewake(struct poll_safewake *psw, wait_queue_head_t *wq)
+static int ep_call_nested(struct nested_calls *ncalls, int max_nests,
+                          int (*nproc)(void *, void *, int), void *priv,
+                          void *cookie)
 {
-        int wake_nests = 0;
+        int error, call_nests = 0;
        unsigned long flags;
-        struct task_struct *this_task = current;
+        int this_cpu = get_cpu();
-        struct list_head *lsthead = &psw->wake_task_list;
+        struct list_head *lsthead = &ncalls->tasks_call_list;
-        struct wake_task_node *tncur;
+        struct nested_call_node *tncur;
-        struct wake_task_node tnode;
+        struct nested_call_node tnode;
-        spin_lock_irqsave(&psw->lock, flags);
+        spin_lock_irqsave(&ncalls->lock, flags);
-        /* Try to see if the current task is already inside this wakeup call */
+        /*
+         * Try to see if the current task is already inside this wakeup call.
+         * We use a list here, since the population inside this set is always
+         * very much limited.
+         */
        list_for_each_entry(tncur, lsthead, llink) {
+                if (tncur->cpu == this_cpu &&
-                if (tncur->wq == wq ||
+                    (tncur->cookie == cookie || ++call_nests > max_nests)) {
-                    (tncur->task == this_task && ++wake_nests > EP_MAX_POLLWAKE_NESTS)) {
                        /*
                         * Ops ... loop detected or maximum nest level reached.
                         * We abort this wake by breaking the cycle itself.
                         */
-                        spin_unlock_irqrestore(&psw->lock, flags);
+                        error = -1;
-                        return;
+                        goto out_unlock;
                }
        }
-        /* Add the current task to the list */
+        /* Add the current task and cookie to the list */
-        tnode.task = this_task;
+        tnode.cpu = this_cpu;
-        tnode.wq = wq;
+        tnode.cookie = cookie;
        list_add(&tnode.llink, lsthead);
-        spin_unlock_irqrestore(&psw->lock, flags);
+        spin_unlock_irqrestore(&ncalls->lock, flags);
-        /* Do really wake up now */
+        /* Call the nested function */
-        wake_up_nested(wq, 1 + wake_nests);
+        error = (*nproc)(priv, cookie, call_nests);
        /* Remove the current task from the list */
-        spin_lock_irqsave(&psw->lock, flags);
+        spin_lock_irqsave(&ncalls->lock, flags);
        list_del(&tnode.llink);
-        spin_unlock_irqrestore(&psw->lock, flags);
+ out_unlock:
+        spin_unlock_irqrestore(&ncalls->lock, flags);
+        put_cpu();
+        return error;
+}
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+                                     unsigned long events, int subclass)
+{
+        unsigned long flags;
+        spin_lock_irqsave_nested(&wqueue->lock, flags, subclass);
+        wake_up_locked_poll(wqueue, events);
+        spin_unlock_irqrestore(&wqueue->lock, flags);
+}
+#else
+static inline void ep_wake_up_nested(wait_queue_head_t *wqueue,
+                                     unsigned long events, int subclass)
+{
+        wake_up_poll(wqueue, events);
+}
+#endif
+static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests)
+{
+        ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN,
+                          1 + call_nests);
+        return 0;
+}
+/*
+ * Perform a safe wake up of the poll wait list. The problem is that
+ * with the new callback'd wake up system, it is possible that the
+ * poll callback is reentered from inside the call to wake_up() done
+ * on the poll wait queue head. The rule is that we cannot reenter the
+ * wake up code from the same task more than EP_MAX_NESTS times,
+ * and we cannot reenter the same wait queue head at all. This will
+ * enable to have a hierarchy of epoll file descriptor of no more than
+ * EP_MAX_NESTS deep.
+ */
+static void ep_poll_safewake(wait_queue_head_t *wq)
+{
+        ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS,
+                       ep_poll_wakeup_proc, NULL, wq);
 }
 /*
- * This function unregister poll callbacks from the associated file descriptor.
+ * This function unregisters poll callbacks from the associated file
- * Since this must be called without holding "ep->lock" the atomic exchange trick
+ * descriptor.  Must be called with "mtx" held (or "epmutex" if called from
- * will protect us from multiple unregister.
+ * ep_free).
 */
 static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi)
 {
-        int nwait;
        struct list_head *lsthead = &epi->pwqlist;
        struct eppoll_entry *pwq;
-        /* This is called without locks, so we need the atomic exchange */
+        while (!list_empty(lsthead)) {
-        nwait = xchg(&epi->nwait, 0);
+                pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
-        if (nwait) {
+                list_del(&pwq->llink);
-                while (!list_empty(lsthead)) {
+                remove_wait_queue(pwq->whead, &pwq->wait);
-                        pwq = list_first_entry(lsthead, struct eppoll_entry, llink);
+                kmem_cache_free(pwq_cache, pwq);
+        }
+}
-                        list_del_init(&pwq->llink);
+/**
-                        remove_wait_queue(pwq->whead, &pwq->wait);
+ * ep_scan_ready_list - Scans the ready list in a way that makes possible for
-                        kmem_cache_free(pwq_cache, pwq);
+ *                      the scan code, to call f_op->poll(). Also allows for
-                }
+ *                      O(NumReady) performance.
+ *
+ * @ep: Pointer to the epoll private data structure.
+ * @sproc: Pointer to the scan callback.
+ * @priv: Private opaque data passed to the @sproc callback.
+ *
+ * Returns: The same integer error code returned by the @sproc callback.
+ */
+static int ep_scan_ready_list(struct eventpoll *ep,
+                              int (*sproc)(struct eventpoll *,
+                                           struct list_head *, void *),
+                              void *priv)
+{
+        int error, pwake = 0;
+        unsigned long flags;
+        struct epitem *epi, *nepi;
+        LIST_HEAD(txlist);
+        /*
+         * We need to lock this because we could be hit by
+         * eventpoll_release_file() and epoll_ctl().
+         */
+        mutex_lock(&ep->mtx);
+        /*
+         * Steal the ready list, and re-init the original one to the
+         * empty list. Also, set ep->ovflist to NULL so that events
+         * happening while looping w/out locks, are not lost. We cannot
+         * have the poll callback to queue directly on ep->rdllist,
+         * because we want the "sproc" callback to be able to do it
+         * in a lockless way.
+         */
+        spin_lock_irqsave(&ep->lock, flags);
+        list_splice_init(&ep->rdllist, &txlist);
+        ep->ovflist = NULL;
+        spin_unlock_irqrestore(&ep->lock, flags);
+        /*
+         * Now call the callback function.
+         */
+        error = (*sproc)(ep, &txlist, priv);
+        spin_lock_irqsave(&ep->lock, flags);
+        /*
+         * During the time we spent inside the "sproc" callback, some
+         * other events might have been queued by the poll callback.
+         * We re-insert them inside the main ready-list here.
+         */
+        for (nepi = ep->ovflist; (epi = nepi) != NULL;
+             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
+                /*
+                 * We need to check if the item is already in the list.
+                 * During the "sproc" callback execution time, items are
+                 * queued into ->ovflist but the "txlist" might already
+                 * contain them, and the list_splice() below takes care of them.
+                 */
+                if (!ep_is_linked(&epi->rdllink))
+                        list_add_tail(&epi->rdllink, &ep->rdllist);
+        }
+        /*
+         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
+         * releasing the lock, events will be queued in the normal way inside
+         * ep->rdllist.
+         */
+        ep->ovflist = EP_UNACTIVE_PTR;
+        /*
+         * Quickly re-inject items left on "txlist".
+         */
+        list_splice(&txlist, &ep->rdllist);
+        if (!list_empty(&ep->rdllist)) {
+                /*
+                 * Wake up (if active) both the eventpoll wait list and
+                 * the ->poll() wait list (delayed after we release the lock).
+                 */
+                if (waitqueue_active(&ep->wq))
+                        wake_up_locked(&ep->wq);
+                if (waitqueue_active(&ep->poll_wait))
+                        pwake++;
        }
+        spin_unlock_irqrestore(&ep->lock, flags);
+        mutex_unlock(&ep->mtx);
+        /* We have to call this outside the lock */
+        if (pwake)
+                ep_poll_safewake(&ep->poll_wait);
+        return error;
 }
 /*
@@ -434,9 +563,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi)
        atomic_dec(&ep->user->epoll_watches);
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n",
-                     current, ep, file));
        return 0;
 }
@@ -447,7 +573,7 @@ static void ep_free(struct eventpoll *ep)
        /* We need to release all tasks waiting for these file */
        if (waitqueue_active(&ep->poll_wait))
-                ep_poll_safewake(&psw, &ep->poll_wait);
+                ep_poll_safewake(&ep->poll_wait);
        /*
         * We need to lock this because we could be hit by
@@ -492,26 +618,54 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file)
        if (ep)
                ep_free(ep);
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep));
        return 0;
 }
+static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head,
+                               void *priv)
+{
+        struct epitem *epi, *tmp;
+        list_for_each_entry_safe(epi, tmp, head, rdllink) {
+                if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
+                    epi->event.events)
+                        return POLLIN | POLLRDNORM;
+                else {
+                        /*
+                         * Item has been dropped into the ready list by the poll
+                         * callback, but it's not actually ready, as far as
+                         * caller requested events goes. We can remove it here.
+                         */
+                        list_del_init(&epi->rdllink);
+                }
+        }
+        return 0;
+}
+static int ep_poll_readyevents_proc(void *priv, void *cookie, int call_nests)
+{
+        return ep_scan_ready_list(priv, ep_read_events_proc, NULL);
+}
 static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait)
 {
-        unsigned int pollflags = 0;
+        int pollflags;
-        unsigned long flags;
        struct eventpoll *ep = file->private_data;
        /* Insert inside our poll wait queue */
        poll_wait(file, &ep->poll_wait, wait);
-        /* Check our condition */
+        /*
-        spin_lock_irqsave(&ep->lock, flags);
+         * Proceed to find out if wanted events are really available inside
-        if (!list_empty(&ep->rdllist))
+         * the ready list. This need to be done under ep_call_nested()
-                pollflags = POLLIN | POLLRDNORM;
+         * supervision, since the call to f_op->poll() done on listed files
-        spin_unlock_irqrestore(&ep->lock, flags);
+         * could re-enter here.
+         */
+        pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS,
+                                   ep_poll_readyevents_proc, ep, ep);
-        return pollflags;
+        return pollflags != -1 ? pollflags : 0;
 }
 /* File callbacks that implement the eventpoll file behaviour */
@@ -541,7 +695,7 @@ void eventpoll_release_file(struct file *file)
         * We don't want to get "file->f_lock" because it is not
         * necessary. It is not necessary because we're in the "struct file"
         * cleanup path, and this means that noone is using this file anymore.
-         * So, for example, epoll_ctl() cannot hit here sicne if we reach this
+         * So, for example, epoll_ctl() cannot hit here since if we reach this
         * point, the file counter already went to zero and fget() would fail.
         * The only hit might come from ep_free() but by holding the mutex
         * will correctly serialize the operation. We do need to acquire
@@ -588,8 +742,6 @@ static int ep_alloc(struct eventpoll **pep)
        *pep = ep;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n",
-                     current, ep));
        return 0;
 free_uid:
@@ -623,9 +775,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd)
                }
        }
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n",
-                     current, file, epir));
        return epir;
 }
@@ -641,9 +790,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
        struct epitem *epi = ep_item_from_wait(wait);
        struct eventpoll *ep = epi->ep;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n",
-                     current, epi->ffd.file, epi, ep));
        spin_lock_irqsave(&ep->lock, flags);
        /*
@@ -656,6 +802,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
                goto out_unlock;
        /*
+         * Check the events coming with the callback. At this stage, not
+         * every device reports the events in the "key" parameter of the
+         * callback. We need to be able to handle both cases here, hence the
+         * test for "key" != NULL before the event match test.
+         */
+        if (key && !((unsigned long) key & epi->event.events))
+                goto out_unlock;
+        /*
         * If we are trasfering events to userspace, we can hold no locks
         * (because we're accessing user memory, and because of linux f_op->poll()
         * semantics). All the events that happens during that period of time are
@@ -670,12 +825,9 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
        }
        /* If this file is already in the ready list we exit soon */
-        if (ep_is_linked(&epi->rdllink))
+        if (!ep_is_linked(&epi->rdllink))
-                goto is_linked;
+                list_add_tail(&epi->rdllink, &ep->rdllist);
-        list_add_tail(&epi->rdllink, &ep->rdllist);
-is_linked:
        /*
         * Wake up ( if active ) both the eventpoll wait list and the ->poll()
         * wait list.
@@ -690,7 +842,7 @@ out_unlock:
        /* We have to call this outside the lock */
        if (pwake)
-                ep_poll_safewake(&psw, &ep->poll_wait);
+                ep_poll_safewake(&ep->poll_wait);
        return 1;
 }
@@ -817,10 +969,7 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event,
        /* We have to call this outside the lock */
        if (pwake)
-                ep_poll_safewake(&psw, &ep->poll_wait);
+                ep_poll_safewake(&ep->poll_wait);
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n",
-                     current, ep, tfile, fd));
        return 0;
@@ -851,15 +1000,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
 {
        int pwake = 0;
        unsigned int revents;
-        unsigned long flags;
        /*
-         * Set the new event interest mask before calling f_op->poll(), otherwise
+         * Set the new event interest mask before calling f_op->poll();
-         * a potential race might occur. In fact if we do this operation inside
+         * otherwise we might miss an event that happens between the
-         * the lock, an event might happen between the f_op->poll() call and the
+         * f_op->poll() call and the new event set registering.
-         * new event set registering.
         */
        epi->event.events = event->events;
+        epi->event.data = event->data; /* protected by mtx */
        /*
         * Get current event bits. We can safely use the file* here because
@@ -867,16 +1015,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
         */
        revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
-        spin_lock_irqsave(&ep->lock, flags);
-        /* Copy the data member from inside the lock */
-        epi->event.data = event->data;
        /*
         * If the item is "hot" and it is not registered inside the ready
         * list, push it inside.
         */
        if (revents & event->events) {
+                spin_lock_irq(&ep->lock);
                if (!ep_is_linked(&epi->rdllink)) {
                        list_add_tail(&epi->rdllink, &ep->rdllist);
@@ -886,142 +1030,84 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even
                        if (waitqueue_active(&ep->poll_wait))
                                pwake++;
                }
+                spin_unlock_irq(&ep->lock);
        }
-        spin_unlock_irqrestore(&ep->lock, flags);
        /* We have to call this outside the lock */
        if (pwake)
-                ep_poll_safewake(&psw, &ep->poll_wait);
+                ep_poll_safewake(&ep->poll_wait);
        return 0;
 }
-static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events,
+static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head,
-                          int maxevents)
+                               void *priv)
 {
-        int eventcnt, error = -EFAULT, pwake = 0;
+        struct ep_send_events_data *esed = priv;
+        int eventcnt;
        unsigned int revents;
-        unsigned long flags;
+        struct epitem *epi;
-        struct epitem *epi, *nepi;
+        struct epoll_event __user *uevent;
-        struct list_head txlist;
-        INIT_LIST_HEAD(&txlist);
-        /*
-         * We need to lock this because we could be hit by
-         * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL).
-         */
-        mutex_lock(&ep->mtx);
-        /*
-         * Steal the ready list, and re-init the original one to the
-         * empty list. Also, set ep->ovflist to NULL so that events
-         * happening while looping w/out locks, are not lost. We cannot
-         * have the poll callback to queue directly on ep->rdllist,
-         * because we are doing it in the loop below, in a lockless way.
-         */
-        spin_lock_irqsave(&ep->lock, flags);
-        list_splice(&ep->rdllist, &txlist);
-        INIT_LIST_HEAD(&ep->rdllist);
-        ep->ovflist = NULL;
-        spin_unlock_irqrestore(&ep->lock, flags);
        /*
-         * We can loop without lock because this is a task private list.
+         * We can loop without lock because we are passed a task private list.
-         * We just splice'd out the ep->rdllist in ep_collect_ready_items().
+         * Items cannot vanish during the loop because ep_scan_ready_list() is
-         * Items cannot vanish during the loop because we are holding "mtx".
+         * holding "mtx" during this call.
         */
-        for (eventcnt = 0; !list_empty(&txlist) && eventcnt < maxevents;) {
+        for (eventcnt = 0, uevent = esed->events;
-                epi = list_first_entry(&txlist, struct epitem, rdllink);
+             !list_empty(head) && eventcnt < esed->maxevents;) {
+                epi = list_first_entry(head, struct epitem, rdllink);
                list_del_init(&epi->rdllink);
-                /*
+                revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) &
-                 * Get the ready file event set. We can safely use the file
+                        epi->event.events;
-                 * because we are holding the "mtx" and this will guarantee
-                 * that both the file and the item will not vanish.
-                 */
-                revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL);
-                revents &= epi->event.events;
                /*
-                 * Is the event mask intersect the caller-requested one,
+                 * If the event mask intersect the caller-requested one,
-                 * deliver the event to userspace. Again, we are holding
+                 * deliver the event to userspace. Again, ep_scan_ready_list()
-                 * "mtx", so no operations coming from userspace can change
+                 * is holding "mtx", so no operations coming from userspace
-                 * the item.
+                 * can change the item.
                 */
                if (revents) {
-                        if (__put_user(revents,
+                        if (__put_user(revents, &uevent->events) ||
-                                       &events[eventcnt].events) ||
+                            __put_user(epi->event.data, &uevent->data)) {
-                            __put_user(epi->event.data,
+                                list_add(&epi->rdllink, head);
-                                       &events[eventcnt].data))
+                                return eventcnt ? eventcnt : -EFAULT;
-                                goto errxit;
+                        }
+                        eventcnt++;
+                        uevent++;
                        if (epi->event.events & EPOLLONESHOT)
                                epi->event.events &= EP_PRIVATE_BITS;
-                        eventcnt++;
+                        else if (!(epi->event.events & EPOLLET)) {
+                                /*
+                                 * If this file has been added with Level
+                                 * Trigger mode, we need to insert back inside
+                                 * the ready list, so that the next call to
+                                 * epoll_wait() will check again the events
+                                 * availability. At this point, noone can insert
+                                 * into ep->rdllist besides us. The epoll_ctl()
+                                 * callers are locked out by
+                                 * ep_scan_ready_list() holding "mtx" and the
+                                 * poll callback will queue them in ep->ovflist.
+                                 */
+                                list_add_tail(&epi->rdllink, &ep->rdllist);
+                        }
                }
-                /*
-                 * At this point, noone can insert into ep->rdllist besides
-                 * us. The epoll_ctl() callers are locked out by us holding
-                 * "mtx" and the poll callback will queue them in ep->ovflist.
-                 */
-                if (!(epi->event.events & EPOLLET) &&
-                    (revents & epi->event.events))
-                        list_add_tail(&epi->rdllink, &ep->rdllist);
-        }
-        error = 0;
-errxit:
-        spin_lock_irqsave(&ep->lock, flags);
-        /*
-         * During the time we spent in the loop above, some other events
-         * might have been queued by the poll callback. We re-insert them
-         * inside the main ready-list here.
-         */
-        for (nepi = ep->ovflist; (epi = nepi) != NULL;
-             nepi = epi->next, epi->next = EP_UNACTIVE_PTR) {
-                /*
-                 * If the above loop quit with errors, the epoll item might still
-                 * be linked to "txlist", and the list_splice() done below will
-                 * take care of those cases.
-                 */
-                if (!ep_is_linked(&epi->rdllink))
-                        list_add_tail(&epi->rdllink, &ep->rdllist);
        }
-        /*
-         * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after
-         * releasing the lock, events will be queued in the normal way inside
-         * ep->rdllist.
-         */
-        ep->ovflist = EP_UNACTIVE_PTR;
-        /*
+        return eventcnt;
-         * In case of error in the event-send loop, or in case the number of
+}
-         * ready events exceeds the userspace limit, we need to splice the
-         * "txlist" back inside ep->rdllist.
-         */
-        list_splice(&txlist, &ep->rdllist);
-        if (!list_empty(&ep->rdllist)) {
-                /*
-                 * Wake up (if active) both the eventpoll wait list and the ->poll()
-                 * wait list (delayed after we release the lock).
-                 */
-                if (waitqueue_active(&ep->wq))
-                        wake_up_locked(&ep->wq);
-                if (waitqueue_active(&ep->poll_wait))
-                        pwake++;
-        }
-        spin_unlock_irqrestore(&ep->lock, flags);
-        mutex_unlock(&ep->mtx);
+static int ep_send_events(struct eventpoll *ep,
+                          struct epoll_event __user *events, int maxevents)
+{
+        struct ep_send_events_data esed;
-        /* We have to call this outside the lock */
+        esed.maxevents = maxevents;
-        if (pwake)
+        esed.events = events;
-                ep_poll_safewake(&psw, &ep->poll_wait);
-        return eventcnt == 0 ? error: eventcnt;
+        return ep_scan_ready_list(ep, ep_send_events_proc, &esed);
 }
 static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
@@ -1033,7 +1119,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
        wait_queue_t wait;
        /*
-         * Calculate the timeout by checking for the "infinite" value ( -1 )
+         * Calculate the timeout by checking for the "infinite" value (-1)
         * and the overflow condition. The passed timeout is in milliseconds,
         * that why (t * HZ) / 1000.
         */
@@ -1076,9 +1162,8 @@ retry:
                set_current_state(TASK_RUNNING);
        }
        /* Is it worth to try to dig for events ? */
-        eavail = !list_empty(&ep->rdllist);
+        eavail = !list_empty(&ep->rdllist) || ep->ovflist != EP_UNACTIVE_PTR;
        spin_unlock_irqrestore(&ep->lock, flags);
@@ -1099,41 +1184,30 @@ retry:
 */
 SYSCALL_DEFINE1(epoll_create1, int, flags)
 {
-        int error, fd = -1;
+        int error;
-        struct eventpoll *ep;
+        struct eventpoll *ep = NULL;
        /* Check the EPOLL_* constant for consistency.  */
        BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC);
        if (flags & ~EPOLL_CLOEXEC)
                return -EINVAL;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n",
-                     current, flags));
        /*
-         * Create the internal data structure ( "struct eventpoll" ).
+         * Create the internal data structure ("struct eventpoll").
         */
        error = ep_alloc(&ep);
-        if (error < 0) {
+        if (error < 0)
-                fd = error;
+                return error;
-                goto error_return;
-        }
        /*
         * Creates all the items needed to setup an eventpoll file. That is,
         * a file structure and a free file descriptor.
         */
-        fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
+        error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep,
-                              flags & O_CLOEXEC);
+                                 flags & O_CLOEXEC);
-        if (fd < 0)
+        if (error < 0)
                ep_free(ep);
-error_return:
+        return error;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n",
-                     current, flags, fd));
-        return fd;
 }
 SYSCALL_DEFINE1(epoll_create, int, size)
@@ -1158,9 +1232,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        struct epitem *epi;
        struct epoll_event epds;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n",
-                     current, epfd, op, fd, event));
        error = -EFAULT;
        if (ep_op_has_event(op) &&
            copy_from_user(&epds, event, sizeof(struct epoll_event)))
@@ -1211,7 +1282,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
        case EPOLL_CTL_ADD:
                if (!epi) {
                        epds.events |= POLLERR | POLLHUP;
                        error = ep_insert(ep, &epds, tfile, fd);
                } else
                        error = -EEXIST;
@@ -1237,8 +1307,6 @@ error_tgt_fput:
 error_fput:
        fput(file);
 error_return:
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n",
-                     current, epfd, op, fd, event, error));
        return error;
 }
@@ -1254,9 +1322,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
        struct file *file;
        struct eventpoll *ep;
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n",
-                     current, epfd, events, maxevents, timeout));
        /* The maximum number of event must be greater than zero */
        if (maxevents <= 0 || maxevents > EP_MAX_EVENTS)
                return -EINVAL;
@@ -1293,8 +1358,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events,
 error_fput:
        fput(file);
 error_return:
-        DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n",
-                     current, epfd, events, maxevents, timeout, error));
        return error;
 }
@@ -1359,17 +1422,18 @@ static int __init eventpoll_init(void)
                EP_ITEM_COST;
        /* Initialize the structure used to perform safe poll wait head wake ups */
-        ep_poll_safewake_init(&psw);
+        ep_nested_calls_init(&poll_safewake_ncalls);
+        /* Initialize the structure used to perform file's f_op->poll() calls */
+        ep_nested_calls_init(&poll_readywalk_ncalls);
        /* Allocates slab cache used to allocate "struct epitem" items */
        epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem),
-                        0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC,
+                        0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
-                        NULL);
        /* Allocates slab cache used to allocate "struct eppoll_entry" */
        pwq_cache = kmem_cache_create("eventpoll_pwq",
-                        sizeof(struct eppoll_entry), 0,
+                        sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL);
-                        EPI_SLAB_DEBUG|SLAB_PANIC, NULL);
        return 0;
 }
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38f40d55899c..53c72ad85877 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
 }
 static int ext4_group_used_meta_blocks(struct super_block *sb,
-                                ext4_group_t block_group)
+                                       ext4_group_t block_group,
+                                       struct ext4_group_desc *gdp)
 {
        ext4_fsblk_t tmp;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
        int used_blocks = sbi->s_itb_per_group + 2;
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
-                struct ext4_group_desc *gdp;
-                struct buffer_head *bh;
-                gdp = ext4_get_group_desc(sb, block_group, &bh);
                if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
                                        block_group))
                        used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 */
                mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
        }
-        return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+        return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
 }
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-                spin_lock(sb_bgl_lock(sbi, flex_group));
+                atomic_add(blocks_freed,
-                sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
+                           &sbi->s_flex_groups[flex_group].free_blocks);
-                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
        /*
         * request to reload the buddy with the
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01af..b64789929a65 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
                         unsigned int offset)
 {
        const char *error_msg = NULL;
-        const int rlen = ext4_rec_len_from_disk(de->rec_len);
+        const int rlen = ext4_rec_len_from_disk(de->rec_len,
+                                                dir->i_sb->s_blocksize);
        if (rlen < EXT4_DIR_REC_LEN(1))
                error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
                                 * least that it is non-zero.  A
                                 * failure will be detected in the
                                 * dirent test below. */
-                                if (ext4_rec_len_from_disk(de->rec_len)
+                                if (ext4_rec_len_from_disk(de->rec_len,
-                                                < EXT4_DIR_REC_LEN(1))
+                                        sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
                                        break;
-                                i += ext4_rec_len_from_disk(de->rec_len);
+                                i += ext4_rec_len_from_disk(de->rec_len,
+                                                            sb->s_blocksize);
                        }
                        offset = i;
                        filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
                                ret = stored;
                                goto out;
                        }
-                        offset += ext4_rec_len_from_disk(de->rec_len);
+                        offset += ext4_rec_len_from_disk(de->rec_len,
+                                        sb->s_blocksize);
                        if (le32_to_cpu(de->inode)) {
                                /* We might block in the next section
                                 * if the data destination is
@@ -225,7 +228,8 @@ revalidate:
                                        goto revalidate;
                                stored++;
                        }
-                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+                        filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+                                                sb->s_blocksize);
                }
                offset = 0;
                brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6083bb38057b..d0f15ef56de1 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,14 +33,6 @@
 #undef EXT4FS_DEBUG
 /*
- * Define EXT4_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT4_DEFAULT_RESERVE_BLOCKS     8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT4_MAX_RESERVE_BLOCKS         1027
-#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
-/*
 * Debug code
 */
 #ifdef EXT4FS_DEBUG
@@ -54,8 +46,6 @@
 #define ext4_debug(f, a...)     do {} while (0)
 #endif
-#define EXT4_MULTIBLOCK_ALLOCATOR       1
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE              1
 /* blocks already reserved */
@@ -180,8 +170,9 @@ struct ext4_group_desc
 */
 struct flex_groups {
-        __u32 free_inodes;
+        atomic_t free_inodes;
-        __u32 free_blocks;
+        atomic_t free_blocks;
+        atomic_t used_dirs;
 };
 #define EXT4_BG_INODE_UNINIT    0x0001 /* Inode table/bitmap not in use */
@@ -249,6 +240,30 @@ struct flex_groups {
 #define EXT4_FL_USER_VISIBLE            0x000BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE         0x000B80FF /* User modifiable flags */
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+                           EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
+                           EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+                           EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+                           EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+        if (S_ISDIR(mode))
+                return flags;
+        else if (S_ISREG(mode))
+                return flags & EXT4_REG_FLMASK;
+        else
+                return flags & EXT4_OTHER_FLMASK;
+}
 /*
 * Inode dynamic state flags
 */
@@ -256,6 +271,7 @@ struct flex_groups {
 #define EXT4_STATE_NEW                  0x00000002 /* inode is newly created */
 #define EXT4_STATE_XATTR                0x00000004 /* has in-inode xattrs */
 #define EXT4_STATE_NO_EXPAND            0x00000008 /* No space for expansion */
+#define EXT4_STATE_DA_ALLOC_CLOSE       0x00000010 /* Alloc DA blks on close */
 /* Used to pass group descriptor data when online resize is done */
 struct ext4_new_group_input {
@@ -303,7 +319,9 @@ struct ext4_new_group_data {
 #define EXT4_IOC_GROUP_EXTEND           _IOW('f', 7, unsigned long)
 #define EXT4_IOC_GROUP_ADD              _IOW('f', 8, struct ext4_new_group_input)
 #define EXT4_IOC_MIGRATE                _IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
 /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS          _IO('f', 12)
 /*
 * ioctl commands in 32 bit emulation
@@ -531,7 +549,7 @@ do {									       \
 #define EXT4_MOUNT_NO_UID32             0x02000  /* Disable 32-bit UIDs */
 #define EXT4_MOUNT_XATTR_USER           0x04000 /* Extended user attributes */
 #define EXT4_MOUNT_POSIX_ACL            0x08000 /* POSIX Access Control Lists */
-#define EXT4_MOUNT_RESERVATION          0x10000 /* Preallocation */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC     0x10000 /* No auto delalloc mapping */
 #define EXT4_MOUNT_BARRIER              0x20000 /* Use block barriers */
 #define EXT4_MOUNT_NOBH                 0x40000 /* No bufferheads */
 #define EXT4_MOUNT_QUOTA                0x80000 /* Some quota option set */
@@ -666,7 +684,8 @@ struct ext4_super_block {
        __u8    s_log_groups_per_flex;  /* FLEX_BG group size */
        __u8    s_reserved_char_pad2;
        __le16  s_reserved_pad;
-        __u32   s_reserved[162];        /* Padding to the end of the block */
+        __le64  s_kbytes_written;       /* nr of lifetime kilobytes written */
+        __u32   s_reserved[160];        /* Padding to the end of the block */
 };
 #ifdef __KERNEL__
@@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
 /*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
+/*
 * Structure of a directory entry
 */
 #define EXT4_NAME_LEN 255
@@ -865,24 +890,6 @@ struct ext4_dir_entry_2 {
                                         ~EXT4_DIR_ROUND)
 #define EXT4_MAX_REC_LEN                ((1<<16)-1)
-static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
-{
-        unsigned len = le16_to_cpu(dlen);
-        if (len == EXT4_MAX_REC_LEN || len == 0)
-                return 1 << 16;
-        return len;
-}
-static inline __le16 ext4_rec_len_to_disk(unsigned len)
-{
-        if (len == (1 << 16))
-                return cpu_to_le16(EXT4_MAX_REC_LEN);
-        else if (len > (1 << 16))
-                BUG();
-        return cpu_to_le16(len);
-}
 /*
 * Hash Tree Directory indexing
 * (c) Daniel Phillips, 2001
@@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
 extern struct proc_dir_entry *ext4_proc_root;
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations ext4_ui_proc_fops;
-#define EXT4_PROC_HANDLER(name, var)                                    \
-do {                                                                    \
-        proc = proc_create_data(name, mode, sbi->s_proc,                \
-                                &ext4_ui_proc_fops, &sbi->s_##var);     \
-        if (proc == NULL) {                                             \
-                printk(KERN_ERR "EXT4-fs: can't create %s\n", name);    \
-                goto err_out;                                           \
-        }                                                               \
-} while (0)
-#else
-#define EXT4_PROC_HANDLER(name, var)
-#endif
 /*
 * Function prototypes
 */
@@ -1092,13 +1083,14 @@ extern int ext4_can_truncate(struct inode *inode);
 extern void ext4_truncate(struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
 extern void ext4_set_aops(struct inode *inode);
 extern int ext4_writepage_trans_blocks(struct inode *);
 extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
 extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_block_truncate_page(handle_t *handle,
                struct address_space *mapping, loff_t from);
-extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page);
+extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t ext4_get_reserved_space(struct inode *inode);
 /* ioctl.c */
@@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
 /* migrate.c */
 extern int ext4_ext_migrate(struct inode *);
 /* namei.c */
+extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
+extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
 extern int ext4_orphan_add(handle_t *, struct inode *);
 extern int ext4_orphan_del(handle_t *, struct inode *);
 extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbbc..f0c3ec85bd48 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
 extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
                                                ext4_lblk_t *, ext4_fsblk_t *);
 extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
 #endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c4..4ce2187123aa 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
 /* data type for block group number */
 typedef unsigned int ext4_group_t;
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
 /*
 * storage for cached extent
 */
@@ -125,6 +122,9 @@ struct ext4_inode_info {
        struct list_head i_prealloc_list;
        spinlock_t i_prealloc_lock;
+        /* ialloc */
+        ext4_group_t    i_last_alloc_group;
        /* allocation reservation info for delalloc */
        unsigned int i_reserved_data_blocks;
        unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a042..57b71fefbccf 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
        struct percpu_counter s_freeinodes_counter;
        struct percpu_counter s_dirs_counter;
        struct percpu_counter s_dirtyblocks_counter;
-        struct blockgroup_lock s_blockgroup_lock;
+        struct blockgroup_lock *s_blockgroup_lock;
        struct proc_dir_entry *s_proc;
+        struct kobject s_kobj;
-        /* root of the per fs reservation window tree */
+        struct completion s_kobj_unregister;
-        spinlock_t s_rsv_window_lock;
-        struct rb_root s_rsv_window_root;
        /* Journaling */
        struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
        /* locality groups */
        struct ext4_locality_group *s_locality_groups;
+        /* for write statistics */
+        unsigned long s_sectors_written_start;
+        u64 s_kbytes_written;
        unsigned int s_log_groups_per_flex;
        struct flex_groups *s_flex_groups;
 };
@@ -153,7 +155,7 @@ struct ext4_sb_info {
 static inline spinlock_t *
 sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
 {
-        return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+        return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
 }
 #endif  /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e0aa4fe4f596..ac77d8b8251d 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;
        ext4_grpblk_t colour;
+        ext4_group_t block_group;
+        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        int depth;
        if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
        }
        /* OK. use inode's group */
-        bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+        block_group = ei->i_block_group;
+        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+                /*
+                 * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+                 * block groups per flexgroup, reserve the first block 
+                 * group for directories and special files.  Regular 
+                 * files will start at the second block group.  This
+                 * tends to speed up directory access and improves 
+                 * fsck times.
+                 */
+                block_group &= ~(flex_size-1);
+                if (S_ISREG(inode->i_mode))
+                        block_group++;
+        }
+        bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
                le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+        /*
+         * If we are doing delayed allocation, we don't need take
+         * colour into account.
+         */
+        if (test_opt(inode->i_sb, DELALLOC))
+                return bg_start;
        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (current->pid % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
        return max;
 }
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+        ext4_fsblk_t block = ext_pblock(ext);
+        int len = ext4_ext_get_actual_len(ext);
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+        if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+                        ((block + len) > ext4_blocks_count(es))))
+                return 0;
+        else
+                return 1;
+}
+static int ext4_valid_extent_idx(struct inode *inode,
+                                struct ext4_extent_idx *ext_idx)
+{
+        ext4_fsblk_t block = idx_pblock(ext_idx);
+        struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+        if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+                        (block > ext4_blocks_count(es))))
+                return 0;
+        else
+                return 1;
+}
+static int ext4_valid_extent_entries(struct inode *inode,
+                                struct ext4_extent_header *eh,
+                                int depth)
+{
+        struct ext4_extent *ext;
+        struct ext4_extent_idx *ext_idx;
+        unsigned short entries;
+        if (eh->eh_entries == 0)
+                return 1;
+        entries = le16_to_cpu(eh->eh_entries);
+        if (depth == 0) {
+                /* leaf entries */
+                ext = EXT_FIRST_EXTENT(eh);
+                while (entries) {
+                        if (!ext4_valid_extent(inode, ext))
+                                return 0;
+                        ext++;
+                        entries--;
+                }
+        } else {
+                ext_idx = EXT_FIRST_INDEX(eh);
+                while (entries) {
+                        if (!ext4_valid_extent_idx(inode, ext_idx))
+                                return 0;
+                        ext_idx++;
+                        entries--;
+                }
+        }
+        return 1;
+}
+static int __ext4_ext_check(const char *function, struct inode *inode,
                                        struct ext4_extent_header *eh,
                                        int depth)
 {
@@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
                error_msg = "invalid eh_entries";
                goto corrupted;
        }
+        if (!ext4_valid_extent_entries(inode, eh, depth)) {
+                error_msg = "invalid extent entries";
+                goto corrupted;
+        }
        return 0;
 corrupted:
        ext4_error(inode->i_sb, function,
-                        "bad header in inode #%lu: %s - magic %x, "
+                        "bad header/extent in inode #%lu: %s - magic %x, "
                        "entries %u, max %u(%u), depth %u(%u)",
                        inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
                        le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +426,13 @@ corrupted:
        return -EIO;
 }
-#define ext4_ext_check_header(inode, eh, depth) \
+#define ext4_ext_check(inode, eh, depth)        \
-        __ext4_ext_check_header(__func__, inode, eh, depth)
+        __ext4_ext_check(__func__, inode, eh, depth)
+int ext4_ext_check_inode(struct inode *inode)
+{
+        return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
 #ifdef EXT_DEBUG
 static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        eh = ext_inode_hdr(inode);
        depth = ext_depth(inode);
-        if (ext4_ext_check_header(inode, eh, depth))
-                return ERR_PTR(-EIO);
        /* account possible depth increase */
        if (!path) {
@@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
        i = depth;
        /* walk through the tree */
        while (i) {
+                int need_to_validate = 0;
                ext_debug("depth %d: num %d, max %d\n",
                          ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                path[ppos].p_depth = i;
                path[ppos].p_ext = NULL;
-                bh = sb_bread(inode->i_sb, path[ppos].p_block);
+                bh = sb_getblk(inode->i_sb, path[ppos].p_block);
-                if (!bh)
+                if (unlikely(!bh))
                        goto err;
+                if (!bh_uptodate_or_lock(bh)) {
+                        if (bh_submit_read(bh) < 0) {
+                                put_bh(bh);
+                                goto err;
+                        }
+                        /* validate the extent entries */
+                        need_to_validate = 1;
+                }
                eh = ext_block_hdr(bh);
                ppos++;
                BUG_ON(ppos > depth);
@@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
                path[ppos].p_hdr = eh;
                i--;
-                if (ext4_ext_check_header(inode, eh, i))
+                if (need_to_validate && ext4_ext_check(inode, eh, i))
                        goto err;
        }
@@ -1181,7 +1276,7 @@ got_index:
                        return -EIO;
                eh = ext_block_hdr(bh);
                /* subtract from p_depth to get proper eh_depth */
-                if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+                if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
                        put_bh(bh);
                        return -EIO;
                }
@@ -1194,7 +1289,7 @@ got_index:
        if (bh == NULL)
                return -EIO;
        eh = ext_block_hdr(bh);
-        if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+        if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
                put_bh(bh);
                return -EIO;
        }
@@ -2137,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
                return -ENOMEM;
        }
        path[0].p_hdr = ext_inode_hdr(inode);
-        if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
+        if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
                err = -EIO;
                goto out;
        }
@@ -2191,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
                                err = -EIO;
                                break;
                        }
-                        if (ext4_ext_check_header(inode, ext_block_hdr(bh),
+                        if (ext4_ext_check(inode, ext_block_hdr(bh),
                                                        depth - i - 1)) {
                                err = -EIO;
                                break;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a03..588af8c77246 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
 */
 static int ext4_release_file(struct inode *inode, struct file *filp)
 {
+        if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+                ext4_alloc_da_blocks(inode);
+                EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+        }
        /* if we are the last writer on the inode, drop the block reservation */
        if ((filp->f_mode & FMODE_WRITE) &&
-                        (atomic_read(&inode->i_writecount) == 1))
+                        (atomic_read(&inode->i_writecount) == 1) &&
+                        !EXT4_I(inode)->i_reserved_data_blocks)
        {
                down_write(&EXT4_I(inode)->i_data_sem);
                ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8f..47b84e8df568 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
        struct ext4_super_block *es;
        struct ext4_sb_info *sbi;
        int fatal = 0, err, count, cleared;
-        ext4_group_t flex_group;
        if (atomic_read(&inode->i_count) > 1) {
                printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                        if (is_directory) {
                                count = ext4_used_dirs_count(sb, gdp) - 1;
                                ext4_used_dirs_set(sb, gdp, count);
+                                if (sbi->s_log_groups_per_flex) {
+                                        ext4_group_t f;
+                                        f = ext4_flex_group(sbi, block_group);
+                                        atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+                                }
                        }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                                percpu_counter_dec(&sbi->s_dirs_counter);
                        if (sbi->s_log_groups_per_flex) {
-                                flex_group = ext4_flex_group(sbi, block_group);
+                                ext4_group_t f;
-                                spin_lock(sb_bgl_lock(sbi, flex_group));
-                                sbi->s_flex_groups[flex_group].free_inodes++;
+                                f = ext4_flex_group(sbi, block_group);
-                                spin_unlock(sb_bgl_lock(sbi, flex_group));
+                                atomic_inc(&sbi->s_flex_groups[f].free_inodes);
                        }
                }
                BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
                sbi->s_log_groups_per_flex;
 find_close_to_parent:
-        flexbg_free_blocks = flex_group[best_flex].free_blocks;
+        flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
        flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
-        if (flex_group[best_flex].free_inodes &&
+        if (atomic_read(&flex_group[best_flex].free_inodes) &&
            flex_freeb_ratio > free_block_ratio)
                goto found_flexbg;
@@ -375,24 +381,24 @@ find_close_to_parent:
                if (i == parent_fbg_group || i == parent_fbg_group - 1)
                        continue;
-                flexbg_free_blocks = flex_group[i].free_blocks;
+                flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
                flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
                if (flex_freeb_ratio > free_block_ratio &&
-                    flex_group[i].free_inodes) {
+                    (atomic_read(&flex_group[i].free_inodes))) {
                        best_flex = i;
                        goto found_flexbg;
                }
-                if (flex_group[best_flex].free_inodes == 0 ||
+                if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
-                    (flex_group[i].free_blocks >
+                    ((atomic_read(&flex_group[i].free_blocks) >
-                     flex_group[best_flex].free_blocks &&
+                      atomic_read(&flex_group[best_flex].free_blocks)) &&
-                     flex_group[i].free_inodes))
+                     atomic_read(&flex_group[i].free_inodes)))
                        best_flex = i;
        }
-        if (!flex_group[best_flex].free_inodes ||
+        if (!atomic_read(&flex_group[best_flex].free_inodes) ||
-            !flex_group[best_flex].free_blocks)
+            !atomic_read(&flex_group[best_flex].free_blocks))
                return -1;
 found_flexbg:
@@ -410,6 +416,42 @@ out:
        return 0;
 }
+struct orlov_stats {
+        __u32 free_inodes;
+        __u32 free_blocks;
+        __u32 used_dirs;
+};
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg.  If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+                       int flex_size, struct orlov_stats *stats)
+{
+        struct ext4_group_desc *desc;
+        struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
+        if (flex_size > 1) {
+                stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+                stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+                stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+                return;
+        }
+        desc = ext4_get_group_desc(sb, g, NULL);
+        if (desc) {
+                stats->free_inodes = ext4_free_inodes_count(sb, desc);
+                stats->free_blocks = ext4_free_blks_count(sb, desc);
+                stats->used_dirs = ext4_used_dirs_count(sb, desc);
+        } else {
+                stats->free_inodes = 0;
+                stats->free_blocks = 0;
+                stats->used_dirs = 0;
+        }
+}
 /*
 * Orlov's allocator for directories.
 *
@@ -425,35 +467,34 @@ out:
 * it has too many directories already (max_dirs) or
 * it has too few free inodes left (min_inodes) or
 * it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
 * Parent's group is preferred, if it doesn't satisfy these
 * conditions we search cyclically through the rest. If none
 * of the groups look good we just look for a group with more
 * free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
 */
-#define INODE_COST 64
-#define BLOCK_COST 256
 static int find_group_orlov(struct super_block *sb, struct inode *parent,
-                                ext4_group_t *group)
+                            ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct ext4_super_block *es = sbi->s_es;
        ext4_group_t ngroups = sbi->s_groups_count;
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
        ext4_fsblk_t freeb, avefreeb;
-        ext4_fsblk_t blocks_per_dir;
        unsigned int ndirs;
-        int max_debt, max_dirs, min_inodes;
+        int max_dirs, min_inodes;
        ext4_grpblk_t min_blocks;
-        ext4_group_t i;
+        ext4_group_t i, grp, g;
        struct ext4_group_desc *desc;
+        struct orlov_stats stats;
+        int flex_size = ext4_flex_bg_size(sbi);
+        if (flex_size > 1) {
+                ngroups = (ngroups + flex_size - 1) >>
+                        sbi->s_log_groups_per_flex;
+                parent_group >>= sbi->s_log_groups_per_flex;
+        }
        freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
        avefreei = freei / ngroups;
@@ -462,71 +503,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        do_div(avefreeb, ngroups);
        ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
-        if ((parent == sb->s_root->d_inode) ||
+        if (S_ISDIR(mode) &&
-            (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+            ((parent == sb->s_root->d_inode) ||
+             (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
                int best_ndir = inodes_per_group;
-                ext4_group_t grp;
                int ret = -1;
                get_random_bytes(&grp, sizeof(grp));
                parent_group = (unsigned)grp % ngroups;
                for (i = 0; i < ngroups; i++) {
-                        grp = (parent_group + i) % ngroups;
+                        g = (parent_group + i) % ngroups;
-                        desc = ext4_get_group_desc(sb, grp, NULL);
+                        get_orlov_stats(sb, g, flex_size, &stats);
-                        if (!desc || !ext4_free_inodes_count(sb, desc))
+                        if (!stats.free_inodes)
                                continue;
-                        if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+                        if (stats.used_dirs >= best_ndir)
                                continue;
-                        if (ext4_free_inodes_count(sb, desc) < avefreei)
+                        if (stats.free_inodes < avefreei)
                                continue;
-                        if (ext4_free_blks_count(sb, desc) < avefreeb)
+                        if (stats.free_blocks < avefreeb)
                                continue;
-                        *group = grp;
+                        grp = g;
                        ret = 0;
-                        best_ndir = ext4_used_dirs_count(sb, desc);
+                        best_ndir = stats.used_dirs;
+                }
+                if (ret)
+                        goto fallback;
+        found_flex_bg:
+                if (flex_size == 1) {
+                        *group = grp;
+                        return 0;
+                }
+                /*
+                 * We pack inodes at the beginning of the flexgroup's
+                 * inode tables.  Block allocation decisions will do
+                 * something similar, although regular files will
+                 * start at 2nd block group of the flexgroup.  See
+                 * ext4_ext_find_goal() and ext4_find_near().
+                 */
+                grp *= flex_size;
+                for (i = 0; i < flex_size; i++) {
+                        if (grp+i >= sbi->s_groups_count)
+                                break;
+                        desc = ext4_get_group_desc(sb, grp+i, NULL);
+                        if (desc && ext4_free_inodes_count(sb, desc)) {
+                                *group = grp+i;
+                                return 0;
+                        }
                }
-                if (ret == 0)
-                        return ret;
                goto fallback;
        }
-        blocks_per_dir = ext4_blocks_count(es) - freeb;
-        do_div(blocks_per_dir, ndirs);
        max_dirs = ndirs / ngroups + inodes_per_group / 16;
-        min_inodes = avefreei - inodes_per_group / 4;
+        min_inodes = avefreei - inodes_per_group*flex_size / 4;
-        min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
+        if (min_inodes < 1)
+                min_inodes = 1;
-        max_debt = EXT4_BLOCKS_PER_GROUP(sb);
+        min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
-        max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
-        if (max_debt * INODE_COST > inodes_per_group)
+        /*
-                max_debt = inodes_per_group / INODE_COST;
+         * Start looking in the flex group where we last allocated an
-        if (max_debt > 255)
+         * inode for this parent directory
-                max_debt = 255;
+         */
-        if (max_debt == 0)
+        if (EXT4_I(parent)->i_last_alloc_group != ~0) {
-                max_debt = 1;
+                parent_group = EXT4_I(parent)->i_last_alloc_group;
+                if (flex_size > 1)
+                        parent_group >>= sbi->s_log_groups_per_flex;
+        }
        for (i = 0; i < ngroups; i++) {
-                *group = (parent_group + i) % ngroups;
+                grp = (parent_group + i) % ngroups;
-                desc = ext4_get_group_desc(sb, *group, NULL);
+                get_orlov_stats(sb, grp, flex_size, &stats);
-                if (!desc || !ext4_free_inodes_count(sb, desc))
+                if (stats.used_dirs >= max_dirs)
-                        continue;
-                if (ext4_used_dirs_count(sb, desc) >= max_dirs)
                        continue;
-                if (ext4_free_inodes_count(sb, desc) < min_inodes)
+                if (stats.free_inodes < min_inodes)
                        continue;
-                if (ext4_free_blks_count(sb, desc) < min_blocks)
+                if (stats.free_blocks < min_blocks)
                        continue;
-                return 0;
+                goto found_flex_bg;
        }
 fallback:
+        ngroups = sbi->s_groups_count;
+        avefreei = freei / ngroups;
+        parent_group = EXT4_I(parent)->i_block_group;
        for (i = 0; i < ngroups; i++) {
-                *group = (parent_group + i) % ngroups;
+                grp = (parent_group + i) % ngroups;
-                desc = ext4_get_group_desc(sb, *group, NULL);
+                desc = ext4_get_group_desc(sb, grp, NULL);
                if (desc && ext4_free_inodes_count(sb, desc) &&
-                        ext4_free_inodes_count(sb, desc) >= avefreei)
+                    ext4_free_inodes_count(sb, desc) >= avefreei) {
+                        *group = grp;
                        return 0;
+                }
        }
        if (avefreei) {
@@ -542,12 +609,51 @@ fallback:
 }
 static int find_group_other(struct super_block *sb, struct inode *parent,
-                                ext4_group_t *group)
+                            ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
        struct ext4_group_desc *desc;
-        ext4_group_t i;
+        ext4_group_t i, last;
+        int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+        /*
+         * Try to place the inode is the same flex group as its
+         * parent.  If we can't find space, use the Orlov algorithm to
+         * find another flex group, and store that information in the
+         * parent directory's inode information so that use that flex
+         * group for future allocations.
+         */
+        if (flex_size > 1) {
+                int retry = 0;
+        try_again:
+                parent_group &= ~(flex_size-1);
+                last = parent_group + flex_size;
+                if (last > ngroups)
+                        last = ngroups;
+                for  (i = parent_group; i < last; i++) {
+                        desc = ext4_get_group_desc(sb, i, NULL);
+                        if (desc && ext4_free_inodes_count(sb, desc)) {
+                                *group = i;
+                                return 0;
+                        }
+                }
+                if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+                        retry = 1;
+                        parent_group = EXT4_I(parent)->i_last_alloc_group;
+                        goto try_again;
+                }
+                /*
+                 * If this didn't work, use the Orlov search algorithm
+                 * to find a new flex group; we pass in the mode to
+                 * avoid the topdir algorithms.
+                 */
+                *group = parent_group + flex_size;
+                if (*group > ngroups)
+                        *group = 0;
+                return find_group_orlov(sb, parent, group, mode);
+        }
        /*
         * Try to place the inode in its parent directory
@@ -665,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb,
        if (S_ISDIR(mode)) {
                count = ext4_used_dirs_count(sb, gdp) + 1;
                ext4_used_dirs_set(sb, gdp, count);
+                if (sbi->s_log_groups_per_flex) {
+                        ext4_group_t f = ext4_flex_group(sbi, group);
+                        atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+                }
        }
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
@@ -716,10 +827,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
        sbi = EXT4_SB(sb);
        es = sbi->s_es;
-        if (sbi->s_log_groups_per_flex) {
+        if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
                ret2 = find_group_flex(sb, dir, &group);
                if (ret2 == -1) {
-                        ret2 = find_group_other(sb, dir, &group);
+                        ret2 = find_group_other(sb, dir, &group, mode);
                        if (ret2 == 0 && once)
                                once = 0;
                                printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +844,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                if (test_opt(sb, OLDALLOC))
                        ret2 = find_group_dir(sb, dir, &group);
                else
-                        ret2 = find_group_orlov(sb, dir, &group);
+                        ret2 = find_group_orlov(sb, dir, &group, mode);
        } else
-                ret2 = find_group_other(sb, dir, &group);
+                ret2 = find_group_other(sb, dir, &group, mode);
 got_group:
+        EXT4_I(dir)->i_last_alloc_group = group;
        err = -ENOSPC;
        if (ret2 == -1)
                goto out;
@@ -858,9 +970,7 @@ got:
        if (sbi->s_log_groups_per_flex) {
                flex_group = ext4_flex_group(sbi, group);
-                spin_lock(sb_bgl_lock(sbi, flex_group));
+                atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
-                sbi->s_flex_groups[flex_group].free_inodes--;
-                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
        inode->i_uid = current_fsuid();
@@ -885,19 +995,16 @@ got:
        ei->i_disksize = 0;
        /*
-         * Don't inherit extent flag from directory. We set extent flag on
+         * Don't inherit extent flag from directory, amongst others. We set
-         * newly created directory and file only if -o extent mount option is
+         * extent flag on newly created directory and file only if -o extent
-         * specified
+         * mount option is specified
         */
-        ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
+        ei->i_flags =
-        if (S_ISLNK(mode))
+                ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
-                ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
-        /* dirsync only applies to directories */
-        if (!S_ISDIR(mode))
-                ei->i_flags &= ~EXT4_DIRSYNC_FL;
        ei->i_file_acl = 0;
        ei->i_dtime = 0;
        ei->i_block_group = group;
+        ei->i_last_alloc_group = ~0;
        ext4_set_inode_flags(inode);
        if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 71d3ecd5db79..a2e7952bc5f9 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
        return n;
 }
+static int __ext4_check_blockref(const char *function, struct inode *inode,
+                                 unsigned int *p, unsigned int max) {
+        unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+        unsigned int *bref = p;
+        while (bref < p+max) {
+                if (unlikely(*bref >= maxblocks)) {
+                        ext4_error(inode->i_sb, function,
+                                   "block reference %u >= max (%u) "
+                                   "in inode #%lu, offset=%d",
+                                   *bref, maxblocks,
+                                   inode->i_ino, (int)(bref-p));
+                        return -EIO;
+                }
+                bref++;
+        }
+        return 0;
+}
+#define ext4_check_indirect_blockref(inode, bh)                         \
+        __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data,  \
+                              EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+#define ext4_check_inode_blockref(inode)                                \
+        __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data,   \
+                              EXT4_NDIR_BLOCKS)
 /**
 *      ext4_get_branch - read the chain of indirect blocks leading to data
 *      @inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
        if (!p->key)
                goto no_block;
        while (--depth) {
-                bh = sb_bread(sb, le32_to_cpu(p->key));
+                bh = sb_getblk(sb, le32_to_cpu(p->key));
-                if (!bh)
+                if (unlikely(!bh))
                        goto failure;
+                  
+                if (!bh_uptodate_or_lock(bh)) {
+                        if (bh_submit_read(bh) < 0) {
+                                put_bh(bh);
+                                goto failure;
+                        }
+                        /* validate block references */
+                        if (ext4_check_indirect_blockref(inode, bh)) {
+                                put_bh(bh);
+                                goto failure;
+                        }
+                }
+                
                add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
                /* Reader: end */
                if (!p->key)
@@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
        ext4_fsblk_t bg_start;
        ext4_fsblk_t last_block;
        ext4_grpblk_t colour;
+        ext4_group_t block_group;
+        int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
        /* Try to find previous block */
        for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
         * It is going to be referred to from the inode itself? OK, just put it
         * into the same cylinder group then.
         */
-        bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+        block_group = ei->i_block_group;
+        if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+                block_group &= ~(flex_size-1);
+                if (S_ISREG(inode->i_mode))
+                        block_group++;
+        }
+        bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
        last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+        /*
+         * If we are doing delayed allocation, we don't need take
+         * colour into account.
+         */
+        if (test_opt(inode->i_sb, DELALLOC))
+                return bg_start;
        if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
                colour = (current->pid % 16) *
                        (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -1052,9 +1108,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
        /*
         * free those over-booking quota for metadata blocks
         */
        if (mdb_free)
                vfs_dq_release_reservation_block(inode, mdb_free);
+        /*
+         * If we have done all the pending block allocations and if
+         * there aren't any writers on the inode, we can discard the
+         * inode's preallocations.
+         */
+        if (!total && (atomic_read(&inode->i_writecount) == 0))
+                ext4_discard_preallocations(inode);
 }
 /*
@@ -1688,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page,
 struct mpage_da_data {
        struct inode *inode;
-        struct buffer_head lbh;                 /* extent of blocks */
+        sector_t b_blocknr;             /* start block number of extent */
+        size_t b_size;                  /* size of extent */
+        unsigned long b_state;          /* state of the extent */
        unsigned long first_page, next_page;    /* extent of pages */
-        get_block_t *get_block;
        struct writeback_control *wbc;
        int io_done;
        int pages_written;
@@ -1704,7 +1768,6 @@ struct mpage_da_data {
 * @mpd->inode: inode
 * @mpd->first_page: first page of the extent
 * @mpd->next_page: page after the last page of the extent
- * @mpd->get_block: the filesystem's block mapper function
 *
 * By the time mpage_da_submit_io() is called we expect all blocks
 * to be allocated. this may be wrong if allocation failed.
@@ -1724,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
        /*
         * We need to start from the first_page to the next_page - 1
         * to make sure we also write the mapped dirty buffer_heads.
-         * If we look at mpd->lbh.b_blocknr we would only be looking
+         * If we look at mpd->b_blocknr we would only be looking
         * at the currently mapped buffer_heads.
         */
        index = mpd->first_page;
@@ -1914,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode)
        return;
 }
+#define         EXT4_DELALLOC_RSVED     1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+                                   struct buffer_head *bh_result, int create)
+{
+        int ret;
+        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+        loff_t disksize = EXT4_I(inode)->i_disksize;
+        handle_t *handle = NULL;
+        handle = ext4_journal_current_handle();
+        BUG_ON(!handle);
+        ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+                                   bh_result, create, 0, EXT4_DELALLOC_RSVED);
+        if (ret <= 0)
+                return ret;
+        bh_result->b_size = (ret << inode->i_blkbits);
+        if (ext4_should_order_data(inode)) {
+                int retval;
+                retval = ext4_jbd2_file_inode(handle, inode);
+                if (retval)
+                        /*
+                         * Failed to add inode for ordered mode. Don't
+                         * update file size
+                         */
+                        return retval;
+        }
+        /*
+         * Update on-disk size along with block allocation we don't
+         * use 'extend_disksize' as size may change within already
+         * allocated block -bzzz
+         */
+        disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+        if (disksize > i_size_read(inode))
+                disksize = i_size_read(inode);
+        if (disksize > EXT4_I(inode)->i_disksize) {
+                ext4_update_i_disksize(inode, disksize);
+                ret = ext4_mark_inode_dirty(handle, inode);
+                return ret;
+        }
+        return 0;
+}
 /*
 * mpage_da_map_blocks - go through given space
 *
- * @mpd->lbh - bh describing space
+ * @mpd - bh describing space
- * @mpd->get_block - the filesystem's block mapper function
 *
 * The function skips space we know is already mapped to disk blocks.
 *
 */
-static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
        int err = 0;
        struct buffer_head new;
-        struct buffer_head *lbh = &mpd->lbh;
        sector_t next;
        /*
         * We consider only non-mapped and non-allocated blocks
         */
-        if (buffer_mapped(lbh) && !buffer_delay(lbh))
+        if ((mpd->b_state  & (1 << BH_Mapped)) &&
+            !(mpd->b_state & (1 << BH_Delay)))
                return 0;
-        new.b_state = lbh->b_state;
+        new.b_state = mpd->b_state;
        new.b_blocknr = 0;
-        new.b_size = lbh->b_size;
+        new.b_size = mpd->b_size;
-        next = lbh->b_blocknr;
+        next = mpd->b_blocknr;
        /*
         * If we didn't accumulate anything
         * to write simply return
         */
        if (!new.b_size)
                return 0;
-        err = mpd->get_block(mpd->inode, next, &new, 1);
-        if (err) {
-                /* If get block returns with error
+        err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
-                 * we simply return. Later writepage
+        if (err) {
-                 * will redirty the page and writepages
+                /*
-                 * will find the dirty page again
+                 * If get block returns with error we simply
+                 * return. Later writepage will redirty the page and
+                 * writepages will find the dirty page again
                 */
                if (err == -EAGAIN)
                        return 0;
                if (err == -ENOSPC &&
-                                ext4_count_free_blocks(mpd->inode->i_sb)) {
+                    ext4_count_free_blocks(mpd->inode->i_sb)) {
                        mpd->retval = err;
                        return 0;
                }
                /*
-                 * get block failure will cause us
+                 * get block failure will cause us to loop in
-                 * to loop in writepages. Because
+                 * writepages, because a_ops->writepage won't be able
-                 * a_ops->writepage won't be able to
+                 * to make progress. The page will be redirtied by
-                 * make progress. The page will be redirtied
+                 * writepage and writepages will again try to write
-                 * by writepage and writepages will again
+                 * the same.
-                 * try to write the same.
                 */
                printk(KERN_EMERG "%s block allocation failed for inode %lu "
                                  "at logical offset %llu with max blocks "
                                  "%zd with error %d\n",
                                  __func__, mpd->inode->i_ino,
                                  (unsigned long long)next,
-                                  lbh->b_size >> mpd->inode->i_blkbits, err);
+                                  mpd->b_size >> mpd->inode->i_blkbits, err);
                printk(KERN_EMERG "This should not happen.!! "
                                        "Data will be lost\n");
                if (err == -ENOSPC) {
@@ -1983,7 +2089,7 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
                }
                /* invlaidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
-                                lbh->b_size >> mpd->inode->i_blkbits);
+                                mpd->b_size >> mpd->inode->i_blkbits);
                return err;
        }
        BUG_ON(new.b_size == 0);
@@ -1995,7 +2101,8 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
         * If blocks are delayed marked, we need to
         * put actual blocknr and drop delayed bit
         */
-        if (buffer_delay(lbh) || buffer_unwritten(lbh))
+        if ((mpd->b_state & (1 << BH_Delay)) ||
+            (mpd->b_state & (1 << BH_Unwritten)))
                mpage_put_bnr_to_bhs(mpd, next, &new);
        return 0;
@@ -2014,12 +2121,11 @@ static int  mpage_da_map_blocks(struct mpage_da_data *mpd)
 * the function is used to collect contig. blocks in same state
 */
 static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
-                                   sector_t logical, struct buffer_head *bh)
+                                   sector_t logical, size_t b_size,
+                                   unsigned long b_state)
 {
        sector_t next;
-        size_t b_size = bh->b_size;
+        int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
-        struct buffer_head *lbh = &mpd->lbh;
-        int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
        /* check if thereserved journal credits might overflow */
        if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2046,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
        /*
         * First block in the extent
         */
-        if (lbh->b_size == 0) {
+        if (mpd->b_size == 0) {
-                lbh->b_blocknr = logical;
+                mpd->b_blocknr = logical;
-                lbh->b_size = b_size;
+                mpd->b_size = b_size;
-                lbh->b_state = bh->b_state & BH_FLAGS;
+                mpd->b_state = b_state & BH_FLAGS;
                return;
        }
-        next = lbh->b_blocknr + nrblocks;
+        next = mpd->b_blocknr + nrblocks;
        /*
         * Can we merge the block to our big extent?
         */
-        if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
+        if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
-                lbh->b_size += b_size;
+                mpd->b_size += b_size;
                return;
        }
@@ -2087,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page,
 {
        struct mpage_da_data *mpd = data;
        struct inode *inode = mpd->inode;
-        struct buffer_head *bh, *head, fake;
+        struct buffer_head *bh, *head;
        sector_t logical;
        if (mpd->io_done) {
@@ -2129,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page,
                /*
                 * ... and blocks
                 */
-                mpd->lbh.b_size = 0;
+                mpd->b_size = 0;
-                mpd->lbh.b_state = 0;
+                mpd->b_state = 0;
-                mpd->lbh.b_blocknr = 0;
+                mpd->b_blocknr = 0;
        }
        mpd->next_page = page->index + 1;
@@ -2139,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page,
                  (PAGE_CACHE_SHIFT - inode->i_blkbits);
        if (!page_has_buffers(page)) {
-                /*
+                mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
-                 * There is no attached buffer heads yet (mmap?)
+                                       (1 << BH_Dirty) | (1 << BH_Uptodate));
-                 * we treat the page asfull of dirty blocks
-                 */
-                bh = &fake;
-                bh->b_size = PAGE_CACHE_SIZE;
-                bh->b_state = 0;
-                set_buffer_dirty(bh);
-                set_buffer_uptodate(bh);
-                mpage_add_bh_to_extent(mpd, logical, bh);
                if (mpd->io_done)
                        return MPAGE_DA_EXTENT_TAIL;
        } else {
@@ -2166,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page,
                         * with the page in ext4_da_writepage
                         */
                        if (buffer_dirty(bh) &&
-                                (!buffer_mapped(bh) || buffer_delay(bh))) {
+                            (!buffer_mapped(bh) || buffer_delay(bh))) {
-                                mpage_add_bh_to_extent(mpd, logical, bh);
+                                mpage_add_bh_to_extent(mpd, logical,
+                                                       bh->b_size,
+                                                       bh->b_state);
                                if (mpd->io_done)
                                        return MPAGE_DA_EXTENT_TAIL;
                        } else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2179,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page,
                                 * unmapped buffer_head later we need to
                                 * use the b_state flag of that buffer_head.
                                 */
-                                if (mpd->lbh.b_size == 0)
+                                if (mpd->b_size == 0)
-                                        mpd->lbh.b_state =
+                                        mpd->b_state = bh->b_state & BH_FLAGS;
-                                                bh->b_state & BH_FLAGS;
                        }
                        logical++;
                } while ((bh = bh->b_this_page) != head);
@@ -2191,51 +2290,6 @@ static int __mpage_da_writepage(struct page *page,
 }
 /*
- * mpage_da_writepages - walk the list of dirty pages of the given
- * address space, allocates non-allocated blocks, maps newly-allocated
- * blocks to existing bhs and issue IO them
- *
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @get_block: the filesystem's block mapper function.
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- */
-static int mpage_da_writepages(struct address_space *mapping,
-                               struct writeback_control *wbc,
-                               struct mpage_da_data *mpd)
-{
-        int ret;
-        if (!mpd->get_block)
-                return generic_writepages(mapping, wbc);
-        mpd->lbh.b_size = 0;
-        mpd->lbh.b_state = 0;
-        mpd->lbh.b_blocknr = 0;
-        mpd->first_page = 0;
-        mpd->next_page = 0;
-        mpd->io_done = 0;
-        mpd->pages_written = 0;
-        mpd->retval = 0;
-        ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
-        /*
-         * Handle last extent of pages
-         */
-        if (!mpd->io_done && mpd->next_page != mpd->first_page) {
-                if (mpage_da_map_blocks(mpd) == 0)
-                        mpage_da_submit_io(mpd);
-                mpd->io_done = 1;
-                ret = MPAGE_DA_EXTENT_TAIL;
-        }
-        wbc->nr_to_write -= mpd->pages_written;
-        return ret;
-}
-/*
 * this is a special callback for ->write_begin() only
 * it's intention is to return mapped block or reserve space
 */
@@ -2274,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
        return ret;
 }
-#define         EXT4_DELALLOC_RSVED     1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-                                   struct buffer_head *bh_result, int create)
-{
-        int ret;
-        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-        loff_t disksize = EXT4_I(inode)->i_disksize;
-        handle_t *handle = NULL;
-        handle = ext4_journal_current_handle();
-        BUG_ON(!handle);
-        ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-                        bh_result, create, 0, EXT4_DELALLOC_RSVED);
-        if (ret > 0) {
-                bh_result->b_size = (ret << inode->i_blkbits);
-                if (ext4_should_order_data(inode)) {
-                        int retval;
-                        retval = ext4_jbd2_file_inode(handle, inode);
-                        if (retval)
-                                /*
-                                 * Failed to add inode for ordered
-                                 * mode. Don't update file size
-                                 */
-                                return retval;
-                }
-                /*
-                 * Update on-disk size along with block allocation
-                 * we don't use 'extend_disksize' as size may change
-                 * within already allocated block -bzzz
-                 */
-                disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-                if (disksize > i_size_read(inode))
-                        disksize = i_size_read(inode);
-                if (disksize > EXT4_I(inode)->i_disksize) {
-                        ext4_update_i_disksize(inode, disksize);
-                        ret = ext4_mark_inode_dirty(handle, inode);
-                        return ret;
-                }
-                ret = 0;
-        }
-        return ret;
-}
 static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
 {
@@ -2569,8 +2578,38 @@ retry:
                        dump_stack();
                        goto out_writepages;
                }
-                mpd.get_block = ext4_da_get_block_write;
-                ret = mpage_da_writepages(mapping, wbc, &mpd);
+                /*
+                 * Now call __mpage_da_writepage to find the next
+                 * contiguous region of logical blocks that need
+                 * blocks to be allocated by ext4.  We don't actually
+                 * submit the blocks for I/O here, even though
+                 * write_cache_pages thinks it will, and will set the
+                 * pages as clean for write before calling
+                 * __mpage_da_writepage().
+                 */
+                mpd.b_size = 0;
+                mpd.b_state = 0;
+                mpd.b_blocknr = 0;
+                mpd.first_page = 0;
+                mpd.next_page = 0;
+                mpd.io_done = 0;
+                mpd.pages_written = 0;
+                mpd.retval = 0;
+                ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+                                        &mpd);
+                /*
+                 * If we have a contigous extent of pages and we
+                 * haven't done the I/O yet, map the blocks and submit
+                 * them for I/O.
+                 */
+                if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+                        if (mpage_da_map_blocks(&mpd) == 0)
+                                mpage_da_submit_io(&mpd);
+                        mpd.io_done = 1;
+                        ret = MPAGE_DA_EXTENT_TAIL;
+                }
+                wbc->nr_to_write -= mpd.pages_written;
                ext4_journal_stop(handle);
@@ -2846,6 +2885,48 @@ out:
        return;
 }
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+        if (!EXT4_I(inode)->i_reserved_data_blocks &&
+            !EXT4_I(inode)->i_reserved_meta_blocks)
+                return 0;
+        /*
+         * We do something simple for now.  The filemap_flush() will
+         * also start triggering a write of the data blocks, which is
+         * not strictly speaking necessary (and for users of
+         * laptop_mode, not even desirable).  However, to do otherwise
+         * would require replicating code paths in:
+         * 
+         * ext4_da_writepages() ->
+         *    write_cache_pages() ---> (via passed in callback function)
+         *        __mpage_da_writepage() -->
+         *           mpage_add_bh_to_extent()
+         *           mpage_da_map_blocks()
+         *
+         * The problem is that write_cache_pages(), located in
+         * mm/page-writeback.c, marks pages clean in preparation for
+         * doing I/O, which is not desirable if we're not planning on
+         * doing I/O at all.
+         *
+         * We could call write_cache_pages(), and then redirty all of
+         * the pages by calling redirty_page_for_writeback() but that
+         * would be ugly in the extreme.  So instead we would need to
+         * replicate parts of the code in the above functions,
+         * simplifying them becuase we wouldn't actually intend to
+         * write out the pages, but rather only collect contiguous
+         * logical block extents, call the multi-block allocator, and
+         * then update the buffer heads with the block allocations.
+         * 
+         * For now, though, we'll cheat by calling filemap_flush(),
+         * which will map the blocks, and start the I/O, but not
+         * actually wait for the I/O to complete.
+         */
+        return filemap_flush(inode->i_mapping);
+}
 /*
 * bmap() is special.  It gets used by applications such as lilo and by
@@ -3868,6 +3949,9 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
+        if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+                ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                ext4_ext_truncate(inode);
                return;
@@ -4110,12 +4194,7 @@ make_io:
                        unsigned num;
                        table = ext4_inode_table(sb, gdp);
-                        /* Make sure s_inode_readahead_blks is a power of 2 */
+                        /* s_inode_readahead_blks is always a power of 2 */
-                        while (EXT4_SB(sb)->s_inode_readahead_blks &
-                               (EXT4_SB(sb)->s_inode_readahead_blks-1))
-                                EXT4_SB(sb)->s_inode_readahead_blks = 
-                                   (EXT4_SB(sb)->s_inode_readahead_blks &
-                                    (EXT4_SB(sb)->s_inode_readahead_blks-1));
                        b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
                        if (table > b)
                                b = table;
@@ -4287,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
        ei->i_disksize = inode->i_size;
        inode->i_generation = le32_to_cpu(raw_inode->i_generation);
        ei->i_block_group = iloc.block_group;
+        ei->i_last_alloc_group = ~0;
        /*
         * NOTE! The in-memory inode i_data array is in little-endian order
         * even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4329,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        (__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
        }
+        if (ei->i_flags & EXT4_EXTENTS_FL) {
+                /* Validate extent which is part of inode */
+                ret = ext4_ext_check_inode(inode);
+        } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                   (S_ISLNK(inode->i_mode) &&
+                    !ext4_inode_is_fast_symlink(inode))) {
+                /* Validate block references which are part of inode */
+                ret = ext4_check_inode_blockref(inode);
+        }
+        if (ret) {
+                brelse(bh);
+                goto bad_inode;
+        }
        if (S_ISREG(inode->i_mode)) {
                inode->i_op = &ext4_file_inode_operations;
                inode->i_fop = &ext4_file_operations;
@@ -4345,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                        inode->i_op = &ext4_symlink_inode_operations;
                        ext4_set_aops(inode);
                }
-        } else {
+        } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+              S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
                inode->i_op = &ext4_special_inode_operations;
                if (raw_inode->i_block[0])
                        init_special_inode(inode, inode->i_mode,
@@ -4353,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
                else
                        init_special_inode(inode, inode->i_mode,
                           new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+        } else {
+                brelse(bh);
+                ret = -EIO;
+                ext4_error(inode->i_sb, __func__, 
+                           "bogus i_mode (%o) for inode=%lu",
+                           inode->i_mode, inode->i_ino);
+                goto bad_inode;
        }
        brelse(iloc.bh);
        ext4_set_inode_flags(inode);
@@ -5146,8 +5248,9 @@ static int ext4_bh_unmapped(handle_t *handle, struct buffer_head *bh)
        return !buffer_mapped(bh);
 }
-int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        loff_t size;
        unsigned long len;
        int ret = -EINVAL;
@@ -5199,6 +5302,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                goto out_unlock;
        ret = 0;
 out_unlock:
+        if (ret)
+                ret = VM_FAULT_SIGBUS;
        up_read(&inode->i_alloc_sem);
        return ret;
 }
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 42dc83fb247a..91e75f7a9e73 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
                if (err)
                        return err;
-                if (!S_ISDIR(inode->i_mode))
+                flags = ext4_mask_flags(inode->i_mode, flags);
-                        flags &= ~EXT4_DIRSYNC_FL;
                err = -EPERM;
                mutex_lock(&inode->i_mutex);
@@ -263,6 +262,20 @@ setversion_out:
                return err;
        }
+        case EXT4_IOC_ALLOC_DA_BLKS:
+        {
+                int err;
+                if (!is_owner_or_cap(inode))
+                        return -EACCES;
+                err = mnt_want_write(filp->f_path.mnt);
+                if (err)
+                        return err;
+                err = ext4_alloc_da_blocks(inode);
+                mnt_drop_write(filp->f_path.mnt);
+                return err;
+        }
        default:
                return -ENOTTY;
        }
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b038188bd039..f871677a7984 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
 * The allocation request involve request for multiple number of blocks
 * near to the goal(block) value specified.
 *
- * During initialization phase of the allocator we decide to use the group
+ * During initialization phase of the allocator we decide to use the
- * preallocation or inode preallocation depending on the size file. The
+ * group preallocation or inode preallocation depending on the size of
- * size of the file could be the resulting file size we would have after
+ * the file. The size of the file could be the resulting file size we
- * allocation or the current file size which ever is larger. If the size is
+ * would have after allocation, or the current file size, which ever
- * less that sbi->s_mb_stream_request we select the group
+ * is larger. If the size is less than sbi->s_mb_stream_request we
- * preallocation. The default value of s_mb_stream_request is 16
+ * select to use the group preallocation. The default value of
- * blocks. This can also be tuned via
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
- * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
- * of number of blocks.
+ * terms of number of blocks.
 *
 * The main motivation for having small file use group preallocation is to
- * ensure that we have small file closer in the disk.
+ * ensure that we have small files closer together on the disk.
 *
- * First stage the allocator looks at the inode prealloc list
+ * First stage the allocator looks at the inode prealloc list,
- * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
- * this particular inode. The inode prealloc space is represented as:
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
 *
 * pa_lstart -> the logical start block for this prealloc space
 * pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
 * list. In case of inode preallocation we follow a list of heuristics
 * based on file size. This can be found in ext4_mb_normalize_request. If
 * we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
 * 512 blocks. This can be tuned via
- * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
 * terms of number of blocks. If we have mounted the file system with -O
 * stripe=<value> option the group prealloc request is normalized to the
 * stripe value (sbi->s_stripe)
 *
- * The regular allocator(using the buddy cache) support few tunables.
+ * The regular allocator(using the buddy cache) supports few tunables.
 *
- * /proc/fs/ext4/<partition>/min_to_scan
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
- * /proc/fs/ext4/<partition>/max_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
- * /proc/fs/ext4/<partition>/order2_req
+ * /sys/fs/ext4/<partition>/mb_order2_req
 *
- * The regular allocator use buddy scan only if the request len is power of
+ * The regular allocator uses buddy scan only if the request len is power of
 * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
 * value of s_mb_order2_reqs can be tuned via
- * /proc/fs/ext4/<partition>/order2_req.  If the request len is equal to
+ * /sys/fs/ext4/<partition>/mb_order2_req.  If the request len is equal to
 * stripe size (sbi->s_stripe), we try to search for contigous block in
- * stripe size. This should result in better allocation on RAID setup. If
+ * stripe size. This should result in better allocation on RAID setups. If
- * not we search in the specific group using bitmap for best extents. The
+ * not, we search in the specific group using bitmap for best extents. The
- * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * tunable min_to_scan and max_to_scan control the behaviour here.
 * min_to_scan indicate how long the mballoc __must__ look for a best
- * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
 * best extent in the found extents. Searching for the blocks starts with
 * the group specified as the goal value in allocation context via
 * ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
 static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
@@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
 {
        unsigned free, fragments;
        unsigned i, bits;
+        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
        struct ext4_group_desc *desc;
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
@@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
                        return 0;
+                /* Avoid using the first bg of a flexgroup for data files */
+                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+                    (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+                    ((group % flex_size) == 0))
+                        return 0;
                bits = ac->ac_sb->s_blocksize_bits + 1;
                for (i = ac->ac_2order; i <= bits; i++)
                        if (grp->bb_counters[i] > 0)
@@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
        /*
         * We search using buddy data only if the order of the request
         * is greater than equal to the sbi_s_mb_order2_reqs
-         * You can tune it via /proc/fs/ext4/<partition>/order2_req
+         * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
         */
        if (i >= sbi->s_mb_order2_reqs) {
                /*
@@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
        sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
        if (sbi->s_mb_maxs == NULL) {
-                kfree(sbi->s_mb_maxs);
+                kfree(sbi->s_mb_offsets);
                return -ENOMEM;
        }
@@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
                spin_lock_init(&lg->lg_prealloc_lock);
        }
-        ext4_mb_init_per_dev_proc(sb);
        ext4_mb_history_init(sb);
        if (sbi->s_journal)
@@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
        free_percpu(sbi->s_locality_groups);
        ext4_mb_history_release(sb);
-        ext4_mb_destroy_per_dev_proc(sb);
        return 0;
 }
@@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
        mb_debug("freed %u blocks in %u structures\n", count, count2);
 }
-#define EXT4_MB_STATS_NAME              "stats"
-#define EXT4_MB_MAX_TO_SCAN_NAME        "max_to_scan"
-#define EXT4_MB_MIN_TO_SCAN_NAME        "min_to_scan"
-#define EXT4_MB_ORDER2_REQ              "order2_req"
-#define EXT4_MB_STREAM_REQ              "stream_req"
-#define EXT4_MB_GROUP_PREALLOC          "group_prealloc"
-static int ext4_mb_init_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
-        mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        struct proc_dir_entry *proc;
-        if (sbi->s_proc == NULL)
-                return -EINVAL;
-        EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
-        EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
-        EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
-        EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
-        EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
-        EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
-        return 0;
-err_out:
-        remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-        return -ENOMEM;
-#else
-        return 0;
-#endif
-}
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
-        struct ext4_sb_info *sbi = EXT4_SB(sb);
-        if (sbi->s_proc == NULL)
-                return -EINVAL;
-        remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
-        remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-#endif
-        return 0;
-}
 int __init init_ext4_mballoc(void)
 {
        ext4_pspace_cachep =
@@ -3096,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi,
                                                          ac->ac_b_ex.fe_group);
-                spin_lock(sb_bgl_lock(sbi, flex_group));
+                atomic_sub(ac->ac_b_ex.fe_len,
-                sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
+                           &sbi->s_flex_groups[flex_group].free_blocks);
-                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
        err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3116,7 +3063,7 @@ out_err:
 * here we normalize request for locality group
 * Group request are normalized to s_strip size if we set the same via mount
 * option. If not we set it to s_mb_group_prealloc which can be configured via
- * /proc/fs/ext4/<partition>/group_prealloc
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
 *
 * XXX: should we try to preallocate more than the group has now?
 */
@@ -3608,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
        spin_unlock(&pa->pa_lock);
        grp_blk = pa->pa_pstart;
-        /* If linear, pa_pstart may be in the next group when pa is used up */
+        /* 
-        if (pa->pa_linear)
+         * If doing group-based preallocation, pa_pstart may be in the
+         * next group when pa is used up
+         */
+        if (pa->pa_type == MB_GROUP_PA)
                grp_blk--;
        ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
@@ -3704,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
        INIT_LIST_HEAD(&pa->pa_inode_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
-        pa->pa_linear = 0;
+        pa->pa_type = MB_INODE_PA;
        mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
                        pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3767,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
        INIT_LIST_HEAD(&pa->pa_inode_list);
        INIT_LIST_HEAD(&pa->pa_group_list);
        pa->pa_deleted = 0;
-        pa->pa_linear = 1;
+        pa->pa_type = MB_GROUP_PA;
        mb_debug("new group pa %p: %llu/%u for %u\n", pa,
                 pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -4021,7 +3971,7 @@ repeat:
                list_del_rcu(&pa->pa_inode_list);
                spin_unlock(pa->pa_obj_lock);
-                if (pa->pa_linear)
+                if (pa->pa_type == MB_GROUP_PA)
                        ext4_mb_release_group_pa(&e4b, pa, ac);
                else
                        ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4121,7 +4071,7 @@ repeat:
        spin_unlock(&ei->i_prealloc_lock);
        list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
-                BUG_ON(pa->pa_linear != 0);
+                BUG_ON(pa->pa_type != MB_INODE_PA);
                ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
                err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4232,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 * file is determined by the current size or the resulting size after
 * allocation which ever is larger
 *
- * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
 */
 static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
 {
@@ -4373,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
                        continue;
                }
                /* only lg prealloc space */
-                BUG_ON(!pa->pa_linear);
+                BUG_ON(pa->pa_type != MB_GROUP_PA);
                /* seems this one can be freed ... */
                pa->pa_deleted = 1;
@@ -4442,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
                                                pa_inode_list) {
                spin_lock(&tmp_pa->pa_lock);
                if (tmp_pa->pa_deleted) {
-                        spin_unlock(&pa->pa_lock);
+                        spin_unlock(&tmp_pa->pa_lock);
                        continue;
                }
                if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4479,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 {
        struct ext4_prealloc_space *pa = ac->ac_pa;
        if (pa) {
-                if (pa->pa_linear) {
+                if (pa->pa_type == MB_GROUP_PA) {
                        /* see comment in ext4_mb_use_group_pa() */
                        spin_lock(&pa->pa_lock);
                        pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4499,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
                 * doesn't grow big.  We need to release
                 * alloc_semp before calling ext4_mb_add_n_trim()
                 */
-                if (pa->pa_linear && likely(pa->pa_free)) {
+                if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
                        spin_lock(pa->pa_obj_lock);
                        list_del_rcu(&pa->pa_inode_list);
                        spin_unlock(pa->pa_obj_lock);
@@ -4936,9 +4886,7 @@ do_more:
        if (sbi->s_log_groups_per_flex) {
                ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
-                spin_lock(sb_bgl_lock(sbi, flex_group));
+                atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
-                sbi->s_flex_groups[flex_group].free_blocks += count;
-                spin_unlock(sb_bgl_lock(sbi, flex_group));
        }
        ext4_mb_release_desc(&e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 10a2921baf14..dd9e6cd5f6cf 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
        ext4_lblk_t             pa_lstart;      /* log. block */
        unsigned short          pa_len;         /* len of preallocated chunk */
        unsigned short          pa_free;        /* how many blocks are free */
-        unsigned short          pa_linear;      /* consumed in one direction
+        unsigned short          pa_type;        /* pa type. inode or group */
-                                                 * strictly, for grp prealloc */
        spinlock_t              *pa_obj_lock;
        struct inode            *pa_inode;      /* hack, for history only */
 };
+enum {
+        MB_INODE_PA = 0,
+        MB_GROUP_PA = 1
+};
 struct ext4_free_extent {
        ext4_lblk_t fe_logical;
@@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
 #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
                                        struct ext4_free_extent *fex)
 {
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 83410244d3ee..22098e1cd085 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
                                 struct dx_frame *frame,
                                 int *err);
 static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                       struct dx_hash_info *hinfo, struct dx_map_entry map[]);
 static void dx_sort_map(struct dx_map_entry *map, unsigned count);
 static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
-                struct dx_map_entry *offsets, int count);
+                struct dx_map_entry *offsets, int count, unsigned blocksize);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
 static void dx_insert_block(struct dx_frame *frame,
                                        u32 hash, ext4_lblk_t block);
 static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                             struct inode *inode);
+unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+        unsigned len = le16_to_cpu(dlen);
+        if (len == EXT4_MAX_REC_LEN || len == 0)
+                return blocksize;
+        return (len & 65532) | ((len & 3) << 16);
+}
+  
+__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+        if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+                BUG();
+        if (len < 65536)
+                return cpu_to_le16(len);
+        if (len == blocksize) {
+                if (blocksize == 65536)
+                        return cpu_to_le16(EXT4_MAX_REC_LEN);
+                else 
+                        return cpu_to_le16(0);
+        }
+        return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+}
 /*
 * p is at least 6 bytes before the end of page
 */
 static inline struct ext4_dir_entry_2 *
-ext4_next_entry(struct ext4_dir_entry_2 *p)
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
 {
        return (struct ext4_dir_entry_2 *)((char *)p +
-                ext4_rec_len_from_disk(p->rec_len));
+                ext4_rec_len_from_disk(p->rec_len, blocksize));
 }
 /*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
                        space += EXT4_DIR_REC_LEN(de->name_len);
                        names++;
                }
-                de = ext4_next_entry(de);
+                de = ext4_next_entry(de, size);
        }
        printk("(%i)\n", names);
        return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
        top = (struct ext4_dir_entry_2 *) ((char *) de +
                                           dir->i_sb->s_blocksize -
                                           EXT4_DIR_REC_LEN(0));
-        for (; de < top; de = ext4_next_entry(de)) {
+        for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
                if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
                                        (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
                                                +((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
        }
        if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
                de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
-                de = ext4_next_entry(de);
+                de = ext4_next_entry(de, dir->i_sb->s_blocksize);
                if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
                        goto errout;
                count++;
@@ -713,15 +737,15 @@ errout:
 * Create map of hash values, offsets, and sizes, stored at end of block.
 * Returns number of entries mapped.
 */
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
-                        struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+                       struct dx_hash_info *hinfo,
+                       struct dx_map_entry *map_tail)
 {
        int count = 0;
        char *base = (char *) de;
        struct dx_hash_info h = *hinfo;
-        while ((char *) de < base + size)
+        while ((char *) de < base + blocksize) {
-        {
                if (de->name_len && de->inode) {
                        ext4fs_dirhash(de->name, de->name_len, &h);
                        map_tail--;
@@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
                        cond_resched();
                }
                /* XXX: do we need to check rec_len == 0 case? -Chris */
-                de = ext4_next_entry(de);
+                de = ext4_next_entry(de, blocksize);
        }
        return count;
 }
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
                        return 1;
                }
                /* prevent looping on a bad block */
-                de_len = ext4_rec_len_from_disk(de->rec_len);
+                de_len = ext4_rec_len_from_disk(de->rec_len,
+                                                dir->i_sb->s_blocksize);
                if (de_len <= 0)
                        return -1;
                offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
                de = (struct ext4_dir_entry_2 *) bh->b_data;
                top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
                                       EXT4_DIR_REC_LEN(0));
-                for (; de < top; de = ext4_next_entry(de)) {
+                for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
                        int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
                                  + ((char *) de - bh->b_data);
@@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
                        return ERR_PTR(-EIO);
                }
                inode = ext4_iget(dir->i_sb, ino);
-                if (IS_ERR(inode))
+                if (unlikely(IS_ERR(inode))) {
-                        return ERR_CAST(inode);
+                        if (PTR_ERR(inode) == -ESTALE) {
+                                ext4_error(dir->i_sb, __func__,
+                                                "deleted inode referenced: %u",
+                                                ino);
+                                return ERR_PTR(-EIO);
+                        } else {
+                                return ERR_CAST(inode);
+                        }
+                }
        }
        return d_splice_alias(inode, dentry);
 }
@@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
 * Returns pointer to last entry moved.
 */
 static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+                unsigned blocksize)
 {
        unsigned rec_len = 0;
@@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
-                                ext4_rec_len_to_disk(rec_len);
+                                ext4_rec_len_to_disk(rec_len, blocksize);
                de->inode = 0;
                map++;
                to += rec_len;
@@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
 * Compact each dir entry in the range to the minimal rec_len.
 * Returns pointer to last entry in range.
 */
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
 {
        struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
        unsigned rec_len = 0;
        prev = to = de;
-        while ((char*)de < base + size) {
+        while ((char*)de < base + blocksize) {
-                next = ext4_next_entry(de);
+                next = ext4_next_entry(de, blocksize);
                if (de->inode && de->name_len) {
                        rec_len = EXT4_DIR_REC_LEN(de->name_len);
                        if (de > to)
                                memmove(to, de, rec_len);
-                        to->rec_len = ext4_rec_len_to_disk(rec_len);
+                        to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
                        prev = to;
                        to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
                }
@@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
                                        hash2, split, count-split));
        /* Fancy dance to stay within two buffers */
-        de2 = dx_move_dirents(data1, data2, map + split, count - split);
+        de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
        de = dx_pack_dirents(data1, blocksize);
-        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
-        de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
+                                           blocksize);
+        de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+                                            blocksize);
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
        dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
@@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        const char      *name = dentry->d_name.name;
        int             namelen = dentry->d_name.len;
        unsigned int    offset = 0;
+        unsigned int    blocksize = dir->i_sb->s_blocksize;
        unsigned short  reclen;
        int             nlen, rlen, err;
        char            *top;
@@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        reclen = EXT4_DIR_REC_LEN(namelen);
        if (!de) {
                de = (struct ext4_dir_entry_2 *)bh->b_data;
-                top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+                top = bh->b_data + blocksize - reclen;
                while ((char *) de <= top) {
                        if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
                                                  bh, offset)) {
@@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
                                return -EEXIST;
                        }
                        nlen = EXT4_DIR_REC_LEN(de->name_len);
-                        rlen = ext4_rec_len_from_disk(de->rec_len);
+                        rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
                        if ((de->inode? rlen - nlen: rlen) >= reclen)
                                break;
                        de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
        /* By now the buffer is marked for journaling */
        nlen = EXT4_DIR_REC_LEN(de->name_len);
-        rlen = ext4_rec_len_from_disk(de->rec_len);
+        rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
        if (de->inode) {
                struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
-                de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
+                de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
-                de->rec_len = ext4_rec_len_to_disk(nlen);
+                de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
                de = de1;
        }
        de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        /* The 0th block becomes the root, move the dirents out */
        fde = &root->dotdot;
        de = (struct ext4_dir_entry_2 *)((char *)fde +
-                ext4_rec_len_from_disk(fde->rec_len));
+                ext4_rec_len_from_disk(fde->rec_len, blocksize));
        if ((char *) de >= (((char *) root) + blocksize)) {
                ext4_error(dir->i_sb, __func__,
                           "invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
        memcpy (data1, de, len);
        de = (struct ext4_dir_entry_2 *) data1;
        top = data1 + len;
-        while ((char *)(de2 = ext4_next_entry(de)) < top)
+        while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
                de = de2;
-        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+        de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+                                           blocksize);
        /* Initialize the root; the dot dirents already exist */
        de = (struct ext4_dir_entry_2 *) (&root->dotdot);
-        de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
+        de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+                                           blocksize);
        memset (&root->info, 0, sizeof(root->info));
        root->info.info_length = sizeof(root->info);
        root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
                return retval;
        de = (struct ext4_dir_entry_2 *) bh->b_data;
        de->inode = 0;
-        de->rec_len = ext4_rec_len_to_disk(blocksize);
+        de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
        return add_dirent_to_buf(handle, dentry, inode, de, bh);
 }
@@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
                        goto cleanup;
                node2 = (struct dx_node *)(bh2->b_data);
                entries2 = node2->entries;
-                node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
+                node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+                                                           sb->s_blocksize);
                node2->fake.inode = 0;
                BUFFER_TRACE(frame->bh, "get_write_access");
                err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle,
                             struct buffer_head *bh)
 {
        struct ext4_dir_entry_2 *de, *pde;
+        unsigned int blocksize = dir->i_sb->s_blocksize;
        int i;
        i = 0;
@@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle,
                        ext4_journal_get_write_access(handle, bh);
                        if (pde)
                                pde->rec_len = ext4_rec_len_to_disk(
-                                        ext4_rec_len_from_disk(pde->rec_len) +
+                                        ext4_rec_len_from_disk(pde->rec_len,
-                                        ext4_rec_len_from_disk(de->rec_len));
+                                                               blocksize) +
+                                        ext4_rec_len_from_disk(de->rec_len,
+                                                               blocksize),
+                                        blocksize);
                        else
                                de->inode = 0;
                        dir->i_version++;
@@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle,
                        ext4_handle_dirty_metadata(handle, dir, bh);
                        return 0;
                }
-                i += ext4_rec_len_from_disk(de->rec_len);
+                i += ext4_rec_len_from_disk(de->rec_len, blocksize);
                pde = de;
-                de = ext4_next_entry(de);
+                de = ext4_next_entry(de, blocksize);
        }
        return -ENOENT;
 }
@@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        struct inode *inode;
        struct buffer_head *dir_block;
        struct ext4_dir_entry_2 *de;
+        unsigned int blocksize = dir->i_sb->s_blocksize;
        int err, retries = 0;
        if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1869,14 @@ retry:
        de = (struct ext4_dir_entry_2 *) dir_block->b_data;
        de->inode = cpu_to_le32(inode->i_ino);
        de->name_len = 1;
-        de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
+        de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+                                           blocksize);
        strcpy(de->name, ".");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
-        de = ext4_next_entry(de);
+        de = ext4_next_entry(de, blocksize);
        de->inode = cpu_to_le32(dir->i_ino);
-        de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
+        de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
-                                                EXT4_DIR_REC_LEN(1));
+                                           blocksize);
        de->name_len = 2;
        strcpy(de->name, "..");
        ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode)
                return 1;
        }
        de = (struct ext4_dir_entry_2 *) bh->b_data;
-        de1 = ext4_next_entry(de);
+        de1 = ext4_next_entry(de, sb->s_blocksize);
        if (le32_to_cpu(de->inode) != inode->i_ino ||
                        !le32_to_cpu(de1->inode) ||
                        strcmp(".", de->name) ||
@@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode)
                brelse(bh);
                return 1;
        }
-        offset = ext4_rec_len_from_disk(de->rec_len) +
+        offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
-                 ext4_rec_len_from_disk(de1->rec_len);
+                 ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
-        de = ext4_next_entry(de1);
+        de = ext4_next_entry(de1, sb->s_blocksize);
        while (offset < inode->i_size) {
                if (!bh ||
                        (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode)
                        brelse(bh);
                        return 0;
                }
-                offset += ext4_rec_len_from_disk(de->rec_len);
+                offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
-                de = ext4_next_entry(de);
+                de = ext4_next_entry(de, sb->s_blocksize);
        }
        brelse(bh);
        return 1;
@@ -2297,8 +2343,8 @@ retry:
        return err;
 }
-#define PARENT_INO(buffer) \
+#define PARENT_INO(buffer, size) \
-        (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
+        (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
 /*
 * Anybody can rename anything with this: the permission checks are left to the
@@ -2311,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        struct inode *old_inode, *new_inode;
        struct buffer_head *old_bh, *new_bh, *dir_bh;
        struct ext4_dir_entry_2 *old_de, *new_de;
-        int retval;
+        int retval, force_da_alloc = 0;
        old_bh = new_bh = dir_bh = NULL;
@@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
                if (!dir_bh)
                        goto end_rename;
-                if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+                if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+                                old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
                        goto end_rename;
                retval = -EMLINK;
                if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (dir_bh) {
                BUFFER_TRACE(dir_bh, "get_write_access");
                ext4_journal_get_write_access(handle, dir_bh);
-                PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+                PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+                                                cpu_to_le32(new_dir->i_ino);
                BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
                ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
                ext4_dec_count(handle, old_dir);
@@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
                ext4_mark_inode_dirty(handle, new_inode);
                if (!new_inode->i_nlink)
                        ext4_orphan_add(handle, new_inode);
+                if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+                        force_da_alloc = 1;
        }
        retval = 0;
@@ -2457,6 +2507,8 @@ end_rename:
        brelse(old_bh);
        brelse(new_bh);
        ext4_journal_stop(handle);
+        if (retval == 0 && force_da_alloc)
+                ext4_alloc_da_blocks(old_inode);
        return retval;
 }
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c06886abd658..546c7dd869e1 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
                ext4_group_t flex_group;
                flex_group = ext4_flex_group(sbi, input->group);
-                sbi->s_flex_groups[flex_group].free_blocks +=
+                atomic_add(input->free_blocks_count,
-                        input->free_blocks_count;
+                           &sbi->s_flex_groups[flex_group].free_blocks);
-                sbi->s_flex_groups[flex_group].free_inodes +=
+                atomic_add(EXT4_INODES_PER_GROUP(sb),
-                        EXT4_INODES_PER_GROUP(sb);
+                           &sbi->s_flex_groups[flex_group].free_inodes);
        }
        ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7371a6a923d..9987bba99db3 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
 #include <linux/quotaops.h>
 #include <linux/seq_file.h>
 #include <linux/proc_fs.h>
+#include <linux/ctype.h>
 #include <linux/marker.h>
 #include <linux/log2.h>
 #include <linux/crc16.h>
@@ -48,6 +49,7 @@
 #include "group.h"
 struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
@@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb)
                ext4_commit_super(sb, es, 1);
        }
        if (sbi->s_proc) {
-                remove_proc_entry("inode_readahead_blks", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
+        kobject_del(&sbi->s_kobj);
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
@@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb)
                ext4_blkdev_remove(sbi);
        }
        sb->s_fs_info = NULL;
+        /*
+         * Now that we are completely done shutting down the
+         * superblock, we need to actually destroy the kobject.
+         */
+        unlock_kernel();
+        unlock_super(sb);
+        kobject_put(&sbi->s_kobj);
+        wait_for_completion(&sbi->s_kobj_unregister);
+        lock_super(sb);
+        lock_kernel();
+        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return;
 }
@@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
                seq_puts(seq, ",noacl");
 #endif
-        if (!test_opt(sb, RESERVATION))
-                seq_puts(seq, ",noreservation");
        if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
                seq_printf(seq, ",commit=%u",
                           (unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (test_opt(sb, DATA_ERR_ABORT))
                seq_puts(seq, ",data_err=abort");
+        if (test_opt(sb, NO_AUTO_DA_ALLOC))
+                seq_puts(seq, ",noauto_da_alloc");
        ext4_show_quota_options(seq, sb);
        return 0;
 }
@@ -1004,7 +1018,7 @@ enum {
        Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
        Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
        Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-        Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+        Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
        Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
        Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
@@ -1012,8 +1026,8 @@ enum {
        Opt_data_err_abort, Opt_data_err_ignore,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
-        Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
+        Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
-        Opt_grpquota, Opt_i_version,
+        Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
        Opt_inode_readahead_blks, Opt_journal_ioprio
 };
@@ -1039,8 +1053,6 @@ static const match_table_t tokens = {
        {Opt_nouser_xattr, "nouser_xattr"},
        {Opt_acl, "acl"},
        {Opt_noacl, "noacl"},
-        {Opt_reservation, "reservation"},
-        {Opt_noreservation, "noreservation"},
        {Opt_noload, "noload"},
        {Opt_nobh, "nobh"},
        {Opt_bh, "bh"},
@@ -1068,6 +1080,8 @@ static const match_table_t tokens = {
        {Opt_quota, "quota"},
        {Opt_usrquota, "usrquota"},
        {Opt_barrier, "barrier=%u"},
+        {Opt_barrier, "barrier"},
+        {Opt_nobarrier, "nobarrier"},
        {Opt_i_version, "i_version"},
        {Opt_stripe, "stripe=%u"},
        {Opt_resize, "resize"},
@@ -1075,6 +1089,9 @@ static const match_table_t tokens = {
        {Opt_nodelalloc, "nodelalloc"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
        {Opt_journal_ioprio, "journal_ioprio=%u"},
+        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+        {Opt_auto_da_alloc, "auto_da_alloc"},
+        {Opt_noauto_da_alloc, "noauto_da_alloc"},
        {Opt_err, NULL},
 };
@@ -1207,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
                               "not supported\n");
                        break;
 #endif
-                case Opt_reservation:
-                        set_opt(sbi->s_mount_opt, RESERVATION);
-                        break;
-                case Opt_noreservation:
-                        clear_opt(sbi->s_mount_opt, RESERVATION);
-                        break;
                case Opt_journal_update:
                        /* @@@ FIXME */
                        /* Eventually we will want to be able to create
@@ -1415,9 +1426,14 @@ set_qf_format:
                case Opt_abort:
                        set_opt(sbi->s_mount_opt, ABORT);
                        break;
+                case Opt_nobarrier:
+                        clear_opt(sbi->s_mount_opt, BARRIER);
+                        break;
                case Opt_barrier:
-                        if (match_int(&args[0], &option))
+                        if (match_int(&args[0], &option)) {
-                                return 0;
+                                set_opt(sbi->s_mount_opt, BARRIER);
+                                break;
+                        }
                        if (option)
                                set_opt(sbi->s_mount_opt, BARRIER);
                        else
@@ -1463,6 +1479,11 @@ set_qf_format:
                                return 0;
                        if (option < 0 || option > (1 << 30))
                                return 0;
+                        if (option & (option - 1)) {
+                                printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+                                       " must be a power of 2\n");
+                                return 0;
+                        }
                        sbi->s_inode_readahead_blks = option;
                        break;
                case Opt_journal_ioprio:
@@ -1473,6 +1494,19 @@ set_qf_format:
                        *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
                                                            option);
                        break;
+                case Opt_noauto_da_alloc:
+                        set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                        break;
+                case Opt_auto_da_alloc:
+                        if (match_int(&args[0], &option)) {
+                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                                break;
+                        }
+                        if (option)
+                                clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+                        else
+                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+                        break;
                default:
                        printk(KERN_ERR
                               "EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1612,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
                gdp = ext4_get_group_desc(sb, i, &bh);
                flex_group = ext4_flex_group(sbi, i);
-                sbi->s_flex_groups[flex_group].free_inodes +=
+                atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
-                        ext4_free_inodes_count(sb, gdp);
+                           ext4_free_inodes_count(sb, gdp));
-                sbi->s_flex_groups[flex_group].free_blocks +=
+                atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
-                        ext4_free_blks_count(sb, gdp);
+                           ext4_free_blks_count(sb, gdp));
+                atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
+                           ext4_used_dirs_count(sb, gdp));
        }
        return 1;
@@ -1991,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
        return 0;
 }
+/* sysfs supprt */
+struct ext4_attr {
+        struct attribute attr;
+        ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+        ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *, 
+                         const char *, size_t);
+        int offset;
+};
+static int parse_strtoul(const char *buf,
+                unsigned long max, unsigned long *value)
+{
+        char *endp;
+        while (*buf && isspace(*buf))
+                buf++;
+        *value = simple_strtoul(buf, &endp, 0);
+        while (*endp && isspace(*endp))
+                endp++;
+        if (*endp || *value > max)
+                return -EINVAL;
+        return 0;
+}
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+                                              struct ext4_sb_info *sbi,
+                                              char *buf)
+{
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                        (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+}
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+                                         struct ext4_sb_info *sbi, char *buf)
+{
+        struct super_block *sb = sbi->s_buddy_cache->i_sb;
+        return snprintf(buf, PAGE_SIZE, "%lu\n",
+                        (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                         sbi->s_sectors_written_start) >> 1);
+}
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+                                          struct ext4_sb_info *sbi, char *buf)
+{
+        struct super_block *sb = sbi->s_buddy_cache->i_sb;
+        return snprintf(buf, PAGE_SIZE, "%llu\n",
+                        sbi->s_kbytes_written + 
+                        ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                          EXT4_SB(sb)->s_sectors_written_start) >> 1));
+}
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+                                          struct ext4_sb_info *sbi,
+                                          const char *buf, size_t count)
+{
+        unsigned long t;
+        if (parse_strtoul(buf, 0x40000000, &t))
+                return -EINVAL;
+        /* inode_readahead_blks must be a power of 2 */
+        if (t & (t-1))
+                return -EINVAL;
+        sbi->s_inode_readahead_blks = t;
+        return count;
+}
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+                                struct ext4_sb_info *sbi, char *buf)
+{
+        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+        return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+                            struct ext4_sb_info *sbi,
+                            const char *buf, size_t count)
+{
+        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+        unsigned long t;
+        if (parse_strtoul(buf, 0xffffffff, &t))
+                return -EINVAL;
+        *ui = t;
+        return count;
+}
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = {                   \
+        .attr = {.name = __stringify(_name), .mode = _mode },   \
+        .show   = _show,                                        \
+        .store  = _store,                                       \
+        .offset = offsetof(struct ext4_sb_info, _elname),       \
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname)       \
+        EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+                 inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+static struct attribute *ext4_attrs[] = {
+        ATTR_LIST(delayed_allocation_blocks),
+        ATTR_LIST(session_write_kbytes),
+        ATTR_LIST(lifetime_write_kbytes),
+        ATTR_LIST(inode_readahead_blks),
+        ATTR_LIST(mb_stats),
+        ATTR_LIST(mb_max_to_scan),
+        ATTR_LIST(mb_min_to_scan),
+        ATTR_LIST(mb_order2_req),
+        ATTR_LIST(mb_stream_req),
+        ATTR_LIST(mb_group_prealloc),
+        NULL,
+};
+static ssize_t ext4_attr_show(struct kobject *kobj,
+                              struct attribute *attr, char *buf)
+{
+        struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                                s_kobj);
+        struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+        return a->show ? a->show(a, sbi, buf) : 0;
+}
+static ssize_t ext4_attr_store(struct kobject *kobj,
+                               struct attribute *attr,
+                               const char *buf, size_t len)
+{
+        struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                                s_kobj);
+        struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+        return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+static void ext4_sb_release(struct kobject *kobj)
+{
+        struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+                                                s_kobj);
+        complete(&sbi->s_kobj_unregister);
+}
+static struct sysfs_ops ext4_attr_ops = {
+        .show   = ext4_attr_show,
+        .store  = ext4_attr_store,
+};
+static struct kobj_type ext4_ktype = {
+        .default_attrs  = ext4_attrs,
+        .sysfs_ops      = &ext4_attr_ops,
+        .release        = ext4_sb_release,
+};
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
@@ -2021,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
        if (!sbi)
                return -ENOMEM;
+        sbi->s_blockgroup_lock =
+                kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+        if (!sbi->s_blockgroup_lock) {
+                kfree(sbi);
+                return -ENOMEM;
+        }
        sb->s_fs_info = sbi;
        sbi->s_mount_opt = 0;
        sbi->s_resuid = EXT4_DEF_RESUID;
        sbi->s_resgid = EXT4_DEF_RESGID;
        sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
        sbi->s_sb_block = sb_block;
+        sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
+                                                      sectors[1]);
        unlock_kernel();
@@ -2064,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->s_magic = le16_to_cpu(es->s_magic);
        if (sb->s_magic != EXT4_SUPER_MAGIC)
                goto cantfind_ext4;
+        sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
        /* Set defaults before we parse the mount options */
        def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2101,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
-        set_opt(sbi->s_mount_opt, RESERVATION);
        set_opt(sbi->s_mount_opt, BARRIER);
        /*
@@ -2325,14 +2545,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 #ifdef CONFIG_PROC_FS
        if (ext4_proc_root)
                sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-        if (sbi->s_proc)
-                proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
-                                 &ext4_ui_proc_fops,
-                                 &sbi->s_inode_readahead_blks);
 #endif
-        bgl_lock_init(&sbi->s_blockgroup_lock);
+        bgl_lock_init(sbi->s_blockgroup_lock);
        for (i = 0; i < db_count; i++) {
                block = descriptor_loc(sb, logical_sb_block, i);
@@ -2564,6 +2779,16 @@ no_journal:
                goto failed_mount4;
        }
+        sbi->s_kobj.kset = ext4_kset;
+        init_completion(&sbi->s_kobj_unregister);
+        err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+                                   "%s", sb->s_id);
+        if (err) {
+                ext4_mb_release(sb);
+                ext4_ext_release(sb);
+                goto failed_mount4;
+        };
        /*
         * akpm: core read_super() calls in here with the superblock locked.
         * That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2618,7 +2843,6 @@ failed_mount2:
        kfree(sbi->s_group_desc);
 failed_mount:
        if (sbi->s_proc) {
-                remove_proc_entry("inode_readahead_blks", sbi->s_proc);
                remove_proc_entry(sb->s_id, ext4_proc_root);
        }
 #ifdef CONFIG_QUOTA
@@ -2913,6 +3137,10 @@ static int ext4_commit_super(struct super_block *sb,
                set_buffer_uptodate(sbh);
        }
        es->s_wtime = cpu_to_le32(get_seconds());
+        es->s_kbytes_written =
+                cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + 
+                            ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+                              EXT4_SB(sb)->s_sectors_written_start) >> 1));
        ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeblocks_counter));
        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3647,45 +3875,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
        return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
 }
-#ifdef CONFIG_PROC_FS
-static int ext4_ui_proc_show(struct seq_file *m, void *v)
-{
-        unsigned int *p = m->private;
-        seq_printf(m, "%u\n", *p);
-        return 0;
-}
-static int ext4_ui_proc_open(struct inode *inode, struct file *file)
-{
-        return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
-}
-static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
-                               size_t cnt, loff_t *ppos)
-{
-        unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
-        char str[32];
-        if (cnt >= sizeof(str))
-                return -EINVAL;
-        if (copy_from_user(str, buf, cnt))
-                return -EFAULT;
-        *p = simple_strtoul(str, NULL, 0);
-        return cnt;
-}
-const struct file_operations ext4_ui_proc_fops = {
-        .owner          = THIS_MODULE,
-        .open           = ext4_ui_proc_open,
-        .read           = seq_read,
-        .llseek         = seq_lseek,
-        .release        = single_release,
-        .write          = ext4_ui_proc_write,
-};
-#endif
 static struct file_system_type ext4_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "ext4",
@@ -3719,6 +3908,9 @@ static int __init init_ext4_fs(void)
 {
        int err;
+        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+        if (!ext4_kset)
+                return -ENOMEM;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
        err = init_ext4_mballoc();
        if (err)
@@ -3760,6 +3952,7 @@ static void __exit exit_ext4_fs(void)
        exit_ext4_xattr();
        exit_ext4_mballoc();
        remove_proc_entry("fs/ext4", NULL);
+        kset_unregister(ext4_kset);
 }
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 821d10f719bd..4e340fedf768 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1234,8 +1234,9 @@ static void fuse_vma_close(struct vm_area_struct *vma)
 * - sync(2)
 * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
 */
-static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        /*
         * Don't use page->mapping as it may become NULL from a
         * concurrent truncate.
diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 3b9e8de3500b..70b9b8548945 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -337,8 +337,9 @@ static int gfs2_allocate_page_backing(struct page *page)
 * blocks allocated on disk to back that page.
 */
-static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct gfs2_inode *ip = GFS2_I(inode);
        struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -412,6 +413,8 @@ out_unlock:
        gfs2_glock_dq(&gh);
 out:
        gfs2_holder_uninit(&gh);
+        if (ret)
+                ret = VM_FAULT_SIGBUS;
        return ret;
 }
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 9b800d97a687..23a3c76711e0 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -943,14 +943,13 @@ static struct vfsmount *hugetlbfs_vfsmount;
 static int can_do_hugetlb_shm(void)
 {
-        return likely(capable(CAP_IPC_LOCK) ||
+        return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group);
-                        in_group_p(sysctl_hugetlb_shm_group) ||
-                        can_do_mlock());
 }
 struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 {
        int error = -ENOMEM;
+        int unlock_shm = 0;
        struct file *file;
        struct inode *inode;
        struct dentry *dentry, *root;
@@ -960,11 +959,14 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
        if (!hugetlbfs_vfsmount)
                return ERR_PTR(-ENOENT);
-        if (!can_do_hugetlb_shm())
+        if (!can_do_hugetlb_shm()) {
-                return ERR_PTR(-EPERM);
+                if (user_shm_lock(size, user)) {
+                        unlock_shm = 1;
-        if (!user_shm_lock(size, user))
+                        WARN_ONCE(1,
-                return ERR_PTR(-ENOMEM);
+                          "Using mlock ulimits for SHM_HUGETLB deprecated\n");
+                } else
+                        return ERR_PTR(-EPERM);
+        }
        root = hugetlbfs_vfsmount->mnt_root;
        quick_string.name = name;
@@ -1004,7 +1006,8 @@ out_inode:
 out_dentry:
        dput(dentry);
 out_shm_unlock:
-        user_shm_unlock(size, user);
+        if (unlock_shm)
+                user_shm_unlock(size, user);
        return ERR_PTR(error);
 }
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44c..4ea72377c7a2 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        int tag_bytes = journal_tag_bytes(journal);
        struct buffer_head *cbh = NULL; /* For transactional checksums */
        __u32 crc32_sum = ~0;
+        int write_op = WRITE;
        /*
         * First job: lock down the current transaction and wait for
@@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
        spin_lock(&journal->j_state_lock);
        commit_transaction->t_state = T_LOCKED;
+        if (commit_transaction->t_synchronous_commit)
+                write_op = WRITE_SYNC;
        stats.u.run.rs_wait = commit_transaction->t_max_wait;
        stats.u.run.rs_locked = jiffies;
        stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +683,7 @@ start_journal_io:
                                clear_buffer_dirty(bh);
                                set_buffer_uptodate(bh);
                                bh->b_end_io = journal_end_buffer_io_sync;
-                                submit_bh(WRITE, bh);
+                                submit_bh(write_op, bh);
                        }
                        cond_resched();
                        stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 257ff2625765..bbe6d592d8b3 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -55,6 +55,25 @@
 *                      need do nothing.
 * RevokeValid set, Revoked set:
 *                      buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table.  Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
 */
 #ifndef __KERNEL__
@@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
 * the second time we would still have a pending revoke to cancel.  So,
 * do not trust the Revoked bit on buffers unless RevokeValid is also
 * set.
- *
- * The caller must have the journal locked.
 */
 int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
 {
@@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
 /*
 * Write revoke records to the journal for all entries in the current
 * revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
 */
 void jbd2_journal_write_revoke_records(journal_t *journal,
                                  transaction_t *transaction)
 {
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598e..996ffda06bf3 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
                }
        }
+        if (handle->h_sync)
+                transaction->t_synchronous_commit = 1;
        current->journal_info = NULL;
        spin_lock(&journal->j_state_lock);
        spin_lock(&transaction->t_handle_lock);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index aedc47a264c1..1f3b0fc0d351 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,55 +139,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
        return 0;
 }
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
-                                                 struct in6_addr *addr_mapped)
-{
-        const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
-        switch (sap->sa_family) {
-        case AF_INET6:
-                return &((const struct sockaddr_in6 *)sap)->sin6_addr;
-        case AF_INET:
-                ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
-                return addr_mapped;
-        }
-        return NULL;
-}
-/*
- * If lockd is using a PF_INET6 listener, all incoming requests appear
- * to come from AF_INET6 remotes.  The address of AF_INET remotes are
- * mapped to AF_INET6 automatically by the network layer.  In case the
- * user passed an AF_INET server address at mount time, ensure both
- * addresses are AF_INET6 before comparing them.
- */
-static int nlmclnt_cmp_addr(const struct nlm_host *host,
-                            const struct sockaddr *sap)
-{
-        const struct in6_addr *addr1;
-        const struct in6_addr *addr2;
-        struct in6_addr addr1_mapped;
-        struct in6_addr addr2_mapped;
-        addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
-        if (likely(addr1 != NULL)) {
-                addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
-                if (likely(addr2 != NULL))
-                        return ipv6_addr_equal(addr1, addr2);
-        }
-        return 0;
-}
-#else   /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
-static int nlmclnt_cmp_addr(const struct nlm_host *host,
-                            const struct sockaddr *sap)
-{
-        return nlm_cmp_addr(nlm_addr(host), sap);
-}
-#endif  /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
 /*
 * The server lockd has called us back to tell us the lock was granted
 */
@@ -215,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
                 */
                if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
                        continue;
-                if (!nlmclnt_cmp_addr(block->b_host, addr))
+                if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
                        continue;
                if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
                        continue;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5e2c4d5ac827..6d5d4a4169e5 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -16,6 +16,8 @@
 #include <linux/sunrpc/svc.h>
 #include <linux/lockd/lockd.h>
+#include <asm/unaligned.h>
 #define NLMDBG_FACILITY         NLMDBG_MONITOR
 #define NSM_PROGRAM             100024
 #define NSM_VERSION             1
@@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm)
 {
        u64 *p = (u64 *)&nsm->sm_priv.data;
        struct timespec ts;
+        s64 ns;
        ktime_get_ts(&ts);
-        *p++ = timespec_to_ns(&ts);
+        ns = timespec_to_ns(&ts);
-        *p = (unsigned long)nsm;
+        put_unaligned(ns, p);
+        put_unaligned((unsigned long)nsm, p + 1);
 }
 static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b5853..abf83881f68a 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -53,17 +53,6 @@ static struct svc_rqst		*nlmsvc_rqst;
 unsigned long                   nlmsvc_timeout;
 /*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
-        defined(CONFIG_SUNRPC_REGISTER_V4)
-static const sa_family_t        nlmsvc_family = AF_INET6;
-#else   /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-static const sa_family_t        nlmsvc_family = AF_INET;
-#endif  /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-/*
 * These can be set at insmod time (useful for NFS as root filesystem),
 * and also changed through the sysctl interface.  -- Jamie Lokier, Aug 2003
 */
@@ -204,19 +193,30 @@ lockd(void *vrqstp)
        return 0;
 }
-static int create_lockd_listener(struct svc_serv *serv, char *name,
+static int create_lockd_listener(struct svc_serv *serv, const char *name,
-                                 unsigned short port)
+                                 const int family, const unsigned short port)
 {
        struct svc_xprt *xprt;
-        xprt = svc_find_xprt(serv, name, 0, 0);
+        xprt = svc_find_xprt(serv, name, family, 0);
        if (xprt == NULL)
-                return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
+                return svc_create_xprt(serv, name, family, port,
+                                                SVC_SOCK_DEFAULTS);
        svc_xprt_put(xprt);
        return 0;
 }
+static int create_lockd_family(struct svc_serv *serv, const int family)
+{
+        int err;
+        err = create_lockd_listener(serv, "udp", family, nlm_udpport);
+        if (err < 0)
+                return err;
+        return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
+}
 /*
 * Ensure there are active UDP and TCP listeners for lockd.
 *
@@ -232,13 +232,15 @@ static int make_socks(struct svc_serv *serv)
        static int warned;
        int err;
-        err = create_lockd_listener(serv, "udp", nlm_udpport);
+        err = create_lockd_family(serv, PF_INET);
        if (err < 0)
                goto out_err;
-        err = create_lockd_listener(serv, "tcp", nlm_tcpport);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-        if (err < 0)
+        err = create_lockd_family(serv, PF_INET6);
+        if (err < 0 && err != -EAFNOSUPPORT)
                goto out_err;
+#endif  /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
        warned = 0;
        return 0;
@@ -274,7 +276,7 @@ int lockd_up(void)
                        "lockd_up: no pid, %d users??\n", nlmsvc_users);
        error = -ENOMEM;
-        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
+        serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
        if (!serv) {
                printk(KERN_WARNING "lockd_up: create service failed\n");
                goto out;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a1083..a886e692ddd0 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -38,19 +38,10 @@ static struct svc_program nfs4_callback_program;
 unsigned int nfs_callback_set_tcpport;
 unsigned short nfs_callback_tcpport;
+unsigned short nfs_callback_tcpport6;
 static const int nfs_set_port_min = 0;
 static const int nfs_set_port_max = 65535;
-/*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const sa_family_t        nfs_callback_family = AF_INET6;
-#else
-static const sa_family_t        nfs_callback_family = AF_INET;
-#endif
 static int param_set_port(const char *val, struct kernel_param *kp)
 {
        char *endp;
@@ -116,19 +107,29 @@ int nfs_callback_up(void)
        mutex_lock(&nfs_callback_mutex);
        if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
                goto out;
-        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
+        serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
-                                nfs_callback_family, NULL);
        ret = -ENOMEM;
        if (!serv)
                goto out_err;
-        ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
+        ret = svc_create_xprt(serv, "tcp", PF_INET,
-                              SVC_SOCK_ANONYMOUS);
+                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
        if (ret <= 0)
                goto out_err;
        nfs_callback_tcpport = ret;
        dprintk("NFS: Callback listener port = %u (af %u)\n",
-                        nfs_callback_tcpport, nfs_callback_family);
+                        nfs_callback_tcpport, PF_INET);
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+        ret = svc_create_xprt(serv, "tcp", PF_INET6,
+                                nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+        if (ret > 0) {
+                nfs_callback_tcpport6 = ret;
+                dprintk("NFS: Callback listener port = %u (af %u)\n",
+                                nfs_callback_tcpport6, PF_INET6);
+        } else if (ret != -EAFNOSUPPORT)
+                goto out_err;
+#endif  /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
        nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
        if (IS_ERR(nfs_callback_info.rqst)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index bb25d2135ff1..e110e286a262 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -72,5 +72,6 @@ extern void nfs_callback_down(void);
 extern unsigned int nfs_callback_set_tcpport;
 extern unsigned short nfs_callback_tcpport;
+extern unsigned short nfs_callback_tcpport6;
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2277421656e7..aba38017bdef 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -224,38 +224,6 @@ void nfs_put_client(struct nfs_client *clp)
 }
 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
-{
-        switch (sa->sa_family) {
-                default:
-                        return NULL;
-                case AF_INET6:
-                        return &((const struct sockaddr_in6 *)sa)->sin6_addr;
-                        break;
-                case AF_INET:
-                        ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
-                                        addr_mapped);
-                        return addr_mapped;
-        }
-}
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
-                const struct sockaddr *sa2)
-{
-        const struct in6_addr *addr1;
-        const struct in6_addr *addr2;
-        struct in6_addr addr1_mapped;
-        struct in6_addr addr2_mapped;
-        addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
-        if (likely(addr1 != NULL)) {
-                addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
-                if (likely(addr2 != NULL))
-                        return ipv6_addr_equal(addr1, addr2);
-        }
-        return 0;
-}
 /*
 * Test if two ip6 socket addresses refer to the same socket by
 * comparing relevant fields. The padding bytes specifically, are not
@@ -267,38 +235,21 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
 *
 * The caller should ensure both socket addresses are AF_INET6.
 */
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-                                const struct sockaddr *sa2)
+                                      const struct sockaddr *sa2)
 {
-        const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1;
+        const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
-        const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2;
+        const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-        if (!ipv6_addr_equal(&saddr1->sin6_addr,
+        if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
-                             &saddr1->sin6_addr))
+            sin1->sin6_scope_id != sin2->sin6_scope_id)
                return 0;
-        if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
-            saddr1->sin6_scope_id != saddr2->sin6_scope_id)
-                return 0;
-        return saddr1->sin6_port == saddr2->sin6_port;
-}
-#else
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
-                                 const struct sockaddr_in *sa2)
-{
-        return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
-}
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+        return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
-                                 const struct sockaddr *sa2)
-{
-        if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
-                return 0;
-        return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
-                        (const struct sockaddr_in *)sa2);
 }
+#else   /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
-                                const struct sockaddr * sa2)
+                                      const struct sockaddr *sa2)
 {
        return 0;
 }
@@ -311,20 +262,57 @@ static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
 *
 * The caller should ensure both socket addresses are AF_INET.
 */
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
+                                      const struct sockaddr *sa2)
+{
+        const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+        const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+        return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
+}
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+                                const struct sockaddr *sa2)
+{
+        const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+        const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+        return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
+                (sin1->sin6_port == sin2->sin6_port);
+}
 static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
                                const struct sockaddr *sa2)
 {
-        const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1;
+        const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
-        const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2;
+        const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-        if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr)
+        return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
+                (sin1->sin_port == sin2->sin_port);
+}
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, excluding the port number.
+ */
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+                                     const struct sockaddr *sa2)
+{
+        if (sa1->sa_family != sa2->sa_family)
                return 0;
-        return saddr1->sin_port == saddr2->sin_port;
+        switch (sa1->sa_family) {
+        case AF_INET:
+                return nfs_sockaddr_match_ipaddr4(sa1, sa2);
+        case AF_INET6:
+                return nfs_sockaddr_match_ipaddr6(sa1, sa2);
+        }
+        return 0;
 }
 /*
 * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields.
+ * by comparing (only) relevant fields, including the port number.
 */
 static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
                            const struct sockaddr *sa2)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 78bf72fc1db3..370b190a09d1 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                } else if (atomic_read(&new_dentry->d_count) > 1)
                        /* dentry still busy? */
                        goto out;
-        } else
+        }
-                nfs_drop_nlink(new_inode);
 go_ahead:
        /*
@@ -1638,10 +1637,8 @@ go_ahead:
        }
        nfs_inode_return_delegation(old_inode);
-        if (new_inode != NULL) {
+        if (new_inode != NULL)
                nfs_inode_return_delegation(new_inode);
-                d_delete(new_dentry);
-        }
        error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
                                           new_dir, &new_dentry->d_name);
@@ -1650,6 +1647,8 @@ out:
        if (rehash)
                d_rehash(rehash);
        if (!error) {
+                if (new_inode != NULL)
+                        nfs_drop_nlink(new_inode);
                d_move(old_dentry, new_dentry);
                nfs_set_verifier(new_dentry,
                                        nfs_save_change_attribute(new_dir));
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 90f292b520d2..0abf3f331f56 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -64,11 +64,7 @@ const struct file_operations nfs_file_operations = {
        .write          = do_sync_write,
        .aio_read       = nfs_file_read,
        .aio_write      = nfs_file_write,
-#ifdef CONFIG_MMU
        .mmap           = nfs_file_mmap,
-#else
-        .mmap           = generic_file_mmap,
-#endif
        .open           = nfs_file_open,
        .flush          = nfs_file_flush,
        .release        = nfs_file_release,
@@ -141,9 +137,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
                        dentry->d_parent->d_name.name,
                        dentry->d_name.name);
-        /* Ensure that dirty pages are flushed out with the right creds */
-        if (filp->f_mode & FMODE_WRITE)
-                nfs_wb_all(dentry->d_inode);
        nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
        return nfs_release(inode, filp);
 }
@@ -235,7 +228,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
        struct nfs_open_context *ctx = nfs_file_open_context(file);
        struct dentry   *dentry = file->f_path.dentry;
        struct inode    *inode = dentry->d_inode;
-        int             status;
        dprintk("NFS: flush(%s/%s)\n",
                        dentry->d_parent->d_name.name,
@@ -245,11 +237,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
                return 0;
        nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
-        /* Ensure that data+attribute caches are up to date after close() */
+        /* Flush writes to the server and return any errors */
-        status = nfs_do_fsync(ctx, inode);
+        return nfs_do_fsync(ctx, inode);
-        if (!status)
-                nfs_revalidate_inode(NFS_SERVER(inode), inode);
-        return status;
 }
 static ssize_t
@@ -304,11 +293,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
        dprintk("NFS: mmap(%s/%s)\n",
                dentry->d_parent->d_name.name, dentry->d_name.name);
-        status = nfs_revalidate_mapping(inode, file->f_mapping);
+        /* Note: generic_file_mmap() returns ENOSYS on nommu systems
+         *       so we call that before revalidating the mapping
+         */
+        status = generic_file_mmap(file, vma);
        if (!status) {
                vma->vm_ops = &nfs_file_vm_ops;
-                vma->vm_flags |= VM_CAN_NONLINEAR;
+                status = nfs_revalidate_mapping(inode, file->f_mapping);
-                file_accessed(file);
        }
        return status;
 }
@@ -354,6 +345,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
                file->f_path.dentry->d_name.name,
                mapping->host->i_ino, len, (long long) pos);
+        /*
+         * Prevent starvation issues if someone is doing a consistency
+         * sync-to-disk
+         */
+        ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+                        nfs_wait_bit_killable, TASK_KILLABLE);
+        if (ret)
+                return ret;
        page = grab_cache_page_write_begin(mapping, index, flags);
        if (!page)
                return -ENOMEM;
@@ -451,8 +451,9 @@ const struct address_space_operations nfs_file_aops = {
        .launder_page = nfs_launder_page,
 };
-static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct file *filp = vma->vm_file;
        struct dentry *dentry = filp->f_path.dentry;
        unsigned pagelen;
@@ -483,6 +484,8 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                ret = pagelen;
 out_unlock:
        unlock_page(page);
+        if (ret)
+                ret = VM_FAULT_SIGBUS;
        return ret;
 }
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f29..46177cb87064 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
                return ret;
        }
-        if (fattr.type != NFDIR) {
+        if (!S_ISDIR(fattr.mode)) {
                printk(KERN_ERR "nfs4_get_root:"
                       " getroot encountered non-directory\n");
                return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
                return ret;
        }
-        if (fattr.type != NFDIR) {
+        if (!S_ISDIR(fattr.mode)) {
                printk(KERN_ERR "nfs4_get_root:"
                       " lookupfh encountered non-directory\n");
                return -ENOTDIR;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c381686171e..a834d1d850b7 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -66,6 +66,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 }
 /**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(void *word)
+{
+        if (fatal_signal_pending(current))
+                return -ERESTARTSYS;
+        schedule();
+        return 0;
+}
+/**
 * nfs_compat_user_ino64 - returns the user-visible inode number
 * @fileid: 64-bit fileid
 *
@@ -249,13 +261,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
        struct inode *inode = ERR_PTR(-ENOENT);
        unsigned long hash;
-        if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+        if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
                goto out_no_inode;
+        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
-        if (!fattr->nlink) {
-                printk("NFS: Buggy server - nlink == 0!\n");
                goto out_no_inode;
-        }
        hash = nfs_fattr_to_ino_t(fattr);
@@ -291,7 +300,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                            && fattr->size <= NFS_LIMIT_READDIRPLUS)
                                set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
                        /* Deal with crossing mountpoints */
-                        if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+                        if ((fattr->valid & NFS_ATTR_FATTR_FSID)
+                                        && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
                                if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
                                        inode->i_op = &nfs_referral_inode_operations;
                                else
@@ -304,28 +314,45 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
                else
                        init_special_inode(inode, inode->i_mode, fattr->rdev);
+                memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+                memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+                memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+                nfsi->change_attr = 0;
+                inode->i_size = 0;
+                inode->i_nlink = 0;
+                inode->i_uid = -2;
+                inode->i_gid = -2;
+                inode->i_blocks = 0;
+                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
                nfsi->read_cache_jiffies = fattr->time_start;
                nfsi->attr_gencount = fattr->gencount;
-                inode->i_atime = fattr->atime;
+                if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-                inode->i_mtime = fattr->mtime;
+                        inode->i_atime = fattr->atime;
-                inode->i_ctime = fattr->ctime;
+                if (fattr->valid & NFS_ATTR_FATTR_MTIME)
-                if (fattr->valid & NFS_ATTR_FATTR_V4)
+                        inode->i_mtime = fattr->mtime;
+                if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+                        inode->i_ctime = fattr->ctime;
+                if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
                        nfsi->change_attr = fattr->change_attr;
-                inode->i_size = nfs_size_to_loff_t(fattr->size);
+                if (fattr->valid & NFS_ATTR_FATTR_SIZE)
-                inode->i_nlink = fattr->nlink;
+                        inode->i_size = nfs_size_to_loff_t(fattr->size);
-                inode->i_uid = fattr->uid;
+                if (fattr->valid & NFS_ATTR_FATTR_NLINK)
-                inode->i_gid = fattr->gid;
+                        inode->i_nlink = fattr->nlink;
-                if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+                if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+                        inode->i_uid = fattr->uid;
+                if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+                        inode->i_gid = fattr->gid;
+                if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+                        inode->i_blocks = fattr->du.nfs2.blocks;
+                if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
                        /*
                         * report the blocks in 512byte units
                         */
                        inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-                } else {
-                        inode->i_blocks = fattr->du.nfs2.blocks;
                }
                nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
                nfsi->attrtimeo_timestamp = now;
-                memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
                nfsi->access_cache = RB_ROOT;
                unlock_new_inode(inode);
@@ -514,6 +541,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
        return err;
 }
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+        struct inode *inode;
+        struct nfs_server *server;
+        if (!(ctx->mode & FMODE_WRITE))
+                return;
+        if (!is_sync)
+                return;
+        inode = ctx->path.dentry->d_inode;
+        if (!list_empty(&NFS_I(inode)->open_files))
+                return;
+        server = NFS_SERVER(inode);
+        if (server->flags & NFS_MOUNT_NOCTO)
+                return;
+        nfs_revalidate_inode(server, inode);
+}
 static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
 {
        struct nfs_open_context *ctx;
@@ -540,24 +593,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
        return ctx;
 }
-static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
 {
-        struct inode *inode;
+        struct inode *inode = ctx->path.dentry->d_inode;
-        if (ctx == NULL)
-                return;
-        inode = ctx->path.dentry->d_inode;
        if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
                return;
        list_del(&ctx->list);
        spin_unlock(&inode->i_lock);
-        if (ctx->state != NULL) {
+        NFS_PROTO(inode)->close_context(ctx, is_sync);
-                if (wait)
-                        nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
-                else
-                        nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
-        }
        if (ctx->cred != NULL)
                put_rpccred(ctx->cred);
        path_put(&ctx->path);
@@ -670,9 +714,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
        if (NFS_STALE(inode))
                goto out;
-        if (NFS_STALE(inode))
-                goto out;
        nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
        status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
        if (status != 0) {
@@ -815,25 +856,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
+        if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
-                        nfsi->change_attr == fattr->pre_change_attr) {
+                        && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+                        && nfsi->change_attr == fattr->pre_change_attr) {
                nfsi->change_attr = fattr->change_attr;
                if (S_ISDIR(inode->i_mode))
                        nfsi->cache_validity |= NFS_INO_INVALID_DATA;
        }
        /* If we have atomic WCC data, we may update some attributes */
-        if ((fattr->valid & NFS_ATTR_WCC) != 0) {
+        if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
-                if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+                        && (fattr->valid & NFS_ATTR_FATTR_CTIME)
+                        && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
                        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
-                if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+        if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+                        && (fattr->valid & NFS_ATTR_FATTR_MTIME)
+                        && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
                        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
                        if (S_ISDIR(inode->i_mode))
                                nfsi->cache_validity |= NFS_INO_INVALID_DATA;
-                }
-                if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
-                    nfsi->npages == 0)
-                        i_size_write(inode, nfs_size_to_loff_t(fattr->size));
        }
+        if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+                        && (fattr->valid & NFS_ATTR_FATTR_SIZE)
+                        && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+                        && nfsi->npages == 0)
+                        i_size_write(inode, nfs_size_to_loff_t(fattr->size));
 }
 /**
@@ -853,35 +900,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
        /* Has the inode gone and changed behind our back? */
-        if (nfsi->fileid != fattr->fileid
+        if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
-                        || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+                return -EIO;
+        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
                return -EIO;
-        }
-        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+        if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
                        nfsi->change_attr != fattr->change_attr)
                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
        /* Verify a few of the more important attributes */
-        if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
+        if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
-        cur_size = i_size_read(inode);
+        if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
-        new_isize = nfs_size_to_loff_t(fattr->size);
+                cur_size = i_size_read(inode);
-        if (cur_size != new_isize && nfsi->npages == 0)
+                new_isize = nfs_size_to_loff_t(fattr->size);
-                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+                if (cur_size != new_isize && nfsi->npages == 0)
+                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+        }
        /* Have any file permissions changed? */
-        if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
+        if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
-                        || inode->i_uid != fattr->uid
+                invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
-                        || inode->i_gid != fattr->gid)
+        if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+                invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+        if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
                invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
        /* Has the link count changed? */
-        if (inode->i_nlink != fattr->nlink)
+        if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
                invalid |= NFS_INO_INVALID_ATTR;
-        if (!timespec_equal(&inode->i_atime, &fattr->atime))
+        if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
                invalid |= NFS_INO_INVALID_ATIME;
        if (invalid != 0)
@@ -893,11 +944,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+        if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+                return 0;
        return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
 }
 static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
 {
+        if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+                return 0;
        return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
 }
@@ -1033,20 +1088,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
        /* Don't do a WCC update if these attributes are already stale */
        if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
                        !nfs_inode_attrs_need_update(inode, fattr)) {
-                fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+                fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+                                | NFS_ATTR_FATTR_PRESIZE
+                                | NFS_ATTR_FATTR_PREMTIME
+                                | NFS_ATTR_FATTR_PRECTIME);
                goto out_noforce;
        }
-        if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+        if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
-                        (fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+                        (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
                fattr->pre_change_attr = NFS_I(inode)->change_attr;
-                fattr->valid |= NFS_ATTR_WCC_V4;
+                fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
        }
-        if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
+        if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
-                        (fattr->valid & NFS_ATTR_WCC) == 0) {
+                        (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
                memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+                fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+        }
+        if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+                        (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
                memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+                fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+        }
+        if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+                        (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
                fattr->pre_size = i_size_read(inode);
-                fattr->valid |= NFS_ATTR_WCC;
+                fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
        }
 out_noforce:
        status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1078,18 +1144,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        __func__, inode->i_sb->s_id, inode->i_ino,
                        atomic_read(&inode->i_count), fattr->valid);
-        if (nfsi->fileid != fattr->fileid)
+        if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
                goto out_fileid;
        /*
         * Make sure the inode's type hasn't changed.
         */
-        if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+        if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
                goto out_changed;
        server = NFS_SERVER(inode);
        /* Update the fsid? */
-        if (S_ISDIR(inode->i_mode) &&
+        if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
                        !nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
                        !test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
                server->fsid = fattr->fsid;
@@ -1099,14 +1165,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
         */
        nfsi->read_cache_jiffies = fattr->time_start;
-        nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
+        if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
-                        | NFS_INO_REVAL_PAGECACHE);
+            nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+                    | NFS_INO_INVALID_ATIME
+                    | NFS_INO_REVAL_PAGECACHE);
        /* Do atomic weak cache consistency updates */
        nfs_wcc_update_inode(inode, fattr);
        /* More cache consistency checks */
-        if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+        if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+                if (nfsi->change_attr != fattr->change_attr) {
+                        dprintk("NFS: change_attr change on server for file %s/%ld\n",
+                                        inode->i_sb->s_id, inode->i_ino);
+                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                        if (S_ISDIR(inode->i_mode))
+                                nfs_force_lookup_revalidate(inode);
+                        nfsi->change_attr = fattr->change_attr;
+                }
+        }
+        if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
                /* NFSv2/v3: Check if the mtime agrees */
                if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
                        dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1114,59 +1193,80 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
                        if (S_ISDIR(inode->i_mode))
                                nfs_force_lookup_revalidate(inode);
+                        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
                }
+        }
+        if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
                /* If ctime has changed we should definitely clear access+acl caches */
-                if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
+                if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-        } else if (nfsi->change_attr != fattr->change_attr) {
+                        /* and probably clear data for a directory too as utimes can cause
-                dprintk("NFS: change_attr change on server for file %s/%ld\n",
+                         * havoc with our cache.
-                                inode->i_sb->s_id, inode->i_ino);
+                         */
-                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                        if (S_ISDIR(inode->i_mode)) {
-                if (S_ISDIR(inode->i_mode))
+                                invalid |= NFS_INO_INVALID_DATA;
-                        nfs_force_lookup_revalidate(inode);
+                                nfs_force_lookup_revalidate(inode);
+                        }
+                        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+                }
        }
        /* Check if our cached file size is stale */
-        new_isize = nfs_size_to_loff_t(fattr->size);
+        if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
-        cur_isize = i_size_read(inode);
+                new_isize = nfs_size_to_loff_t(fattr->size);
-        if (new_isize != cur_isize) {
+                cur_isize = i_size_read(inode);
-                /* Do we perhaps have any outstanding writes, or has
+                if (new_isize != cur_isize) {
-                 * the file grown beyond our last write? */
+                        /* Do we perhaps have any outstanding writes, or has
-                if (nfsi->npages == 0 || new_isize > cur_isize) {
+                         * the file grown beyond our last write? */
-                        i_size_write(inode, new_isize);
+                        if (nfsi->npages == 0 || new_isize > cur_isize) {
-                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+                                i_size_write(inode, new_isize);
+                                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+                        }
+                        dprintk("NFS: isize change on server for file %s/%ld\n",
+                                        inode->i_sb->s_id, inode->i_ino);
                }
-                dprintk("NFS: isize change on server for file %s/%ld\n",
-                                inode->i_sb->s_id, inode->i_ino);
        }
-        memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
+        if (fattr->valid & NFS_ATTR_FATTR_ATIME)
-        memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+                memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
-        memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
-        nfsi->change_attr = fattr->change_attr;
-        if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
-            inode->i_uid != fattr->uid ||
-            inode->i_gid != fattr->gid)
-                invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
-        if (inode->i_nlink != fattr->nlink)
+        if (fattr->valid & NFS_ATTR_FATTR_MODE) {
-                invalid |= NFS_INO_INVALID_ATTR;
+                if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                        inode->i_mode = fattr->mode;
+                }
+        }
+        if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+                if (inode->i_uid != fattr->uid) {
+                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                        inode->i_uid = fattr->uid;
+                }
+        }
+        if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+                if (inode->i_gid != fattr->gid) {
+                        invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+                        inode->i_gid = fattr->gid;
+                }
+        }
-        inode->i_mode = fattr->mode;
+        if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
-        inode->i_nlink = fattr->nlink;
+                if (inode->i_nlink != fattr->nlink) {
-        inode->i_uid = fattr->uid;
+                        invalid |= NFS_INO_INVALID_ATTR;
-        inode->i_gid = fattr->gid;
+                        if (S_ISDIR(inode->i_mode))
+                                invalid |= NFS_INO_INVALID_DATA;
+                        inode->i_nlink = fattr->nlink;
+                }
+        }
-        if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+        if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
                /*
                 * report the blocks in 512byte units
                 */
                inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
-        } else {
-                inode->i_blocks = fattr->du.nfs2.blocks;
        }
+        if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+                inode->i_blocks = fattr->du.nfs2.blocks;
        /* Update attrtimeo value if we're out of the unstable period */
        if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1274,7 +1374,6 @@ static void init_once(void *foo)
        INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
        INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
        INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
-        nfsi->ncommit = 0;
        nfsi->npages = 0;
        atomic_set(&nfsi->silly_count, 1);
        INIT_HLIST_HEAD(&nfsi->silly_list);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608f..2041f68ff1cc 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -152,6 +152,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
 extern struct rpc_procinfo nfs4_procedures[];
 #endif
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
 /* dir.c */
 extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
@@ -165,6 +168,7 @@ extern void nfs_clear_inode(struct inode *);
 extern void nfs4_clear_inode(struct inode *);
 #endif
 void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(void *word);
 /* super.c */
 void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d1519..c862c9340f9a 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
 static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
-        u32 rdev;
+        u32 rdev, type;
-        fattr->type = (enum nfs_ftype) ntohl(*p++);
+        type = ntohl(*p++);
        fattr->mode = ntohl(*p++);
        fattr->nlink = ntohl(*p++);
        fattr->uid = ntohl(*p++);
@@ -136,10 +136,9 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
        p = xdr_decode_time(p, &fattr->atime);
        p = xdr_decode_time(p, &fattr->mtime);
        p = xdr_decode_time(p, &fattr->ctime);
-        fattr->valid |= NFS_ATTR_FATTR;
+        fattr->valid |= NFS_ATTR_FATTR_V2;
        fattr->rdev = new_decode_dev(rdev);
-        if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
+        if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
-                fattr->type = NFFIFO;
                fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
                fattr->rdev = 0;
        }
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679e..b82fe6847f14 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
        .commit_done    = nfs3_commit_done,
        .lock           = nfs3_proc_lock,
        .clear_acl_cache = nfs3_forget_cached_acls,
+        .close_context  = nfs_close_context,
 };
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cdeacffde46..e6a1932c7110 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -91,19 +91,15 @@
 /*
 * Map file type to S_IFMT bits
 */
-static struct {
+static const umode_t nfs_type2fmt[] = {
-        unsigned int    mode;
+        [NF3BAD] = 0,
-        unsigned int    nfs2type;
+        [NF3REG] = S_IFREG,
-} nfs_type2fmt[] = {
+        [NF3DIR] = S_IFDIR,
-      { 0,              NFNON   },
+        [NF3BLK] = S_IFBLK,
-      { S_IFREG,        NFREG   },
+        [NF3CHR] = S_IFCHR,
-      { S_IFDIR,        NFDIR   },
+        [NF3LNK] = S_IFLNK,
-      { S_IFBLK,        NFBLK   },
+        [NF3SOCK] = S_IFSOCK,
-      { S_IFCHR,        NFCHR   },
+        [NF3FIFO] = S_IFIFO,
-      { S_IFLNK,        NFLNK   },
-      { S_IFSOCK,       NFSOCK  },
-      { S_IFIFO,        NFFIFO  },
-      { 0,              NFBAD   }
 };
 /*
@@ -148,13 +144,12 @@ static __be32 *
 xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
 {
        unsigned int    type, major, minor;
-        int             fmode;
+        umode_t         fmode;
        type = ntohl(*p++);
-        if (type >= NF3BAD)
+        if (type > NF3FIFO)
-                type = NF3BAD;
+                type = NF3NON;
-        fmode = nfs_type2fmt[type].mode;
+        fmode = nfs_type2fmt[type];
-        fattr->type = nfs_type2fmt[type].nfs2type;
        fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
        fattr->nlink = ntohl(*p++);
        fattr->uid = ntohl(*p++);
@@ -177,7 +172,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
        p = xdr_decode_time3(p, &fattr->ctime);
        /* Update the mode bits */
-        fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
+        fattr->valid |= NFS_ATTR_FATTR_V3;
        return p;
 }
@@ -233,7 +228,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
        p = xdr_decode_hyper(p, &fattr->pre_size);
        p = xdr_decode_time3(p, &fattr->pre_mtime);
        p = xdr_decode_time3(p, &fattr->pre_ctime);
-        fattr->valid |= NFS_ATTR_WCC;
+        fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+                | NFS_ATTR_FATTR_PREMTIME
+                | NFS_ATTR_FATTR_PRECTIME;
        return p;
 }
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d9..97bacccff579 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
        kunmap_atomic(start, KM_USER0);
 }
-static int nfs4_wait_bit_killable(void *word)
-{
-        if (fatal_signal_pending(current))
-                return -ERESTARTSYS;
-        schedule();
-        return 0;
-}
 static int nfs4_wait_clnt_recover(struct nfs_client *clp)
 {
        int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
        might_sleep();
        res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
-                        nfs4_wait_bit_killable, TASK_KILLABLE);
+                        nfs_wait_bit_killable, TASK_KILLABLE);
        return res;
 }
@@ -1439,7 +1431,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
        if (calldata->arg.seqid == NULL)
                goto out_free_calldata;
        calldata->arg.fmode = 0;
-        calldata->arg.bitmask = server->attr_bitmask;
+        calldata->arg.bitmask = server->cache_consistency_bitmask;
        calldata->res.fattr = &calldata->fattr;
        calldata->res.seqid = calldata->arg.seqid;
        calldata->res.server = server;
@@ -1580,6 +1572,15 @@ out_drop:
        return 0;
 }
+void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+        if (ctx->state == NULL)
+                return;
+        if (is_sync)
+                nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
+        else
+                nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+}
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
@@ -1600,6 +1601,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
                        server->caps |= NFS_CAP_HARDLINKS;
                if (res.has_symlinks != 0)
                        server->caps |= NFS_CAP_SYMLINKS;
+                memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+                server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+                server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
                server->acl_bitmask = res.acl_bitmask;
        }
        return status;
@@ -2079,7 +2083,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
        struct nfs_removeargs *args = msg->rpc_argp;
        struct nfs_removeres *res = msg->rpc_resp;
-        args->bitmask = server->attr_bitmask;
+        args->bitmask = server->cache_consistency_bitmask;
        res->server = server;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
 }
@@ -2323,7 +2327,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
                .pages = &page,
                .pgbase = 0,
                .count = count,
-                .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+                .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
        };
        struct nfs4_readdir_res res;
        struct rpc_message msg = {
@@ -2552,7 +2556,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
-        data->args.bitmask = server->attr_bitmask;
+        data->args.bitmask = server->cache_consistency_bitmask;
        data->res.server = server;
        data->timestamp   = jiffies;
@@ -2575,7 +2579,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
 {
        struct nfs_server *server = NFS_SERVER(data->inode);
        
-        data->args.bitmask = server->attr_bitmask;
+        data->args.bitmask = server->cache_consistency_bitmask;
        data->res.server = server;
        msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
 }
@@ -3678,6 +3682,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
        return len;
 }
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
+{
+        if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
+                (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+                (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
+                return;
+        fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+                NFS_ATTR_FATTR_NLINK;
+        fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+        fattr->nlink = 2;
+}
 int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
                struct nfs4_fs_locations *fs_locations, struct page *page)
 {
@@ -3704,6 +3721,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
        fs_locations->server = server;
        fs_locations->nlocations = 0;
        status = rpc_call_sync(server->client, &msg, 0);
+        nfs_fixup_referral_attributes(&fs_locations->fattr);
        dprintk("%s: returned status = %d\n", __func__, status);
        return status;
 }
@@ -3767,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
        .commit_done    = nfs4_commit_done,
        .lock           = nfs4_proc_lock,
        .clear_acl_cache = nfs4_zap_acl_attr,
+        .close_context  = nfs4_close_context,
 };
 /*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2022fe47966f..0298e909559f 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list);
 static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
 {
-        int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
+        unsigned short port;
-                        nfs_callback_tcpport, cred);
+        int status;
+        port = nfs_callback_tcpport;
+        if (clp->cl_addr.ss_family == AF_INET6)
+                port = nfs_callback_tcpport6;
+        status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
        if (status == 0)
                status = nfs4_proc_setclientid_confirm(clp, cred);
        if (status == 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1e4c8f8a0a9..1690f0e44b91 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
                                 decode_lookup_maxsz + \
                                 decode_fs_locations_maxsz)
-static struct {
+static const umode_t nfs_type2fmt[] = {
-        unsigned int    mode;
+        [NF4BAD] = 0,
-        unsigned int    nfs2type;
+        [NF4REG] = S_IFREG,
-} nfs_type2fmt[] = {
+        [NF4DIR] = S_IFDIR,
-        { 0,            NFNON        },
+        [NF4BLK] = S_IFBLK,
-        { S_IFREG,      NFREG        },
+        [NF4CHR] = S_IFCHR,
-        { S_IFDIR,      NFDIR        },
+        [NF4LNK] = S_IFLNK,
-        { S_IFBLK,      NFBLK        },
+        [NF4SOCK] = S_IFSOCK,
-        { S_IFCHR,      NFCHR        },
+        [NF4FIFO] = S_IFIFO,
-        { S_IFLNK,      NFLNK        },
+        [NF4ATTRDIR] = 0,
-        { S_IFSOCK,     NFSOCK       },
+        [NF4NAMEDATTR] = 0,
-        { S_IFIFO,      NFFIFO       },
-        { 0,            NFNON        },
-        { 0,            NFNON        },
 };
 struct compound_hdr {
@@ -2160,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
 static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
 {
        __be32 *p;
+        int ret = 0;
        *type = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2172,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
                        return -EIO;
                }
                bitmap[0] &= ~FATTR4_WORD0_TYPE;
+                ret = NFS_ATTR_FATTR_TYPE;
        }
-        dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
+        dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
-        return 0;
+        return ret;
 }
 static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
 {
        __be32 *p;
+        int ret = 0;
        *change = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2188,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
                READ_BUF(8);
                READ64(*change);
                bitmap[0] &= ~FATTR4_WORD0_CHANGE;
+                ret = NFS_ATTR_FATTR_CHANGE;
        }
        dprintk("%s: change attribute=%Lu\n", __func__,
                        (unsigned long long)*change);
-        return 0;
+        return ret;
 }
 static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
 {
        __be32 *p;
+        int ret = 0;
        *size = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2205,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
                READ_BUF(8);
                READ64(*size);
                bitmap[0] &= ~FATTR4_WORD0_SIZE;
+                ret = NFS_ATTR_FATTR_SIZE;
        }
        dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
-        return 0;
+        return ret;
 }
 static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2245,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
 static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
 {
        __be32 *p;
+        int ret = 0;
        fsid->major = 0;
        fsid->minor = 0;
@@ -2255,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
                READ64(fsid->major);
                READ64(fsid->minor);
                bitmap[0] &= ~FATTR4_WORD0_FSID;
+                ret = NFS_ATTR_FATTR_FSID;
        }
        dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
                        (unsigned long long)fsid->major,
                        (unsigned long long)fsid->minor);
-        return 0;
+        return ret;
 }
 static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2297,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
 static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
        __be32 *p;
+        int ret = 0;
        *fileid = 0;
        if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2305,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
                READ_BUF(8);
                READ64(*fileid);
                bitmap[0] &= ~FATTR4_WORD0_FILEID;
+                ret = NFS_ATTR_FATTR_FILEID;
        }
        dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
-        return 0;
+        return ret;
 }
 static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
 {
        __be32 *p;
+        int ret = 0;
        *fileid = 0;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2321,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
                READ_BUF(8);
                READ64(*fileid);
                bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+                ret = NFS_ATTR_FATTR_FILEID;
        }
        dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
-        return 0;
+        return ret;
 }
 static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2479,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
                if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
                        res->nlocations++;
        }
+        if (res->nlocations != 0)
+                status = NFS_ATTR_FATTR_V4_REFERRAL;
 out:
        dprintk("%s: fs_locations done, error = %d\n", __func__, status);
        return status;
@@ -2580,26 +2591,30 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
        return status;
 }
-static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
 {
+        uint32_t tmp;
        __be32 *p;
+        int ret = 0;
        *mode = 0;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
                READ_BUF(4);
-                READ32(*mode);
+                READ32(tmp);
-                *mode &= ~S_IFMT;
+                *mode = tmp & ~S_IFMT;
                bitmap[1] &= ~FATTR4_WORD1_MODE;
+                ret = NFS_ATTR_FATTR_MODE;
        }
        dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
-        return 0;
+        return ret;
 }
 static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
 {
        __be32 *p;
+        int ret = 0;
        *nlink = 1;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2608,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
                READ_BUF(4);
                READ32(*nlink);
                bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
+                ret = NFS_ATTR_FATTR_NLINK;
        }
        dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
-        return 0;
+        return ret;
 }
 static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
 {
        uint32_t len;
        __be32 *p;
+        int ret = 0;
        *uid = -2;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2626,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
                READ32(len);
                READ_BUF(len);
                if (len < XDR_MAX_NETOBJ) {
-                        if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0)
+                        if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+                                ret = NFS_ATTR_FATTR_OWNER;
+                        else
                                dprintk("%s: nfs_map_name_to_uid failed!\n",
                                                __func__);
                } else
@@ -2635,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
                bitmap[1] &= ~FATTR4_WORD1_OWNER;
        }
        dprintk("%s: uid=%d\n", __func__, (int)*uid);
-        return 0;
+        return ret;
 }
 static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
 {
        uint32_t len;
        __be32 *p;
+        int ret = 0;
        *gid = -2;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2651,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
                READ32(len);
                READ_BUF(len);
                if (len < XDR_MAX_NETOBJ) {
-                        if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0)
+                        if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+                                ret = NFS_ATTR_FATTR_GROUP;
+                        else
                                dprintk("%s: nfs_map_group_to_gid failed!\n",
                                                __func__);
                } else
@@ -2660,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
                bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
        }
        dprintk("%s: gid=%d\n", __func__, (int)*gid);
-        return 0;
+        return ret;
 }
 static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
 {
        uint32_t major = 0, minor = 0;
        __be32 *p;
+        int ret = 0;
        *rdev = MKDEV(0,0);
        if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2681,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
                if (MAJOR(tmp) == major && MINOR(tmp) == minor)
                        *rdev = tmp;
                bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
+                ret = NFS_ATTR_FATTR_RDEV;
        }
        dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
-        return 0;
+        return ret;
 }
 static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2740,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
 static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
 {
        __be32 *p;
+        int ret = 0;
        *used = 0;
        if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2748,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
                READ_BUF(8);
                READ64(*used);
                bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
+                ret = NFS_ATTR_FATTR_SPACE_USED;
        }
        dprintk("%s: space used=%Lu\n", __func__,
                        (unsigned long long)*used);
-        return 0;
+        return ret;
 }
 static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2778,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
                status = decode_attr_time(xdr, time);
+                if (status == 0)
+                        status = NFS_ATTR_FATTR_ATIME;
                bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
        }
        dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2794,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
                status = decode_attr_time(xdr, time);
+                if (status == 0)
+                        status = NFS_ATTR_FATTR_CTIME;
                bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
        }
        dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2810,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
                return -EIO;
        if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
                status = decode_attr_time(xdr, time);
+                if (status == 0)
+                        status = NFS_ATTR_FATTR_MTIME;
                bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
        }
        dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -2994,63 +3026,116 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
        uint32_t attrlen,
                 bitmap[2] = {0},
                 type;
-        int status, fmode = 0;
+        int status;
+        umode_t fmode = 0;
        uint64_t fileid;
-        if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
+        status = decode_op_hdr(xdr, OP_GETATTR);
-                goto xdr_error;
+        if (status < 0)
-        if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
                goto xdr_error;
-        fattr->bitmap[0] = bitmap[0];
+        status = decode_attr_bitmap(xdr, bitmap);
-        fattr->bitmap[1] = bitmap[1];
+        if (status < 0)
+                goto xdr_error;
-        if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+        status = decode_attr_length(xdr, &attrlen, &savep);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
+        status = decode_attr_type(xdr, bitmap, &type);
+        if (status < 0)
                goto xdr_error;
-        fattr->type = nfs_type2fmt[type].nfs2type;
+        fattr->mode = 0;
-        fmode = nfs_type2fmt[type].mode;
+        if (status != 0) {
+                fattr->mode |= nfs_type2fmt[type];
+                fattr->valid |= status;
+        }
-        if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
+        status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_size(xdr, bitmap, &fattr->size);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+        fattr->valid |= status;
+        status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
                                                struct nfs4_fs_locations,
-                                                fattr))) != 0)
+                                                fattr));
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_mode(xdr, bitmap, &fmode);
+        if (status < 0)
                goto xdr_error;
-        fattr->mode |= fmode;
+        if (status != 0) {
-        if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
+                fattr->mode |= fmode;
+                fattr->valid |= status;
+        }
+        status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
+        if (status < 0)
                goto xdr_error;
-        if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
+        fattr->valid |= status;
+        status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
+        if (status < 0)
                goto xdr_error;
-        if (fattr->fileid == 0 && fileid != 0)
+        if (status != 0 && !(fattr->valid & status)) {
                fattr->fileid = fileid;
-        if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
+                fattr->valid |= status;
-                fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
+        }
+        status = verify_attr_len(xdr, savep, attrlen);
 xdr_error:
        dprintk("%s: xdr returned %d\n", __func__, -status);
        return status;
@@ -4078,9 +4163,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
        status = decode_setattr(&xdr, res);
        if (status)
                goto out;
-        status = decode_getfattr(&xdr, res->fattr, res->server);
+        decode_getfattr(&xdr, res->fattr, res->server);
-        if (status == NFS4ERR_DELAY)
-                status = 0;
 out:
        return status;
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70a..e2975939126a 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
        kref_put(&req->wb_kref, nfs_free_request);
 }
-static int nfs_wait_bit_killable(void *word)
-{
-        int ret = 0;
-        if (fatal_signal_pending(current))
-                ret = -ERESTARTSYS;
-        else
-                schedule();
-        return ret;
-}
 /**
 * nfs_wait_on_request - Wait for a request to complete.
 * @req: request to wait upon.
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7c..7be72d90d49d 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
        .commit_setup   = nfs_proc_commit_setup,
        .lock           = nfs_proc_lock,
        .lock_check_bounds = nfs_lock_check_bounds,
+        .close_context  = nfs_close_context,
 };
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d6686f4786dc..0942fcbbad3c 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1018,6 +1018,7 @@ static int nfs_parse_mount_options(char *raw,
                case Opt_rdma:
                        mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
                        mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+                        xprt_load_transport(p);
                        break;
                case Opt_acl:
                        mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1205,12 +1206,14 @@ static int nfs_parse_mount_options(char *raw,
                                /* vector side protocols to TCP */
                                mnt->flags |= NFS_MOUNT_TCP;
                                mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+                                xprt_load_transport(string);
                                break;
                        default:
                                errors++;
                                dfprintk(MOUNT, "NFS:   unrecognized "
                                                "transport protocol\n");
                        }
+                        kfree(string);
                        break;
                case Opt_mountproto:
                        string = match_strdup(args);
@@ -1218,7 +1221,6 @@ static int nfs_parse_mount_options(char *raw,
                                goto out_nomem;
                        token = match_token(string,
                                            nfs_xprt_protocol_tokens, args);
-                        kfree(string);
                        switch (token) {
                        case Opt_xprt_udp:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc1..e560a78995a3 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
        struct inode *inode = mapping->host;
+        unsigned long *bitlock = &NFS_I(inode)->flags;
        struct nfs_pageio_descriptor pgio;
        int err;
+        /* Stop dirtying of new pages while we sync */
+        err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
+                        nfs_wait_bit_killable, TASK_KILLABLE);
+        if (err)
+                goto out_err;
        nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
        nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
        err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
        nfs_pageio_complete(&pgio);
+        clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
+        smp_mb__after_clear_bit();
+        wake_up_bit(bitlock, NFS_INO_FLUSHING);
        if (err < 0)
-                return err;
+                goto out_err;
-        if (pgio.pg_error < 0)
+        err = pgio.pg_error;
-                return pgio.pg_error;
+        if (err < 0)
+                goto out_err;
        return 0;
+out_err:
+        return err;
 }
 /*
@@ -404,7 +419,6 @@ nfs_mark_request_commit(struct nfs_page *req)
        struct nfs_inode *nfsi = NFS_I(inode);
        spin_lock(&inode->i_lock);
-        nfsi->ncommit++;
        set_bit(PG_CLEAN, &(req)->wb_flags);
        radix_tree_tag_set(&nfsi->nfs_page_tree,
                        req->wb_index,
@@ -524,6 +538,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
 }
 #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int
+nfs_need_commit(struct nfs_inode *nfsi)
+{
+        return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+}
 /*
 * nfs_scan_commit - Scan an inode for commit requests
 * @inode: NFS inode to scan
@@ -538,16 +558,18 @@ static int
 nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
        struct nfs_inode *nfsi = NFS_I(inode);
-        int res = 0;
-        if (nfsi->ncommit != 0) {
+        if (!nfs_need_commit(nfsi))
-                res = nfs_scan_list(nfsi, dst, idx_start, npages,
+                return 0;
-                                NFS_PAGE_TAG_COMMIT);
-                nfsi->ncommit -= res;
+        return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
-        }
-        return res;
 }
 #else
+static inline int nfs_need_commit(struct nfs_inode *nfsi)
+{
+        return 0;
+}
 static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
 {
        return 0;
@@ -820,7 +842,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
        data->args.stable  = NFS_UNSTABLE;
        if (how & FLUSH_STABLE) {
                data->args.stable = NFS_DATA_SYNC;
-                if (!NFS_I(inode)->ncommit)
+                if (!nfs_need_commit(NFS_I(inode)))
                        data->args.stable = NFS_FILE_SYNC;
        }
@@ -1425,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
 {
        struct writeback_control wbc = {
                .bdi = mapping->backing_dev_info,
-                .sync_mode = WB_SYNC_NONE,
+                .sync_mode = WB_SYNC_ALL,
                .nr_to_write = LONG_MAX,
                .range_start = 0,
                .range_end = LLONG_MAX,
                .for_writepages = 1,
        };
-        int ret;
-        ret = __nfs_write_mapping(mapping, &wbc, how);
-        if (ret < 0)
-                return ret;
-        wbc.sync_mode = WB_SYNC_ALL;
        return __nfs_write_mapping(mapping, &wbc, how);
 }
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce5..a4ed8644d69c 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -938,10 +938,12 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
                char transport[16];
                int port;
                if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+                        if (port < 1 || port > 65535)
+                                return -EINVAL;
                        err = nfsd_create_serv();
                        if (!err) {
                                err = svc_create_xprt(nfsd_serv,
-                                                      transport, port,
+                                                      transport, PF_INET, port,
                                                      SVC_SOCK_ANONYMOUS);
                                if (err == -ENOENT)
                                        /* Give a reasonable perror msg for
@@ -960,7 +962,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
                char transport[16];
                int port;
                if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
-                        if (port == 0)
+                        if (port < 1 || port > 65535)
                                return -EINVAL;
                        if (nfsd_serv) {
                                xprt = svc_find_xprt(nfsd_serv, transport,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..bc3567bab8c4 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,7 +229,6 @@ int nfsd_create_serv(void)
        atomic_set(&nfsd_busy, 0);
        nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
-                                      AF_INET,
                                      nfsd_last_thread, nfsd, THIS_MODULE);
        if (nfsd_serv == NULL)
                err = -ENOMEM;
@@ -244,7 +243,7 @@ static int nfsd_init_socks(int port)
        if (!list_empty(&nfsd_serv->sv_permsocks))
                return 0;
-        error = svc_create_xprt(nfsd_serv, "udp", port,
+        error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
@@ -253,7 +252,7 @@ static int nfsd_init_socks(int port)
        if (error < 0)
                return error;
-        error = svc_create_xprt(nfsd_serv, "tcp", port,
+        error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
                                        SVC_SOCK_DEFAULTS);
        if (error < 0)
                return error;
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 34314b33dbd4..5a9e34475e37 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -32,8 +32,8 @@
 /**
 * The little endian Unicode string $I30 as a global constant.
 */
-ntfschar I30[5] = { const_cpu_to_le16('$'), const_cpu_to_le16('I'),
+ntfschar I30[5] = { cpu_to_le16('$'), cpu_to_le16('I'),
-                const_cpu_to_le16('3'), const_cpu_to_le16('0'), 0 };
+                cpu_to_le16('3'),       cpu_to_le16('0'), 0 };
 /**
 * ntfs_lookup_inode_by_name - find an inode in a directory given its name
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index 86bef156cf0a..82c5085559c6 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1975,8 +1975,7 @@ int ntfs_read_inode_mount(struct inode *vi)
                                goto em_put_err_out;
                        next_al_entry = (ATTR_LIST_ENTRY*)((u8*)al_entry +
                                        le16_to_cpu(al_entry->length));
-                        if (le32_to_cpu(al_entry->type) >
+                        if (le32_to_cpu(al_entry->type) > le32_to_cpu(AT_DATA))
-                                        const_le32_to_cpu(AT_DATA))
                                goto em_put_err_out;
                        if (AT_DATA != al_entry->type)
                                continue;
diff --git a/fs/ntfs/layout.h b/fs/ntfs/layout.h
index 1e383328eceb..50931b1ce4b9 100644
--- a/fs/ntfs/layout.h
+++ b/fs/ntfs/layout.h
@@ -31,19 +31,8 @@
 #include "types.h"
-/*
- * Constant endianness conversion defines.
- */
-#define const_le16_to_cpu(x)    __constant_le16_to_cpu(x)
-#define const_le32_to_cpu(x)    __constant_le32_to_cpu(x)
-#define const_le64_to_cpu(x)    __constant_le64_to_cpu(x)
-#define const_cpu_to_le16(x)    __constant_cpu_to_le16(x)
-#define const_cpu_to_le32(x)    __constant_cpu_to_le32(x)
-#define const_cpu_to_le64(x)    __constant_cpu_to_le64(x)
 /* The NTFS oem_id "NTFS    " */
-#define magicNTFS       const_cpu_to_le64(0x202020205346544eULL)
+#define magicNTFS       cpu_to_le64(0x202020205346544eULL)
 /*
 * Location of bootsector on partition:
@@ -114,25 +103,25 @@ typedef struct {
 */
 enum {
        /* Found in $MFT/$DATA. */
-        magic_FILE = const_cpu_to_le32(0x454c4946), /* Mft entry. */
+        magic_FILE = cpu_to_le32(0x454c4946), /* Mft entry. */
-        magic_INDX = const_cpu_to_le32(0x58444e49), /* Index buffer. */
+        magic_INDX = cpu_to_le32(0x58444e49), /* Index buffer. */
-        magic_HOLE = const_cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
+        magic_HOLE = cpu_to_le32(0x454c4f48), /* ? (NTFS 3.0+?) */
        /* Found in $LogFile/$DATA. */
-        magic_RSTR = const_cpu_to_le32(0x52545352), /* Restart page. */
+        magic_RSTR = cpu_to_le32(0x52545352), /* Restart page. */
-        magic_RCRD = const_cpu_to_le32(0x44524352), /* Log record page. */
+        magic_RCRD = cpu_to_le32(0x44524352), /* Log record page. */
        /* Found in $LogFile/$DATA.  (May be found in $MFT/$DATA, also?) */
-        magic_CHKD = const_cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
+        magic_CHKD = cpu_to_le32(0x444b4843), /* Modified by chkdsk. */
        /* Found in all ntfs record containing records. */
-        magic_BAAD = const_cpu_to_le32(0x44414142), /* Failed multi sector
+        magic_BAAD = cpu_to_le32(0x44414142), /* Failed multi sector
                                                       transfer was detected. */
        /*
         * Found in $LogFile/$DATA when a page is full of 0xff bytes and is
         * thus not initialized.  Page must be initialized before using it.
         */
-        magic_empty = const_cpu_to_le32(0xffffffff) /* Record is empty. */
+        magic_empty = cpu_to_le32(0xffffffff) /* Record is empty. */
 };
 typedef le32 NTFS_RECORD_TYPE;
@@ -258,8 +247,8 @@ typedef enum {
 * information about the mft record in which they are present.
 */
 enum {
-        MFT_RECORD_IN_USE       = const_cpu_to_le16(0x0001),
+        MFT_RECORD_IN_USE       = cpu_to_le16(0x0001),
-        MFT_RECORD_IS_DIRECTORY = const_cpu_to_le16(0x0002),
+        MFT_RECORD_IS_DIRECTORY = cpu_to_le16(0x0002),
 } __attribute__ ((__packed__));
 typedef le16 MFT_RECORD_FLAGS;
@@ -309,7 +298,7 @@ typedef le16 MFT_RECORD_FLAGS;
 * Note: The _LE versions will return a CPU endian formatted value!
 */
 #define MFT_REF_MASK_CPU 0x0000ffffffffffffULL
-#define MFT_REF_MASK_LE const_cpu_to_le64(MFT_REF_MASK_CPU)
+#define MFT_REF_MASK_LE cpu_to_le64(MFT_REF_MASK_CPU)
 typedef u64 MFT_REF;
 typedef le64 leMFT_REF;
@@ -477,25 +466,25 @@ typedef struct {
 * a revealing choice of symbol I do not know what is... (-;
 */
 enum {
-        AT_UNUSED                       = const_cpu_to_le32(         0),
+        AT_UNUSED                       = cpu_to_le32(         0),
-        AT_STANDARD_INFORMATION         = const_cpu_to_le32(      0x10),
+        AT_STANDARD_INFORMATION         = cpu_to_le32(      0x10),
-        AT_ATTRIBUTE_LIST               = const_cpu_to_le32(      0x20),
+        AT_ATTRIBUTE_LIST               = cpu_to_le32(      0x20),
-        AT_FILE_NAME                    = const_cpu_to_le32(      0x30),
+        AT_FILE_NAME                    = cpu_to_le32(      0x30),
-        AT_OBJECT_ID                    = const_cpu_to_le32(      0x40),
+        AT_OBJECT_ID                    = cpu_to_le32(      0x40),
-        AT_SECURITY_DESCRIPTOR          = const_cpu_to_le32(      0x50),
+        AT_SECURITY_DESCRIPTOR          = cpu_to_le32(      0x50),
-        AT_VOLUME_NAME                  = const_cpu_to_le32(      0x60),
+        AT_VOLUME_NAME                  = cpu_to_le32(      0x60),
-        AT_VOLUME_INFORMATION           = const_cpu_to_le32(      0x70),
+        AT_VOLUME_INFORMATION           = cpu_to_le32(      0x70),
-        AT_DATA                         = const_cpu_to_le32(      0x80),
+        AT_DATA                         = cpu_to_le32(      0x80),
-        AT_INDEX_ROOT                   = const_cpu_to_le32(      0x90),
+        AT_INDEX_ROOT                   = cpu_to_le32(      0x90),
-        AT_INDEX_ALLOCATION             = const_cpu_to_le32(      0xa0),
+        AT_INDEX_ALLOCATION             = cpu_to_le32(      0xa0),
-        AT_BITMAP                       = const_cpu_to_le32(      0xb0),
+        AT_BITMAP                       = cpu_to_le32(      0xb0),
-        AT_REPARSE_POINT                = const_cpu_to_le32(      0xc0),
+        AT_REPARSE_POINT                = cpu_to_le32(      0xc0),
-        AT_EA_INFORMATION               = const_cpu_to_le32(      0xd0),
+        AT_EA_INFORMATION               = cpu_to_le32(      0xd0),
-        AT_EA                           = const_cpu_to_le32(      0xe0),
+        AT_EA                           = cpu_to_le32(      0xe0),
-        AT_PROPERTY_SET                 = const_cpu_to_le32(      0xf0),
+        AT_PROPERTY_SET                 = cpu_to_le32(      0xf0),
-        AT_LOGGED_UTILITY_STREAM        = const_cpu_to_le32(     0x100),
+        AT_LOGGED_UTILITY_STREAM        = cpu_to_le32(     0x100),
-        AT_FIRST_USER_DEFINED_ATTRIBUTE = const_cpu_to_le32(    0x1000),
+        AT_FIRST_USER_DEFINED_ATTRIBUTE = cpu_to_le32(    0x1000),
-        AT_END                          = const_cpu_to_le32(0xffffffff)
+        AT_END                          = cpu_to_le32(0xffffffff)
 };
 typedef le32 ATTR_TYPE;
@@ -539,13 +528,13 @@ typedef le32 ATTR_TYPE;
 *      equal then the second le32 values would be compared, etc.
 */
 enum {
-        COLLATION_BINARY                = const_cpu_to_le32(0x00),
+        COLLATION_BINARY                = cpu_to_le32(0x00),
-        COLLATION_FILE_NAME             = const_cpu_to_le32(0x01),
+        COLLATION_FILE_NAME             = cpu_to_le32(0x01),
-        COLLATION_UNICODE_STRING        = const_cpu_to_le32(0x02),
+        COLLATION_UNICODE_STRING        = cpu_to_le32(0x02),
-        COLLATION_NTOFS_ULONG           = const_cpu_to_le32(0x10),
+        COLLATION_NTOFS_ULONG           = cpu_to_le32(0x10),
-        COLLATION_NTOFS_SID             = const_cpu_to_le32(0x11),
+        COLLATION_NTOFS_SID             = cpu_to_le32(0x11),
-        COLLATION_NTOFS_SECURITY_HASH   = const_cpu_to_le32(0x12),
+        COLLATION_NTOFS_SECURITY_HASH   = cpu_to_le32(0x12),
-        COLLATION_NTOFS_ULONGS          = const_cpu_to_le32(0x13),
+        COLLATION_NTOFS_ULONGS          = cpu_to_le32(0x13),
 };
 typedef le32 COLLATION_RULE;
@@ -559,25 +548,25 @@ typedef le32 COLLATION_RULE;
 * NT4.
 */
 enum {
-        ATTR_DEF_INDEXABLE      = const_cpu_to_le32(0x02), /* Attribute can be
+        ATTR_DEF_INDEXABLE      = cpu_to_le32(0x02), /* Attribute can be
                                        indexed. */
-        ATTR_DEF_MULTIPLE       = const_cpu_to_le32(0x04), /* Attribute type
+        ATTR_DEF_MULTIPLE       = cpu_to_le32(0x04), /* Attribute type
                                        can be present multiple times in the
                                        mft records of an inode. */
-        ATTR_DEF_NOT_ZERO       = const_cpu_to_le32(0x08), /* Attribute value
+        ATTR_DEF_NOT_ZERO       = cpu_to_le32(0x08), /* Attribute value
                                        must contain at least one non-zero
                                        byte. */
-        ATTR_DEF_INDEXED_UNIQUE = const_cpu_to_le32(0x10), /* Attribute must be
+        ATTR_DEF_INDEXED_UNIQUE = cpu_to_le32(0x10), /* Attribute must be
                                        indexed and the attribute value must be
                                        unique for the attribute type in all of
                                        the mft records of an inode. */
-        ATTR_DEF_NAMED_UNIQUE   = const_cpu_to_le32(0x20), /* Attribute must be
+        ATTR_DEF_NAMED_UNIQUE   = cpu_to_le32(0x20), /* Attribute must be
                                        named and the name must be unique for
                                        the attribute type in all of the mft
                                        records of an inode. */
-        ATTR_DEF_RESIDENT       = const_cpu_to_le32(0x40), /* Attribute must be
+        ATTR_DEF_RESIDENT       = cpu_to_le32(0x40), /* Attribute must be
                                        resident. */
-        ATTR_DEF_ALWAYS_LOG     = const_cpu_to_le32(0x80), /* Always log
+        ATTR_DEF_ALWAYS_LOG     = cpu_to_le32(0x80), /* Always log
                                        modifications to this attribute,
                                        regardless of whether it is resident or
                                        non-resident.  Without this, only log
@@ -614,12 +603,12 @@ typedef struct {
 * Attribute flags (16-bit).
 */
 enum {
-        ATTR_IS_COMPRESSED    = const_cpu_to_le16(0x0001),
+        ATTR_IS_COMPRESSED    = cpu_to_le16(0x0001),
-        ATTR_COMPRESSION_MASK = const_cpu_to_le16(0x00ff), /* Compression method
+        ATTR_COMPRESSION_MASK = cpu_to_le16(0x00ff), /* Compression method
                                                              mask.  Also, first
                                                              illegal value. */
-        ATTR_IS_ENCRYPTED     = const_cpu_to_le16(0x4000),
+        ATTR_IS_ENCRYPTED     = cpu_to_le16(0x4000),
-        ATTR_IS_SPARSE        = const_cpu_to_le16(0x8000),
+        ATTR_IS_SPARSE        = cpu_to_le16(0x8000),
 } __attribute__ ((__packed__));
 typedef le16 ATTR_FLAGS;
@@ -811,32 +800,32 @@ typedef ATTR_RECORD ATTR_REC;
 * flags appear in all of the above.
 */
 enum {
-        FILE_ATTR_READONLY              = const_cpu_to_le32(0x00000001),
+        FILE_ATTR_READONLY              = cpu_to_le32(0x00000001),
-        FILE_ATTR_HIDDEN                = const_cpu_to_le32(0x00000002),
+        FILE_ATTR_HIDDEN                = cpu_to_le32(0x00000002),
-        FILE_ATTR_SYSTEM                = const_cpu_to_le32(0x00000004),
+        FILE_ATTR_SYSTEM                = cpu_to_le32(0x00000004),
-        /* Old DOS volid. Unused in NT. = const_cpu_to_le32(0x00000008), */
+        /* Old DOS volid. Unused in NT. = cpu_to_le32(0x00000008), */
-        FILE_ATTR_DIRECTORY             = const_cpu_to_le32(0x00000010),
+        FILE_ATTR_DIRECTORY             = cpu_to_le32(0x00000010),
        /* Note, FILE_ATTR_DIRECTORY is not considered valid in NT.  It is
           reserved for the DOS SUBDIRECTORY flag. */
-        FILE_ATTR_ARCHIVE               = const_cpu_to_le32(0x00000020),
+        FILE_ATTR_ARCHIVE               = cpu_to_le32(0x00000020),
-        FILE_ATTR_DEVICE                = const_cpu_to_le32(0x00000040),
+        FILE_ATTR_DEVICE                = cpu_to_le32(0x00000040),
-        FILE_ATTR_NORMAL                = const_cpu_to_le32(0x00000080),
+        FILE_ATTR_NORMAL                = cpu_to_le32(0x00000080),
-        FILE_ATTR_TEMPORARY             = const_cpu_to_le32(0x00000100),
+        FILE_ATTR_TEMPORARY             = cpu_to_le32(0x00000100),
-        FILE_ATTR_SPARSE_FILE           = const_cpu_to_le32(0x00000200),
+        FILE_ATTR_SPARSE_FILE           = cpu_to_le32(0x00000200),
-        FILE_ATTR_REPARSE_POINT         = const_cpu_to_le32(0x00000400),
+        FILE_ATTR_REPARSE_POINT         = cpu_to_le32(0x00000400),
-        FILE_ATTR_COMPRESSED            = const_cpu_to_le32(0x00000800),
+        FILE_ATTR_COMPRESSED            = cpu_to_le32(0x00000800),
-        FILE_ATTR_OFFLINE               = const_cpu_to_le32(0x00001000),
+        FILE_ATTR_OFFLINE               = cpu_to_le32(0x00001000),
-        FILE_ATTR_NOT_CONTENT_INDEXED   = const_cpu_to_le32(0x00002000),
+        FILE_ATTR_NOT_CONTENT_INDEXED   = cpu_to_le32(0x00002000),
-        FILE_ATTR_ENCRYPTED             = const_cpu_to_le32(0x00004000),
+        FILE_ATTR_ENCRYPTED             = cpu_to_le32(0x00004000),
-        FILE_ATTR_VALID_FLAGS           = const_cpu_to_le32(0x00007fb7),
+        FILE_ATTR_VALID_FLAGS           = cpu_to_le32(0x00007fb7),
        /* Note, FILE_ATTR_VALID_FLAGS masks out the old DOS VolId and the
           FILE_ATTR_DEVICE and preserves everything else.  This mask is used
           to obtain all flags that are valid for reading. */
-        FILE_ATTR_VALID_SET_FLAGS       = const_cpu_to_le32(0x000031a7),
+        FILE_ATTR_VALID_SET_FLAGS       = cpu_to_le32(0x000031a7),
        /* Note, FILE_ATTR_VALID_SET_FLAGS masks out the old DOS VolId, the
           F_A_DEVICE, F_A_DIRECTORY, F_A_SPARSE_FILE, F_A_REPARSE_POINT,
           F_A_COMPRESSED, and F_A_ENCRYPTED and preserves the rest.  This mask
@@ -846,11 +835,11 @@ enum {
         * FILENAME_ATTR attributes but not in the STANDARD_INFORMATION
         * attribute of an mft record.
         */
-        FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT   = const_cpu_to_le32(0x10000000),
+        FILE_ATTR_DUP_FILE_NAME_INDEX_PRESENT   = cpu_to_le32(0x10000000),
        /* Note, this is a copy of the corresponding bit from the mft record,
           telling us whether this is a directory or not, i.e. whether it has
           an index root attribute or not. */
-        FILE_ATTR_DUP_VIEW_INDEX_PRESENT        = const_cpu_to_le32(0x20000000),
+        FILE_ATTR_DUP_VIEW_INDEX_PRESENT        = cpu_to_le32(0x20000000),
        /* Note, this is a copy of the corresponding bit from the mft record,
           telling us whether this file has a view index present (eg. object id
           index, quota index, one of the security indexes or the encrypting
@@ -1446,42 +1435,42 @@ enum {
        /* Specific rights for files and directories are as follows: */
        /* Right to read data from the file. (FILE) */
-        FILE_READ_DATA                  = const_cpu_to_le32(0x00000001),
+        FILE_READ_DATA                  = cpu_to_le32(0x00000001),
        /* Right to list contents of a directory. (DIRECTORY) */
-        FILE_LIST_DIRECTORY             = const_cpu_to_le32(0x00000001),
+        FILE_LIST_DIRECTORY             = cpu_to_le32(0x00000001),
        /* Right to write data to the file. (FILE) */
-        FILE_WRITE_DATA                 = const_cpu_to_le32(0x00000002),
+        FILE_WRITE_DATA                 = cpu_to_le32(0x00000002),
        /* Right to create a file in the directory. (DIRECTORY) */
-        FILE_ADD_FILE                   = const_cpu_to_le32(0x00000002),
+        FILE_ADD_FILE                   = cpu_to_le32(0x00000002),
        /* Right to append data to the file. (FILE) */
-        FILE_APPEND_DATA                = const_cpu_to_le32(0x00000004),
+        FILE_APPEND_DATA                = cpu_to_le32(0x00000004),
        /* Right to create a subdirectory. (DIRECTORY) */
-        FILE_ADD_SUBDIRECTORY           = const_cpu_to_le32(0x00000004),
+        FILE_ADD_SUBDIRECTORY           = cpu_to_le32(0x00000004),
        /* Right to read extended attributes. (FILE/DIRECTORY) */
-        FILE_READ_EA                    = const_cpu_to_le32(0x00000008),
+        FILE_READ_EA                    = cpu_to_le32(0x00000008),
        /* Right to write extended attributes. (FILE/DIRECTORY) */
-        FILE_WRITE_EA                   = const_cpu_to_le32(0x00000010),
+        FILE_WRITE_EA                   = cpu_to_le32(0x00000010),
        /* Right to execute a file. (FILE) */
-        FILE_EXECUTE                    = const_cpu_to_le32(0x00000020),
+        FILE_EXECUTE                    = cpu_to_le32(0x00000020),
        /* Right to traverse the directory. (DIRECTORY) */
-        FILE_TRAVERSE                   = const_cpu_to_le32(0x00000020),
+        FILE_TRAVERSE                   = cpu_to_le32(0x00000020),
        /*
         * Right to delete a directory and all the files it contains (its
         * children), even if the files are read-only. (DIRECTORY)
         */
-        FILE_DELETE_CHILD               = const_cpu_to_le32(0x00000040),
+        FILE_DELETE_CHILD               = cpu_to_le32(0x00000040),
        /* Right to read file attributes. (FILE/DIRECTORY) */
-        FILE_READ_ATTRIBUTES            = const_cpu_to_le32(0x00000080),
+        FILE_READ_ATTRIBUTES            = cpu_to_le32(0x00000080),
        /* Right to change file attributes. (FILE/DIRECTORY) */
-        FILE_WRITE_ATTRIBUTES           = const_cpu_to_le32(0x00000100),
+        FILE_WRITE_ATTRIBUTES           = cpu_to_le32(0x00000100),
        /*
         * The standard rights (bits 16 to 23).  These are independent of the
@@ -1489,27 +1478,27 @@ enum {
         */
        /* Right to delete the object. */
-        DELETE                          = const_cpu_to_le32(0x00010000),
+        DELETE                          = cpu_to_le32(0x00010000),
        /*
         * Right to read the information in the object's security descriptor,
         * not including the information in the SACL, i.e. right to read the
         * security descriptor and owner.
         */
-        READ_CONTROL                    = const_cpu_to_le32(0x00020000),
+        READ_CONTROL                    = cpu_to_le32(0x00020000),
        /* Right to modify the DACL in the object's security descriptor. */
-        WRITE_DAC                       = const_cpu_to_le32(0x00040000),
+        WRITE_DAC                       = cpu_to_le32(0x00040000),
        /* Right to change the owner in the object's security descriptor. */
-        WRITE_OWNER                     = const_cpu_to_le32(0x00080000),
+        WRITE_OWNER                     = cpu_to_le32(0x00080000),
        /*
         * Right to use the object for synchronization.  Enables a process to
         * wait until the object is in the signalled state.  Some object types
         * do not support this access right.
         */
-        SYNCHRONIZE                     = const_cpu_to_le32(0x00100000),
+        SYNCHRONIZE                     = cpu_to_le32(0x00100000),
        /*
         * The following STANDARD_RIGHTS_* are combinations of the above for
@@ -1517,25 +1506,25 @@ enum {
         */
        /* These are currently defined to READ_CONTROL. */
-        STANDARD_RIGHTS_READ            = const_cpu_to_le32(0x00020000),
+        STANDARD_RIGHTS_READ            = cpu_to_le32(0x00020000),
-        STANDARD_RIGHTS_WRITE           = const_cpu_to_le32(0x00020000),
+        STANDARD_RIGHTS_WRITE           = cpu_to_le32(0x00020000),
-        STANDARD_RIGHTS_EXECUTE         = const_cpu_to_le32(0x00020000),
+        STANDARD_RIGHTS_EXECUTE         = cpu_to_le32(0x00020000),
        /* Combines DELETE, READ_CONTROL, WRITE_DAC, and WRITE_OWNER access. */
-        STANDARD_RIGHTS_REQUIRED        = const_cpu_to_le32(0x000f0000),
+        STANDARD_RIGHTS_REQUIRED        = cpu_to_le32(0x000f0000),
        /*
         * Combines DELETE, READ_CONTROL, WRITE_DAC, WRITE_OWNER, and
         * SYNCHRONIZE access.
         */
-        STANDARD_RIGHTS_ALL             = const_cpu_to_le32(0x001f0000),
+        STANDARD_RIGHTS_ALL             = cpu_to_le32(0x001f0000),
        /*
         * The access system ACL and maximum allowed access types (bits 24 to
         * 25, bits 26 to 27 are reserved).
         */
-        ACCESS_SYSTEM_SECURITY          = const_cpu_to_le32(0x01000000),
+        ACCESS_SYSTEM_SECURITY          = cpu_to_le32(0x01000000),
-        MAXIMUM_ALLOWED                 = const_cpu_to_le32(0x02000000),
+        MAXIMUM_ALLOWED                 = cpu_to_le32(0x02000000),
        /*
         * The generic rights (bits 28 to 31).  These map onto the standard and
@@ -1543,10 +1532,10 @@ enum {
         */
        /* Read, write, and execute access. */
-        GENERIC_ALL                     = const_cpu_to_le32(0x10000000),
+        GENERIC_ALL                     = cpu_to_le32(0x10000000),
        /* Execute access. */
-        GENERIC_EXECUTE                 = const_cpu_to_le32(0x20000000),
+        GENERIC_EXECUTE                 = cpu_to_le32(0x20000000),
        /*
         * Write access.  For files, this maps onto:
@@ -1555,7 +1544,7 @@ enum {
         * For directories, the mapping has the same numerical value.  See
         * above for the descriptions of the rights granted.
         */
-        GENERIC_WRITE                   = const_cpu_to_le32(0x40000000),
+        GENERIC_WRITE                   = cpu_to_le32(0x40000000),
        /*
         * Read access.  For files, this maps onto:
@@ -1564,7 +1553,7 @@ enum {
         * For directories, the mapping has the same numberical value.  See
         * above for the descriptions of the rights granted.
         */
-        GENERIC_READ                    = const_cpu_to_le32(0x80000000),
+        GENERIC_READ                    = cpu_to_le32(0x80000000),
 };
 typedef le32 ACCESS_MASK;
@@ -1604,8 +1593,8 @@ typedef struct {
 * The object ACE flags (32-bit).
 */
 enum {
-        ACE_OBJECT_TYPE_PRESENT                 = const_cpu_to_le32(1),
+        ACE_OBJECT_TYPE_PRESENT                 = cpu_to_le32(1),
-        ACE_INHERITED_OBJECT_TYPE_PRESENT       = const_cpu_to_le32(2),
+        ACE_INHERITED_OBJECT_TYPE_PRESENT       = cpu_to_le32(2),
 };
 typedef le32 OBJECT_ACE_FLAGS;
@@ -1706,23 +1695,23 @@ typedef enum {
 *      expressed as offsets from the beginning of the security descriptor.
 */
 enum {
-        SE_OWNER_DEFAULTED              = const_cpu_to_le16(0x0001),
+        SE_OWNER_DEFAULTED              = cpu_to_le16(0x0001),
-        SE_GROUP_DEFAULTED              = const_cpu_to_le16(0x0002),
+        SE_GROUP_DEFAULTED              = cpu_to_le16(0x0002),
-        SE_DACL_PRESENT                 = const_cpu_to_le16(0x0004),
+        SE_DACL_PRESENT                 = cpu_to_le16(0x0004),
-        SE_DACL_DEFAULTED               = const_cpu_to_le16(0x0008),
+        SE_DACL_DEFAULTED               = cpu_to_le16(0x0008),
-        SE_SACL_PRESENT                 = const_cpu_to_le16(0x0010),
+        SE_SACL_PRESENT                 = cpu_to_le16(0x0010),
-        SE_SACL_DEFAULTED               = const_cpu_to_le16(0x0020),
+        SE_SACL_DEFAULTED               = cpu_to_le16(0x0020),
-        SE_DACL_AUTO_INHERIT_REQ        = const_cpu_to_le16(0x0100),
+        SE_DACL_AUTO_INHERIT_REQ        = cpu_to_le16(0x0100),
-        SE_SACL_AUTO_INHERIT_REQ        = const_cpu_to_le16(0x0200),
+        SE_SACL_AUTO_INHERIT_REQ        = cpu_to_le16(0x0200),
-        SE_DACL_AUTO_INHERITED          = const_cpu_to_le16(0x0400),
+        SE_DACL_AUTO_INHERITED          = cpu_to_le16(0x0400),
-        SE_SACL_AUTO_INHERITED          = const_cpu_to_le16(0x0800),
+        SE_SACL_AUTO_INHERITED          = cpu_to_le16(0x0800),
-        SE_DACL_PROTECTED               = const_cpu_to_le16(0x1000),
+        SE_DACL_PROTECTED               = cpu_to_le16(0x1000),
-        SE_SACL_PROTECTED               = const_cpu_to_le16(0x2000),
+        SE_SACL_PROTECTED               = cpu_to_le16(0x2000),
-        SE_RM_CONTROL_VALID             = const_cpu_to_le16(0x4000),
+        SE_RM_CONTROL_VALID             = cpu_to_le16(0x4000),
-        SE_SELF_RELATIVE                = const_cpu_to_le16(0x8000)
+        SE_SELF_RELATIVE                = cpu_to_le16(0x8000)
 } __attribute__ ((__packed__));
 typedef le16 SECURITY_DESCRIPTOR_CONTROL;
@@ -1910,21 +1899,21 @@ typedef struct {
 * Possible flags for the volume (16-bit).
 */
 enum {
-        VOLUME_IS_DIRTY                 = const_cpu_to_le16(0x0001),
+        VOLUME_IS_DIRTY                 = cpu_to_le16(0x0001),
-        VOLUME_RESIZE_LOG_FILE          = const_cpu_to_le16(0x0002),
+        VOLUME_RESIZE_LOG_FILE          = cpu_to_le16(0x0002),
-        VOLUME_UPGRADE_ON_MOUNT         = const_cpu_to_le16(0x0004),
+        VOLUME_UPGRADE_ON_MOUNT         = cpu_to_le16(0x0004),
-        VOLUME_MOUNTED_ON_NT4           = const_cpu_to_le16(0x0008),
+        VOLUME_MOUNTED_ON_NT4           = cpu_to_le16(0x0008),
-        VOLUME_DELETE_USN_UNDERWAY      = const_cpu_to_le16(0x0010),
+        VOLUME_DELETE_USN_UNDERWAY      = cpu_to_le16(0x0010),
-        VOLUME_REPAIR_OBJECT_ID         = const_cpu_to_le16(0x0020),
+        VOLUME_REPAIR_OBJECT_ID         = cpu_to_le16(0x0020),
-        VOLUME_CHKDSK_UNDERWAY          = const_cpu_to_le16(0x4000),
+        VOLUME_CHKDSK_UNDERWAY          = cpu_to_le16(0x4000),
-        VOLUME_MODIFIED_BY_CHKDSK       = const_cpu_to_le16(0x8000),
+        VOLUME_MODIFIED_BY_CHKDSK       = cpu_to_le16(0x8000),
-        VOLUME_FLAGS_MASK               = const_cpu_to_le16(0xc03f),
+        VOLUME_FLAGS_MASK               = cpu_to_le16(0xc03f),
        /* To make our life easier when checking if we must mount read-only. */
-        VOLUME_MUST_MOUNT_RO_MASK       = const_cpu_to_le16(0xc027),
+        VOLUME_MUST_MOUNT_RO_MASK       = cpu_to_le16(0xc027),
 } __attribute__ ((__packed__));
 typedef le16 VOLUME_FLAGS;
@@ -2109,26 +2098,26 @@ typedef struct {
 * The user quota flags.  Names explain meaning.
 */
 enum {
-        QUOTA_FLAG_DEFAULT_LIMITS       = const_cpu_to_le32(0x00000001),
+        QUOTA_FLAG_DEFAULT_LIMITS       = cpu_to_le32(0x00000001),
-        QUOTA_FLAG_LIMIT_REACHED        = const_cpu_to_le32(0x00000002),
+        QUOTA_FLAG_LIMIT_REACHED        = cpu_to_le32(0x00000002),
-        QUOTA_FLAG_ID_DELETED           = const_cpu_to_le32(0x00000004),
+        QUOTA_FLAG_ID_DELETED           = cpu_to_le32(0x00000004),
-        QUOTA_FLAG_USER_MASK            = const_cpu_to_le32(0x00000007),
+        QUOTA_FLAG_USER_MASK            = cpu_to_le32(0x00000007),
        /* This is a bit mask for the user quota flags. */
        /*
         * These flags are only present in the quota defaults index entry, i.e.
         * in the entry where owner_id = QUOTA_DEFAULTS_ID.
         */
-        QUOTA_FLAG_TRACKING_ENABLED     = const_cpu_to_le32(0x00000010),
+        QUOTA_FLAG_TRACKING_ENABLED     = cpu_to_le32(0x00000010),
-        QUOTA_FLAG_ENFORCEMENT_ENABLED  = const_cpu_to_le32(0x00000020),
+        QUOTA_FLAG_ENFORCEMENT_ENABLED  = cpu_to_le32(0x00000020),
-        QUOTA_FLAG_TRACKING_REQUESTED   = const_cpu_to_le32(0x00000040),
+        QUOTA_FLAG_TRACKING_REQUESTED   = cpu_to_le32(0x00000040),
-        QUOTA_FLAG_LOG_THRESHOLD        = const_cpu_to_le32(0x00000080),
+        QUOTA_FLAG_LOG_THRESHOLD        = cpu_to_le32(0x00000080),
-        QUOTA_FLAG_LOG_LIMIT            = const_cpu_to_le32(0x00000100),
+        QUOTA_FLAG_LOG_LIMIT            = cpu_to_le32(0x00000100),
-        QUOTA_FLAG_OUT_OF_DATE          = const_cpu_to_le32(0x00000200),
+        QUOTA_FLAG_OUT_OF_DATE          = cpu_to_le32(0x00000200),
-        QUOTA_FLAG_CORRUPT              = const_cpu_to_le32(0x00000400),
+        QUOTA_FLAG_CORRUPT              = cpu_to_le32(0x00000400),
-        QUOTA_FLAG_PENDING_DELETES      = const_cpu_to_le32(0x00000800),
+        QUOTA_FLAG_PENDING_DELETES      = cpu_to_le32(0x00000800),
 };
 typedef le32 QUOTA_FLAGS;
@@ -2172,9 +2161,9 @@ typedef struct {
 * Predefined owner_id values (32-bit).
 */
 enum {
-        QUOTA_INVALID_ID        = const_cpu_to_le32(0x00000000),
+        QUOTA_INVALID_ID        = cpu_to_le32(0x00000000),
-        QUOTA_DEFAULTS_ID       = const_cpu_to_le32(0x00000001),
+        QUOTA_DEFAULTS_ID       = cpu_to_le32(0x00000001),
-        QUOTA_FIRST_USER_ID     = const_cpu_to_le32(0x00000100),
+        QUOTA_FIRST_USER_ID     = cpu_to_le32(0x00000100),
 };
 /*
@@ -2189,14 +2178,14 @@ typedef enum {
 * Index entry flags (16-bit).
 */
 enum {
-        INDEX_ENTRY_NODE = const_cpu_to_le16(1), /* This entry contains a
+        INDEX_ENTRY_NODE = cpu_to_le16(1), /* This entry contains a
                        sub-node, i.e. a reference to an index block in form of
                        a virtual cluster number (see below). */
-        INDEX_ENTRY_END  = const_cpu_to_le16(2), /* This signifies the last
+        INDEX_ENTRY_END  = cpu_to_le16(2), /* This signifies the last
                        entry in an index block.  The index entry does not
                        represent a file but it can point to a sub-node. */
-        INDEX_ENTRY_SPACE_FILLER = const_cpu_to_le16(0xffff), /* gcc: Force
+        INDEX_ENTRY_SPACE_FILLER = cpu_to_le16(0xffff), /* gcc: Force
                        enum bit width to 16-bit. */
 } __attribute__ ((__packed__));
@@ -2334,26 +2323,26 @@ typedef struct {
 * These are the predefined reparse point tags:
 */
 enum {
-        IO_REPARSE_TAG_IS_ALIAS         = const_cpu_to_le32(0x20000000),
+        IO_REPARSE_TAG_IS_ALIAS         = cpu_to_le32(0x20000000),
-        IO_REPARSE_TAG_IS_HIGH_LATENCY  = const_cpu_to_le32(0x40000000),
+        IO_REPARSE_TAG_IS_HIGH_LATENCY  = cpu_to_le32(0x40000000),
-        IO_REPARSE_TAG_IS_MICROSOFT     = const_cpu_to_le32(0x80000000),
+        IO_REPARSE_TAG_IS_MICROSOFT     = cpu_to_le32(0x80000000),
-        IO_REPARSE_TAG_RESERVED_ZERO    = const_cpu_to_le32(0x00000000),
+        IO_REPARSE_TAG_RESERVED_ZERO    = cpu_to_le32(0x00000000),
-        IO_REPARSE_TAG_RESERVED_ONE     = const_cpu_to_le32(0x00000001),
+        IO_REPARSE_TAG_RESERVED_ONE     = cpu_to_le32(0x00000001),
-        IO_REPARSE_TAG_RESERVED_RANGE   = const_cpu_to_le32(0x00000001),
+        IO_REPARSE_TAG_RESERVED_RANGE   = cpu_to_le32(0x00000001),
-        IO_REPARSE_TAG_NSS              = const_cpu_to_le32(0x68000005),
+        IO_REPARSE_TAG_NSS              = cpu_to_le32(0x68000005),
-        IO_REPARSE_TAG_NSS_RECOVER      = const_cpu_to_le32(0x68000006),
+        IO_REPARSE_TAG_NSS_RECOVER      = cpu_to_le32(0x68000006),
-        IO_REPARSE_TAG_SIS              = const_cpu_to_le32(0x68000007),
+        IO_REPARSE_TAG_SIS              = cpu_to_le32(0x68000007),
-        IO_REPARSE_TAG_DFS              = const_cpu_to_le32(0x68000008),
+        IO_REPARSE_TAG_DFS              = cpu_to_le32(0x68000008),
-        IO_REPARSE_TAG_MOUNT_POINT      = const_cpu_to_le32(0x88000003),
+        IO_REPARSE_TAG_MOUNT_POINT      = cpu_to_le32(0x88000003),
-        IO_REPARSE_TAG_HSM              = const_cpu_to_le32(0xa8000004),
+        IO_REPARSE_TAG_HSM              = cpu_to_le32(0xa8000004),
-        IO_REPARSE_TAG_SYMBOLIC_LINK    = const_cpu_to_le32(0xe8000000),
+        IO_REPARSE_TAG_SYMBOLIC_LINK    = cpu_to_le32(0xe8000000),
-        IO_REPARSE_TAG_VALID_VALUES     = const_cpu_to_le32(0xe000ffff),
+        IO_REPARSE_TAG_VALID_VALUES     = cpu_to_le32(0xe000ffff),
 };
 /*
diff --git a/fs/ntfs/logfile.h b/fs/ntfs/logfile.h
index 9468e1c45ae3..b5a6f08bd35c 100644
--- a/fs/ntfs/logfile.h
+++ b/fs/ntfs/logfile.h
@@ -104,7 +104,7 @@ typedef struct {
 * in this particular client array.  Also inside the client records themselves,
 * this means that there are no client records preceding or following this one.
 */
-#define LOGFILE_NO_CLIENT       const_cpu_to_le16(0xffff)
+#define LOGFILE_NO_CLIENT       cpu_to_le16(0xffff)
 #define LOGFILE_NO_CLIENT_CPU   0xffff
 /*
@@ -112,8 +112,8 @@ typedef struct {
 * information about the log file in which they are present.
 */
 enum {
-        RESTART_VOLUME_IS_CLEAN = const_cpu_to_le16(0x0002),
+        RESTART_VOLUME_IS_CLEAN = cpu_to_le16(0x0002),
-        RESTART_SPACE_FILLER    = const_cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
+        RESTART_SPACE_FILLER    = cpu_to_le16(0xffff), /* gcc: Force enum bit width to 16. */
 } __attribute__ ((__packed__));
 typedef le16 RESTART_AREA_FLAGS;
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 17d32ca6bc35..23bf68453d7d 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -2839,7 +2839,7 @@ int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m)
         */
        /* Mark the mft record as not in use. */
-        m->flags &= const_cpu_to_le16(~const_le16_to_cpu(MFT_RECORD_IN_USE));
+        m->flags &= ~MFT_RECORD_IN_USE;
        /* Increment the sequence number, skipping zero, if it is not zero. */
        old_seq_no = m->sequence_number;
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 4a46743b5077..f76951dcd4a6 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -618,7 +618,7 @@ static bool is_boot_sector_ntfs(const struct super_block *sb,
         * many BIOSes will refuse to boot from a bootsector if the magic is
         * incorrect, so we emit a warning.
         */
-        if (!silent && b->end_of_sector_marker != const_cpu_to_le16(0xaa55))
+        if (!silent && b->end_of_sector_marker != cpu_to_le16(0xaa55))
                ntfs_warning(sb, "Invalid end of sector marker.");
        return true;
 not_ntfs:
@@ -1242,13 +1242,13 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
        u32 *kaddr, *kend;
        ntfs_name *name = NULL;
        int ret = 1;
-        static const ntfschar hiberfil[13] = { const_cpu_to_le16('h'),
+        static const ntfschar hiberfil[13] = { cpu_to_le16('h'),
-                        const_cpu_to_le16('i'), const_cpu_to_le16('b'),
+                        cpu_to_le16('i'), cpu_to_le16('b'),
-                        const_cpu_to_le16('e'), const_cpu_to_le16('r'),
+                        cpu_to_le16('e'), cpu_to_le16('r'),
-                        const_cpu_to_le16('f'), const_cpu_to_le16('i'),
+                        cpu_to_le16('f'), cpu_to_le16('i'),
-                        const_cpu_to_le16('l'), const_cpu_to_le16('.'),
+                        cpu_to_le16('l'), cpu_to_le16('.'),
-                        const_cpu_to_le16('s'), const_cpu_to_le16('y'),
+                        cpu_to_le16('s'), cpu_to_le16('y'),
-                        const_cpu_to_le16('s'), 0 };
+                        cpu_to_le16('s'), 0 };
        ntfs_debug("Entering.");
        /*
@@ -1296,7 +1296,7 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
                goto iput_out;
        }
        kaddr = (u32*)page_address(page);
-        if (*(le32*)kaddr == const_cpu_to_le32(0x72626968)/*'hibr'*/) {
+        if (*(le32*)kaddr == cpu_to_le32(0x72626968)/*'hibr'*/) {
                ntfs_debug("Magic \"hibr\" found in hiberfil.sys.  Windows is "
                                "hibernated on the volume.  This is the "
                                "system volume.");
@@ -1337,12 +1337,12 @@ static bool load_and_init_quota(ntfs_volume *vol)
        MFT_REF mref;
        struct inode *tmp_ino;
        ntfs_name *name = NULL;
-        static const ntfschar Quota[7] = { const_cpu_to_le16('$'),
+        static const ntfschar Quota[7] = { cpu_to_le16('$'),
-                        const_cpu_to_le16('Q'), const_cpu_to_le16('u'),
+                        cpu_to_le16('Q'), cpu_to_le16('u'),
-                        const_cpu_to_le16('o'), const_cpu_to_le16('t'),
+                        cpu_to_le16('o'), cpu_to_le16('t'),
-                        const_cpu_to_le16('a'), 0 };
+                        cpu_to_le16('a'), 0 };
-        static ntfschar Q[3] = { const_cpu_to_le16('$'),
+        static ntfschar Q[3] = { cpu_to_le16('$'),
-                        const_cpu_to_le16('Q'), 0 };
+                        cpu_to_le16('Q'), 0 };
        ntfs_debug("Entering.");
        /*
@@ -1416,16 +1416,16 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
        struct page *page;
        ntfs_name *name = NULL;
        USN_HEADER *uh;
-        static const ntfschar UsnJrnl[9] = { const_cpu_to_le16('$'),
+        static const ntfschar UsnJrnl[9] = { cpu_to_le16('$'),
-                        const_cpu_to_le16('U'), const_cpu_to_le16('s'),
+                        cpu_to_le16('U'), cpu_to_le16('s'),
-                        const_cpu_to_le16('n'), const_cpu_to_le16('J'),
+                        cpu_to_le16('n'), cpu_to_le16('J'),
-                        const_cpu_to_le16('r'), const_cpu_to_le16('n'),
+                        cpu_to_le16('r'), cpu_to_le16('n'),
-                        const_cpu_to_le16('l'), 0 };
+                        cpu_to_le16('l'), 0 };
-        static ntfschar Max[5] = { const_cpu_to_le16('$'),
+        static ntfschar Max[5] = { cpu_to_le16('$'),
-                        const_cpu_to_le16('M'), const_cpu_to_le16('a'),
+                        cpu_to_le16('M'), cpu_to_le16('a'),
-                        const_cpu_to_le16('x'), 0 };
+                        cpu_to_le16('x'), 0 };
-        static ntfschar J[3] = { const_cpu_to_le16('$'),
+        static ntfschar J[3] = { cpu_to_le16('$'),
-                        const_cpu_to_le16('J'), 0 };
+                        cpu_to_le16('J'), 0 };
        ntfs_debug("Entering.");
        /*
diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h
index 4087fbdac327..00d8e6bd7c36 100644
--- a/fs/ntfs/usnjrnl.h
+++ b/fs/ntfs/usnjrnl.h
@@ -116,27 +116,27 @@ typedef struct {
 * documentation: http://www.linux-ntfs.org/
 */
 enum {
-        USN_REASON_DATA_OVERWRITE       = const_cpu_to_le32(0x00000001),
+        USN_REASON_DATA_OVERWRITE       = cpu_to_le32(0x00000001),
-        USN_REASON_DATA_EXTEND          = const_cpu_to_le32(0x00000002),
+        USN_REASON_DATA_EXTEND          = cpu_to_le32(0x00000002),
-        USN_REASON_DATA_TRUNCATION      = const_cpu_to_le32(0x00000004),
+        USN_REASON_DATA_TRUNCATION      = cpu_to_le32(0x00000004),
-        USN_REASON_NAMED_DATA_OVERWRITE = const_cpu_to_le32(0x00000010),
+        USN_REASON_NAMED_DATA_OVERWRITE = cpu_to_le32(0x00000010),
-        USN_REASON_NAMED_DATA_EXTEND    = const_cpu_to_le32(0x00000020),
+        USN_REASON_NAMED_DATA_EXTEND    = cpu_to_le32(0x00000020),
-        USN_REASON_NAMED_DATA_TRUNCATION= const_cpu_to_le32(0x00000040),
+        USN_REASON_NAMED_DATA_TRUNCATION= cpu_to_le32(0x00000040),
-        USN_REASON_FILE_CREATE          = const_cpu_to_le32(0x00000100),
+        USN_REASON_FILE_CREATE          = cpu_to_le32(0x00000100),
-        USN_REASON_FILE_DELETE          = const_cpu_to_le32(0x00000200),
+        USN_REASON_FILE_DELETE          = cpu_to_le32(0x00000200),
-        USN_REASON_EA_CHANGE            = const_cpu_to_le32(0x00000400),
+        USN_REASON_EA_CHANGE            = cpu_to_le32(0x00000400),
-        USN_REASON_SECURITY_CHANGE      = const_cpu_to_le32(0x00000800),
+        USN_REASON_SECURITY_CHANGE      = cpu_to_le32(0x00000800),
-        USN_REASON_RENAME_OLD_NAME      = const_cpu_to_le32(0x00001000),
+        USN_REASON_RENAME_OLD_NAME      = cpu_to_le32(0x00001000),
-        USN_REASON_RENAME_NEW_NAME      = const_cpu_to_le32(0x00002000),
+        USN_REASON_RENAME_NEW_NAME      = cpu_to_le32(0x00002000),
-        USN_REASON_INDEXABLE_CHANGE     = const_cpu_to_le32(0x00004000),
+        USN_REASON_INDEXABLE_CHANGE     = cpu_to_le32(0x00004000),
-        USN_REASON_BASIC_INFO_CHANGE    = const_cpu_to_le32(0x00008000),
+        USN_REASON_BASIC_INFO_CHANGE    = cpu_to_le32(0x00008000),
-        USN_REASON_HARD_LINK_CHANGE     = const_cpu_to_le32(0x00010000),
+        USN_REASON_HARD_LINK_CHANGE     = cpu_to_le32(0x00010000),
-        USN_REASON_COMPRESSION_CHANGE   = const_cpu_to_le32(0x00020000),
+        USN_REASON_COMPRESSION_CHANGE   = cpu_to_le32(0x00020000),
-        USN_REASON_ENCRYPTION_CHANGE    = const_cpu_to_le32(0x00040000),
+        USN_REASON_ENCRYPTION_CHANGE    = cpu_to_le32(0x00040000),
-        USN_REASON_OBJECT_ID_CHANGE     = const_cpu_to_le32(0x00080000),
+        USN_REASON_OBJECT_ID_CHANGE     = cpu_to_le32(0x00080000),
-        USN_REASON_REPARSE_POINT_CHANGE = const_cpu_to_le32(0x00100000),
+        USN_REASON_REPARSE_POINT_CHANGE = cpu_to_le32(0x00100000),
-        USN_REASON_STREAM_CHANGE        = const_cpu_to_le32(0x00200000),
+        USN_REASON_STREAM_CHANGE        = cpu_to_le32(0x00200000),
-        USN_REASON_CLOSE                = const_cpu_to_le32(0x80000000),
+        USN_REASON_CLOSE                = cpu_to_le32(0x80000000),
 };
 typedef le32 USN_REASON_FLAGS;
@@ -148,9 +148,9 @@ typedef le32 USN_REASON_FLAGS;
 *      http://www.linux-ntfs.org/
 */
 enum {
-        USN_SOURCE_DATA_MANAGEMENT        = const_cpu_to_le32(0x00000001),
+        USN_SOURCE_DATA_MANAGEMENT        = cpu_to_le32(0x00000001),
-        USN_SOURCE_AUXILIARY_DATA         = const_cpu_to_le32(0x00000002),
+        USN_SOURCE_AUXILIARY_DATA         = cpu_to_le32(0x00000002),
-        USN_SOURCE_REPLICATION_MANAGEMENT = const_cpu_to_le32(0x00000004),
+        USN_SOURCE_REPLICATION_MANAGEMENT = cpu_to_le32(0x00000004),
 };
 typedef le32 USN_SOURCE_INFO_FLAGS;
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index eea1d24713ea..b606496b72ec 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -154,8 +154,9 @@ out:
        return ret;
 }
-static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct buffer_head *di_bh = NULL;
        sigset_t blocked, oldset;
@@ -196,7 +197,8 @@ out:
        ret2 = ocfs2_vm_op_unblock_sigs(&oldset);
        if (ret2 < 0)
                mlog_errno(ret2);
+        if (ret)
+                ret = VM_FAULT_SIGBUS;
        return ret;
 }
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 4a9e0f65ae60..83adcc869437 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -144,16 +144,12 @@ void proc_tty_register_driver(struct tty_driver *driver)
 {
        struct proc_dir_entry *ent;
                
-        if (!driver->ops->read_proc || !driver->driver_name ||
+        if (!driver->driver_name || driver->proc_entry ||
-            driver->proc_entry)
+            !driver->ops->proc_fops)
                return;
-        ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver);
+        ent = proc_create_data(driver->driver_name, 0, proc_tty_driver,
-        if (!ent)
+                               driver->ops->proc_fops, driver);
-                return;
-        ent->read_proc = driver->ops->read_proc;
-        ent->data = driver;
        driver->proc_entry = ent;
 }
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 995ef1d6686c..ebb2c417912c 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -59,7 +59,6 @@ const struct inode_operations ramfs_file_inode_operations = {
 */
 int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
 {
-        struct pagevec lru_pvec;
        unsigned long npages, xpages, loop, limit;
        struct page *pages;
        unsigned order;
@@ -102,24 +101,20 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
        memset(data, 0, newsize);
        /* attach all the pages to the inode's address space */
-        pagevec_init(&lru_pvec, 0);
        for (loop = 0; loop < npages; loop++) {
                struct page *page = pages + loop;
-                ret = add_to_page_cache(page, inode->i_mapping, loop, GFP_KERNEL);
+                ret = add_to_page_cache_lru(page, inode->i_mapping, loop,
+                                        GFP_KERNEL);
                if (ret < 0)
                        goto add_error;
-                if (!pagevec_add(&lru_pvec, page))
-                        __pagevec_lru_add_file(&lru_pvec);
                /* prevent the page from being discarded on memory pressure */
                SetPageDirty(page);
                unlock_page(page);
        }
-        pagevec_lru_add_file(&lru_pvec);
        return 0;
 fsize_exceeded:
@@ -128,10 +123,8 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize)
        return -EFBIG;
 add_error:
-        pagevec_lru_add_file(&lru_pvec);
+        while (loop < npages)
-        page_cache_release(pages + loop);
+                __free_page(pages + loop++);
-        for (loop++; loop < npages; loop++)
-                __free_page(pages + loop);
        return ret;
 }
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index b7e6ac706b87..a404fb88e456 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -33,12 +33,15 @@
 #include <linux/backing-dev.h>
 #include <linux/ramfs.h>
 #include <linux/sched.h>
+#include <linux/parser.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 /* some random number */
 #define RAMFS_MAGIC     0x858458f6
+#define RAMFS_DEFAULT_MODE      0755
 static const struct super_operations ramfs_ops;
 static const struct inode_operations ramfs_dir_inode_operations;
@@ -158,12 +161,75 @@ static const struct inode_operations ramfs_dir_inode_operations = {
 static const struct super_operations ramfs_ops = {
        .statfs         = simple_statfs,
        .drop_inode     = generic_delete_inode,
+        .show_options   = generic_show_options,
+};
+struct ramfs_mount_opts {
+        umode_t mode;
+};
+enum {
+        Opt_mode,
+        Opt_err
+};
+static const match_table_t tokens = {
+        {Opt_mode, "mode=%o"},
+        {Opt_err, NULL}
+};
+struct ramfs_fs_info {
+        struct ramfs_mount_opts mount_opts;
 };
+static int ramfs_parse_options(char *data, struct ramfs_mount_opts *opts)
+{
+        substring_t args[MAX_OPT_ARGS];
+        int option;
+        int token;
+        char *p;
+        opts->mode = RAMFS_DEFAULT_MODE;
+        while ((p = strsep(&data, ",")) != NULL) {
+                if (!*p)
+                        continue;
+                token = match_token(p, tokens, args);
+                switch (token) {
+                case Opt_mode:
+                        if (match_octal(&args[0], &option))
+                                return -EINVAL;
+                        opts->mode = option & S_IALLUGO;
+                        break;
+                default:
+                        printk(KERN_ERR "ramfs: bad mount option: %s\n", p);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
 static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
 {
-        struct inode * inode;
+        struct ramfs_fs_info *fsi;
-        struct dentry * root;
+        struct inode *inode = NULL;
+        struct dentry *root;
+        int err;
+        save_mount_options(sb, data);
+        fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
+        if (!fsi) {
+                err = -ENOMEM;
+                goto fail;
+        }
+        sb->s_fs_info = fsi;
+        err = ramfs_parse_options(data, &fsi->mount_opts);
+        if (err)
+                goto fail;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_blocksize = PAGE_CACHE_SIZE;
@@ -171,17 +237,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
        sb->s_magic = RAMFS_MAGIC;
        sb->s_op = &ramfs_ops;
        sb->s_time_gran = 1;
-        inode = ramfs_get_inode(sb, S_IFDIR | 0755, 0);
+        inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
-        if (!inode)
+        if (!inode) {
-                return -ENOMEM;
+                err = -ENOMEM;
+                goto fail;
+        }
        root = d_alloc_root(inode);
        if (!root) {
-                iput(inode);
+                err = -ENOMEM;
-                return -ENOMEM;
+                goto fail;
        }
        sb->s_root = root;
        return 0;
+fail:
+        kfree(fsi);
+        iput(inode);
+        return err;
 }
 int ramfs_get_sb(struct file_system_type *fs_type,
@@ -197,10 +269,16 @@ static int rootfs_get_sb(struct file_system_type *fs_type,
                            mnt);
 }
+static void ramfs_kill_sb(struct super_block *sb)
+{
+        kfree(sb->s_fs_info);
+        kill_litter_super(sb);
+}
 static struct file_system_type ramfs_fs_type = {
        .name           = "ramfs",
        .get_sb         = ramfs_get_sb,
-        .kill_sb        = kill_litter_super,
+        .kill_sb        = ramfs_kill_sb,
 };
 static struct file_system_type rootfs_fs_type = {
        .name           = "rootfs",
diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c
index 07703d3ff4a1..93e0c0281d45 100644
--- a/fs/sysfs/bin.c
+++ b/fs/sysfs/bin.c
@@ -234,7 +234,7 @@ static int bin_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        return ret;
 }
-static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int bin_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
        struct file *file = vma->vm_file;
        struct bin_buffer *bb = file->private_data;
@@ -242,15 +242,15 @@ static int bin_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        int ret;
        if (!bb->vm_ops)
-                return -EINVAL;
+                return VM_FAULT_SIGBUS;
        if (!bb->vm_ops->page_mkwrite)
                return 0;
        if (!sysfs_get_active_two(attr_sd))
-                return -EINVAL;
+                return VM_FAULT_SIGBUS;
-        ret = bb->vm_ops->page_mkwrite(vma, page);
+        ret = bb->vm_ops->page_mkwrite(vma, vmf);
        sysfs_put_active_two(attr_sd);
        return ret;
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 93b6de51f261..0ff89fe71e51 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1434,8 +1434,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
 * mmap()d file has taken write protection fault and is being made
 * writable. UBIFS must ensure page is budgeted for.
 */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+        struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        struct timespec now = ubifs_current_time(inode);
@@ -1447,7 +1448,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
        if (unlikely(c->ro_media))
-                return -EROFS;
+                return VM_FAULT_SIGBUS; /* -EROFS */
        /*
         * We have not locked @page so far so we may budget for changing the
@@ -1480,7 +1481,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                if (err == -ENOSPC)
                        ubifs_warn("out of space for mmapped file "
                                   "(inode number %lu)", inode->i_ino);
-                return err;
+                return VM_FAULT_SIGBUS;
        }
        lock_page(page);
@@ -1520,6 +1521,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 out_unlock:
        unlock_page(page);
        ubifs_release_budget(c, &req);
+        if (err)
+                err = VM_FAULT_SIGBUS;
        return err;
 }
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index e14c4e3aea0c..f4e255441574 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -234,9 +234,9 @@ xfs_file_mmap(
 STATIC int
 xfs_vm_page_mkwrite(
        struct vm_area_struct   *vma,
-        struct page             *page)
+        struct vm_fault         *vmf)
 {
-        return block_page_mkwrite(vma, page, xfs_get_blocks);
+        return block_page_mkwrite(vma, vmf, xfs_get_blocks);
 }
 const struct file_operations xfs_file_operations = {
author	Felix Blyakher <felixb@sgi.com>	2009-04-01 17:58:39 -0400
committer	Felix Blyakher <felixb@sgi.com>	2009-04-01 17:58:39 -0400
commit	f36345ff9a4a77f2cc576a2777b6256d5c8798fa (patch)
tree	7ae4c607f6baae74060c2e385f744e171fbbf92b /fs
parent	1aacc064e029f0017384e463121b98f06d3a2cc3 (diff)
parent	8b53ef33d9d8fa5f771ae11cc6a6e7bc0182beec (diff)