62 files changed, 3745 insertions, 1522 deletions
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 89b6ce3634fd..c0ddfd29c5e5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,7 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
-           compression.o delayed-ref.o relocation.o delayed-inode.o backref.o \
+           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-           scrub.o
+           reada.o backref.o
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                if (!value)
                        return ERR_PTR(-ENOMEM);
                size = __btrfs_getxattr(inode, name, value, size);
-                if (size > 0) {
+        }
-                        acl = posix_acl_from_xattr(value, size);
+        if (size > 0) {
-                        if (IS_ERR(acl)) {
+                acl = posix_acl_from_xattr(value, size);
-                                kfree(value);
-                                return acl;
-                        }
-                        set_cached_acl(inode, type, acl);
-                }
-                kfree(value);
        } else if (size == -ENOENT || size == -ENODATA || size == 0) {
                /* FIXME, who returns -ENOENT?  I think nobody */
                acl = NULL;
-                set_cached_acl(inode, type, acl);
        } else {
                acl = ERR_PTR(-EIO);
        }
+        kfree(value);
+        if (!IS_ERR(acl))
+                set_cached_acl(inode, type, acl);
        return acl;
 }
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..5a5d325a3935 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
         */
        u64 delalloc_bytes;
-        /* total number of bytes that may be used for this inode for
-         * delalloc
-         */
-        u64 reserved_bytes;
        /*
         * the size of the file stored in the metadata on disk.  data=ordered
         * means the in-memory i_size might be larger than the size on disk
@@ -115,9 +110,6 @@ struct btrfs_inode {
         */
        u64 disk_i_size;
-        /* flags field from the on disk inode */
-        u32 flags;
        /*
         * if this is a directory then index_cnt is the counter for the index
         * number for new files that are created
@@ -132,6 +124,15 @@ struct btrfs_inode {
        u64 last_unlink_trans;
        /*
+         * Number of bytes outstanding that are going to need csums.  This is
+         * used in ENOSPC accounting.
+         */
+        u64 csum_bytes;
+        /* flags field from the on disk inode */
+        u32 flags;
+        /*
         * Counters to keep track of the number of extent item's we may use due
         * to delalloc and such.  outstanding_extents is the number of extent
         * items we think we'll end up using, and reserved_extents is the number
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
 static inline int compressed_bio_size(struct btrfs_root *root,
                                      unsigned long disk_size)
 {
-        u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        return sizeof(struct compressed_bio) +
                ((disk_size + root->sectorsize - 1) / root->sectorsize) *
                csum_size;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..0fe615e4ea38 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
-        if (level < BTRFS_MAX_LEVEL - 1)
+        if (level < BTRFS_MAX_LEVEL - 1) {
                parent = path->nodes[level + 1];
-        pslot = path->slots[level + 1];
+                pslot = path->slots[level + 1];
+        }
        /*
         * deal with the case where there is only one pointer in the root
@@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
        mid = path->nodes[level];
        WARN_ON(btrfs_header_generation(mid) != trans->transid);
-        if (level < BTRFS_MAX_LEVEL - 1)
+        if (level < BTRFS_MAX_LEVEL - 1) {
                parent = path->nodes[level + 1];
-        pslot = path->slots[level + 1];
+                pslot = path->slots[level + 1];
+        }
        if (!parent)
                return 1;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..b9ba59ff9292 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
 #include <linux/kobject.h>
 #include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
+#include <linux/pagemap.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -360,6 +361,47 @@ struct btrfs_header {
 #define BTRFS_LABEL_SIZE 256
 /*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+        __le64 tree_root;
+        __le64 tree_root_gen;
+        __le64 chunk_root;
+        __le64 chunk_root_gen;
+        __le64 extent_root;
+        __le64 extent_root_gen;
+        __le64 fs_root;
+        __le64 fs_root_gen;
+        __le64 dev_root;
+        __le64 dev_root_gen;
+        __le64 csum_root;
+        __le64 csum_root_gen;
+        __le64 total_bytes;
+        __le64 bytes_used;
+        __le64 num_devices;
+        /* future */
+        __le64 unsed_64[4];
+        u8 tree_root_level;
+        u8 chunk_root_level;
+        u8 extent_root_level;
+        u8 fs_root_level;
+        u8 dev_root_level;
+        u8 csum_root_level;
+        /* future and to align */
+        u8 unused_8[10];
+} __attribute__ ((__packed__));
+/*
 * the super block basically lists the main trees of the FS
 * it currently lacks any block count etc etc
 */
@@ -405,6 +447,7 @@ struct btrfs_super_block {
        /* future expansion */
        __le64 reserved[31];
        u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+        struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
 /*
@@ -772,14 +815,8 @@ struct btrfs_space_info {
 struct btrfs_block_rsv {
        u64 size;
        u64 reserved;
-        u64 freed[2];
        struct btrfs_space_info *space_info;
-        struct list_head list;
        spinlock_t lock;
-        atomic_t usage;
-        unsigned int priority:8;
-        unsigned int durable:1;
-        unsigned int refill_used:1;
        unsigned int full:1;
 };
@@ -840,10 +877,10 @@ struct btrfs_block_group_cache {
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
-        u64 reserved_pinned;
        u64 bytes_super;
        u64 flags;
        u64 sectorsize;
+        u64 cache_generation;
        unsigned int ro:1;
        unsigned int dirty:1;
        unsigned int iref:1;
@@ -899,6 +936,10 @@ struct btrfs_fs_info {
        spinlock_t block_group_cache_lock;
        struct rb_root block_group_cache_tree;
+        /* keep track of unallocated space */
+        spinlock_t free_chunk_lock;
+        u64 free_chunk_space;
        struct extent_io_tree freed_extents[2];
        struct extent_io_tree *pinned_extents;
@@ -916,14 +957,11 @@ struct btrfs_fs_info {
        struct btrfs_block_rsv trans_block_rsv;
        /* block reservation for chunk tree */
        struct btrfs_block_rsv chunk_block_rsv;
+        /* block reservation for delayed operations */
+        struct btrfs_block_rsv delayed_block_rsv;
        struct btrfs_block_rsv empty_block_rsv;
-        /* list of block reservations that cross multiple transactions */
-        struct list_head durable_block_rsv_list;
-        struct mutex durable_block_rsv_mutex;
        u64 generation;
        u64 last_trans_committed;
@@ -942,8 +980,8 @@ struct btrfs_fs_info {
        wait_queue_head_t transaction_blocked_wait;
        wait_queue_head_t async_submit_wait;
-        struct btrfs_super_block super_copy;
+        struct btrfs_super_block *super_copy;
-        struct btrfs_super_block super_for_commit;
+        struct btrfs_super_block *super_for_commit;
        struct block_device *__bdev;
        struct super_block *sb;
        struct inode *btree_inode;
@@ -1036,6 +1074,7 @@ struct btrfs_fs_info {
        struct btrfs_workers endio_freespace_worker;
        struct btrfs_workers submit_workers;
        struct btrfs_workers caching_workers;
+        struct btrfs_workers readahead_workers;
        /*
         * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1158,13 @@ struct btrfs_fs_info {
        u64 fs_state;
        struct btrfs_delayed_root *delayed_root;
+        /* readahead tree */
+        spinlock_t reada_lock;
+        struct radix_tree_root reada_tree;
+        /* next backup root to be overwritten */
+        int backup_root_index;
 };
 /*
@@ -1363,6 +1409,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
 #define BTRFS_MOUNT_AUTO_DEFRAG         (1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE     (1 << 17)
+#define BTRFS_MOUNT_RECOVERY            (1 << 18)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1978,6 +2025,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
        return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
 }
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+                   tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+                   tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+                   tree_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+                   chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+                   chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+                   chunk_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+                   extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+                   extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+                   extent_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+                   fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+                   fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+                   fs_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+                   dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+                   dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+                   dev_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+                   csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+                   csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+                   csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+                   total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+                   bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+                   num_devices, 64);
 /* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -2129,6 +2225,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
                (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
 }
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+        return mapping_gfp_mask(mapping) & ~__GFP_FS;
+}
 /* extent-tree.c */
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
                                                 unsigned num_items)
@@ -2137,6 +2238,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
                3 * num_items;
 }
+/*
+ * Doing a truncate won't result in new nodes or leaves, just what we need for
+ * COW.
+ */
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+                                                 unsigned num_items)
+{
+        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+                num_items;
+}
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
@@ -2146,6 +2258,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    u64 bytenr, u64 num_bytes);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 objectid, u64 offset, u64 bytenr);
@@ -2196,8 +2311,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      u64 root_objectid, u64 owner, u64 offset);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
-                                u64 num_bytes, int reserve, int sinfo);
+                                       u64 start, u64 len);
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2240,25 +2355,23 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv);
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
+int btrfs_block_rsv_add(struct btrfs_root *root,
-                                 struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv,
                        u64 num_bytes);
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                          struct btrfs_root *root,
+                                struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes);
+int btrfs_block_rsv_check(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv,
-                          u64 min_reserved, int min_factor);
+                          u64 min_reserved);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
                            struct btrfs_block_rsv *dst_rsv,
                            u64 num_bytes);
 void btrfs_block_rsv_release(struct btrfs_root *root,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes);
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root,
-                                    struct btrfs_block_rsv *rsv);
 int btrfs_set_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
@@ -2379,6 +2492,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
        smp_mb();
        return fs_info->closing;
 }
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+        kfree(fs_info->delayed_root);
+        kfree(fs_info->extent_root);
+        kfree(fs_info->tree_root);
+        kfree(fs_info->chunk_root);
+        kfree(fs_info->dev_root);
+        kfree(fs_info->csum_root);
+        kfree(fs_info->super_copy);
+        kfree(fs_info->super_for_commit);
+        kfree(fs_info);
+}
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
@@ -2579,11 +2704,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
-                                struct btrfs_pending_snapshot *pending,
-                                u64 *bytes_to_reserve);
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
-                                struct btrfs_pending_snapshot *pending);
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
@@ -2697,4 +2817,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
                         struct btrfs_scrub_progress *progress);
+/* reada.c */
+struct reada_control {
+        struct btrfs_root       *root;          /* tree to prefetch */
+        struct btrfs_key        key_start;
+        struct btrfs_key        key_end;        /* exclusive */
+        atomic_t                elems;
+        struct kref             refcnt;
+        wait_queue_head_t       wait;
+};
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+                              struct btrfs_key *start, struct btrfs_key *end);
+int btrfs_reada_wait(void *handle);
+void btrfs_reada_detach(void *handle);
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+                         u64 start, int err);
 #endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..bbe8496d5339 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
                return 0;
        src_rsv = trans->block_rsv;
-        dst_rsv = &root->fs_info->global_block_rsv;
+        dst_rsv = &root->fs_info->delayed_block_rsv;
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
        if (!item->bytes_reserved)
                return;
-        rsv = &root->fs_info->global_block_rsv;
+        rsv = &root->fs_info->delayed_block_rsv;
        btrfs_block_rsv_release(root, rsv,
                                item->bytes_reserved);
 }
@@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata(
        u64 num_bytes;
        int ret;
-        if (!trans->bytes_reserved)
-                return 0;
        src_rsv = trans->block_rsv;
-        dst_rsv = &root->fs_info->global_block_rsv;
+        dst_rsv = &root->fs_info->delayed_block_rsv;
        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+        /*
+         * btrfs_dirty_inode will update the inode under btrfs_join_transaction
+         * which doesn't reserve space for speed.  This is a problem since we
+         * still need to reserve space for this update, so try to reserve the
+         * space.
+         *
+         * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
+         * we're accounted for.
+         */
+        if (!trans->bytes_reserved &&
+            src_rsv != &root->fs_info->delalloc_block_rsv) {
+                ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+                /*
+                 * Since we're under a transaction reserve_metadata_bytes could
+                 * try to commit the transaction which will make it return
+                 * EAGAIN to make us stop the transaction we have, so return
+                 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
+                 */
+                if (ret == -EAGAIN)
+                        ret = -ENOSPC;
+                if (!ret)
+                        node->bytes_reserved = num_bytes;
+                return ret;
+        }
        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
        if (!ret)
                node->bytes_reserved = num_bytes;
@@ -646,7 +669,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
        if (!node->bytes_reserved)
                return;
-        rsv = &root->fs_info->global_block_rsv;
+        rsv = &root->fs_info->delayed_block_rsv;
        btrfs_block_rsv_release(root, rsv,
                                node->bytes_reserved);
        node->bytes_reserved = 0;
@@ -1026,7 +1049,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
        path->leave_spinning = 1;
        block_rsv = trans->block_rsv;
-        trans->block_rsv = &root->fs_info->global_block_rsv;
+        trans->block_rsv = &root->fs_info->delayed_block_rsv;
        delayed_root = btrfs_get_delayed_root(root);
@@ -1069,7 +1092,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
        path->leave_spinning = 1;
        block_rsv = trans->block_rsv;
-        trans->block_rsv = &node->root->fs_info->global_block_rsv;
+        trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
        ret = btrfs_insert_delayed_items(trans, path, node->root, node);
        if (!ret)
@@ -1149,7 +1172,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
                goto free_path;
        block_rsv = trans->block_rsv;
-        trans->block_rsv = &root->fs_info->global_block_rsv;
+        trans->block_rsv = &root->fs_info->delayed_block_rsv;
        ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
        if (!ret)
@@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
        }
        ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
-        /*
+        if (ret)
-         * we must reserve enough space when we start a new transaction,
+                goto release_node;
-         * so reserving metadata failure is impossible
-         */
-        BUG_ON(ret);
        fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
        delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index dc0343802535..0eb1f0951251 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                           int verify)
 {
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        char *result = NULL;
        unsigned long len;
        unsigned long cur_len;
@@ -367,7 +366,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
        clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
        while (1) {
-                ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+                ret = read_extent_buffer_pages(io_tree, eb, start,
+                                               WAIT_COMPLETE,
                                               btree_get_extent, mirror_num);
                if (!ret &&
                    !verify_parent_transid(io_tree, eb, parent_transid))
@@ -608,11 +608,47 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
        end = eb->start + end - 1;
 err:
+        if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+                clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+                btree_readahead_hook(root, eb, eb->start, ret);
+        }
        free_extent_buffer(eb);
 out:
        return ret;
 }
+static int btree_io_failed_hook(struct bio *failed_bio,
+                         struct page *page, u64 start, u64 end,
+                         u64 mirror_num, struct extent_state *state)
+{
+        struct extent_io_tree *tree;
+        unsigned long len;
+        struct extent_buffer *eb;
+        struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+        tree = &BTRFS_I(page->mapping->host)->io_tree;
+        if (page->private == EXTENT_PAGE_PRIVATE)
+                goto out;
+        if (!page->private)
+                goto out;
+        len = page->private >> 2;
+        WARN_ON(len == 0);
+        eb = alloc_extent_buffer(tree, start, len, page);
+        if (eb == NULL)
+                goto out;
+        if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+                clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+                btree_readahead_hook(root, eb, eb->start, -EIO);
+        }
+out:
+        return -EIO;    /* we fixed nothing */
+}
 static void end_workqueue_bio(struct bio *bio, int err)
 {
        struct end_io_wq *end_io_wq = bio->bi_private;
@@ -974,11 +1010,43 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
        if (!buf)
                return 0;
        read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-                                 buf, 0, 0, btree_get_extent, 0);
+                                 buf, 0, WAIT_NONE, btree_get_extent, 0);
        free_extent_buffer(buf);
        return ret;
 }
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+                         int mirror_num, struct extent_buffer **eb)
+{
+        struct extent_buffer *buf = NULL;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+        int ret;
+        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+        if (!buf)
+                return 0;
+        set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+        ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+                                       btree_get_extent, mirror_num);
+        if (ret) {
+                free_extent_buffer(buf);
+                return ret;
+        }
+        if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+                free_extent_buffer(buf);
+                return -EIO;
+        } else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+                *eb = buf;
+        } else {
+                free_extent_buffer(buf);
+        }
+        return 0;
+}
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize)
 {
@@ -1135,10 +1203,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        generation = btrfs_root_generation(&root->root_item);
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+        root->commit_root = NULL;
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
        if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
                free_extent_buffer(root->node);
+                root->node = NULL;
                return -EIO;
        }
        root->commit_root = btrfs_root_node(root);
@@ -1577,6 +1647,228 @@ sleep:
        return 0;
 }
+/*
+ * this will find the highest generation in the array of
+ * root backups.  The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest  root in the array with the generation
+ * in the super block.  If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+        u64 cur;
+        int newest_index = -1;
+        struct btrfs_root_backup *root_backup;
+        int i;
+        for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+                root_backup = info->super_copy->super_roots + i;
+                cur = btrfs_backup_tree_root_gen(root_backup);
+                if (cur == newest_gen)
+                        newest_index = i;
+        }
+        /* check to see if we actually wrapped around */
+        if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+                root_backup = info->super_copy->super_roots;
+                cur = btrfs_backup_tree_root_gen(root_backup);
+                if (cur == newest_gen)
+                        newest_index = 0;
+        }
+        return newest_index;
+}
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array.  This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+                                     u64 newest_gen)
+{
+        int newest_index = -1;
+        newest_index = find_newest_super_backup(info, newest_gen);
+        /* if there was garbage in there, just move along */
+        if (newest_index == -1) {
+                info->backup_root_index = 0;
+        } else {
+                info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+        }
+}
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+        int next_backup;
+        struct btrfs_root_backup *root_backup;
+        int last_backup;
+        next_backup = info->backup_root_index;
+        last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+                BTRFS_NUM_BACKUP_ROOTS;
+        /*
+         * just overwrite the last backup if we're at the same generation
+         * this happens only at umount
+         */
+        root_backup = info->super_for_commit->super_roots + last_backup;
+        if (btrfs_backup_tree_root_gen(root_backup) ==
+            btrfs_header_generation(info->tree_root->node))
+                next_backup = last_backup;
+        root_backup = info->super_for_commit->super_roots + next_backup;
+        /*
+         * make sure all of our padding and empty slots get zero filled
+         * regardless of which ones we use today
+         */
+        memset(root_backup, 0, sizeof(*root_backup));
+        info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+        btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+        btrfs_set_backup_tree_root_gen(root_backup,
+                               btrfs_header_generation(info->tree_root->node));
+        btrfs_set_backup_tree_root_level(root_backup,
+                               btrfs_header_level(info->tree_root->node));
+        btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+        btrfs_set_backup_chunk_root_gen(root_backup,
+                               btrfs_header_generation(info->chunk_root->node));
+        btrfs_set_backup_chunk_root_level(root_backup,
+                               btrfs_header_level(info->chunk_root->node));
+        btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+        btrfs_set_backup_extent_root_gen(root_backup,
+                               btrfs_header_generation(info->extent_root->node));
+        btrfs_set_backup_extent_root_level(root_backup,
+                               btrfs_header_level(info->extent_root->node));
+        btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
+        btrfs_set_backup_fs_root_gen(root_backup,
+                               btrfs_header_generation(info->fs_root->node));
+        btrfs_set_backup_fs_root_level(root_backup,
+                               btrfs_header_level(info->fs_root->node));
+        btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+        btrfs_set_backup_dev_root_gen(root_backup,
+                               btrfs_header_generation(info->dev_root->node));
+        btrfs_set_backup_dev_root_level(root_backup,
+                                       btrfs_header_level(info->dev_root->node));
+        btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+        btrfs_set_backup_csum_root_gen(root_backup,
+                               btrfs_header_generation(info->csum_root->node));
+        btrfs_set_backup_csum_root_level(root_backup,
+                               btrfs_header_level(info->csum_root->node));
+        btrfs_set_backup_total_bytes(root_backup,
+                             btrfs_super_total_bytes(info->super_copy));
+        btrfs_set_backup_bytes_used(root_backup,
+                             btrfs_super_bytes_used(info->super_copy));
+        btrfs_set_backup_num_devices(root_backup,
+                             btrfs_super_num_devices(info->super_copy));
+        /*
+         * if we don't copy this out to the super_copy, it won't get remembered
+         * for the next commit
+         */
+        memcpy(&info->super_copy->super_roots,
+               &info->super_for_commit->super_roots,
+               sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block.  It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+                                     struct btrfs_super_block *super,
+                                     int *num_backups_tried, int *backup_index)
+{
+        struct btrfs_root_backup *root_backup;
+        int newest = *backup_index;
+        if (*num_backups_tried == 0) {
+                u64 gen = btrfs_super_generation(super);
+                newest = find_newest_super_backup(info, gen);
+                if (newest == -1)
+                        return -1;
+                *backup_index = newest;
+                *num_backups_tried = 1;
+        } else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+                /* we've tried all the backups, all done */
+                return -1;
+        } else {
+                /* jump to the next oldest backup */
+                newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+                        BTRFS_NUM_BACKUP_ROOTS;
+                *backup_index = newest;
+                *num_backups_tried += 1;
+        }
+        root_backup = super->super_roots + newest;
+        btrfs_set_super_generation(super,
+                                   btrfs_backup_tree_root_gen(root_backup));
+        btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+        btrfs_set_super_root_level(super,
+                                   btrfs_backup_tree_root_level(root_backup));
+        btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+        /*
+         * fixme: the total bytes and num_devices need to match or we should
+         * need a fsck
+         */
+        btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+        btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+        return 0;
+}
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+        free_extent_buffer(info->tree_root->node);
+        free_extent_buffer(info->tree_root->commit_root);
+        free_extent_buffer(info->dev_root->node);
+        free_extent_buffer(info->dev_root->commit_root);
+        free_extent_buffer(info->extent_root->node);
+        free_extent_buffer(info->extent_root->commit_root);
+        free_extent_buffer(info->csum_root->node);
+        free_extent_buffer(info->csum_root->commit_root);
+        info->tree_root->node = NULL;
+        info->tree_root->commit_root = NULL;
+        info->dev_root->node = NULL;
+        info->dev_root->commit_root = NULL;
+        info->extent_root->node = NULL;
+        info->extent_root->commit_root = NULL;
+        info->csum_root->node = NULL;
+        info->csum_root->commit_root = NULL;
+        if (chunk_root) {
+                free_extent_buffer(info->chunk_root->node);
+                free_extent_buffer(info->chunk_root->commit_root);
+                info->chunk_root->node = NULL;
+                info->chunk_root->commit_root = NULL;
+        }
+}
 struct btrfs_root *open_ctree(struct super_block *sb,
                              struct btrfs_fs_devices *fs_devices,
                              char *options)
@@ -1604,6 +1896,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        int ret;
        int err = -EINVAL;
+        int num_backups_tried = 0;
+        int backup_index = 0;
        struct btrfs_super_block *disk_super;
@@ -1648,6 +1942,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
        spin_lock_init(&fs_info->defrag_inodes_lock);
+        spin_lock_init(&fs_info->free_chunk_lock);
        mutex_init(&fs_info->reloc_mutex);
        init_completion(&fs_info->kobj_unregister);
@@ -1665,8 +1960,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_init_block_rsv(&fs_info->trans_block_rsv);
        btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
        btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-        INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
+        btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
-        mutex_init(&fs_info->durable_block_rsv_mutex);
        atomic_set(&fs_info->nr_async_submits, 0);
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
@@ -1677,6 +1971,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->metadata_ratio = 0;
        fs_info->defrag_inodes = RB_ROOT;
        fs_info->trans_no_join = 0;
+        fs_info->free_chunk_space = 0;
+        /* readahead state */
+        INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+        spin_lock_init(&fs_info->reada_lock);
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
@@ -1766,14 +2065,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                goto fail_alloc;
        }
-        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
+        memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
-        memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
+        memcpy(fs_info->super_for_commit, fs_info->super_copy,
-               sizeof(fs_info->super_for_commit));
+               sizeof(*fs_info->super_for_commit));
        brelse(bh);
-        memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+        memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
-        disk_super = &fs_info->super_copy;
+        disk_super = fs_info->super_copy;
        if (!btrfs_super_root(disk_super))
                goto fail_alloc;
@@ -1783,6 +2082,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
        /*
+         * run through our array of backup supers and setup
+         * our ring pointer to the oldest one
+         */
+        generation = btrfs_super_generation(disk_super);
+        find_oldest_super_backup(fs_info, generation);
+        /*
         * In the long term, we'll store the compression type in the super
         * block, and it'll be used for per file compression control.
         */
@@ -1870,6 +2176,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
                           fs_info->thread_pool_size,
                           &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+                           fs_info->thread_pool_size,
+                           &fs_info->generic_worker);
        /*
         * endios are largely parallel and should have a very
@@ -1880,6 +2189,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->endio_write_workers.idle_thresh = 2;
        fs_info->endio_meta_write_workers.idle_thresh = 2;
+        fs_info->readahead_workers.idle_thresh = 2;
        btrfs_start_workers(&fs_info->workers, 1);
        btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1893,6 +2203,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
        btrfs_start_workers(&fs_info->delayed_workers, 1);
        btrfs_start_workers(&fs_info->caching_workers, 1);
+        btrfs_start_workers(&fs_info->readahead_workers, 1);
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1939,7 +2250,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
                printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
                       sb->s_id);
-                goto fail_chunk_root;
+                goto fail_tree_roots;
        }
        btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
        chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1954,11 +2265,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (ret) {
                printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
                       sb->s_id);
-                goto fail_chunk_root;
+                goto fail_tree_roots;
        }
        btrfs_close_extra_devices(fs_devices);
+retry_root_backup:
        blocksize = btrfs_level_size(tree_root,
                                     btrfs_super_root_level(disk_super));
        generation = btrfs_super_generation(disk_super);
@@ -1966,32 +2278,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        tree_root->node = read_tree_block(tree_root,
                                          btrfs_super_root(disk_super),
                                          blocksize, generation);
-        if (!tree_root->node)
+        if (!tree_root->node ||
-                goto fail_chunk_root;
+            !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
-        if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
                printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
                       sb->s_id);
-                goto fail_tree_root;
+                goto recovery_tree_root;
        }
        btrfs_set_root_node(&tree_root->root_item, tree_root->node);
        tree_root->commit_root = btrfs_root_node(tree_root);
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
        if (ret)
-                goto fail_tree_root;
+                goto recovery_tree_root;
        extent_root->track_dirty = 1;
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_DEV_TREE_OBJECTID, dev_root);
        if (ret)
-                goto fail_extent_root;
+                goto recovery_tree_root;
        dev_root->track_dirty = 1;
        ret = find_and_setup_root(tree_root, fs_info,
                                  BTRFS_CSUM_TREE_OBJECTID, csum_root);
        if (ret)
-                goto fail_dev_root;
+                goto recovery_tree_root;
        csum_root->track_dirty = 1;
@@ -2124,20 +2437,10 @@ fail_cleaner:
 fail_block_groups:
        btrfs_free_block_groups(fs_info);
-        free_extent_buffer(csum_root->node);
-        free_extent_buffer(csum_root->commit_root);
+fail_tree_roots:
-fail_dev_root:
+        free_root_pointers(fs_info, 1);
-        free_extent_buffer(dev_root->node);
-        free_extent_buffer(dev_root->commit_root);
-fail_extent_root:
-        free_extent_buffer(extent_root->node);
-        free_extent_buffer(extent_root->commit_root);
-fail_tree_root:
-        free_extent_buffer(tree_root->node);
-        free_extent_buffer(tree_root->commit_root);
-fail_chunk_root:
-        free_extent_buffer(chunk_root->node);
-        free_extent_buffer(chunk_root->commit_root);
 fail_sb_buffer:
        btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2152,7 +2455,6 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
 fail_alloc:
-        kfree(fs_info->delayed_root);
 fail_iput:
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
@@ -2164,13 +2466,27 @@ fail_bdi:
 fail_srcu:
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
-        kfree(extent_root);
+        free_fs_info(fs_info);
-        kfree(tree_root);
-        kfree(fs_info);
-        kfree(chunk_root);
-        kfree(dev_root);
-        kfree(csum_root);
        return ERR_PTR(err);
+recovery_tree_root:
+        if (!btrfs_test_opt(tree_root, RECOVERY))
+                goto fail_tree_roots;
+        free_root_pointers(fs_info, 0);
+        /* don't use the log in recovery mode, it won't be valid */
+        btrfs_set_super_log_root(disk_super, 0);
+        /* we can't trust the free space cache either */
+        btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+        ret = next_root_backup(fs_info, fs_info->super_copy,
+                               &num_backups_tried, &backup_index);
+        if (ret == -1)
+                goto fail_block_groups;
+        goto retry_root_backup;
 }
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2338,10 +2654,11 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        int total_errors = 0;
        u64 flags;
-        max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+        max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
        do_barriers = !btrfs_test_opt(root, NOBARRIER);
+        backup_super_roots(root->fs_info);
-        sb = &root->fs_info->super_for_commit;
+        sb = root->fs_info->super_for_commit;
        dev_item = &sb->dev_item;
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@ -2545,8 +2862,6 @@ int close_ctree(struct btrfs_root *root)
        /* clear out the rbtree of defraggable inodes */
        btrfs_run_defrag_inodes(root->fs_info);
-        btrfs_put_block_group_cache(fs_info);
        /*
         * Here come 2 situations when btrfs is broken to flip readonly:
         *
@@ -2572,6 +2887,8 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
+        btrfs_put_block_group_cache(fs_info);
        kthread_stop(root->fs_info->transaction_kthread);
        kthread_stop(root->fs_info->cleaner_kthread);
@@ -2603,7 +2920,6 @@ int close_ctree(struct btrfs_root *root)
        del_fs_roots(fs_info);
        iput(fs_info->btree_inode);
-        kfree(fs_info->delayed_root);
        btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2617,6 +2933,7 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->submit_workers);
        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_stop_workers(&fs_info->caching_workers);
+        btrfs_stop_workers(&fs_info->readahead_workers);
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2624,12 +2941,7 @@ int close_ctree(struct btrfs_root *root)
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
-        kfree(fs_info->extent_root);
+        free_fs_info(fs_info);
-        kfree(fs_info->tree_root);
-        kfree(fs_info->chunk_root);
-        kfree(fs_info->dev_root);
-        kfree(fs_info->csum_root);
-        kfree(fs_info);
        return 0;
 }
@@ -2735,7 +3047,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
        return ret;
 }
-int btree_lock_page_hook(struct page *page)
+static int btree_lock_page_hook(struct page *page, void *data,
+                                void (*flush_fn)(void *))
 {
        struct inode *inode = page->mapping->host;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +3065,10 @@ int btree_lock_page_hook(struct page *page)
        if (!eb)
                goto out;
-        btrfs_tree_lock(eb);
+        if (!btrfs_try_tree_write_lock(eb)) {
+                flush_fn(data);
+                btrfs_tree_lock(eb);
+        }
        btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
        if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +3083,10 @@ int btree_lock_page_hook(struct page *page)
        btrfs_tree_unlock(eb);
        free_extent_buffer(eb);
 out:
-        lock_page(page);
+        if (!trylock_page(page)) {
+                flush_fn(data);
+                lock_page(page);
+        }
        return 0;
 }
@@ -3123,6 +3442,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
+        .readpage_io_failed_hook = btree_io_failed_hook,
        .submit_bio_hook = btree_submit_bio_hook,
        /* note we're sharing with inode.c for the merge bio hook */
        .merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..c99d0a8f13fa 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
                                      u32 blocksize, u64 parent_transid);
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
                         u64 parent_transid);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+                         int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
                                                   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
@@ -83,8 +85,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
                             struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root);
-int btree_lock_page_hook(struct page *page);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 119f842c1d4f..18ea90c8943b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -52,6 +53,21 @@ enum {
        CHUNK_ALLOC_LIMITED = 2,
 };
+/*
+ * Control how reservations are dealt with.
+ *
+ * RESERVE_FREE - freeing a reservation.
+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ *   ENOSPC accounting
+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ *   bytes_may_use as the ENOSPC accounting is done elsewhere
+ */
+enum {
+        RESERVE_FREE = 0,
+        RESERVE_ALLOC = 1,
+        RESERVE_ALLOC_NO_ACCOUNT = 2,
+};
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +97,8 @@ static int find_next_key(struct btrfs_path *path, int level,
                         struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                       u64 num_bytes, int reserve);
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -104,7 +122,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
        if (atomic_dec_and_test(&cache->count)) {
                WARN_ON(cache->pinned > 0);
                WARN_ON(cache->reserved > 0);
-                WARN_ON(cache->reserved_pinned > 0);
                kfree(cache->free_space_ctl);
                kfree(cache);
        }
@@ -465,7 +482,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
         * we likely hold important locks.
         */
        if (trans && (!trans->transaction->in_commit) &&
-            (root && root != root->fs_info->tree_root)) {
+            (root && root != root->fs_info->tree_root) &&
+            btrfs_test_opt(root, SPACE_CACHE)) {
                spin_lock(&cache->lock);
                if (cache->cached != BTRFS_CACHE_NO) {
                        spin_unlock(&cache->lock);
@@ -2700,6 +2718,13 @@ again:
                goto again;
        }
+        /* We've already setup this transaction, go ahead and exit */
+        if (block_group->cache_generation == trans->transid &&
+            i_size_read(inode)) {
+                dcs = BTRFS_DC_SETUP;
+                goto out_put;
+        }
        /*
         * We want to set the generation to 0, that way if anything goes wrong
         * from here on out we know not to trust this cache when we load up next
@@ -2749,12 +2774,15 @@ again:
        if (!ret)
                dcs = BTRFS_DC_SETUP;
        btrfs_free_reserved_data_space(inode, num_pages);
 out_put:
        iput(inode);
 out_free:
        btrfs_release_path(path);
 out:
        spin_lock(&block_group->lock);
+        if (!ret)
+                block_group->cache_generation = trans->transid;
        block_group->disk_cache_state = dcs;
        spin_unlock(&block_group->lock);
@@ -3122,16 +3150,13 @@ commit_trans:
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
-        BTRFS_I(inode)->reserved_bytes += bytes;
        spin_unlock(&data_sinfo->lock);
        return 0;
 }
 /*
- * called when we are clearing an delalloc extent from the
+ * Called if we need to clear a data reservation for this inode.
- * inode's io_tree or there was an error for whatever reason
- * after calling btrfs_check_data_free_space
 */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 {
@@ -3144,7 +3169,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        data_sinfo = BTRFS_I(inode)->space_info;
        spin_lock(&data_sinfo->lock);
        data_sinfo->bytes_may_use -= bytes;
-        BTRFS_I(inode)->reserved_bytes -= bytes;
        spin_unlock(&data_sinfo->lock);
 }
@@ -3165,6 +3189,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
                              struct btrfs_space_info *sinfo, u64 alloc_bytes,
                              int force)
 {
+        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
        u64 thresh;
@@ -3173,11 +3198,18 @@ static int should_alloc_chunk(struct btrfs_root *root,
                return 1;
        /*
+         * We need to take into account the global rsv because for all intents
+         * and purposes it's used space.  Don't worry about locking the
+         * global_rsv, it doesn't change except when the transaction commits.
+         */
+        num_allocated += global_rsv->size;
+        /*
         * in limited mode, we want to have some free space up to
         * about 1% of the FS size.
         */
        if (force == CHUNK_ALLOC_LIMITED) {
-                thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+                thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
                thresh = max_t(u64, 64 * 1024 * 1024,
                               div_factor_fine(thresh, 1));
@@ -3199,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
        if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
                return 0;
-        thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
        /* 256MB or 5% of the FS */
        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3302,24 +3334,26 @@ out:
 /*
 * shrink metadata reservation for delalloc
 */
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
-                           struct btrfs_root *root, u64 to_reclaim, int sync)
+                           bool wait_ordered)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_space_info *space_info;
+        struct btrfs_trans_handle *trans;
        u64 reserved;
        u64 max_reclaim;
        u64 reclaimed = 0;
        long time_left;
-        int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+        unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
        int loops = 0;
        unsigned long progress;
+        trans = (struct btrfs_trans_handle *)current->journal_info;
        block_rsv = &root->fs_info->delalloc_block_rsv;
        space_info = block_rsv->space_info;
        smp_mb();
-        reserved = space_info->bytes_reserved;
+        reserved = space_info->bytes_may_use;
        progress = space_info->reservation_progress;
        if (reserved == 0)
@@ -3334,7 +3368,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
        }
        max_reclaim = min(reserved, to_reclaim);
+        nr_pages = max_t(unsigned long, nr_pages,
+                         max_reclaim >> PAGE_CACHE_SHIFT);
        while (loops < 1024) {
                /* have the flusher threads jump in and do some IO */
                smp_mb();
@@ -3343,9 +3378,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
                spin_lock(&space_info->lock);
-                if (reserved > space_info->bytes_reserved)
+                if (reserved > space_info->bytes_may_use)
-                        reclaimed += reserved - space_info->bytes_reserved;
+                        reclaimed += reserved - space_info->bytes_may_use;
-                reserved = space_info->bytes_reserved;
+                reserved = space_info->bytes_may_use;
                spin_unlock(&space_info->lock);
                loops++;
@@ -3356,11 +3391,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                if (trans && trans->transaction->blocked)
                        return -EAGAIN;
-                time_left = schedule_timeout_interruptible(1);
+                if (wait_ordered && !trans) {
+                        btrfs_wait_ordered_extents(root, 0, 0);
+                } else {
+                        time_left = schedule_timeout_interruptible(1);
-                /* We were interrupted, exit */
+                        /* We were interrupted, exit */
-                if (time_left)
+                        if (time_left)
-                        break;
+                                break;
+                }
                /* we've kicked the IO a few times, if anything has been freed,
                 * exit.  There is no sense in looping here for a long time
@@ -3375,34 +3414,90 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
                }
        }
-        if (reclaimed >= to_reclaim && !trans)
-                btrfs_wait_ordered_extents(root, 0, 0);
        return reclaimed >= to_reclaim;
 }
-/*
+/**
- * Retries tells us how many times we've called reserve_metadata_bytes.  The
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
- * idea is if this is the first call (retries == 0) then we will add to our
+ * @root - the root we're allocating for
- * reserved count if we can't make the allocation in order to hold our place
+ * @bytes - the number of bytes we want to reserve
- * while we go and try and free up space.  That way for retries > 1 we don't try
+ * @force - force the commit
- * and add space, we just check to see if the amount of unused space is >= the
- * total space, meaning that our reservation is valid.
 *
- * However if we don't intend to retry this reservation, pass -1 as retries so
+ * This will check to make sure that committing the transaction will actually
- * that it short circuits this logic.
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
 */
-static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+static int may_commit_transaction(struct btrfs_root *root,
-                                  struct btrfs_root *root,
+                                  struct btrfs_space_info *space_info,
+                                  u64 bytes, int force)
+{
+        struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+        struct btrfs_trans_handle *trans;
+        trans = (struct btrfs_trans_handle *)current->journal_info;
+        if (trans)
+                return -EAGAIN;
+        if (force)
+                goto commit;
+        /* See if there is enough pinned space to make this reservation */
+        spin_lock(&space_info->lock);
+        if (space_info->bytes_pinned >= bytes) {
+                spin_unlock(&space_info->lock);
+                goto commit;
+        }
+        spin_unlock(&space_info->lock);
+        /*
+         * See if there is some space in the delayed insertion reservation for
+         * this reservation.
+         */
+        if (space_info != delayed_rsv->space_info)
+                return -ENOSPC;
+        spin_lock(&delayed_rsv->lock);
+        if (delayed_rsv->size < bytes) {
+                spin_unlock(&delayed_rsv->lock);
+                return -ENOSPC;
+        }
+        spin_unlock(&delayed_rsv->lock);
+commit:
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans))
+                return -ENOSPC;
+        return btrfs_commit_transaction(trans, root);
+}
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - wether or not we can flush to make our reservation
+ *
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
+ */
+static int reserve_metadata_bytes(struct btrfs_root *root,
                                  struct btrfs_block_rsv *block_rsv,
                                  u64 orig_bytes, int flush)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
-        u64 unused;
+        u64 used;
        u64 num_bytes = orig_bytes;
        int retries = 0;
        int ret = 0;
        bool committed = false;
        bool flushing = false;
+        bool wait_ordered = false;
 again:
        ret = 0;
@@ -3419,7 +3514,7 @@ again:
                 * deadlock since we are waiting for the flusher to finish, but
                 * hold the current transaction open.
                 */
-                if (trans)
+                if (current->journal_info)
                        return -EAGAIN;
                ret = wait_event_interruptible(space_info->wait,
                                               !space_info->flush);
@@ -3431,9 +3526,9 @@ again:
        }
        ret = -ENOSPC;
-        unused = space_info->bytes_used + space_info->bytes_reserved +
+        used = space_info->bytes_used + space_info->bytes_reserved +
-                 space_info->bytes_pinned + space_info->bytes_readonly +
+                space_info->bytes_pinned + space_info->bytes_readonly +
-                 space_info->bytes_may_use;
+                space_info->bytes_may_use;
        /*
         * The idea here is that we've not already over-reserved the block group
@@ -3442,10 +3537,9 @@ again:
         * lets start flushing stuff first and then come back and try to make
         * our reservation.
         */
-        if (unused <= space_info->total_bytes) {
+        if (used <= space_info->total_bytes) {
-                unused = space_info->total_bytes - unused;
+                if (used + orig_bytes <= space_info->total_bytes) {
-                if (unused >= num_bytes) {
+                        space_info->bytes_may_use += orig_bytes;
-                        space_info->bytes_reserved += orig_bytes;
                        ret = 0;
                } else {
                        /*
@@ -3461,10 +3555,64 @@ again:
                 * amount plus the amount of bytes that we need for this
                 * reservation.
                 */
-                num_bytes = unused - space_info->total_bytes +
+                wait_ordered = true;
+                num_bytes = used - space_info->total_bytes +
                        (orig_bytes * (retries + 1));
        }
+        if (ret) {
+                u64 profile = btrfs_get_alloc_profile(root, 0);
+                u64 avail;
+                /*
+                 * If we have a lot of space that's pinned, don't bother doing
+                 * the overcommit dance yet and just commit the transaction.
+                 */
+                avail = (space_info->total_bytes - space_info->bytes_used) * 8;
+                do_div(avail, 10);
+                if (space_info->bytes_pinned >= avail && flush && !committed) {
+                        space_info->flush = 1;
+                        flushing = true;
+                        spin_unlock(&space_info->lock);
+                        ret = may_commit_transaction(root, space_info,
+                                                     orig_bytes, 1);
+                        if (ret)
+                                goto out;
+                        committed = true;
+                        goto again;
+                }
+                spin_lock(&root->fs_info->free_chunk_lock);
+                avail = root->fs_info->free_chunk_space;
+                /*
+                 * If we have dup, raid1 or raid10 then only half of the free
+                 * space is actually useable.
+                 */
+                if (profile & (BTRFS_BLOCK_GROUP_DUP |
+                               BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_RAID10))
+                        avail >>= 1;
+                /*
+                 * If we aren't flushing don't let us overcommit too much, say
+                 * 1/8th of the space.  If we can flush, let it overcommit up to
+                 * 1/2 of the space.
+                 */
+                if (flush)
+                        avail >>= 3;
+                else
+                        avail >>= 1;
+                 spin_unlock(&root->fs_info->free_chunk_lock);
+                if (used + num_bytes < space_info->total_bytes + avail) {
+                        space_info->bytes_may_use += orig_bytes;
+                        ret = 0;
+                } else {
+                        wait_ordered = true;
+                }
+        }
        /*
         * Couldn't make our reservation, save our place so while we're trying
         * to reclaim space we can actually use it instead of somebody else
@@ -3484,7 +3632,7 @@ again:
         * We do synchronous shrinking since we don't actually unreserve
         * metadata until after the IO is completed.
         */
-        ret = shrink_delalloc(trans, root, num_bytes, 1);
+        ret = shrink_delalloc(root, num_bytes, wait_ordered);
        if (ret < 0)
                goto out;
@@ -3496,35 +3644,17 @@ again:
         * so go back around and try again.
         */
        if (retries < 2) {
+                wait_ordered = true;
                retries++;
                goto again;
        }
-        /*
-         * Not enough space to be reclaimed, don't bother committing the
-         * transaction.
-         */
-        spin_lock(&space_info->lock);
-        if (space_info->bytes_pinned < orig_bytes)
-                ret = -ENOSPC;
-        spin_unlock(&space_info->lock);
-        if (ret)
-                goto out;
-        ret = -EAGAIN;
-        if (trans)
-                goto out;
        ret = -ENOSPC;
        if (committed)
                goto out;
-        trans = btrfs_join_transaction(root);
+        ret = may_commit_transaction(root, space_info, orig_bytes, 0);
-        if (IS_ERR(trans))
-                goto out;
-        ret = btrfs_commit_transaction(trans, root);
        if (!ret) {
-                trans = NULL;
                committed = true;
                goto again;
        }
@@ -3542,10 +3672,12 @@ out:
 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
                                             struct btrfs_root *root)
 {
-        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *block_rsv = NULL;
-        if (root->ref_cows)
+        if (root->ref_cows || root == root->fs_info->csum_root)
                block_rsv = trans->block_rsv;
-        else
+        if (!block_rsv)
                block_rsv = root->block_rsv;
        if (!block_rsv)
@@ -3616,7 +3748,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
                }
                if (num_bytes) {
                        spin_lock(&space_info->lock);
-                        space_info->bytes_reserved -= num_bytes;
+                        space_info->bytes_may_use -= num_bytes;
                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
@@ -3640,9 +3772,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
 {
        memset(rsv, 0, sizeof(*rsv));
        spin_lock_init(&rsv->lock);
-        atomic_set(&rsv->usage, 1);
-        rsv->priority = 6;
-        INIT_LIST_HEAD(&rsv->list);
 }
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3663,38 +3792,38 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
                          struct btrfs_block_rsv *rsv)
 {
-        if (rsv && atomic_dec_and_test(&rsv->usage)) {
+        btrfs_block_rsv_release(root, rsv, (u64)-1);
-                btrfs_block_rsv_release(root, rsv, (u64)-1);
+        kfree(rsv);
-                if (!rsv->durable)
-                        kfree(rsv);
-        }
 }
-/*
+int btrfs_block_rsv_add(struct btrfs_root *root,
- * make the block_rsv struct be able to capture freed space.
+                        struct btrfs_block_rsv *block_rsv,
- * the captured space will re-add to the the block_rsv struct
+                        u64 num_bytes)
- * after transaction commit
- */
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-                                 struct btrfs_block_rsv *block_rsv)
 {
-        block_rsv->durable = 1;
+        int ret;
-        mutex_lock(&fs_info->durable_block_rsv_mutex);
-        list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
+        if (num_bytes == 0)
-        mutex_unlock(&fs_info->durable_block_rsv_mutex);
+                return 0;
+        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+        if (!ret) {
+                block_rsv_add_bytes(block_rsv, num_bytes, 1);
+                return 0;
+        }
+        return ret;
 }
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
-                        struct btrfs_root *root,
+                                struct btrfs_block_rsv *block_rsv,
-                        struct btrfs_block_rsv *block_rsv,
+                                u64 num_bytes)
-                        u64 num_bytes)
 {
        int ret;
        if (num_bytes == 0)
                return 0;
-        ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
+        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 1);
                return 0;
@@ -3703,55 +3832,52 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
        return ret;
 }
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
+int btrfs_block_rsv_check(struct btrfs_root *root,
-                          struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv, int min_factor)
-                          struct btrfs_block_rsv *block_rsv,
-                          u64 min_reserved, int min_factor)
 {
        u64 num_bytes = 0;
-        int commit_trans = 0;
        int ret = -ENOSPC;
        if (!block_rsv)
                return 0;
        spin_lock(&block_rsv->lock);
-        if (min_factor > 0)
+        num_bytes = div_factor(block_rsv->size, min_factor);
-                num_bytes = div_factor(block_rsv->size, min_factor);
+        if (block_rsv->reserved >= num_bytes)
-        if (min_reserved > num_bytes)
+                ret = 0;
-                num_bytes = min_reserved;
+        spin_unlock(&block_rsv->lock);
-        if (block_rsv->reserved >= num_bytes) {
+        return ret;
+}
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+                          struct btrfs_block_rsv *block_rsv,
+                          u64 min_reserved)
+{
+        u64 num_bytes = 0;
+        int ret = -ENOSPC;
+        if (!block_rsv)
+                return 0;
+        spin_lock(&block_rsv->lock);
+        num_bytes = min_reserved;
+        if (block_rsv->reserved >= num_bytes)
                ret = 0;
-        } else {
+        else
                num_bytes -= block_rsv->reserved;
-                if (block_rsv->durable &&
-                    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
-                        commit_trans = 1;
-        }
        spin_unlock(&block_rsv->lock);
        if (!ret)
                return 0;
-        if (block_rsv->refill_used) {
+        ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
-                ret = reserve_metadata_bytes(trans, root, block_rsv,
+        if (!ret) {
-                                             num_bytes, 0);
+                block_rsv_add_bytes(block_rsv, num_bytes, 0);
-                if (!ret) {
-                        block_rsv_add_bytes(block_rsv, num_bytes, 0);
-                        return 0;
-                }
-        }
-        if (commit_trans) {
-                if (trans)
-                        return -EAGAIN;
-                trans = btrfs_join_transaction(root);
-                BUG_ON(IS_ERR(trans));
-                ret = btrfs_commit_transaction(trans, root);
                return 0;
        }
-        return -ENOSPC;
+        return ret;
 }
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
@@ -3783,7 +3909,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
        u64 num_bytes;
        u64 meta_used;
        u64 data_used;
-        int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+        int csum_size = btrfs_super_csum_size(fs_info->super_copy);
        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
        spin_lock(&sinfo->lock);
@@ -3827,12 +3953,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        if (sinfo->total_bytes > num_bytes) {
                num_bytes = sinfo->total_bytes - num_bytes;
                block_rsv->reserved += num_bytes;
-                sinfo->bytes_reserved += num_bytes;
+                sinfo->bytes_may_use += num_bytes;
        }
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
-                sinfo->bytes_reserved -= num_bytes;
+                sinfo->bytes_may_use -= num_bytes;
                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
@@ -3848,16 +3974,13 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
        fs_info->chunk_block_rsv.space_info = space_info;
-        fs_info->chunk_block_rsv.priority = 10;
        space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
        fs_info->global_block_rsv.space_info = space_info;
-        fs_info->global_block_rsv.priority = 10;
-        fs_info->global_block_rsv.refill_used = 1;
        fs_info->delalloc_block_rsv.space_info = space_info;
        fs_info->trans_block_rsv.space_info = space_info;
        fs_info->empty_block_rsv.space_info = space_info;
-        fs_info->empty_block_rsv.priority = 10;
+        fs_info->delayed_block_rsv.space_info = space_info;
        fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
        fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3865,10 +3988,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
        fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
        fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
-        btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
-        btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
        update_global_block_rsv(fs_info);
 }
@@ -3881,37 +4000,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
        WARN_ON(fs_info->trans_block_rsv.reserved > 0);
        WARN_ON(fs_info->chunk_block_rsv.size > 0);
        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
-}
+        WARN_ON(fs_info->delayed_block_rsv.size > 0);
+        WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root,
-                                    struct btrfs_block_rsv *rsv)
-{
-        struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
-        u64 num_bytes;
-        int ret;
-        /*
-         * Truncate should be freeing data, but give us 2 items just in case it
-         * needs to use some space.  We may want to be smarter about this in the
-         * future.
-         */
-        num_bytes = btrfs_calc_trans_metadata_size(root, 2);
-        /* We already have enough bytes, just return */
-        if (rsv->reserved >= num_bytes)
-                return 0;
-        num_bytes -= rsv->reserved;
-        /*
-         * You should have reserved enough space before hand to do this, so this
-         * should not fail.
-         */
-        ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
-        BUG_ON(ret);
-        return 0;
 }
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
@@ -3920,9 +4010,7 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
        if (!trans->bytes_reserved)
                return;
-        BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
+        btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
-        btrfs_block_rsv_release(root, trans->block_rsv,
-                                trans->bytes_reserved);
        trans->bytes_reserved = 0;
 }
@@ -3964,11 +4052,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
+/**
+ * drop_outstanding_extent - drop an outstanding extent
+ * @inode: the inode we're dropping the extent for
+ *
+ * This is called when we are freeing up an outstanding extent, either called
+ * after an error or after an extent is written.  This will return the number of
+ * reserved extents that need to be freed.  This must be called with
+ * BTRFS_I(inode)->lock held.
+ */
 static unsigned drop_outstanding_extent(struct inode *inode)
 {
        unsigned dropped_extents = 0;
-        spin_lock(&BTRFS_I(inode)->lock);
        BUG_ON(!BTRFS_I(inode)->outstanding_extents);
        BTRFS_I(inode)->outstanding_extents--;
@@ -3978,19 +4074,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)
         */
        if (BTRFS_I(inode)->outstanding_extents >=
            BTRFS_I(inode)->reserved_extents)
-                goto out;
+                return 0;
        dropped_extents = BTRFS_I(inode)->reserved_extents -
                BTRFS_I(inode)->outstanding_extents;
        BTRFS_I(inode)->reserved_extents -= dropped_extents;
-out:
-        spin_unlock(&BTRFS_I(inode)->lock);
        return dropped_extents;
 }
-static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+/**
+ * calc_csum_metadata_size - return the amount of metada space that must be
+ *      reserved/free'd for the given bytes.
+ * @inode: the inode we're manipulating
+ * @num_bytes: the number of bytes in question
+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ *
+ * This adjusts the number of csum_bytes in the inode and then returns the
+ * correct amount of metadata that must either be reserved or freed.  We
+ * calculate how many checksums we can fit into one leaf and then divide the
+ * number of bytes that will need to be checksumed by this value to figure out
+ * how many checksums will be required.  If we are adding bytes then the number
+ * may go up and we will return the number of additional bytes that must be
+ * reserved.  If it is going down we will return the number of bytes that must
+ * be freed.
+ *
+ * This must be called with BTRFS_I(inode)->lock held.
+ */
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+                                   int reserve)
 {
-        return num_bytes >>= 3;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 csum_size;
+        int num_csums_per_leaf;
+        int num_csums;
+        int old_csums;
+        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+            BTRFS_I(inode)->csum_bytes == 0)
+                return 0;
+        old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+        if (reserve)
+                BTRFS_I(inode)->csum_bytes += num_bytes;
+        else
+                BTRFS_I(inode)->csum_bytes -= num_bytes;
+        csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+        num_csums_per_leaf = (int)div64_u64(csum_size,
+                                            sizeof(struct btrfs_csum_item) +
+                                            sizeof(struct btrfs_disk_key));
+        num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+        num_csums = num_csums + num_csums_per_leaf - 1;
+        num_csums = num_csums / num_csums_per_leaf;
+        old_csums = old_csums + num_csums_per_leaf - 1;
+        old_csums = old_csums / num_csums_per_leaf;
+        /* No change, no need to reserve more */
+        if (old_csums == num_csums)
+                return 0;
+        if (reserve)
+                return btrfs_calc_trans_metadata_size(root,
+                                                      num_csums - old_csums);
+        return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
 }
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -3999,9 +4146,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
        u64 to_reserve = 0;
        unsigned nr_extents = 0;
+        int flush = 1;
        int ret;
-        if (btrfs_transaction_in_commit(root->fs_info))
+        if (btrfs_is_free_space_inode(root, inode))
+                flush = 0;
+        if (flush && btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4017,18 +4168,29 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
                to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
        }
+        to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
        spin_unlock(&BTRFS_I(inode)->lock);
-        to_reserve += calc_csum_metadata_size(inode, num_bytes);
+        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
-        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
        if (ret) {
+                u64 to_free = 0;
                unsigned dropped;
+                spin_lock(&BTRFS_I(inode)->lock);
+                dropped = drop_outstanding_extent(inode);
+                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+                spin_unlock(&BTRFS_I(inode)->lock);
+                to_free += btrfs_calc_trans_metadata_size(root, dropped);
                /*
-                 * We don't need the return value since our reservation failed,
+                 * Somebody could have come in and twiddled with the
-                 * we just need to clean up our counter.
+                 * reservation, so if we have to free more than we would have
+                 * reserved from this reservation go ahead and release those
+                 * bytes.
                 */
-                dropped = drop_outstanding_extent(inode);
+                to_free -= to_reserve;
-                WARN_ON(dropped > 1);
+                if (to_free)
+                        btrfs_block_rsv_release(root, block_rsv, to_free);
                return ret;
        }
@@ -4037,6 +4199,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        return 0;
 }
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for
+ * @num_bytes: the number of bytes we're releasing
+ *
+ * This will release the metadata reservation for an inode.  This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations.
+ */
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4044,9 +4215,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        unsigned dropped;
        num_bytes = ALIGN(num_bytes, root->sectorsize);
+        spin_lock(&BTRFS_I(inode)->lock);
        dropped = drop_outstanding_extent(inode);
-        to_free = calc_csum_metadata_size(inode, num_bytes);
+        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+        spin_unlock(&BTRFS_I(inode)->lock);
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4054,6 +4227,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
                                to_free);
 }
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * @inode: inode we're writing to
+ * @num_bytes: the number of bytes we want to allocate
+ *
+ * This will do the following things
+ *
+ * o reserve space in the data space info for num_bytes
+ * o reserve space in the metadata space info based on number of outstanding
+ *   extents and how much csums will be needed
+ * o add to the inodes ->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *
+ * This will return 0 for success and -ENOSPC if there is no space left.
+ */
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 {
        int ret;
@@ -4071,6 +4259,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
        return 0;
 }
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @num_bytes: the number of bytes we want to free up
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore.  So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ */
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
 {
        btrfs_delalloc_release_metadata(inode, num_bytes);
@@ -4090,12 +4291,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
        /* block accounting for super block */
        spin_lock(&info->delalloc_lock);
-        old_val = btrfs_super_bytes_used(&info->super_copy);
+        old_val = btrfs_super_bytes_used(info->super_copy);
        if (alloc)
                old_val += num_bytes;
        else
                old_val -= num_bytes;
-        btrfs_set_super_bytes_used(&info->super_copy, old_val);
+        btrfs_set_super_bytes_used(info->super_copy, old_val);
        spin_unlock(&info->delalloc_lock);
        while (total) {
@@ -4123,7 +4324,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
-                if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+                if (btrfs_test_opt(root, SPACE_CACHE) &&
                    cache->disk_cache_state < BTRFS_DC_CLEAR)
                        cache->disk_cache_state = BTRFS_DC_CLEAR;
@@ -4135,7 +4336,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
-                        cache->space_info->reservation_progress++;
                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
@@ -4187,7 +4387,6 @@ static int pin_down_extent(struct btrfs_root *root,
        if (reserved) {
                cache->reserved -= num_bytes;
                cache->space_info->bytes_reserved -= num_bytes;
-                cache->space_info->reservation_progress++;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
@@ -4215,45 +4414,82 @@ int btrfs_pin_extent(struct btrfs_root *root,
 }
 /*
- * update size of reserved extents. this function may return -EAGAIN
+ * this function must be called within transaction
- * if 'reserve' is true or 'sinfo' is false.
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    u64 bytenr, u64 num_bytes)
+{
+        struct btrfs_block_group_cache *cache;
+        cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+        BUG_ON(!cache);
+        /*
+         * pull in the free space cache (if any) so that our pin
+         * removes the free space from the cache.  We have load_only set
+         * to one because the slow code to read in the free extents does check
+         * the pinned extents.
+         */
+        cache_block_group(cache, trans, root, 1);
+        pin_down_extent(root, cache, bytenr, num_bytes, 0);
+        /* remove us from the free space cache (if we're there at all) */
+        btrfs_remove_free_space(cache, bytenr, num_bytes);
+        btrfs_put_block_group(cache);
+        return 0;
+}
+/**
+ * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * @cache:      The cache we are manipulating
+ * @num_bytes:  The number of bytes in question
+ * @reserve:    One of the reservation enums
+ *
+ * This is called by the allocator when it reserves space, or by somebody who is
+ * freeing space that was never actually used on disk.  For example if you
+ * reserve some space for a new leaf in transaction A and before transaction A
+ * commits you free that leaf, you call this with reserve set to 0 in order to
+ * clear the reservation.
+ *
+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * ENOSPC accounting.  For data we handle the reservation through clearing the
+ * delalloc bits in the io_tree.  We have to do this since we could end up
+ * allocating less disk space for the amount of data we have reserved in the
+ * case of compression.
+ *
+ * If this is a reservation and the block group has become read only we cannot
+ * make the reservation and return -EAGAIN, otherwise this function always
+ * succeeds.
 */
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                u64 num_bytes, int reserve, int sinfo)
+                                       u64 num_bytes, int reserve)
 {
+        struct btrfs_space_info *space_info = cache->space_info;
        int ret = 0;
-        if (sinfo) {
+        spin_lock(&space_info->lock);
-                struct btrfs_space_info *space_info = cache->space_info;
+        spin_lock(&cache->lock);
-                spin_lock(&space_info->lock);
+        if (reserve != RESERVE_FREE) {
-                spin_lock(&cache->lock);
-                if (reserve) {
-                        if (cache->ro) {
-                                ret = -EAGAIN;
-                        } else {
-                                cache->reserved += num_bytes;
-                                space_info->bytes_reserved += num_bytes;
-                        }
-                } else {
-                        if (cache->ro)
-                                space_info->bytes_readonly += num_bytes;
-                        cache->reserved -= num_bytes;
-                        space_info->bytes_reserved -= num_bytes;
-                        space_info->reservation_progress++;
-                }
-                spin_unlock(&cache->lock);
-                spin_unlock(&space_info->lock);
-        } else {
-                spin_lock(&cache->lock);
                if (cache->ro) {
                        ret = -EAGAIN;
                } else {
-                        if (reserve)
+                        cache->reserved += num_bytes;
-                                cache->reserved += num_bytes;
+                        space_info->bytes_reserved += num_bytes;
-                        else
+                        if (reserve == RESERVE_ALLOC) {
-                                cache->reserved -= num_bytes;
+                                BUG_ON(space_info->bytes_may_use < num_bytes);
+                                space_info->bytes_may_use -= num_bytes;
+                        }
                }
-                spin_unlock(&cache->lock);
+        } else {
+                if (cache->ro)
+                        space_info->bytes_readonly += num_bytes;
+                cache->reserved -= num_bytes;
+                space_info->bytes_reserved -= num_bytes;
+                space_info->reservation_progress++;
        }
+        spin_unlock(&cache->lock);
+        spin_unlock(&space_info->lock);
        return ret;
 }
@@ -4319,13 +4555,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
                spin_lock(&cache->lock);
                cache->pinned -= len;
                cache->space_info->bytes_pinned -= len;
-                if (cache->ro) {
+                if (cache->ro)
                        cache->space_info->bytes_readonly += len;
-                } else if (cache->reserved_pinned > 0) {
-                        len = min(len, cache->reserved_pinned);
-                        cache->reserved_pinned -= len;
-                        cache->space_info->bytes_reserved += len;
-                }
                spin_unlock(&cache->lock);
                spin_unlock(&cache->space_info->lock);
        }
@@ -4340,11 +4571,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
        struct btrfs_fs_info *fs_info = root->fs_info;
        struct extent_io_tree *unpin;
-        struct btrfs_block_rsv *block_rsv;
-        struct btrfs_block_rsv *next_rsv;
        u64 start;
        u64 end;
-        int idx;
        int ret;
        if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4367,30 +4595,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                cond_resched();
        }
-        mutex_lock(&fs_info->durable_block_rsv_mutex);
-        list_for_each_entry_safe(block_rsv, next_rsv,
-                                 &fs_info->durable_block_rsv_list, list) {
-                idx = trans->transid & 0x1;
-                if (block_rsv->freed[idx] > 0) {
-                        block_rsv_add_bytes(block_rsv,
-                                            block_rsv->freed[idx], 0);
-                        block_rsv->freed[idx] = 0;
-                }
-                if (atomic_read(&block_rsv->usage) == 0) {
-                        btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-                        if (block_rsv->freed[0] == 0 &&
-                            block_rsv->freed[1] == 0) {
-                                list_del_init(&block_rsv->list);
-                                kfree(block_rsv);
-                        }
-                } else {
-                        btrfs_block_rsv_release(root, block_rsv, 0);
-                }
-        }
-        mutex_unlock(&fs_info->durable_block_rsv_mutex);
        return 0;
 }
@@ -4668,7 +4872,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                           struct extent_buffer *buf,
                           u64 parent, int last_ref)
 {
-        struct btrfs_block_rsv *block_rsv;
        struct btrfs_block_group_cache *cache = NULL;
        int ret;
@@ -4683,64 +4886,24 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
        if (!last_ref)
                return;
-        block_rsv = get_block_rsv(trans, root);
        cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-        if (block_rsv->space_info != cache->space_info)
-                goto out;
        if (btrfs_header_generation(buf) == trans->transid) {
                if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
                        ret = check_ref_cleanup(trans, root, buf->start);
                        if (!ret)
-                                goto pin;
+                                goto out;
                }
                if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
                        pin_down_extent(root, cache, buf->start, buf->len, 1);
-                        goto pin;
+                        goto out;
                }
                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
                btrfs_add_free_space(cache, buf->start, buf->len);
-                ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
+                btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
-                if (ret == -EAGAIN) {
-                        /* block group became read-only */
-                        btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
-                        goto out;
-                }
-                ret = 1;
-                spin_lock(&block_rsv->lock);
-                if (block_rsv->reserved < block_rsv->size) {
-                        block_rsv->reserved += buf->len;
-                        ret = 0;
-                }
-                spin_unlock(&block_rsv->lock);
-                if (ret) {
-                        spin_lock(&cache->space_info->lock);
-                        cache->space_info->bytes_reserved -= buf->len;
-                        cache->space_info->reservation_progress++;
-                        spin_unlock(&cache->space_info->lock);
-                }
-                goto out;
-        }
-pin:
-        if (block_rsv->durable && !cache->ro) {
-                ret = 0;
-                spin_lock(&cache->lock);
-                if (!cache->ro) {
-                        cache->reserved_pinned += buf->len;
-                        ret = 1;
-                }
-                spin_unlock(&cache->lock);
-                if (ret) {
-                        spin_lock(&block_rsv->lock);
-                        block_rsv->freed[trans->transid & 0x1] += buf->len;
-                        spin_unlock(&block_rsv->lock);
-                }
        }
 out:
        /*
@@ -4883,10 +5046,13 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        int last_ptr_loop = 0;
        int loop = 0;
        int index = 0;
+        int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+                RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
        bool use_cluster = true;
+        bool have_caching_bg = false;
        u64 ideal_cache_percent = 0;
        u64 ideal_cache_offset = 0;
@@ -4969,6 +5135,7 @@ ideal_cache:
                }
        }
 search:
+        have_caching_bg = false;
        down_read(&space_info->groups_sem);
        list_for_each_entry(block_group, &space_info->block_groups[index],
                            list) {
@@ -5177,6 +5344,8 @@ refill_cluster:
                        failed_alloc = true;
                        goto have_block_group;
                } else if (!offset) {
+                        if (!cached)
+                                have_caching_bg = true;
                        goto loop;
                }
 checks:
@@ -5202,8 +5371,8 @@ checks:
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
+                ret = btrfs_update_reserved_bytes(block_group, num_bytes,
-                                            (data & BTRFS_BLOCK_GROUP_DATA));
+                                                  alloc_type);
                if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
                        goto loop;
@@ -5227,6 +5396,9 @@ loop:
        }
        up_read(&space_info->groups_sem);
+        if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+                goto search;
        if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
                goto search;
@@ -5325,7 +5497,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
        int index = 0;
        spin_lock(&info->lock);
-        printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+        printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+               (unsigned long long)info->flags,
               (unsigned long long)(info->total_bytes - info->bytes_used -
                                    info->bytes_pinned - info->bytes_reserved -
                                    info->bytes_readonly),
@@ -5411,7 +5584,8 @@ again:
        return ret;
 }
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+                                        u64 start, u64 len, int pin)
 {
        struct btrfs_block_group_cache *cache;
        int ret = 0;
@@ -5426,8 +5600,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        if (btrfs_test_opt(root, DISCARD))
                ret = btrfs_discard_extent(root, start, len, NULL);
-        btrfs_add_free_space(cache, start, len);
+        if (pin)
-        btrfs_update_reserved_bytes(cache, len, 0, 1);
+                pin_down_extent(root, cache, start, len, 1);
+        else {
+                btrfs_add_free_space(cache, start, len);
+                btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+        }
        btrfs_put_block_group(cache);
        trace_btrfs_reserved_extent_free(root, start, len);
@@ -5435,6 +5613,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
        return ret;
 }
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+                                        u64 start, u64 len)
+{
+        return __btrfs_free_reserved_extent(root, start, len, 0);
+}
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+                                       u64 start, u64 len)
+{
+        return __btrfs_free_reserved_extent(root, start, len, 1);
+}
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
                                      struct btrfs_root *root,
                                      u64 parent, u64 root_objectid,
@@ -5630,7 +5820,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                put_caching_control(caching_ctl);
        }
-        ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
+        ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+                                          RESERVE_ALLOC_NO_ACCOUNT);
        BUG_ON(ret);
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5687,8 +5878,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        block_rsv = get_block_rsv(trans, root);
        if (block_rsv->size == 0) {
-                ret = reserve_metadata_bytes(trans, root, block_rsv,
+                ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
-                                             blocksize, 0);
                /*
                 * If we couldn't reserve metadata bytes try and use some from
                 * the global reserve.
@@ -5708,13 +5898,15 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        if (!ret)
                return block_rsv;
        if (ret) {
-                WARN_ON(1);
+                static DEFINE_RATELIMIT_STATE(_rs,
-                ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
+                                DEFAULT_RATELIMIT_INTERVAL,
-                                             0);
+                                /*DEFAULT_RATELIMIT_BURST*/ 2);
+                if (__ratelimit(&_rs)) {
+                        printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+                        WARN_ON(1);
+                }
+                ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
                if (!ret) {
-                        spin_lock(&block_rsv->lock);
-                        block_rsv->size += blocksize;
-                        spin_unlock(&block_rsv->lock);
                        return block_rsv;
                } else if (ret && block_rsv != global_rsv) {
                        ret = block_rsv_use_bytes(global_rsv, blocksize);
@@ -6592,12 +6784,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
                    cache->bytes_super - btrfs_block_group_used(&cache->item);
        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
-            sinfo->bytes_may_use + sinfo->bytes_readonly +
+            sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
-            cache->reserved_pinned + num_bytes + min_allocable_bytes <=
+            min_allocable_bytes <= sinfo->total_bytes) {
-            sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
-                sinfo->bytes_reserved += cache->reserved_pinned;
-                cache->reserved_pinned = 0;
                cache->ro = 1;
                ret = 0;
        }
@@ -6964,7 +7153,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                                        struct btrfs_space_info,
                                        list);
                if (space_info->bytes_pinned > 0 ||
-                    space_info->bytes_reserved > 0) {
+                    space_info->bytes_reserved > 0 ||
+                    space_info->bytes_may_use > 0) {
                        WARN_ON(1);
                        dump_space_info(space_info, 0, 0);
                }
@@ -7006,14 +7196,12 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                return -ENOMEM;
        path->reada = 1;
-        cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
-        if (cache_gen != 0 &&
+        if (btrfs_test_opt(root, SPACE_CACHE) &&
-            btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+            btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
                need_clear = 1;
        if (btrfs_test_opt(root, CLEAR_CACHE))
                need_clear = 1;
-        if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
-                printk(KERN_INFO "btrfs: disk space caching is enabled\n");
        while (1) {
                ret = find_first_block_group(root, path, &key);
@@ -7252,7 +7440,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                goto out;
        }
-        inode = lookup_free_space_inode(root, block_group, path);
+        inode = lookup_free_space_inode(tree_root, block_group, path);
        if (!IS_ERR(inode)) {
                ret = btrfs_orphan_add(trans, inode);
                BUG_ON(ret);
@@ -7268,7 +7456,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                        spin_unlock(&block_group->lock);
                }
                /* One for our lookup ref */
-                iput(inode);
+                btrfs_add_delayed_iput(inode);
        }
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
@@ -7339,7 +7527,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
        int mixed = 0;
        int ret;
-        disk_super = &fs_info->super_copy;
+        disk_super = fs_info->super_copy;
        if (!btrfs_super_root(disk_super))
                return 1;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 624ef10d36cc..1f87c4d0e7a0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -895,6 +895,194 @@ search_again:
        goto again;
 }
+/**
+ * convert_extent - convert all bits in a given range from one bit to another
+ * @tree:       the io tree to search
+ * @start:      the start offset in bytes
+ * @end:        the end offset in bytes (inclusive)
+ * @bits:       the bits to set in this range
+ * @clear_bits: the bits to clear in this range
+ * @mask:       the allocation mask
+ *
+ * This will go through and set bits for the given range.  If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits.  This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY.  This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                       int bits, int clear_bits, gfp_t mask)
+{
+        struct extent_state *state;
+        struct extent_state *prealloc = NULL;
+        struct rb_node *node;
+        int err = 0;
+        u64 last_start;
+        u64 last_end;
+again:
+        if (!prealloc && (mask & __GFP_WAIT)) {
+                prealloc = alloc_extent_state(mask);
+                if (!prealloc)
+                        return -ENOMEM;
+        }
+        spin_lock(&tree->lock);
+        /*
+         * this search will find all the extents that end after
+         * our range starts.
+         */
+        node = tree_search(tree, start);
+        if (!node) {
+                prealloc = alloc_extent_state_atomic(prealloc);
+                if (!prealloc)
+                        return -ENOMEM;
+                err = insert_state(tree, prealloc, start, end, &bits);
+                prealloc = NULL;
+                BUG_ON(err == -EEXIST);
+                goto out;
+        }
+        state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+        last_start = state->start;
+        last_end = state->end;
+        /*
+         * | ---- desired range ---- |
+         * | state |
+         *
+         * Just lock what we found and keep going
+         */
+        if (state->start == start && state->end <= end) {
+                struct rb_node *next_node;
+                set_state_bits(tree, state, &bits);
+                clear_state_bit(tree, state, &clear_bits, 0);
+                merge_state(tree, state);
+                if (last_end == (u64)-1)
+                        goto out;
+                start = last_end + 1;
+                next_node = rb_next(&state->rb_node);
+                if (next_node && start < end && prealloc && !need_resched()) {
+                        state = rb_entry(next_node, struct extent_state,
+                                         rb_node);
+                        if (state->start == start)
+                                goto hit_next;
+                }
+                goto search_again;
+        }
+        /*
+         *     | ---- desired range ---- |
+         * | state |
+         *   or
+         * | ------------- state -------------- |
+         *
+         * We need to split the extent we found, and may flip bits on
+         * second half.
+         *
+         * If the extent we found extends past our
+         * range, we just split and search again.  It'll get split
+         * again the next time though.
+         *
+         * If the extent we found is inside our range, we set the
+         * desired bit on it.
+         */
+        if (state->start < start) {
+                prealloc = alloc_extent_state_atomic(prealloc);
+                if (!prealloc)
+                        return -ENOMEM;
+                err = split_state(tree, state, prealloc, start);
+                BUG_ON(err == -EEXIST);
+                prealloc = NULL;
+                if (err)
+                        goto out;
+                if (state->end <= end) {
+                        set_state_bits(tree, state, &bits);
+                        clear_state_bit(tree, state, &clear_bits, 0);
+                        merge_state(tree, state);
+                        if (last_end == (u64)-1)
+                                goto out;
+                        start = last_end + 1;
+                }
+                goto search_again;
+        }
+        /*
+         * | ---- desired range ---- |
+         *     | state | or               | state |
+         *
+         * There's a hole, we need to insert something in it and
+         * ignore the extent we found.
+         */
+        if (state->start > start) {
+                u64 this_end;
+                if (end < last_start)
+                        this_end = end;
+                else
+                        this_end = last_start - 1;
+                prealloc = alloc_extent_state_atomic(prealloc);
+                if (!prealloc)
+                        return -ENOMEM;
+                /*
+                 * Avoid to free 'prealloc' if it can be merged with
+                 * the later extent.
+                 */
+                err = insert_state(tree, prealloc, start, this_end,
+                                   &bits);
+                BUG_ON(err == -EEXIST);
+                if (err) {
+                        free_extent_state(prealloc);
+                        prealloc = NULL;
+                        goto out;
+                }
+                prealloc = NULL;
+                start = this_end + 1;
+                goto search_again;
+        }
+        /*
+         * | ---- desired range ---- |
+         *                        | state |
+         * We need to split the extent, and set the bit
+         * on the first half
+         */
+        if (state->start <= end && state->end > end) {
+                prealloc = alloc_extent_state_atomic(prealloc);
+                if (!prealloc)
+                        return -ENOMEM;
+                err = split_state(tree, state, prealloc, end + 1);
+                BUG_ON(err == -EEXIST);
+                set_state_bits(tree, prealloc, &bits);
+                clear_state_bit(tree, prealloc, &clear_bits, 0);
+                merge_state(tree, prealloc);
+                prealloc = NULL;
+                goto out;
+        }
+        goto search_again;
+out:
+        spin_unlock(&tree->lock);
+        if (prealloc)
+                free_extent_state(prealloc);
+        return err;
+search_again:
+        if (start > end)
+                goto out;
+        spin_unlock(&tree->lock);
+        if (mask & __GFP_WAIT)
+                cond_resched();
+        goto again;
+}
 /* wrappers around set/clear extent bit */
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask)
@@ -920,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask)
 {
        return set_extent_bit(tree, start, end,
-                              EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+                              EXTENT_DELALLOC | EXTENT_UPTODATE,
                              0, NULL, cached_state, mask);
 }
@@ -2102,7 +2290,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                        if (tree->ops && tree->ops->readpage_io_failed_hook)
                                ret = tree->ops->readpage_io_failed_hook(
                                                bio, page, start, end,
-                                                failed_mirror, NULL);
+                                                failed_mirror, state);
                        else
                                ret = bio_readpage_error(bio, page, start, end,
                                                         failed_mirror, NULL);
@@ -2511,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        int compressed;
        int write_flags;
        unsigned long nr_written = 0;
+        bool fill_delalloc = true;
        if (wbc->sync_mode == WB_SYNC_ALL)
                write_flags = WRITE_SYNC;
@@ -2520,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        trace___extent_writepage(page, inode, wbc);
        WARN_ON(!PageLocked(page));
+        ClearPageError(page);
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
           (page->index == end_index && !pg_offset)) {
@@ -2541,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        set_page_extent_mapped(page);
+        if (!tree->ops || !tree->ops->fill_delalloc)
+                fill_delalloc = false;
        delalloc_start = start;
        delalloc_end = 0;
        page_started = 0;
-        if (!epd->extent_locked) {
+        if (!epd->extent_locked && fill_delalloc) {
                u64 delalloc_to_write = 0;
                /*
                 * make sure the wbc mapping index is at least updated
@@ -2796,10 +2991,16 @@ retry:
                         * swizzled back from swapper_space to tmpfs file
                         * mapping
                         */
-                        if (tree->ops && tree->ops->write_cache_pages_lock_hook)
+                        if (tree->ops &&
-                                tree->ops->write_cache_pages_lock_hook(page);
+                            tree->ops->write_cache_pages_lock_hook) {
-                        else
+                                tree->ops->write_cache_pages_lock_hook(page,
-                                lock_page(page);
+                                                               data, flush_fn);
+                        } else {
+                                if (!trylock_page(page)) {
+                                        flush_fn(data);
+                                        lock_page(page);
+                                }
+                        }
                        if (unlikely(page->mapping != mapping)) {
                                unlock_page(page);
@@ -3579,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                                                PAGECACHE_TAG_DIRTY);
                }
                spin_unlock_irq(&page->mapping->tree_lock);
+                ClearPageError(page);
                unlock_page(page);
        }
        return 0;
@@ -3724,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 }
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-                             struct extent_buffer *eb,
+                             struct extent_buffer *eb, u64 start, int wait,
-                             u64 start, int wait,
                             get_extent_t *get_extent, int mirror_num)
 {
        unsigned long i;
@@ -3761,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
-                if (!wait) {
+                if (wait == WAIT_NONE) {
                        if (!trylock_page(page))
                                goto unlock_exit;
                } else {
@@ -3805,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        if (bio)
                submit_one_bio(READ, bio, mirror_num, bio_flags);
-        if (ret || !wait)
+        if (ret || wait != WAIT_COMPLETE)
                return ret;
        for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a8e20b672922..feb9be0e23bc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,7 +17,8 @@
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
 #define EXTENT_FIRST_DELALLOC (1 << 12)
-#define EXTENT_DAMAGED (1 << 13)
+#define EXTENT_NEED_WAIT (1 << 13)
+#define EXTENT_DAMAGED (1 << 14)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
@@ -33,6 +34,7 @@
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
 #define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4       /* this got triggered by readahead */
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -86,7 +88,8 @@ struct extent_io_ops {
                                  struct extent_state *other);
        void (*split_extent_hook)(struct inode *inode,
                                  struct extent_state *orig, u64 split);
-        int (*write_cache_pages_lock_hook)(struct page *page);
+        int (*write_cache_pages_lock_hook)(struct page *page, void *data,
+                                           void (*flush_fn)(void *));
 };
 struct extent_io_tree {
@@ -215,6 +218,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask);
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+                       int bits, int clear_bits, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
@@ -249,6 +254,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
                                         u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
+#define WAIT_NONE       0
+#define WAIT_COMPLETE   1
+#define WAIT_PAGE_LOCK  2
 int read_extent_buffer_pages(struct extent_io_tree *tree,
                             struct extent_buffer *eb, u64 start, int wait,
                             get_extent_t *get_extent, int mirror_num);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
        struct btrfs_csum_item *item;
        struct extent_buffer *leaf;
        u64 csum_offset = 0;
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        int csums_in_item;
        file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
        u64 item_last_offset = 0;
        u64 disk_bytenr;
        u32 diff;
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        int ret;
        struct btrfs_path *path;
        struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        int ret;
        size_t size;
        u64 csum_end;
-        u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
        path = btrfs_alloc_path();
        if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
                                      u64 bytenr, u64 len)
 {
        struct extent_buffer *leaf;
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        u64 csum_end;
        u64 end_byte = bytenr + len;
        u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
        u64 csum_end;
        struct extent_buffer *leaf;
        int ret;
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
        root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
        struct btrfs_sector_sum *sector_sum;
        u32 nritems;
        u32 ins_size;
-        u16 csum_size =
+        u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
-                btrfs_super_csum_size(&root->fs_info->super_copy);
        path = btrfs_alloc_path();
        if (!path)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a381cd22f518..f2e928289600 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1036,11 +1036,13 @@ out:
 * on error we return an unlocked page and the error value
 * on success we return a locked page and 0
 */
-static int prepare_uptodate_page(struct page *page, u64 pos)
+static int prepare_uptodate_page(struct page *page, u64 pos,
+                                 bool force_uptodate)
 {
        int ret = 0;
-        if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+        if (((pos & (PAGE_CACHE_SIZE - 1)) || force_uptodate) &&
+            !PageUptodate(page)) {
                ret = btrfs_readpage(NULL, page);
                if (ret)
                        return ret;
@@ -1061,12 +1063,13 @@ static int prepare_uptodate_page(struct page *page, u64 pos)
 static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
                         struct page **pages, size_t num_pages,
                         loff_t pos, unsigned long first_index,
-                         size_t write_bytes)
+                         size_t write_bytes, bool force_uptodate)
 {
        struct extent_state *cached_state = NULL;
        int i;
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = fdentry(file)->d_inode;
+        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
        int err = 0;
        int faili = 0;
        u64 start_pos;
@@ -1078,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = find_or_create_page(inode->i_mapping, index + i,
-                                               GFP_NOFS);
+                                               mask);
                if (!pages[i]) {
                        faili = i - 1;
                        err = -ENOMEM;
@@ -1086,10 +1089,11 @@ again:
                }
                if (i == 0)
-                        err = prepare_uptodate_page(pages[i], pos);
+                        err = prepare_uptodate_page(pages[i], pos,
+                                                    force_uptodate);
                if (i == num_pages - 1)
                        err = prepare_uptodate_page(pages[i],
-                                                    pos + write_bytes);
+                                                    pos + write_bytes, false);
                if (err) {
                        page_cache_release(pages[i]);
                        faili = i - 1;
@@ -1158,6 +1162,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
        size_t num_written = 0;
        int nrptrs;
        int ret = 0;
+        bool force_page_uptodate = false;
        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
@@ -1200,7 +1205,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                 * contents of pages from loop to loop
                 */
                ret = prepare_pages(root, file, pages, num_pages,
-                                    pos, first_index, write_bytes);
+                                    pos, first_index, write_bytes,
+                                    force_page_uptodate);
                if (ret) {
                        btrfs_delalloc_release_space(inode,
                                        num_pages << PAGE_CACHE_SHIFT);
@@ -1217,12 +1223,15 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
                if (copied < write_bytes)
                        nrptrs = 1;
-                if (copied == 0)
+                if (copied == 0) {
+                        force_page_uptodate = true;
                        dirty_pages = 0;
-                else
+                } else {
+                        force_page_uptodate = false;
                        dirty_pages = (copied + offset +
                                       PAGE_CACHE_SIZE - 1) >>
                                       PAGE_CACHE_SHIFT;
+                }
                /*
                 * If we had a short copy we need to release the excess delaloc
@@ -1607,10 +1616,6 @@ static long btrfs_fallocate(struct file *file, int mode,
                        goto out;
        }
-        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-        if (ret)
-                goto out;
        locked_end = alloc_end - 1;
        while (1) {
                struct btrfs_ordered_extent *ordered;
@@ -1656,11 +1661,27 @@ static long btrfs_fallocate(struct file *file, int mode,
                if (em->block_start == EXTENT_MAP_HOLE ||
                    (cur_offset >= inode->i_size &&
                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                        /*
+                         * Make sure we have enough space before we do the
+                         * allocation.
+                         */
+                        ret = btrfs_check_data_free_space(inode, last_byte -
+                                                          cur_offset);
+                        if (ret) {
+                                free_extent_map(em);
+                                break;
+                        }
                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
                                                        last_byte - cur_offset,
                                                        1 << inode->i_blkbits,
                                                        offset + len,
                                                        &alloc_hint);
+                        /* Let go of our reservation. */
+                        btrfs_free_reserved_data_space(inode, last_byte -
+                                                       cur_offset);
                        if (ret < 0) {
                                free_extent_map(em);
                                break;
@@ -1686,8 +1707,6 @@ static long btrfs_fallocate(struct file *file, int mode,
        }
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
                             &cached_state, GFP_NOFS);
-        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 out:
        mutex_unlock(&inode->i_mutex);
        return ret;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..7a15fcfb3e1f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/ratelimit.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
@@ -84,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
                                      *block_group, struct btrfs_path *path)
 {
        struct inode *inode = NULL;
+        u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
        spin_lock(&block_group->lock);
        if (block_group->inode)
@@ -98,13 +100,14 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
                return inode;
        spin_lock(&block_group->lock);
-        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
+        if (!((BTRFS_I(inode)->flags & flags) == flags)) {
                printk(KERN_INFO "Old style space inode found, converting.\n");
-                BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
+                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
+                        BTRFS_INODE_NODATACOW;
                block_group->disk_cache_state = BTRFS_DC_CLEAR;
        }
-        if (!btrfs_fs_closing(root->fs_info)) {
+        if (!block_group->iref) {
                block_group->inode = igrab(inode);
                block_group->iref = 1;
        }
@@ -122,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
        struct btrfs_free_space_header *header;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
+        u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
        int ret;
        ret = btrfs_insert_empty_inode(trans, root, path, ino);
        if (ret)
                return ret;
+        /* We inline crc's for the free disk space cache */
+        if (ino != BTRFS_FREE_INO_OBJECTID)
+                flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
@@ -140,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
        btrfs_set_inode_uid(leaf, inode_item, 0);
        btrfs_set_inode_gid(leaf, inode_item, 0);
        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
-        btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
+        btrfs_set_inode_flags(leaf, inode_item, flags);
-                              BTRFS_INODE_PREALLOC);
        btrfs_set_inode_nlink(leaf, inode_item, 1);
        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
        btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -191,16 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
                                    struct inode *inode)
 {
        struct btrfs_block_rsv *rsv;
+        u64 needed_bytes;
        loff_t oldsize;
        int ret = 0;
        rsv = trans->block_rsv;
-        trans->block_rsv = root->orphan_block_rsv;
+        trans->block_rsv = &root->fs_info->global_block_rsv;
-        ret = btrfs_block_rsv_check(trans, root,
-                                    root->orphan_block_rsv,
+        /* 1 for slack space, 1 for updating the inode */
-                                    0, 5);
+        needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
-        if (ret)
+                btrfs_calc_trans_metadata_size(root, 1);
-                return ret;
+        spin_lock(&trans->block_rsv->lock);
+        if (trans->block_rsv->reserved < needed_bytes) {
+                spin_unlock(&trans->block_rsv->lock);
+                trans->block_rsv = rsv;
+                return -ENOSPC;
+        }
+        spin_unlock(&trans->block_rsv->lock);
        oldsize = i_size_read(inode);
        btrfs_i_size_write(inode, 0);
@@ -213,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
        ret = btrfs_truncate_inode_items(trans, root, inode,
                                         0, BTRFS_EXTENT_DATA_KEY);
-        trans->block_rsv = rsv;
        if (ret) {
+                trans->block_rsv = rsv;
                WARN_ON(1);
                return ret;
        }
        ret = btrfs_update_inode(trans, root, inode);
+        trans->block_rsv = rsv;
        return ret;
 }
@@ -242,26 +259,342 @@ static int readahead_cache(struct inode *inode)
        return 0;
 }
+struct io_ctl {
+        void *cur, *orig;
+        struct page *page;
+        struct page **pages;
+        struct btrfs_root *root;
+        unsigned long size;
+        int index;
+        int num_pages;
+        unsigned check_crcs:1;
+};
+static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+                       struct btrfs_root *root)
+{
+        memset(io_ctl, 0, sizeof(struct io_ctl));
+        io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT;
+        io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
+                                GFP_NOFS);
+        if (!io_ctl->pages)
+                return -ENOMEM;
+        io_ctl->root = root;
+        if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+                io_ctl->check_crcs = 1;
+        return 0;
+}
+static void io_ctl_free(struct io_ctl *io_ctl)
+{
+        kfree(io_ctl->pages);
+}
+static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+{
+        if (io_ctl->cur) {
+                kunmap(io_ctl->page);
+                io_ctl->cur = NULL;
+                io_ctl->orig = NULL;
+        }
+}
+static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+{
+        WARN_ON(io_ctl->cur);
+        BUG_ON(io_ctl->index >= io_ctl->num_pages);
+        io_ctl->page = io_ctl->pages[io_ctl->index++];
+        io_ctl->cur = kmap(io_ctl->page);
+        io_ctl->orig = io_ctl->cur;
+        io_ctl->size = PAGE_CACHE_SIZE;
+        if (clear)
+                memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+}
+static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+{
+        int i;
+        io_ctl_unmap_page(io_ctl);
+        for (i = 0; i < io_ctl->num_pages; i++) {
+                ClearPageChecked(io_ctl->pages[i]);
+                unlock_page(io_ctl->pages[i]);
+                page_cache_release(io_ctl->pages[i]);
+        }
+}
+static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+                                int uptodate)
+{
+        struct page *page;
+        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+        int i;
+        for (i = 0; i < io_ctl->num_pages; i++) {
+                page = find_or_create_page(inode->i_mapping, i, mask);
+                if (!page) {
+                        io_ctl_drop_pages(io_ctl);
+                        return -ENOMEM;
+                }
+                io_ctl->pages[i] = page;
+                if (uptodate && !PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                printk(KERN_ERR "btrfs: error reading free "
+                                       "space cache\n");
+                                io_ctl_drop_pages(io_ctl);
+                                return -EIO;
+                        }
+                }
+        }
+        return 0;
+}
+static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+{
+        u64 *val;
+        io_ctl_map_page(io_ctl, 1);
+        /*
+         * Skip the csum areas.  If we don't check crcs then we just have a
+         * 64bit chunk at the front of the first page.
+         */
+        if (io_ctl->check_crcs) {
+                io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+                io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+        } else {
+                io_ctl->cur += sizeof(u64);
+                io_ctl->size -= sizeof(u64) * 2;
+        }
+        val = io_ctl->cur;
+        *val = cpu_to_le64(generation);
+        io_ctl->cur += sizeof(u64);
+}
+static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+{
+        u64 *gen;
+        /*
+         * Skip the crc area.  If we don't check crcs then we just have a 64bit
+         * chunk at the front of the first page.
+         */
+        if (io_ctl->check_crcs) {
+                io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+                io_ctl->size -= sizeof(u64) +
+                        (sizeof(u32) * io_ctl->num_pages);
+        } else {
+                io_ctl->cur += sizeof(u64);
+                io_ctl->size -= sizeof(u64) * 2;
+        }
+        gen = io_ctl->cur;
+        if (le64_to_cpu(*gen) != generation) {
+                printk_ratelimited(KERN_ERR "btrfs: space cache generation "
+                                   "(%Lu) does not match inode (%Lu)\n", *gen,
+                                   generation);
+                io_ctl_unmap_page(io_ctl);
+                return -EIO;
+        }
+        io_ctl->cur += sizeof(u64);
+        return 0;
+}
+static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+{
+        u32 *tmp;
+        u32 crc = ~(u32)0;
+        unsigned offset = 0;
+        if (!io_ctl->check_crcs) {
+                io_ctl_unmap_page(io_ctl);
+                return;
+        }
+        if (index == 0)
+                offset = sizeof(u32) * io_ctl->num_pages;;
+        crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+                              PAGE_CACHE_SIZE - offset);
+        btrfs_csum_final(crc, (char *)&crc);
+        io_ctl_unmap_page(io_ctl);
+        tmp = kmap(io_ctl->pages[0]);
+        tmp += index;
+        *tmp = crc;
+        kunmap(io_ctl->pages[0]);
+}
+static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+{
+        u32 *tmp, val;
+        u32 crc = ~(u32)0;
+        unsigned offset = 0;
+        if (!io_ctl->check_crcs) {
+                io_ctl_map_page(io_ctl, 0);
+                return 0;
+        }
+        if (index == 0)
+                offset = sizeof(u32) * io_ctl->num_pages;
+        tmp = kmap(io_ctl->pages[0]);
+        tmp += index;
+        val = *tmp;
+        kunmap(io_ctl->pages[0]);
+        io_ctl_map_page(io_ctl, 0);
+        crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+                              PAGE_CACHE_SIZE - offset);
+        btrfs_csum_final(crc, (char *)&crc);
+        if (val != crc) {
+                printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
+                                   "space cache\n");
+                io_ctl_unmap_page(io_ctl);
+                return -EIO;
+        }
+        return 0;
+}
+static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+                            void *bitmap)
+{
+        struct btrfs_free_space_entry *entry;
+        if (!io_ctl->cur)
+                return -ENOSPC;
+        entry = io_ctl->cur;
+        entry->offset = cpu_to_le64(offset);
+        entry->bytes = cpu_to_le64(bytes);
+        entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
+                BTRFS_FREE_SPACE_EXTENT;
+        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+                return 0;
+        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+        /* No more pages to map */
+        if (io_ctl->index >= io_ctl->num_pages)
+                return 0;
+        /* map the next page */
+        io_ctl_map_page(io_ctl, 1);
+        return 0;
+}
+static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+{
+        if (!io_ctl->cur)
+                return -ENOSPC;
+        /*
+         * If we aren't at the start of the current page, unmap this one and
+         * map the next one if there is any left.
+         */
+        if (io_ctl->cur != io_ctl->orig) {
+                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+                if (io_ctl->index >= io_ctl->num_pages)
+                        return -ENOSPC;
+                io_ctl_map_page(io_ctl, 0);
+        }
+        memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+        io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+        if (io_ctl->index < io_ctl->num_pages)
+                io_ctl_map_page(io_ctl, 0);
+        return 0;
+}
+static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+{
+        /*
+         * If we're not on the boundary we know we've modified the page and we
+         * need to crc the page.
+         */
+        if (io_ctl->cur != io_ctl->orig)
+                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+        else
+                io_ctl_unmap_page(io_ctl);
+        while (io_ctl->index < io_ctl->num_pages) {
+                io_ctl_map_page(io_ctl, 1);
+                io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+        }
+}
+static int io_ctl_read_entry(struct io_ctl *io_ctl,
+                            struct btrfs_free_space *entry, u8 *type)
+{
+        struct btrfs_free_space_entry *e;
+        e = io_ctl->cur;
+        entry->offset = le64_to_cpu(e->offset);
+        entry->bytes = le64_to_cpu(e->bytes);
+        *type = e->type;
+        io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+        io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+        if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+                return 0;
+        io_ctl_unmap_page(io_ctl);
+        if (io_ctl->index >= io_ctl->num_pages)
+                return 0;
+        return io_ctl_check_crc(io_ctl, io_ctl->index);
+}
+static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+                              struct btrfs_free_space *entry)
+{
+        int ret;
+        if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
+                io_ctl_unmap_page(io_ctl);
+        ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+        if (ret)
+                return ret;
+        memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+        io_ctl_unmap_page(io_ctl);
+        return 0;
+}
 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                            struct btrfs_free_space_ctl *ctl,
                            struct btrfs_path *path, u64 offset)
 {
        struct btrfs_free_space_header *header;
        struct extent_buffer *leaf;
-        struct page *page;
+        struct io_ctl io_ctl;
        struct btrfs_key key;
+        struct btrfs_free_space *e, *n;
        struct list_head bitmaps;
        u64 num_entries;
        u64 num_bitmaps;
        u64 generation;
-        pgoff_t index = 0;
+        u8 type;
        int ret = 0;
        INIT_LIST_HEAD(&bitmaps);
        /* Nothing in the space cache, goodbye */
        if (!i_size_read(inode))
-                goto out;
+                return 0;
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
@@ -269,11 +602,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
-                goto out;
+                return 0;
        else if (ret > 0) {
                btrfs_release_path(path);
-                ret = 0;
+                return 0;
-                goto out;
        }
        ret = -1;
@@ -291,169 +623,100 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
                       " not match free space cache generation (%llu)\n",
                       (unsigned long long)BTRFS_I(inode)->generation,
                       (unsigned long long)generation);
-                goto out;
+                return 0;
        }
        if (!num_entries)
-                goto out;
+                return 0;
+        io_ctl_init(&io_ctl, inode, root);
        ret = readahead_cache(inode);
        if (ret)
                goto out;
-        while (1) {
+        ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
-                struct btrfs_free_space_entry *entry;
+        if (ret)
-                struct btrfs_free_space *e;
+                goto out;
-                void *addr;
-                unsigned long offset = 0;
-                int need_loop = 0;
-                if (!num_entries && !num_bitmaps)
+        ret = io_ctl_check_crc(&io_ctl, 0);
-                        break;
+        if (ret)
+                goto free_cache;
-                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+        ret = io_ctl_check_generation(&io_ctl, generation);
-                if (!page)
+        if (ret)
+                goto free_cache;
+        while (num_entries) {
+                e = kmem_cache_zalloc(btrfs_free_space_cachep,
+                                      GFP_NOFS);
+                if (!e)
                        goto free_cache;
-                if (!PageUptodate(page)) {
+                ret = io_ctl_read_entry(&io_ctl, e, &type);
-                        btrfs_readpage(NULL, page);
+                if (ret) {
-                        lock_page(page);
+                        kmem_cache_free(btrfs_free_space_cachep, e);
-                        if (!PageUptodate(page)) {
+                        goto free_cache;
-                                unlock_page(page);
-                                page_cache_release(page);
-                                printk(KERN_ERR "btrfs: error reading free "
-                                       "space cache\n");
-                                goto free_cache;
-                        }
                }
-                addr = kmap(page);
-                if (index == 0) {
+                if (!e->bytes) {
-                        u64 *gen;
+                        kmem_cache_free(btrfs_free_space_cachep, e);
+                        goto free_cache;
+                }
-                        /*
+                if (type == BTRFS_FREE_SPACE_EXTENT) {
-                         * We put a bogus crc in the front of the first page in
+                        spin_lock(&ctl->tree_lock);
-                         * case old kernels try to mount a fs with the new
+                        ret = link_free_space(ctl, e);
-                         * format to make sure they discard the cache.
+                        spin_unlock(&ctl->tree_lock);
-                         */
+                        if (ret) {
-                        addr += sizeof(u64);
+                                printk(KERN_ERR "Duplicate entries in "
-                        offset += sizeof(u64);
+                                       "free space cache, dumping\n");
+                                kmem_cache_free(btrfs_free_space_cachep, e);
-                        gen = addr;
-                        if (*gen != BTRFS_I(inode)->generation) {
-                                printk(KERN_ERR "btrfs: space cache generation"
-                                       " (%llu) does not match inode (%llu)\n",
-                                       (unsigned long long)*gen,
-                                       (unsigned long long)
-                                       BTRFS_I(inode)->generation);
-                                kunmap(page);
-                                unlock_page(page);
-                                page_cache_release(page);
                                goto free_cache;
                        }
-                        addr += sizeof(u64);
+                } else {
-                        offset += sizeof(u64);
+                        BUG_ON(!num_bitmaps);
-                }
+                        num_bitmaps--;
-                entry = addr;
+                        e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+                        if (!e->bitmap) {
-                while (1) {
+                                kmem_cache_free(
-                        if (!num_entries)
+                                        btrfs_free_space_cachep, e);
-                                break;
-                        need_loop = 1;
-                        e = kmem_cache_zalloc(btrfs_free_space_cachep,
-                                              GFP_NOFS);
-                        if (!e) {
-                                kunmap(page);
-                                unlock_page(page);
-                                page_cache_release(page);
                                goto free_cache;
                        }
+                        spin_lock(&ctl->tree_lock);
-                        e->offset = le64_to_cpu(entry->offset);
+                        ret = link_free_space(ctl, e);
-                        e->bytes = le64_to_cpu(entry->bytes);
+                        ctl->total_bitmaps++;
-                        if (!e->bytes) {
+                        ctl->op->recalc_thresholds(ctl);
-                                kunmap(page);
+                        spin_unlock(&ctl->tree_lock);
+                        if (ret) {
+                                printk(KERN_ERR "Duplicate entries in "
+                                       "free space cache, dumping\n");
                                kmem_cache_free(btrfs_free_space_cachep, e);
-                                unlock_page(page);
-                                page_cache_release(page);
                                goto free_cache;
                        }
+                        list_add_tail(&e->list, &bitmaps);
-                        if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
-                                spin_lock(&ctl->tree_lock);
-                                ret = link_free_space(ctl, e);
-                                spin_unlock(&ctl->tree_lock);
-                                if (ret) {
-                                        printk(KERN_ERR "Duplicate entries in "
-                                               "free space cache, dumping\n");
-                                        kunmap(page);
-                                        unlock_page(page);
-                                        page_cache_release(page);
-                                        goto free_cache;
-                                }
-                        } else {
-                                e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-                                if (!e->bitmap) {
-                                        kunmap(page);
-                                        kmem_cache_free(
-                                                btrfs_free_space_cachep, e);
-                                        unlock_page(page);
-                                        page_cache_release(page);
-                                        goto free_cache;
-                                }
-                                spin_lock(&ctl->tree_lock);
-                                ret = link_free_space(ctl, e);
-                                ctl->total_bitmaps++;
-                                ctl->op->recalc_thresholds(ctl);
-                                spin_unlock(&ctl->tree_lock);
-                                if (ret) {
-                                        printk(KERN_ERR "Duplicate entries in "
-                                               "free space cache, dumping\n");
-                                        kunmap(page);
-                                        unlock_page(page);
-                                        page_cache_release(page);
-                                        goto free_cache;
-                                }
-                                list_add_tail(&e->list, &bitmaps);
-                        }
-                        num_entries--;
-                        offset += sizeof(struct btrfs_free_space_entry);
-                        if (offset + sizeof(struct btrfs_free_space_entry) >=
-                            PAGE_CACHE_SIZE)
-                                break;
-                        entry++;
                }
-                /*
+                num_entries--;
-                 * We read an entry out of this page, we need to move on to the
+        }
-                 * next page.
-                 */
-                if (need_loop) {
-                        kunmap(page);
-                        goto next;
-                }
-                /*
+        /*
-                 * We add the bitmaps at the end of the entries in order that
+         * We add the bitmaps at the end of the entries in order that
-                 * the bitmap entries are added to the cache.
+         * the bitmap entries are added to the cache.
-                 */
+         */
-                e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+        list_for_each_entry_safe(e, n, &bitmaps, list) {
                list_del_init(&e->list);
-                memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
+                ret = io_ctl_read_bitmap(&io_ctl, e);
-                kunmap(page);
+                if (ret)
-                num_bitmaps--;
+                        goto free_cache;
-next:
-                unlock_page(page);
-                page_cache_release(page);
-                index++;
        }
+        io_ctl_drop_pages(&io_ctl);
        ret = 1;
 out:
+        io_ctl_free(&io_ctl);
        return ret;
 free_cache:
+        io_ctl_drop_pages(&io_ctl);
        __btrfs_remove_free_space_cache(ctl);
        goto out;
 }
@@ -465,7 +728,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
        struct btrfs_root *root = fs_info->tree_root;
        struct inode *inode;
        struct btrfs_path *path;
-        int ret;
+        int ret = 0;
        bool matched;
        u64 used = btrfs_block_group_used(&block_group->item);
@@ -497,6 +760,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
                return 0;
        }
+        /* We may have converted the inode and made the cache invalid. */
+        spin_lock(&block_group->lock);
+        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+                spin_unlock(&block_group->lock);
+                goto out;
+        }
+        spin_unlock(&block_group->lock);
        ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
                                      path, block_group->key.objectid);
        btrfs_free_path(path);
@@ -530,6 +801,19 @@ out:
        return ret;
 }
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount.  This will return 0 if it was successfull in writing the cache out,
+ * and -1 if it was not.
+ */
 int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                            struct btrfs_free_space_ctl *ctl,
                            struct btrfs_block_group_cache *block_group,
@@ -540,42 +824,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        struct extent_buffer *leaf;
        struct rb_node *node;
        struct list_head *pos, *n;
-        struct page **pages;
-        struct page *page;
        struct extent_state *cached_state = NULL;
        struct btrfs_free_cluster *cluster = NULL;
        struct extent_io_tree *unpin = NULL;
+        struct io_ctl io_ctl;
        struct list_head bitmap_list;
        struct btrfs_key key;
        u64 start, end, len;
-        u64 bytes = 0;
-        u32 crc = ~(u32)0;
-        int index = 0, num_pages = 0;
        int entries = 0;
        int bitmaps = 0;
-        int ret = -1;
+        int ret;
-        bool next_page = false;
+        int err = -1;
-        bool out_of_space = false;
        INIT_LIST_HEAD(&bitmap_list);
-        node = rb_first(&ctl->free_space_offset);
-        if (!node)
-                return 0;
        if (!i_size_read(inode))
                return -1;
-        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+        io_ctl_init(&io_ctl, inode, root);
-                PAGE_CACHE_SHIFT;
-        filemap_write_and_wait(inode->i_mapping);
-        btrfs_wait_ordered_range(inode, inode->i_size &
-                                 ~(root->sectorsize - 1), (u64)-1);
-        pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
-        if (!pages)
-                return -1;
        /* Get the cluster for this block_group if it exists */
        if (block_group && !list_empty(&block_group->cluster_list))
@@ -589,30 +855,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
         */
        unpin = root->fs_info->pinned_extents;
-        /*
+        /* Lock all pages first so we can lock the extent safely. */
-         * Lock all pages first so we can lock the extent safely.
+        io_ctl_prepare_pages(&io_ctl, inode, 0);
-         *
-         * NOTE: Because we hold the ref the entire time we're going to write to
-         * the page find_get_page should never fail, so we don't do a check
-         * after find_get_page at this point.  Just putting this here so people
-         * know and don't freak out.
-         */
-        while (index < num_pages) {
-                page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
-                if (!page) {
-                        int i;
-                        for (i = 0; i < num_pages; i++) {
-                                unlock_page(pages[i]);
-                                page_cache_release(pages[i]);
-                        }
-                        goto out;
-                }
-                pages[index] = page;
-                index++;
-        }
-        index = 0;
        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
                         0, &cached_state, GFP_NOFS);
@@ -623,189 +868,111 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        if (block_group)
                start = block_group->key.objectid;
-        /* Write out the extent entries */
+        node = rb_first(&ctl->free_space_offset);
-        do {
+        if (!node && cluster) {
-                struct btrfs_free_space_entry *entry;
+                node = rb_first(&cluster->root);
-                void *addr, *orig;
+                cluster = NULL;
-                unsigned long offset = 0;
+        }
-                next_page = false;
+        /* Make sure we can fit our crcs into the first page */
+        if (io_ctl.check_crcs &&
+            (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
+                WARN_ON(1);
+                goto out_nospc;
+        }
-                if (index >= num_pages) {
+        io_ctl_set_generation(&io_ctl, trans->transid);
-                        out_of_space = true;
-                        break;
-                }
-                page = pages[index];
+        /* Write out the extent entries */
+        while (node) {
+                struct btrfs_free_space *e;
-                orig = addr = kmap(page);
+                e = rb_entry(node, struct btrfs_free_space, offset_index);
-                if (index == 0) {
+                entries++;
-                        u64 *gen;
-                        /*
+                ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
-                         * We're going to put in a bogus crc for this page to
+                                       e->bitmap);
-                         * make sure that old kernels who aren't aware of this
+                if (ret)
-                         * format will be sure to discard the cache.
+                        goto out_nospc;
-                         */
-                        addr += sizeof(u64);
-                        offset += sizeof(u64);
-                        gen = addr;
+                if (e->bitmap) {
-                        *gen = trans->transid;
+                        list_add_tail(&e->list, &bitmap_list);
-                        addr += sizeof(u64);
+                        bitmaps++;
-                        offset += sizeof(u64);
                }
-                entry = addr;
+                node = rb_next(node);
+                if (!node && cluster) {
-                memset(addr, 0, PAGE_CACHE_SIZE - offset);
+                        node = rb_first(&cluster->root);
-                while (node && !next_page) {
+                        cluster = NULL;
-                        struct btrfs_free_space *e;
-                        e = rb_entry(node, struct btrfs_free_space, offset_index);
-                        entries++;
-                        entry->offset = cpu_to_le64(e->offset);
-                        entry->bytes = cpu_to_le64(e->bytes);
-                        if (e->bitmap) {
-                                entry->type = BTRFS_FREE_SPACE_BITMAP;
-                                list_add_tail(&e->list, &bitmap_list);
-                                bitmaps++;
-                        } else {
-                                entry->type = BTRFS_FREE_SPACE_EXTENT;
-                        }
-                        node = rb_next(node);
-                        if (!node && cluster) {
-                                node = rb_first(&cluster->root);
-                                cluster = NULL;
-                        }
-                        offset += sizeof(struct btrfs_free_space_entry);
-                        if (offset + sizeof(struct btrfs_free_space_entry) >=
-                            PAGE_CACHE_SIZE)
-                                next_page = true;
-                        entry++;
                }
+        }
-                /*
+        /*
-                 * We want to add any pinned extents to our free space cache
+         * We want to add any pinned extents to our free space cache
-                 * so we don't leak the space
+         * so we don't leak the space
-                 */
+         */
-                while (block_group && !next_page &&
+        while (block_group && (start < block_group->key.objectid +
-                       (start < block_group->key.objectid +
+                               block_group->key.offset)) {
-                        block_group->key.offset)) {
+                ret = find_first_extent_bit(unpin, start, &start, &end,
-                        ret = find_first_extent_bit(unpin, start, &start, &end,
+                                            EXTENT_DIRTY);
-                                                    EXTENT_DIRTY);
+                if (ret) {
-                        if (ret) {
+                        ret = 0;
-                                ret = 0;
+                        break;
-                                break;
-                        }
-                        /* This pinned extent is out of our range */
-                        if (start >= block_group->key.objectid +
-                            block_group->key.offset)
-                                break;
-                        len = block_group->key.objectid +
-                                block_group->key.offset - start;
-                        len = min(len, end + 1 - start);
-                        entries++;
-                        entry->offset = cpu_to_le64(start);
-                        entry->bytes = cpu_to_le64(len);
-                        entry->type = BTRFS_FREE_SPACE_EXTENT;
-                        start = end + 1;
-                        offset += sizeof(struct btrfs_free_space_entry);
-                        if (offset + sizeof(struct btrfs_free_space_entry) >=
-                            PAGE_CACHE_SIZE)
-                                next_page = true;
-                        entry++;
                }
-                /* Generate bogus crc value */
+                /* This pinned extent is out of our range */
-                if (index == 0) {
+                if (start >= block_group->key.objectid +
-                        u32 *tmp;
+                    block_group->key.offset)
-                        crc = btrfs_csum_data(root, orig + sizeof(u64), crc,
+                        break;
-                                              PAGE_CACHE_SIZE - sizeof(u64));
-                        btrfs_csum_final(crc, (char *)&crc);
-                        crc++;
-                        tmp = orig;
-                        *tmp = crc;
-                }
-                kunmap(page);
+                len = block_group->key.objectid +
+                        block_group->key.offset - start;
+                len = min(len, end + 1 - start);
-                bytes += PAGE_CACHE_SIZE;
+                entries++;
+                ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+                if (ret)
+                        goto out_nospc;
-                index++;
+                start = end + 1;
-        } while (node || next_page);
+        }
        /* Write out the bitmaps */
        list_for_each_safe(pos, n, &bitmap_list) {
-                void *addr;
                struct btrfs_free_space *entry =
                        list_entry(pos, struct btrfs_free_space, list);
-                if (index >= num_pages) {
+                ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
-                        out_of_space = true;
+                if (ret)
-                        break;
+                        goto out_nospc;
-                }
-                page = pages[index];
-                addr = kmap(page);
-                memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
-                kunmap(page);
-                bytes += PAGE_CACHE_SIZE;
                list_del_init(&entry->list);
-                index++;
-        }
-        if (out_of_space) {
-                btrfs_drop_pages(pages, num_pages);
-                unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
-                                     i_size_read(inode) - 1, &cached_state,
-                                     GFP_NOFS);
-                ret = 0;
-                goto out;
        }
        /* Zero out the rest of the pages just to make sure */
-        while (index < num_pages) {
+        io_ctl_zero_remaining_pages(&io_ctl);
-                void *addr;
-                page = pages[index];
-                addr = kmap(page);
-                memset(addr, 0, PAGE_CACHE_SIZE);
-                kunmap(page);
-                bytes += PAGE_CACHE_SIZE;
-                index++;
-        }
-        ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
+        ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
-                                            bytes, &cached_state);
+                                0, i_size_read(inode), &cached_state);
-        btrfs_drop_pages(pages, num_pages);
+        io_ctl_drop_pages(&io_ctl);
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
-        if (ret) {
+        if (ret)
-                ret = 0;
                goto out;
-        }
-        BTRFS_I(inode)->generation = trans->transid;
-        filemap_write_and_wait(inode->i_mapping);
+        ret = filemap_write_and_wait(inode->i_mapping);
+        if (ret)
+                goto out;
        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
        key.offset = offset;
        key.type = 0;
-        ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret < 0) {
-                ret = -1;
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
-                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+                                 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
-                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 GFP_NOFS);
-                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
                goto out;
        }
        leaf = path->nodes[0];
@@ -816,15 +983,16 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
                    found_key.offset != offset) {
-                        ret = -1;
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
-                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+                                         inode->i_size - 1,
-                                         EXTENT_DIRTY | EXTENT_DELALLOC |
+                                         EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
-                                         EXTENT_DO_ACCOUNTING, 0, 0, NULL,
+                                         NULL, GFP_NOFS);
-                                         GFP_NOFS);
                        btrfs_release_path(path);
                        goto out;
                }
        }
+        BTRFS_I(inode)->generation = trans->transid;
        header = btrfs_item_ptr(leaf, path->slots[0],
                                struct btrfs_free_space_header);
        btrfs_set_free_space_entries(leaf, header, entries);
@@ -833,16 +1001,26 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_release_path(path);
-        ret = 1;
+        err = 0;
 out:
-        kfree(pages);
+        io_ctl_free(&io_ctl);
-        if (ret != 1) {
+        if (err) {
-                invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+                invalidate_inode_pages2(inode->i_mapping);
                BTRFS_I(inode)->generation = 0;
        }
        btrfs_update_inode(trans, root, inode);
-        return ret;
+        return err;
+out_nospc:
+        list_for_each_safe(pos, n, &bitmap_list) {
+                struct btrfs_free_space *entry =
+                        list_entry(pos, struct btrfs_free_space, list);
+                list_del_init(&entry->list);
+        }
+        io_ctl_drop_pages(&io_ctl);
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+        goto out;
 }
 int btrfs_write_out_cache(struct btrfs_root *root,
@@ -869,14 +1047,15 @@ int btrfs_write_out_cache(struct btrfs_root *root,
        ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
                                      path, block_group->key.objectid);
-        if (ret < 0) {
+        if (ret) {
                spin_lock(&block_group->lock);
                block_group->disk_cache_state = BTRFS_DC_ERROR;
                spin_unlock(&block_group->lock);
                ret = 0;
+#ifdef DEBUG
                printk(KERN_ERR "btrfs: failed to write free space cace "
                       "for block group %llu\n", block_group->key.objectid);
+#endif
        }
        iput(inode);
@@ -1701,6 +1880,7 @@ again:
                        ctl->total_bitmaps--;
                }
                kmem_cache_free(btrfs_free_space_cachep, info);
+                ret = 0;
                goto out_lock;
        }
@@ -1708,7 +1888,8 @@ again:
                unlink_free_space(ctl, info);
                info->offset += bytes;
                info->bytes -= bytes;
-                link_free_space(ctl, info);
+                ret = link_free_space(ctl, info);
+                WARN_ON(ret);
                goto out_lock;
        }
@@ -2472,9 +2653,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                spin_unlock(&ctl->tree_lock);
                if (bytes >= minlen) {
-                        int update_ret;
+                        struct btrfs_space_info *space_info;
-                        update_ret = btrfs_update_reserved_bytes(block_group,
+                        int update = 0;
-                                                                 bytes, 1, 1);
+                        space_info = block_group->space_info;
+                        spin_lock(&space_info->lock);
+                        spin_lock(&block_group->lock);
+                        if (!block_group->ro) {
+                                block_group->reserved += bytes;
+                                space_info->bytes_reserved += bytes;
+                                update = 1;
+                        }
+                        spin_unlock(&block_group->lock);
+                        spin_unlock(&space_info->lock);
                        ret = btrfs_error_discard_extent(fs_info->extent_root,
                                                         start,
@@ -2482,9 +2673,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
                                                         &actually_trimmed);
                        btrfs_add_free_space(block_group, start, bytes);
-                        if (!update_ret)
+                        if (update) {
-                                btrfs_update_reserved_bytes(block_group,
+                                spin_lock(&space_info->lock);
-                                                            bytes, 0, 1);
+                                spin_lock(&block_group->lock);
+                                if (block_group->ro)
+                                        space_info->bytes_readonly += bytes;
+                                block_group->reserved -= bytes;
+                                space_info->bytes_reserved -= bytes;
+                                spin_unlock(&space_info->lock);
+                                spin_unlock(&block_group->lock);
+                        }
                        if (ret)
                                break;
@@ -2643,9 +2841,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
                return 0;
        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
-        if (ret < 0)
+        if (ret) {
+                btrfs_delalloc_release_metadata(inode, inode->i_size);
+#ifdef DEBUG
                printk(KERN_ERR "btrfs: failed to write free ino cache "
                       "for root %llu\n", root->root_key.objectid);
+#endif
+        }
        iput(inode);
        return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..53dcbdf446cd 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -465,14 +465,16 @@ again:
        /* Just to make sure we have enough space */
        prealloc += 8 * PAGE_CACHE_SIZE;
-        ret = btrfs_check_data_free_space(inode, prealloc);
+        ret = btrfs_delalloc_reserve_space(inode, prealloc);
        if (ret)
                goto out_put;
        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
                                              prealloc, prealloc, &alloc_hint);
-        if (ret)
+        if (ret) {
+                btrfs_delalloc_release_space(inode, prealloc);
                goto out_put;
+        }
        btrfs_free_reserved_data_space(inode, prealloc);
 out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 9327f45434e8..9d0eaa57d4ee 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -393,7 +393,10 @@ again:
             (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
-                BUG_ON(!pages);
+                if (!pages) {
+                        /* just bail out to the uncompressed code */
+                        goto cont;
+                }
                if (BTRFS_I(inode)->force_compress)
                        compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +427,7 @@ again:
                        will_compress = 1;
                }
        }
+cont:
        if (start == 0) {
                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
@@ -820,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode,
        }
        BUG_ON(disk_num_bytes >
-               btrfs_super_total_bytes(&root->fs_info->super_copy));
+               btrfs_super_total_bytes(root->fs_info->super_copy));
        alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
        btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
@@ -1792,12 +1796,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        }
        ret = 0;
 out:
-        if (nolock) {
+        if (root != root->fs_info->tree_root)
-                if (trans)
-                        btrfs_end_transaction_nolock(trans, root);
-        } else {
                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-                if (trans)
+        if (trans) {
+                if (nolock)
+                        btrfs_end_transaction_nolock(trans, root);
+                else
                        btrfs_end_transaction(trans, root);
        }
@@ -1931,89 +1935,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
        up_read(&root->fs_info->cleanup_work_sem);
 }
-/*
- * calculate extra metadata reservation when snapshotting a subvolume
- * contains orphan files.
- */
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
-                                struct btrfs_pending_snapshot *pending,
-                                u64 *bytes_to_reserve)
-{
-        struct btrfs_root *root;
-        struct btrfs_block_rsv *block_rsv;
-        u64 num_bytes;
-        int index;
-        root = pending->root;
-        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
-                return;
-        block_rsv = root->orphan_block_rsv;
-        /* orphan block reservation for the snapshot */
-        num_bytes = block_rsv->size;
-        /*
-         * after the snapshot is created, COWing tree blocks may use more
-         * space than it frees. So we should make sure there is enough
-         * reserved space.
-         */
-        index = trans->transid & 0x1;
-        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
-                num_bytes += block_rsv->size -
-                             (block_rsv->reserved + block_rsv->freed[index]);
-        }
-        *bytes_to_reserve += num_bytes;
-}
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
-                                struct btrfs_pending_snapshot *pending)
-{
-        struct btrfs_root *root = pending->root;
-        struct btrfs_root *snap = pending->snap;
-        struct btrfs_block_rsv *block_rsv;
-        u64 num_bytes;
-        int index;
-        int ret;
-        if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
-                return;
-        /* refill source subvolume's orphan block reservation */
-        block_rsv = root->orphan_block_rsv;
-        index = trans->transid & 0x1;
-        if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
-                num_bytes = block_rsv->size -
-                            (block_rsv->reserved + block_rsv->freed[index]);
-                ret = btrfs_block_rsv_migrate(&pending->block_rsv,
-                                              root->orphan_block_rsv,
-                                              num_bytes);
-                BUG_ON(ret);
-        }
-        /* setup orphan block reservation for the snapshot */
-        block_rsv = btrfs_alloc_block_rsv(snap);
-        BUG_ON(!block_rsv);
-        btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-        snap->orphan_block_rsv = block_rsv;
-        num_bytes = root->orphan_block_rsv->size;
-        ret = btrfs_block_rsv_migrate(&pending->block_rsv,
-                                      block_rsv, num_bytes);
-        BUG_ON(ret);
-#if 0
-        /* insert orphan item for the snapshot */
-        WARN_ON(!root->orphan_item_inserted);
-        ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
-                                       snap->root_key.objectid);
-        BUG_ON(ret);
-        snap->orphan_item_inserted = 1;
-#endif
-}
 enum btrfs_orphan_cleanup_state {
        ORPHAN_CLEANUP_STARTED  = 1,
        ORPHAN_CLEANUP_DONE     = 2,
@@ -2099,9 +2020,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        }
        spin_unlock(&root->orphan_lock);
-        if (block_rsv)
-                btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
        /* grab metadata reservation from transaction handle */
        if (reserve) {
                ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -2168,6 +2086,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
        struct btrfs_key key, found_key;
        struct btrfs_trans_handle *trans;
        struct inode *inode;
+        u64 last_objectid = 0;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2219,41 +2138,49 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                 * crossing root thing.  we store the inode number in the
                 * offset of the orphan item.
                 */
+                if (found_key.offset == last_objectid) {
+                        printk(KERN_ERR "btrfs: Error removing orphan entry, "
+                               "stopping orphan cleanup\n");
+                        ret = -EINVAL;
+                        goto out;
+                }
+                last_objectid = found_key.offset;
                found_key.objectid = found_key.offset;
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-                if (IS_ERR(inode)) {
+                ret = PTR_RET(inode);
-                        ret = PTR_ERR(inode);
+                if (ret && ret != -ESTALE)
                        goto out;
-                }
                /*
-                 * add this inode to the orphan list so btrfs_orphan_del does
+                 * Inode is already gone but the orphan item is still there,
-                 * the proper thing when we hit it
+                 * kill the orphan item.
-                 */
-                spin_lock(&root->orphan_lock);
-                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-                spin_unlock(&root->orphan_lock);
-                /*
-                 * if this is a bad inode, means we actually succeeded in
-                 * removing the inode, but not the orphan record, which means
-                 * we need to manually delete the orphan since iput will just
-                 * do a destroy_inode
                 */
-                if (is_bad_inode(inode)) {
+                if (ret == -ESTALE) {
-                        trans = btrfs_start_transaction(root, 0);
+                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
                                ret = PTR_ERR(trans);
                                goto out;
                        }
-                        btrfs_orphan_del(trans, inode);
+                        ret = btrfs_del_orphan_item(trans, root,
+                                                    found_key.objectid);
+                        BUG_ON(ret);
                        btrfs_end_transaction(trans, root);
-                        iput(inode);
                        continue;
                }
+                /*
+                 * add this inode to the orphan list so btrfs_orphan_del does
+                 * the proper thing when we hit it
+                 */
+                spin_lock(&root->orphan_lock);
+                list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+                spin_unlock(&root->orphan_lock);
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
                        if (!S_ISREG(inode->i_mode)) {
@@ -2687,7 +2614,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        u64 ino = btrfs_ino(inode);
        u64 dir_ino = btrfs_ino(dir);
-        trans = btrfs_start_transaction(root, 10);
+        /*
+         * 1 for the possible orphan item
+         * 1 for the dir item
+         * 1 for the dir index
+         * 1 for the inode ref
+         * 1 for the inode ref in the tree log
+         * 2 for the dir entries in the log
+         * 1 for the inode
+         */
+        trans = btrfs_start_transaction(root, 8);
        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
                return trans;
@@ -2710,7 +2646,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
                return ERR_PTR(-ENOMEM);
        }
-        trans = btrfs_start_transaction(root, 0);
+        /* 1 for the orphan item */
+        trans = btrfs_start_transaction(root, 1);
        if (IS_ERR(trans)) {
                btrfs_free_path(path);
                root->fs_info->enospc_unlink = 0;
@@ -2815,6 +2752,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        err = 0;
 out:
        btrfs_free_path(path);
+        /* Migrate the orphan reservation over */
+        if (!err)
+                err = btrfs_block_rsv_migrate(trans->block_rsv,
+                                &root->fs_info->global_block_rsv,
+                                trans->bytes_reserved);
        if (err) {
                btrfs_end_transaction(trans, root);
                root->fs_info->enospc_unlink = 0;
@@ -2829,6 +2772,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
        if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+                btrfs_block_rsv_release(root, trans->block_rsv,
+                                        trans->bytes_reserved);
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
                BUG_ON(!root->fs_info->enospc_unlink);
                root->fs_info->enospc_unlink = 0;
        }
@@ -3220,6 +3166,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        pgoff_t index = from >> PAGE_CACHE_SHIFT;
        unsigned offset = from & (PAGE_CACHE_SIZE-1);
        struct page *page;
+        gfp_t mask = btrfs_alloc_write_mask(mapping);
        int ret = 0;
        u64 page_start;
        u64 page_end;
@@ -3232,7 +3179,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
        ret = -ENOMEM;
 again:
-        page = find_or_create_page(mapping, index, GFP_NOFS);
+        page = find_or_create_page(mapping, index, mask);
        if (!page) {
                btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
                goto out;
@@ -3465,6 +3412,8 @@ void btrfs_evict_inode(struct inode *inode)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *rsv, *global_rsv;
+        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
        unsigned long nr;
        int ret;
@@ -3492,22 +3441,55 @@ void btrfs_evict_inode(struct inode *inode)
                goto no_delete;
        }
+        rsv = btrfs_alloc_block_rsv(root);
+        if (!rsv) {
+                btrfs_orphan_del(NULL, inode);
+                goto no_delete;
+        }
+        rsv->size = min_size;
+        global_rsv = &root->fs_info->global_block_rsv;
        btrfs_i_size_write(inode, 0);
+        /*
+         * This is a bit simpler than btrfs_truncate since
+         *
+         * 1) We've already reserved our space for our orphan item in the
+         *    unlink.
+         * 2) We're going to delete the inode item, so we don't need to update
+         *    it at all.
+         *
+         * So we just need to reserve some slack space in case we add bytes when
+         * doing the truncate.
+         */
        while (1) {
-                trans = btrfs_join_transaction(root);
+                ret = btrfs_block_rsv_refill(root, rsv, min_size);
-                BUG_ON(IS_ERR(trans));
-                trans->block_rsv = root->orphan_block_rsv;
+                /*
+                 * Try and steal from the global reserve since we will
+                 * likely not use this space anyway, we want to try as
+                 * hard as possible to get this to work.
+                 */
+                if (ret)
+                        ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
-                ret = btrfs_block_rsv_check(trans, root,
-                                            root->orphan_block_rsv, 0, 5);
                if (ret) {
-                        BUG_ON(ret != -EAGAIN);
+                        printk(KERN_WARNING "Could not get space for a "
-                        ret = btrfs_commit_transaction(trans, root);
+                               "delete, will truncate on mount %d\n", ret);
-                        BUG_ON(ret);
+                        btrfs_orphan_del(NULL, inode);
-                        continue;
+                        btrfs_free_block_rsv(root, rsv);
+                        goto no_delete;
+                }
+                trans = btrfs_start_transaction(root, 0);
+                if (IS_ERR(trans)) {
+                        btrfs_orphan_del(NULL, inode);
+                        btrfs_free_block_rsv(root, rsv);
+                        goto no_delete;
                }
+                trans->block_rsv = rsv;
                ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
                if (ret != -EAGAIN)
                        break;
@@ -3516,14 +3498,17 @@ void btrfs_evict_inode(struct inode *inode)
                btrfs_end_transaction(trans, root);
                trans = NULL;
                btrfs_btree_balance_dirty(root, nr);
        }
+        btrfs_free_block_rsv(root, rsv);
        if (ret == 0) {
+                trans->block_rsv = root->orphan_block_rsv;
                ret = btrfs_orphan_del(trans, inode);
                BUG_ON(ret);
        }
+        trans->block_rsv = &root->fs_info->trans_block_rsv;
        if (!(root == root->fs_info->tree_root ||
              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
                btrfs_return_ino(root, btrfs_ino(inode));
@@ -5647,8 +5632,7 @@ again:
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
                ret = btrfs_ordered_update_i_size(inode, 0, ordered);
                if (!ret)
-                        ret = btrfs_update_inode(trans, root, inode);
+                        err = btrfs_update_inode(trans, root, inode);
-                err = ret;
                goto out;
        }
@@ -6393,6 +6377,7 @@ static int btrfs_truncate(struct inode *inode)
        struct btrfs_trans_handle *trans;
        unsigned long nr;
        u64 mask = root->sectorsize - 1;
+        u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
        ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
        if (ret)
@@ -6440,19 +6425,23 @@ static int btrfs_truncate(struct inode *inode)
        rsv = btrfs_alloc_block_rsv(root);
        if (!rsv)
                return -ENOMEM;
-        btrfs_add_durable_block_rsv(root->fs_info, rsv);
+        rsv->size = min_size;
+        /*
+         * 1 for the truncate slack space
+         * 1 for the orphan item we're going to add
+         * 1 for the orphan item deletion
+         * 1 for updating the inode.
+         */
        trans = btrfs_start_transaction(root, 4);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto out;
        }
-        /*
+        /* Migrate the slack space for the truncate to our reserve */
-         * Reserve space for the truncate process.  Truncate should be adding
+        ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
-         * space, but if there are snapshots it may end up using space.
+                                      min_size);
-         */
-        ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
        BUG_ON(ret);
        ret = btrfs_orphan_add(trans, inode);
@@ -6461,21 +6450,6 @@ static int btrfs_truncate(struct inode *inode)
                goto out;
        }
-        nr = trans->blocks_used;
-        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
-        /*
-         * Ok so we've already migrated our bytes over for the truncate, so here
-         * just reserve the one slot we need for updating the inode.
-         */
-        trans = btrfs_start_transaction(root, 1);
-        if (IS_ERR(trans)) {
-                err = PTR_ERR(trans);
-                goto out;
-        }
-        trans->block_rsv = rsv;
        /*
         * setattr is responsible for setting the ordered_data_close flag,
         * but that is only tested during the last file release.  That
@@ -6497,20 +6471,30 @@ static int btrfs_truncate(struct inode *inode)
                btrfs_add_ordered_operation(trans, root, inode);
        while (1) {
+                ret = btrfs_block_rsv_refill(root, rsv, min_size);
+                if (ret) {
+                        /*
+                         * This can only happen with the original transaction we
+                         * started above, every other time we shouldn't have a
+                         * transaction started yet.
+                         */
+                        if (ret == -EAGAIN)
+                                goto end_trans;
+                        err = ret;
+                        break;
+                }
                if (!trans) {
-                        trans = btrfs_start_transaction(root, 3);
+                        /* Just need the 1 for updating the inode */
+                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
                                err = PTR_ERR(trans);
                                goto out;
                        }
-                        ret = btrfs_truncate_reserve_metadata(trans, root,
-                                                              rsv);
-                        BUG_ON(ret);
-                        trans->block_rsv = rsv;
                }
+                trans->block_rsv = rsv;
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
@@ -6525,7 +6509,7 @@ static int btrfs_truncate(struct inode *inode)
                        err = ret;
                        break;
                }
+end_trans:
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
                trans = NULL;
@@ -6607,9 +6591,9 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->last_sub_trans = 0;
        ei->logged_trans = 0;
        ei->delalloc_bytes = 0;
-        ei->reserved_bytes = 0;
        ei->disk_i_size = 0;
        ei->flags = 0;
+        ei->csum_bytes = 0;
        ei->index_cnt = (u64)-1;
        ei->last_unlink_trans = 0;
@@ -6655,6 +6639,8 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(inode->i_data.nrpages);
        WARN_ON(BTRFS_I(inode)->outstanding_extents);
        WARN_ON(BTRFS_I(inode)->reserved_extents);
+        WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+        WARN_ON(BTRFS_I(inode)->csum_bytes);
        /*
         * This can happen where we create an inode, but somebody else also
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 7f57efa76d11..cc9893990341 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -118,7 +118,7 @@ void btrfs_update_iflags(struct inode *inode)
 /*
 * Inherit flags from the parent inode.
 *
- * Unlike extN we don't have any flags we don't want to inherit currently.
+ * Currently only the compression flags and the cow flags are inherited.
 */
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 {
@@ -129,12 +129,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
        flags = BTRFS_I(dir)->flags;
-        if (S_ISREG(inode->i_mode))
+        if (flags & BTRFS_INODE_NOCOMPRESS) {
-                flags &= ~BTRFS_INODE_DIRSYNC;
+                BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
-        else if (!S_ISDIR(inode->i_mode))
+                BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
-                flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+        } else if (flags & BTRFS_INODE_COMPRESS) {
+                BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+                BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+        }
+        if (flags & BTRFS_INODE_NODATACOW)
+                BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
-        BTRFS_I(inode)->flags = flags;
        btrfs_update_iflags(inode);
 }
@@ -278,6 +283,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
        struct fstrim_range range;
        u64 minlen = ULLONG_MAX;
        u64 num_devices = 0;
+        u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
        int ret;
        if (!capable(CAP_SYS_ADMIN))
@@ -296,12 +302,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
                }
        }
        rcu_read_unlock();
        if (!num_devices)
                return -EOPNOTSUPP;
        if (copy_from_user(&range, arg, sizeof(range)))
                return -EFAULT;
+        if (range.start > total_bytes)
+                return -EINVAL;
+        range.len = min(range.len, total_bytes - range.start);
        range.minlen = max(range.minlen, minlen);
        ret = btrfs_trim_fs(root, &range);
        if (ret < 0)
@@ -761,7 +770,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
        int ret = 1;
        /*
-         * make sure that once we start defragging and extent, we keep on
+         * make sure that once we start defragging an extent, we keep on
         * defragging it
         */
        if (start < *defrag_end)
@@ -806,7 +815,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
         * extent will force at least part of that big extent to be defragged.
         */
        if (ret) {
-                *last_len += len;
                *defrag_end = extent_map_end(em);
        } else {
                *last_len = 0;
@@ -844,6 +852,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
        int i_done;
        struct btrfs_ordered_extent *ordered;
        struct extent_state *cached_state = NULL;
+        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
        if (isize == 0)
                return 0;
@@ -861,7 +870,7 @@ again:
        for (i = 0; i < num_pages; i++) {
                struct page *page;
                page = find_or_create_page(inode->i_mapping,
-                                            start_index + i, GFP_NOFS);
+                                            start_index + i, mask);
                if (!page)
                        break;
@@ -973,18 +982,20 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
        struct btrfs_super_block *disk_super;
        struct file_ra_state *ra = NULL;
        unsigned long last_index;
+        u64 isize = i_size_read(inode);
        u64 features;
        u64 last_len = 0;
        u64 skip = 0;
        u64 defrag_end = 0;
        u64 newer_off = range->start;
-        int newer_left = 0;
        unsigned long i;
+        unsigned long ra_index = 0;
        int ret;
        int defrag_count = 0;
        int compress_type = BTRFS_COMPRESS_ZLIB;
        int extent_thresh = range->extent_thresh;
-        int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+        int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+        int cluster = max_cluster;
        u64 new_align = ~((u64)128 * 1024 - 1);
        struct page **pages = NULL;
@@ -998,7 +1009,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                        compress_type = range->compress_type;
        }
-        if (inode->i_size == 0)
+        if (isize == 0)
                return 0;
        /*
@@ -1014,7 +1025,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                ra = &file->f_ra;
        }
-        pages = kmalloc(sizeof(struct page *) * newer_cluster,
+        pages = kmalloc(sizeof(struct page *) * max_cluster,
                        GFP_NOFS);
        if (!pages) {
                ret = -ENOMEM;
@@ -1023,10 +1034,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
        /* find the last page to defrag */
        if (range->start + range->len > range->start) {
-                last_index = min_t(u64, inode->i_size - 1,
+                last_index = min_t(u64, isize - 1,
                         range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
        } else {
-                last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+                last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
        }
        if (newer_than) {
@@ -1039,16 +1050,24 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                         * the extents in the file evenly spaced
                         */
                        i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
-                        newer_left = newer_cluster;
                } else
                        goto out_ra;
        } else {
                i = range->start >> PAGE_CACHE_SHIFT;
        }
        if (!max_to_defrag)
-                max_to_defrag = last_index - 1;
+                max_to_defrag = last_index;
+        /*
+         * make writeback starts from i, so the defrag range can be
+         * written sequentially.
+         */
+        if (i < inode->i_mapping->writeback_index)
+                inode->i_mapping->writeback_index = i;
-        while (i <= last_index && defrag_count < max_to_defrag) {
+        while (i <= last_index && defrag_count < max_to_defrag &&
+               (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT)) {
                /*
                 * make sure we stop running if someone unmounts
                 * the FS
@@ -1071,18 +1090,31 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                        i = max(i + 1, next);
                        continue;
                }
+                if (!newer_than) {
+                        cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+                                   PAGE_CACHE_SHIFT) - i;
+                        cluster = min(cluster, max_cluster);
+                } else {
+                        cluster = max_cluster;
+                }
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
                        BTRFS_I(inode)->force_compress = compress_type;
-                btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
+                if (i + cluster > ra_index) {
+                        ra_index = max(i, ra_index);
+                        btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+                                       cluster);
+                        ra_index += max_cluster;
+                }
-                ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+                ret = cluster_pages_for_defrag(inode, pages, i, cluster);
                if (ret < 0)
                        goto out_ra;
                defrag_count += ret;
                balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
-                i += ret;
                if (newer_than) {
                        if (newer_off == (u64)-1)
@@ -1097,12 +1129,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                        if (!ret) {
                                range->start = newer_off;
                                i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
-                                newer_left = newer_cluster;
                        } else {
                                break;
                        }
                } else {
-                        i++;
+                        if (ret > 0) {
+                                i += ret;
+                                last_len += ret << PAGE_CACHE_SHIFT;
+                        } else {
+                                i++;
+                                last_len = 0;
+                        }
                }
        }
@@ -1128,16 +1165,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                mutex_unlock(&inode->i_mutex);
        }
-        disk_super = &root->fs_info->super_copy;
+        disk_super = root->fs_info->super_copy;
        features = btrfs_super_incompat_flags(disk_super);
        if (range->compress_type == BTRFS_COMPRESS_LZO) {
                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
                btrfs_set_super_incompat_flags(disk_super, features);
        }
-        if (!file)
+        ret = defrag_count;
-                kfree(ra);
-        return defrag_count;
 out_ra:
        if (!file)
@@ -2579,7 +2614,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
                return PTR_ERR(trans);
        }
-        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+        dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
                                   dir_id, "default", 7, 1);
        if (IS_ERR_OR_NULL(di)) {
@@ -2595,7 +2630,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
-        disk_super = &root->fs_info->super_copy;
+        disk_super = root->fs_info->super_copy;
        features = btrfs_super_incompat_flags(disk_super);
        if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
                features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
@@ -2862,7 +2897,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
        int i;
        unsigned long rel_ptr;
        int size;
-        struct btrfs_ioctl_ino_path_args *ipa;
+        struct btrfs_ioctl_ino_path_args *ipa = NULL;
        struct inode_fs_paths *ipath = NULL;
        struct btrfs_path *path;
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
        int i;
-        u32 type;
+        u32 type, nr;
-        u32 nr = btrfs_header_nritems(l);
        struct btrfs_item *item;
        struct btrfs_root_item *ri;
        struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
        struct btrfs_key key;
        struct btrfs_key found_key;
+        if (!l)
+                return;
+        nr = btrfs_header_nritems(l);
        printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
                (unsigned long long)btrfs_header_bytenr(l), nr,
                btrfs_leaf_free_space(root, l));
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..cd857119ba8a
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,949 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "transaction.h"
+#undef DEBUG
+/*
+ * This is the implementation for the generic read ahead framework.
+ *
+ * To trigger a readahead, btrfs_reada_add must be called. It will start
+ * a read ahead for the given range [start, end) on tree root. The returned
+ * handle can either be used to wait on the readahead to finish
+ * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
+ *
+ * The read ahead works as follows:
+ * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
+ * reada_start_machine will then search for extents to prefetch and trigger
+ * some reads. When a read finishes for a node, all contained node/leaf
+ * pointers that lie in the given range will also be enqueued. The reads will
+ * be triggered in sequential order, thus giving a big win over a naive
+ * enumeration. It will also make use of multi-device layouts. Each disk
+ * will have its on read pointer and all disks will by utilized in parallel.
+ * Also will no two disks read both sides of a mirror simultaneously, as this
+ * would waste seeking capacity. Instead both disks will read different parts
+ * of the filesystem.
+ * Any number of readaheads can be started in parallel. The read order will be
+ * determined globally, i.e. 2 parallel readaheads will normally finish faster
+ * than the 2 started one after another.
+ */
+#define MAX_MIRRORS 2
+#define MAX_IN_FLIGHT 6
+struct reada_extctl {
+        struct list_head        list;
+        struct reada_control    *rc;
+        u64                     generation;
+};
+struct reada_extent {
+        u64                     logical;
+        struct btrfs_key        top;
+        u32                     blocksize;
+        int                     err;
+        struct list_head        extctl;
+        struct kref             refcnt;
+        spinlock_t              lock;
+        struct reada_zone       *zones[MAX_MIRRORS];
+        int                     nzones;
+        struct btrfs_device     *scheduled_for;
+};
+struct reada_zone {
+        u64                     start;
+        u64                     end;
+        u64                     elems;
+        struct list_head        list;
+        spinlock_t              lock;
+        int                     locked;
+        struct btrfs_device     *device;
+        struct btrfs_device     *devs[MAX_MIRRORS]; /* full list, incl self */
+        int                     ndevs;
+        struct kref             refcnt;
+};
+struct reada_machine_work {
+        struct btrfs_work       work;
+        struct btrfs_fs_info    *fs_info;
+};
+static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
+static void reada_control_release(struct kref *kref);
+static void reada_zone_release(struct kref *kref);
+static void reada_start_machine(struct btrfs_fs_info *fs_info);
+static void __reada_start_machine(struct btrfs_fs_info *fs_info);
+static int reada_add_block(struct reada_control *rc, u64 logical,
+                           struct btrfs_key *top, int level, u64 generation);
+/* recurses */
+/* in case of err, eb might be NULL */
+static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+                            u64 start, int err)
+{
+        int level = 0;
+        int nritems;
+        int i;
+        u64 bytenr;
+        u64 generation;
+        struct reada_extent *re;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct list_head list;
+        unsigned long index = start >> PAGE_CACHE_SHIFT;
+        struct btrfs_device *for_dev;
+        if (eb)
+                level = btrfs_header_level(eb);
+        /* find extent */
+        spin_lock(&fs_info->reada_lock);
+        re = radix_tree_lookup(&fs_info->reada_tree, index);
+        if (re)
+                kref_get(&re->refcnt);
+        spin_unlock(&fs_info->reada_lock);
+        if (!re)
+                return -1;
+        spin_lock(&re->lock);
+        /*
+         * just take the full list from the extent. afterwards we
+         * don't need the lock anymore
+         */
+        list_replace_init(&re->extctl, &list);
+        for_dev = re->scheduled_for;
+        re->scheduled_for = NULL;
+        spin_unlock(&re->lock);
+        if (err == 0) {
+                nritems = level ? btrfs_header_nritems(eb) : 0;
+                generation = btrfs_header_generation(eb);
+                /*
+                 * FIXME: currently we just set nritems to 0 if this is a leaf,
+                 * effectively ignoring the content. In a next step we could
+                 * trigger more readahead depending from the content, e.g.
+                 * fetch the checksums for the extents in the leaf.
+                 */
+        } else {
+                /*
+                 * this is the error case, the extent buffer has not been
+                 * read correctly. We won't access anything from it and
+                 * just cleanup our data structures. Effectively this will
+                 * cut the branch below this node from read ahead.
+                 */
+                nritems = 0;
+                generation = 0;
+        }
+        for (i = 0; i < nritems; i++) {
+                struct reada_extctl *rec;
+                u64 n_gen;
+                struct btrfs_key key;
+                struct btrfs_key next_key;
+                btrfs_node_key_to_cpu(eb, &key, i);
+                if (i + 1 < nritems)
+                        btrfs_node_key_to_cpu(eb, &next_key, i + 1);
+                else
+                        next_key = re->top;
+                bytenr = btrfs_node_blockptr(eb, i);
+                n_gen = btrfs_node_ptr_generation(eb, i);
+                list_for_each_entry(rec, &list, list) {
+                        struct reada_control *rc = rec->rc;
+                        /*
+                         * if the generation doesn't match, just ignore this
+                         * extctl. This will probably cut off a branch from
+                         * prefetch. Alternatively one could start a new (sub-)
+                         * prefetch for this branch, starting again from root.
+                         * FIXME: move the generation check out of this loop
+                         */
+#ifdef DEBUG
+                        if (rec->generation != generation) {
+                                printk(KERN_DEBUG "generation mismatch for "
+                                                "(%llu,%d,%llu) %llu != %llu\n",
+                                       key.objectid, key.type, key.offset,
+                                       rec->generation, generation);
+                        }
+#endif
+                        if (rec->generation == generation &&
+                            btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
+                            btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
+                                reada_add_block(rc, bytenr, &next_key,
+                                                level - 1, n_gen);
+                }
+        }
+        /*
+         * free extctl records
+         */
+        while (!list_empty(&list)) {
+                struct reada_control *rc;
+                struct reada_extctl *rec;
+                rec = list_first_entry(&list, struct reada_extctl, list);
+                list_del(&rec->list);
+                rc = rec->rc;
+                kfree(rec);
+                kref_get(&rc->refcnt);
+                if (atomic_dec_and_test(&rc->elems)) {
+                        kref_put(&rc->refcnt, reada_control_release);
+                        wake_up(&rc->wait);
+                }
+                kref_put(&rc->refcnt, reada_control_release);
+                reada_extent_put(fs_info, re);  /* one ref for each entry */
+        }
+        reada_extent_put(fs_info, re);  /* our ref */
+        if (for_dev)
+                atomic_dec(&for_dev->reada_in_flight);
+        return 0;
+}
+/*
+ * start is passed separately in case eb in NULL, which may be the case with
+ * failed I/O
+ */
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+                         u64 start, int err)
+{
+        int ret;
+        ret = __readahead_hook(root, eb, start, err);
+        reada_start_machine(root->fs_info);
+        return ret;
+}
+static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
+                                          struct btrfs_device *dev, u64 logical,
+                                          struct btrfs_bio *multi)
+{
+        int ret;
+        int looped = 0;
+        struct reada_zone *zone;
+        struct btrfs_block_group_cache *cache = NULL;
+        u64 start;
+        u64 end;
+        int i;
+again:
+        zone = NULL;
+        spin_lock(&fs_info->reada_lock);
+        ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+                                     logical >> PAGE_CACHE_SHIFT, 1);
+        if (ret == 1)
+                kref_get(&zone->refcnt);
+        spin_unlock(&fs_info->reada_lock);
+        if (ret == 1) {
+                if (logical >= zone->start && logical < zone->end)
+                        return zone;
+                spin_lock(&fs_info->reada_lock);
+                kref_put(&zone->refcnt, reada_zone_release);
+                spin_unlock(&fs_info->reada_lock);
+        }
+        if (looped)
+                return NULL;
+        cache = btrfs_lookup_block_group(fs_info, logical);
+        if (!cache)
+                return NULL;
+        start = cache->key.objectid;
+        end = start + cache->key.offset - 1;
+        btrfs_put_block_group(cache);
+        zone = kzalloc(sizeof(*zone), GFP_NOFS);
+        if (!zone)
+                return NULL;
+        zone->start = start;
+        zone->end = end;
+        INIT_LIST_HEAD(&zone->list);
+        spin_lock_init(&zone->lock);
+        zone->locked = 0;
+        kref_init(&zone->refcnt);
+        zone->elems = 0;
+        zone->device = dev; /* our device always sits at index 0 */
+        for (i = 0; i < multi->num_stripes; ++i) {
+                /* bounds have already been checked */
+                zone->devs[i] = multi->stripes[i].dev;
+        }
+        zone->ndevs = multi->num_stripes;
+        spin_lock(&fs_info->reada_lock);
+        ret = radix_tree_insert(&dev->reada_zones,
+                                (unsigned long)zone->end >> PAGE_CACHE_SHIFT,
+                                zone);
+        spin_unlock(&fs_info->reada_lock);
+        if (ret) {
+                kfree(zone);
+                looped = 1;
+                goto again;
+        }
+        return zone;
+}
+static struct reada_extent *reada_find_extent(struct btrfs_root *root,
+                                              u64 logical,
+                                              struct btrfs_key *top, int level)
+{
+        int ret;
+        int looped = 0;
+        struct reada_extent *re = NULL;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+        struct btrfs_bio *multi = NULL;
+        struct btrfs_device *dev;
+        u32 blocksize;
+        u64 length;
+        int nzones = 0;
+        int i;
+        unsigned long index = logical >> PAGE_CACHE_SHIFT;
+again:
+        spin_lock(&fs_info->reada_lock);
+        re = radix_tree_lookup(&fs_info->reada_tree, index);
+        if (re)
+                kref_get(&re->refcnt);
+        spin_unlock(&fs_info->reada_lock);
+        if (re || looped)
+                return re;
+        re = kzalloc(sizeof(*re), GFP_NOFS);
+        if (!re)
+                return NULL;
+        blocksize = btrfs_level_size(root, level);
+        re->logical = logical;
+        re->blocksize = blocksize;
+        re->top = *top;
+        INIT_LIST_HEAD(&re->extctl);
+        spin_lock_init(&re->lock);
+        kref_init(&re->refcnt);
+        /*
+         * map block
+         */
+        length = blocksize;
+        ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0);
+        if (ret || !multi || length < blocksize)
+                goto error;
+        if (multi->num_stripes > MAX_MIRRORS) {
+                printk(KERN_ERR "btrfs readahead: more than %d copies not "
+                                "supported", MAX_MIRRORS);
+                goto error;
+        }
+        for (nzones = 0; nzones < multi->num_stripes; ++nzones) {
+                struct reada_zone *zone;
+                dev = multi->stripes[nzones].dev;
+                zone = reada_find_zone(fs_info, dev, logical, multi);
+                if (!zone)
+                        break;
+                re->zones[nzones] = zone;
+                spin_lock(&zone->lock);
+                if (!zone->elems)
+                        kref_get(&zone->refcnt);
+                ++zone->elems;
+                spin_unlock(&zone->lock);
+                spin_lock(&fs_info->reada_lock);
+                kref_put(&zone->refcnt, reada_zone_release);
+                spin_unlock(&fs_info->reada_lock);
+        }
+        re->nzones = nzones;
+        if (nzones == 0) {
+                /* not a single zone found, error and out */
+                goto error;
+        }
+        /* insert extent in reada_tree + all per-device trees, all or nothing */
+        spin_lock(&fs_info->reada_lock);
+        ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+        if (ret) {
+                spin_unlock(&fs_info->reada_lock);
+                if (ret != -ENOMEM) {
+                        /* someone inserted the extent in the meantime */
+                        looped = 1;
+                }
+                goto error;
+        }
+        for (i = 0; i < nzones; ++i) {
+                dev = multi->stripes[i].dev;
+                ret = radix_tree_insert(&dev->reada_extents, index, re);
+                if (ret) {
+                        while (--i >= 0) {
+                                dev = multi->stripes[i].dev;
+                                BUG_ON(dev == NULL);
+                                radix_tree_delete(&dev->reada_extents, index);
+                        }
+                        BUG_ON(fs_info == NULL);
+                        radix_tree_delete(&fs_info->reada_tree, index);
+                        spin_unlock(&fs_info->reada_lock);
+                        goto error;
+                }
+        }
+        spin_unlock(&fs_info->reada_lock);
+        return re;
+error:
+        while (nzones) {
+                struct reada_zone *zone;
+                --nzones;
+                zone = re->zones[nzones];
+                kref_get(&zone->refcnt);
+                spin_lock(&zone->lock);
+                --zone->elems;
+                if (zone->elems == 0) {
+                        /*
+                         * no fs_info->reada_lock needed, as this can't be
+                         * the last ref
+                         */
+                        kref_put(&zone->refcnt, reada_zone_release);
+                }
+                spin_unlock(&zone->lock);
+                spin_lock(&fs_info->reada_lock);
+                kref_put(&zone->refcnt, reada_zone_release);
+                spin_unlock(&fs_info->reada_lock);
+        }
+        kfree(re);
+        if (looped)
+                goto again;
+        return NULL;
+}
+static void reada_kref_dummy(struct kref *kr)
+{
+}
+static void reada_extent_put(struct btrfs_fs_info *fs_info,
+                             struct reada_extent *re)
+{
+        int i;
+        unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+        spin_lock(&fs_info->reada_lock);
+        if (!kref_put(&re->refcnt, reada_kref_dummy)) {
+                spin_unlock(&fs_info->reada_lock);
+                return;
+        }
+        radix_tree_delete(&fs_info->reada_tree, index);
+        for (i = 0; i < re->nzones; ++i) {
+                struct reada_zone *zone = re->zones[i];
+                radix_tree_delete(&zone->device->reada_extents, index);
+        }
+        spin_unlock(&fs_info->reada_lock);
+        for (i = 0; i < re->nzones; ++i) {
+                struct reada_zone *zone = re->zones[i];
+                kref_get(&zone->refcnt);
+                spin_lock(&zone->lock);
+                --zone->elems;
+                if (zone->elems == 0) {
+                        /* no fs_info->reada_lock needed, as this can't be
+                         * the last ref */
+                        kref_put(&zone->refcnt, reada_zone_release);
+                }
+                spin_unlock(&zone->lock);
+                spin_lock(&fs_info->reada_lock);
+                kref_put(&zone->refcnt, reada_zone_release);
+                spin_unlock(&fs_info->reada_lock);
+        }
+        if (re->scheduled_for)
+                atomic_dec(&re->scheduled_for->reada_in_flight);
+        kfree(re);
+}
+static void reada_zone_release(struct kref *kref)
+{
+        struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+        radix_tree_delete(&zone->device->reada_zones,
+                          zone->end >> PAGE_CACHE_SHIFT);
+        kfree(zone);
+}
+static void reada_control_release(struct kref *kref)
+{
+        struct reada_control *rc = container_of(kref, struct reada_control,
+                                                refcnt);
+        kfree(rc);
+}
+static int reada_add_block(struct reada_control *rc, u64 logical,
+                           struct btrfs_key *top, int level, u64 generation)
+{
+        struct btrfs_root *root = rc->root;
+        struct reada_extent *re;
+        struct reada_extctl *rec;
+        re = reada_find_extent(root, logical, top, level); /* takes one ref */
+        if (!re)
+                return -1;
+        rec = kzalloc(sizeof(*rec), GFP_NOFS);
+        if (!rec) {
+                reada_extent_put(root->fs_info, re);
+                return -1;
+        }
+        rec->rc = rc;
+        rec->generation = generation;
+        atomic_inc(&rc->elems);
+        spin_lock(&re->lock);
+        list_add_tail(&rec->list, &re->extctl);
+        spin_unlock(&re->lock);
+        /* leave the ref on the extent */
+        return 0;
+}
+/*
+ * called with fs_info->reada_lock held
+ */
+static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
+{
+        int i;
+        unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+        for (i = 0; i < zone->ndevs; ++i) {
+                struct reada_zone *peer;
+                peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
+                if (peer && peer->device != zone->device)
+                        peer->locked = lock;
+        }
+}
+/*
+ * called with fs_info->reada_lock held
+ */
+static int reada_pick_zone(struct btrfs_device *dev)
+{
+        struct reada_zone *top_zone = NULL;
+        struct reada_zone *top_locked_zone = NULL;
+        u64 top_elems = 0;
+        u64 top_locked_elems = 0;
+        unsigned long index = 0;
+        int ret;
+        if (dev->reada_curr_zone) {
+                reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
+                kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
+                dev->reada_curr_zone = NULL;
+        }
+        /* pick the zone with the most elements */
+        while (1) {
+                struct reada_zone *zone;
+                ret = radix_tree_gang_lookup(&dev->reada_zones,
+                                             (void **)&zone, index, 1);
+                if (ret == 0)
+                        break;
+                index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+                if (zone->locked) {
+                        if (zone->elems > top_locked_elems) {
+                                top_locked_elems = zone->elems;
+                                top_locked_zone = zone;
+                        }
+                } else {
+                        if (zone->elems > top_elems) {
+                                top_elems = zone->elems;
+                                top_zone = zone;
+                        }
+                }
+        }
+        if (top_zone)
+                dev->reada_curr_zone = top_zone;
+        else if (top_locked_zone)
+                dev->reada_curr_zone = top_locked_zone;
+        else
+                return 0;
+        dev->reada_next = dev->reada_curr_zone->start;
+        kref_get(&dev->reada_curr_zone->refcnt);
+        reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
+        return 1;
+}
+static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
+                                   struct btrfs_device *dev)
+{
+        struct reada_extent *re = NULL;
+        int mirror_num = 0;
+        struct extent_buffer *eb = NULL;
+        u64 logical;
+        u32 blocksize;
+        int ret;
+        int i;
+        int need_kick = 0;
+        spin_lock(&fs_info->reada_lock);
+        if (dev->reada_curr_zone == NULL) {
+                ret = reada_pick_zone(dev);
+                if (!ret) {
+                        spin_unlock(&fs_info->reada_lock);
+                        return 0;
+                }
+        }
+        /*
+         * FIXME currently we issue the reads one extent at a time. If we have
+         * a contiguous block of extents, we could also coagulate them or use
+         * plugging to speed things up
+         */
+        ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+                                     dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+        if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+                ret = reada_pick_zone(dev);
+                if (!ret) {
+                        spin_unlock(&fs_info->reada_lock);
+                        return 0;
+                }
+                re = NULL;
+                ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+                                        dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+        }
+        if (ret == 0) {
+                spin_unlock(&fs_info->reada_lock);
+                return 0;
+        }
+        dev->reada_next = re->logical + re->blocksize;
+        kref_get(&re->refcnt);
+        spin_unlock(&fs_info->reada_lock);
+        /*
+         * find mirror num
+         */
+        for (i = 0; i < re->nzones; ++i) {
+                if (re->zones[i]->device == dev) {
+                        mirror_num = i + 1;
+                        break;
+                }
+        }
+        logical = re->logical;
+        blocksize = re->blocksize;
+        spin_lock(&re->lock);
+        if (re->scheduled_for == NULL) {
+                re->scheduled_for = dev;
+                need_kick = 1;
+        }
+        spin_unlock(&re->lock);
+        reada_extent_put(fs_info, re);
+        if (!need_kick)
+                return 0;
+        atomic_inc(&dev->reada_in_flight);
+        ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
+                         mirror_num, &eb);
+        if (ret)
+                __readahead_hook(fs_info->extent_root, NULL, logical, ret);
+        else if (eb)
+                __readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+        if (eb)
+                free_extent_buffer(eb);
+        return 1;
+}
+static void reada_start_machine_worker(struct btrfs_work *work)
+{
+        struct reada_machine_work *rmw;
+        struct btrfs_fs_info *fs_info;
+        rmw = container_of(work, struct reada_machine_work, work);
+        fs_info = rmw->fs_info;
+        kfree(rmw);
+        __reada_start_machine(fs_info);
+}
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_device *device;
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+        u64 enqueued;
+        u64 total = 0;
+        int i;
+        do {
+                enqueued = 0;
+                list_for_each_entry(device, &fs_devices->devices, dev_list) {
+                        if (atomic_read(&device->reada_in_flight) <
+                            MAX_IN_FLIGHT)
+                                enqueued += reada_start_machine_dev(fs_info,
+                                                                    device);
+                }
+                total += enqueued;
+        } while (enqueued && total < 10000);
+        if (enqueued == 0)
+                return;
+        /*
+         * If everything is already in the cache, this is effectively single
+         * threaded. To a) not hold the caller for too long and b) to utilize
+         * more cores, we broke the loop above after 10000 iterations and now
+         * enqueue to workers to finish it. This will distribute the load to
+         * the cores.
+         */
+        for (i = 0; i < 2; ++i)
+                reada_start_machine(fs_info);
+}
+static void reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+        struct reada_machine_work *rmw;
+        rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+        if (!rmw) {
+                /* FIXME we cannot handle this properly right now */
+                BUG();
+        }
+        rmw->work.func = reada_start_machine_worker;
+        rmw->fs_info = fs_info;
+        btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+}
+#ifdef DEBUG
+static void dump_devs(struct btrfs_fs_info *fs_info, int all)
+{
+        struct btrfs_device *device;
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+        unsigned long index;
+        int ret;
+        int i;
+        int j;
+        int cnt;
+        spin_lock(&fs_info->reada_lock);
+        list_for_each_entry(device, &fs_devices->devices, dev_list) {
+                printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
+                        atomic_read(&device->reada_in_flight));
+                index = 0;
+                while (1) {
+                        struct reada_zone *zone;
+                        ret = radix_tree_gang_lookup(&device->reada_zones,
+                                                     (void **)&zone, index, 1);
+                        if (ret == 0)
+                                break;
+                        printk(KERN_DEBUG "  zone %llu-%llu elems %llu locked "
+                                "%d devs", zone->start, zone->end, zone->elems,
+                                zone->locked);
+                        for (j = 0; j < zone->ndevs; ++j) {
+                                printk(KERN_CONT " %lld",
+                                        zone->devs[j]->devid);
+                        }
+                        if (device->reada_curr_zone == zone)
+                                printk(KERN_CONT " curr off %llu",
+                                        device->reada_next - zone->start);
+                        printk(KERN_CONT "\n");
+                        index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+                }
+                cnt = 0;
+                index = 0;
+                while (all) {
+                        struct reada_extent *re = NULL;
+                        ret = radix_tree_gang_lookup(&device->reada_extents,
+                                                     (void **)&re, index, 1);
+                        if (ret == 0)
+                                break;
+                        printk(KERN_DEBUG
+                                "  re: logical %llu size %u empty %d for %lld",
+                                re->logical, re->blocksize,
+                                list_empty(&re->extctl), re->scheduled_for ?
+                                re->scheduled_for->devid : -1);
+                        for (i = 0; i < re->nzones; ++i) {
+                                printk(KERN_CONT " zone %llu-%llu devs",
+                                        re->zones[i]->start,
+                                        re->zones[i]->end);
+                                for (j = 0; j < re->zones[i]->ndevs; ++j) {
+                                        printk(KERN_CONT " %lld",
+                                                re->zones[i]->devs[j]->devid);
+                                }
+                        }
+                        printk(KERN_CONT "\n");
+                        index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+                        if (++cnt > 15)
+                                break;
+                }
+        }
+        index = 0;
+        cnt = 0;
+        while (all) {
+                struct reada_extent *re = NULL;
+                ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
+                                             index, 1);
+                if (ret == 0)
+                        break;
+                if (!re->scheduled_for) {
+                        index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+                        continue;
+                }
+                printk(KERN_DEBUG
+                        "re: logical %llu size %u list empty %d for %lld",
+                        re->logical, re->blocksize, list_empty(&re->extctl),
+                        re->scheduled_for ? re->scheduled_for->devid : -1);
+                for (i = 0; i < re->nzones; ++i) {
+                        printk(KERN_CONT " zone %llu-%llu devs",
+                                re->zones[i]->start,
+                                re->zones[i]->end);
+                        for (i = 0; i < re->nzones; ++i) {
+                                printk(KERN_CONT " zone %llu-%llu devs",
+                                        re->zones[i]->start,
+                                        re->zones[i]->end);
+                                for (j = 0; j < re->zones[i]->ndevs; ++j) {
+                                        printk(KERN_CONT " %lld",
+                                                re->zones[i]->devs[j]->devid);
+                                }
+                        }
+                }
+                printk(KERN_CONT "\n");
+                index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+        }
+        spin_unlock(&fs_info->reada_lock);
+}
+#endif
+/*
+ * interface
+ */
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+                        struct btrfs_key *key_start, struct btrfs_key *key_end)
+{
+        struct reada_control *rc;
+        u64 start;
+        u64 generation;
+        int level;
+        struct extent_buffer *node;
+        static struct btrfs_key max_key = {
+                .objectid = (u64)-1,
+                .type = (u8)-1,
+                .offset = (u64)-1
+        };
+        rc = kzalloc(sizeof(*rc), GFP_NOFS);
+        if (!rc)
+                return ERR_PTR(-ENOMEM);
+        rc->root = root;
+        rc->key_start = *key_start;
+        rc->key_end = *key_end;
+        atomic_set(&rc->elems, 0);
+        init_waitqueue_head(&rc->wait);
+        kref_init(&rc->refcnt);
+        kref_get(&rc->refcnt); /* one ref for having elements */
+        node = btrfs_root_node(root);
+        start = node->start;
+        level = btrfs_header_level(node);
+        generation = btrfs_header_generation(node);
+        free_extent_buffer(node);
+        reada_add_block(rc, start, &max_key, level, generation);
+        reada_start_machine(root->fs_info);
+        return rc;
+}
+#ifdef DEBUG
+int btrfs_reada_wait(void *handle)
+{
+        struct reada_control *rc = handle;
+        while (atomic_read(&rc->elems)) {
+                wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+                                   5 * HZ);
+                dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+        }
+        dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+        kref_put(&rc->refcnt, reada_control_release);
+        return 0;
+}
+#else
+int btrfs_reada_wait(void *handle)
+{
+        struct reada_control *rc = handle;
+        while (atomic_read(&rc->elems)) {
+                wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+        }
+        kref_put(&rc->refcnt, reada_control_release);
+        return 0;
+}
+#endif
+void btrfs_reada_detach(void *handle)
+{
+        struct reada_control *rc = handle;
+        kref_put(&rc->refcnt, reada_control_release);
+}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..24d654ce7a06 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
                BUG_ON(IS_ERR(trans));
                trans->block_rsv = rc->block_rsv;
-                ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
+                ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
-                                            min_reserved, 0);
                if (ret) {
                        BUG_ON(ret != -EAGAIN);
                        ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2151,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 again:
        if (!err) {
                num_bytes = rc->merging_rsv_size;
-                ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
+                ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
-                                          num_bytes);
                if (ret)
                        err = ret;
        }
@@ -2427,7 +2425,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
        trans->block_rsv = rc->block_rsv;
-        ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
+        ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
        if (ret) {
                if (ret == -EAGAIN)
                        rc->commit_transaction = 1;
@@ -2922,6 +2920,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
        unsigned long last_index;
        struct page *page;
        struct file_ra_state *ra;
+        gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
        int nr = 0;
        int ret = 0;
@@ -2956,7 +2955,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
                                                  ra, NULL, index,
                                                  last_index + 1 - index);
                        page = find_or_create_page(inode->i_mapping, index,
-                                                   GFP_NOFS);
+                                                   mask);
                        if (!page) {
                                btrfs_delalloc_release_metadata(inode,
                                                        PAGE_CACHE_SIZE);
@@ -3323,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc,
        }
        key.objectid = ref_objectid;
-        key.offset = ref_offset;
        key.type = BTRFS_EXTENT_DATA_KEY;
+        if (ref_offset > ((u64)-1 << 32))
+                key.offset = 0;
+        else
+                key.offset = ref_offset;
        path->search_commit_root = 1;
        path->skip_locking = 1;
@@ -3645,14 +3647,11 @@ int prepare_to_relocate(struct reloc_control *rc)
         * btrfs_init_reloc_root will use them when there
         * is no reservation in transaction handle.
         */
-        ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+        ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
                                  rc->extent_root->nodesize * 256);
        if (ret)
                return ret;
-        rc->block_rsv->refill_used = 1;
-        btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
        memset(&rc->cluster, 0, sizeof(rc->cluster));
        rc->search_start = rc->block_group->key.objectid;
        rc->extents_found = 0;
@@ -3777,8 +3776,7 @@ restart:
                        }
                }
-                ret = btrfs_block_rsv_check(trans, rc->extent_root,
+                ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
-                                            rc->block_rsv, 0, 5);
                if (ret < 0) {
                        if (ret != -EAGAIN) {
                                err = ret;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index eba42e5fd5fd..94cd3a19e9c8 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -33,15 +33,12 @@
 * any can be found.
 *
 * Future enhancements:
- *  - To enhance the performance, better read-ahead strategies for the
- *    extent-tree can be employed.
 *  - In case an unrepairable extent is encountered, track which files are
 *    affected and report them
 *  - In case of a read error on files with nodatasum, map the file and read
 *    the extent to trigger a writeback of the good copy
 *  - track and record media errors, throw out bad devices
 *  - add a mode to also read unallocated space
- *  - make the prefetch cancellable
 */
 struct scrub_bio;
@@ -209,7 +206,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
        atomic_set(&sdev->in_flight, 0);
        atomic_set(&sdev->fixup_cnt, 0);
        atomic_set(&sdev->cancel_req, 0);
-        sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+        sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
        INIT_LIST_HEAD(&sdev->csum_list);
        spin_lock_init(&sdev->list_lock);
@@ -1130,13 +1127,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        int slot;
        int i;
        u64 nstripes;
-        int start_stripe;
        struct extent_buffer *l;
        struct btrfs_key key;
        u64 physical;
        u64 logical;
        u64 generation;
        int mirror_num;
+        struct reada_control *reada1;
+        struct reada_control *reada2;
+        struct btrfs_key key_start;
+        struct btrfs_key key_end;
        u64 increment = map->stripe_len;
        u64 offset;
@@ -1168,81 +1168,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
        if (!path)
                return -ENOMEM;
-        path->reada = 2;
        path->search_commit_root = 1;
        path->skip_locking = 1;
        /*
-         * find all extents for each stripe and just read them to get
+         * trigger the readahead for extent tree csum tree and wait for
-         * them into the page cache
+         * completion. During readahead, the scrub is officially paused
-         * FIXME: we can do better. build a more intelligent prefetching
+         * to not hold off transaction commits
         */
        logical = base + offset;
-        physical = map->stripes[num].physical;
-        ret = 0;
-        for (i = 0; i < nstripes; ++i) {
-                key.objectid = logical;
-                key.type = BTRFS_EXTENT_ITEM_KEY;
-                key.offset = (u64)0;
-                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                if (ret < 0)
-                        goto out_noplug;
-                /*
-                 * we might miss half an extent here, but that doesn't matter,
-                 * as it's only the prefetch
-                 */
-                while (1) {
-                        l = path->nodes[0];
-                        slot = path->slots[0];
-                        if (slot >= btrfs_header_nritems(l)) {
-                                ret = btrfs_next_leaf(root, path);
-                                if (ret == 0)
-                                        continue;
-                                if (ret < 0)
-                                        goto out_noplug;
-                                break;
+        wait_event(sdev->list_wait,
-                        }
+                   atomic_read(&sdev->in_flight) == 0);
-                        btrfs_item_key_to_cpu(l, &key, slot);
+        atomic_inc(&fs_info->scrubs_paused);
+        wake_up(&fs_info->scrub_pause_wait);
-                        if (key.objectid >= logical + map->stripe_len)
+        /* FIXME it might be better to start readahead at commit root */
-                                break;
+        key_start.objectid = logical;
+        key_start.type = BTRFS_EXTENT_ITEM_KEY;
+        key_start.offset = (u64)0;
+        key_end.objectid = base + offset + nstripes * increment;
+        key_end.type = BTRFS_EXTENT_ITEM_KEY;
+        key_end.offset = (u64)0;
+        reada1 = btrfs_reada_add(root, &key_start, &key_end);
+        key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+        key_start.type = BTRFS_EXTENT_CSUM_KEY;
+        key_start.offset = logical;
+        key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+        key_end.type = BTRFS_EXTENT_CSUM_KEY;
+        key_end.offset = base + offset + nstripes * increment;
+        reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+        if (!IS_ERR(reada1))
+                btrfs_reada_wait(reada1);
+        if (!IS_ERR(reada2))
+                btrfs_reada_wait(reada2);
-                        path->slots[0]++;
+        mutex_lock(&fs_info->scrub_lock);
-                }
+        while (atomic_read(&fs_info->scrub_pause_req)) {
-                btrfs_release_path(path);
+                mutex_unlock(&fs_info->scrub_lock);
-                logical += increment;
+                wait_event(fs_info->scrub_pause_wait,
-                physical += map->stripe_len;
+                   atomic_read(&fs_info->scrub_pause_req) == 0);
-                cond_resched();
+                mutex_lock(&fs_info->scrub_lock);
        }
+        atomic_dec(&fs_info->scrubs_paused);
+        mutex_unlock(&fs_info->scrub_lock);
+        wake_up(&fs_info->scrub_pause_wait);
        /*
         * collect all data csums for the stripe to avoid seeking during
         * the scrub. This might currently (crc32) end up to be about 1MB
         */
-        start_stripe = 0;
        blk_start_plug(&plug);
-again:
-        logical = base + offset + start_stripe * increment;
-        for (i = start_stripe; i < nstripes; ++i) {
-                ret = btrfs_lookup_csums_range(csum_root, logical,
-                                               logical + map->stripe_len - 1,
-                                               &sdev->csum_list, 1);
-                if (ret)
-                        goto out;
-                logical += increment;
-                cond_resched();
-        }
        /*
         * now find all extents for each stripe and scrub them
         */
-        logical = base + offset + start_stripe * increment;
+        logical = base + offset;
-        physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+        physical = map->stripes[num].physical;
        ret = 0;
-        for (i = start_stripe; i < nstripes; ++i) {
+        for (i = 0; i < nstripes; ++i) {
                /*
                 * canceled?
                 */
@@ -1271,11 +1257,14 @@ again:
                        atomic_dec(&fs_info->scrubs_paused);
                        mutex_unlock(&fs_info->scrub_lock);
                        wake_up(&fs_info->scrub_pause_wait);
-                        scrub_free_csums(sdev);
-                        start_stripe = i;
-                        goto again;
                }
+                ret = btrfs_lookup_csums_range(csum_root, logical,
+                                               logical + map->stripe_len - 1,
+                                               &sdev->csum_list, 1);
+                if (ret)
+                        goto out;
                key.objectid = logical;
                key.type = BTRFS_EXTENT_ITEM_KEY;
                key.offset = (u64)0;
@@ -1371,7 +1360,6 @@ next:
 out:
        blk_finish_plug(&plug);
-out_noplug:
        btrfs_free_path(path);
        return ret < 0 ? ret : 0;
 }
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..57080dffdfc6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/cleancache.h>
+#include <linux/mnt_namespace.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -58,6 +59,7 @@
 #include <trace/events/btrfs.h>
 static const struct super_operations btrfs_super_ops;
+static struct file_system_type btrfs_fs_type;
 static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
                                      char nbuf[16])
@@ -162,7 +164,7 @@ enum {
        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-        Opt_inode_cache, Opt_err,
+        Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
 };
 static match_table_t tokens = {
@@ -195,6 +197,8 @@ static match_table_t tokens = {
        {Opt_subvolrootid, "subvolrootid=%d"},
        {Opt_defrag, "autodefrag"},
        {Opt_inode_cache, "inode_cache"},
+        {Opt_no_space_cache, "no_space_cache"},
+        {Opt_recovery, "recovery"},
        {Opt_err, NULL},
 };
@@ -206,14 +210,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 {
        struct btrfs_fs_info *info = root->fs_info;
        substring_t args[MAX_OPT_ARGS];
-        char *p, *num, *orig;
+        char *p, *num, *orig = NULL;
+        u64 cache_gen;
        int intarg;
        int ret = 0;
        char *compress_type;
        bool compress_force = false;
+        cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
+        if (cache_gen)
+                btrfs_set_opt(info->mount_opt, SPACE_CACHE);
        if (!options)
-                return 0;
+                goto out;
        /*
         * strsep changes the string, duplicate it because parse_options
@@ -360,9 +369,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, DISCARD);
                        break;
                case Opt_space_cache:
-                        printk(KERN_INFO "btrfs: enabling disk space caching\n");
                        btrfs_set_opt(info->mount_opt, SPACE_CACHE);
                        break;
+                case Opt_no_space_cache:
+                        printk(KERN_INFO "btrfs: disabling disk space caching\n");
+                        btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
+                        break;
                case Opt_inode_cache:
                        printk(KERN_INFO "btrfs: enabling inode map caching\n");
                        btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -381,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        printk(KERN_INFO "btrfs: enabling auto defrag");
                        btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
                        break;
+                case Opt_recovery:
+                        printk(KERN_INFO "btrfs: enabling auto recovery");
+                        btrfs_set_opt(info->mount_opt, RECOVERY);
+                        break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -391,6 +407,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                }
        }
 out:
+        if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+                printk(KERN_INFO "btrfs: disk space caching is enabled\n");
        kfree(orig);
        return ret;
 }
@@ -406,12 +424,12 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
 {
        substring_t args[MAX_OPT_ARGS];
-        char *opts, *orig, *p;
+        char *device_name, *opts, *orig, *p;
        int error = 0;
        int intarg;
        if (!options)
-                goto out;
+                return 0;
        /*
         * strsep changes the string, duplicate it because parse_options
@@ -457,29 +475,24 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                        }
                        break;
                case Opt_device:
-                        error = btrfs_scan_one_device(match_strdup(&args[0]),
+                        device_name = match_strdup(&args[0]);
+                        if (!device_name) {
+                                error = -ENOMEM;
+                                goto out;
+                        }
+                        error = btrfs_scan_one_device(device_name,
                                        flags, holder, fs_devices);
+                        kfree(device_name);
                        if (error)
-                                goto out_free_opts;
+                                goto out;
                        break;
                default:
                        break;
                }
        }
- out_free_opts:
+out:
        kfree(orig);
- out:
-        /*
-         * If no subvolume name is specified we use the default one.  Allocate
-         * a copy of the string "." here so that code later in the
-         * mount path doesn't care if it's the default volume or another one.
-         */
-        if (!*subvol_name) {
-                *subvol_name = kstrdup(".", GFP_KERNEL);
-                if (!*subvol_name)
-                        return -ENOMEM;
-        }
        return error;
 }
@@ -492,7 +505,6 @@ static struct dentry *get_default_root(struct super_block *sb,
        struct btrfs_path *path;
        struct btrfs_key location;
        struct inode *inode;
-        struct dentry *dentry;
        u64 dir_id;
        int new = 0;
@@ -517,7 +529,7 @@ static struct dentry *get_default_root(struct super_block *sb,
         * will mount by default if we haven't been given a specific subvolume
         * to mount.
         */
-        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+        dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
        if (IS_ERR(di)) {
                btrfs_free_path(path);
@@ -566,29 +578,7 @@ setup_root:
                return dget(sb->s_root);
        }
-        if (new) {
+        return d_obtain_alias(inode);
-                const struct qstr name = { .name = "/", .len = 1 };
-                /*
-                 * New inode, we need to make the dentry a sibling of s_root so
-                 * everything gets cleaned up properly on unmount.
-                 */
-                dentry = d_alloc(sb->s_root, &name);
-                if (!dentry) {
-                        iput(inode);
-                        return ERR_PTR(-ENOMEM);
-                }
-                d_splice_alias(inode, dentry);
-        } else {
-                /*
-                 * We found the inode in cache, just find a dentry for it and
-                 * put the reference to the inode we just got.
-                 */
-                dentry = d_find_alias(inode);
-                iput(inode);
-        }
-        return dentry;
 }
 static int btrfs_fill_super(struct super_block *sb,
@@ -719,6 +709,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",noacl");
        if (btrfs_test_opt(root, SPACE_CACHE))
                seq_puts(seq, ",space_cache");
+        else
+                seq_puts(seq, ",no_space_cache");
        if (btrfs_test_opt(root, CLEAR_CACHE))
                seq_puts(seq, ",clear_cache");
        if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -753,6 +745,137 @@ static int btrfs_set_super(struct super_block *s, void *data)
        return set_anon_super(s, data);
 }
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+        if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+                return 1;
+        return 0;
+}
+/*
+ * This will strip out the subvol=%s argument for an argument string and add
+ * subvolid=0 to make sure we get the actual tree root for path walking to the
+ * subvol we want.
+ */
+static char *setup_root_args(char *args)
+{
+        unsigned copied = 0;
+        unsigned len = strlen(args) + 2;
+        char *pos;
+        char *ret;
+        /*
+         * We need the same args as before, but minus
+         *
+         * subvol=a
+         *
+         * and add
+         *
+         * subvolid=0
+         *
+         * which is a difference of 2 characters, so we allocate strlen(args) +
+         * 2 characters.
+         */
+        ret = kzalloc(len * sizeof(char), GFP_NOFS);
+        if (!ret)
+                return NULL;
+        pos = strstr(args, "subvol=");
+        /* This shouldn't happen, but just in case.. */
+        if (!pos) {
+                kfree(ret);
+                return NULL;
+        }
+        /*
+         * The subvol=<> arg is not at the front of the string, copy everybody
+         * up to that into ret.
+         */
+        if (pos != args) {
+                *pos = '\0';
+                strcpy(ret, args);
+                copied += strlen(args);
+                pos++;
+        }
+        strncpy(ret + copied, "subvolid=0", len - copied);
+        /* Length of subvolid=0 */
+        copied += 10;
+        /*
+         * If there is no , after the subvol= option then we know there's no
+         * other options and we can just return.
+         */
+        pos = strchr(pos, ',');
+        if (!pos)
+                return ret;
+        /* Copy the rest of the arguments into our buffer */
+        strncpy(ret + copied, pos, len - copied);
+        copied += strlen(pos);
+        return ret;
+}
+static struct dentry *mount_subvol(const char *subvol_name, int flags,
+                                   const char *device_name, char *data)
+{
+        struct super_block *s;
+        struct dentry *root;
+        struct vfsmount *mnt;
+        struct mnt_namespace *ns_private;
+        char *newargs;
+        struct path path;
+        int error;
+        newargs = setup_root_args(data);
+        if (!newargs)
+                return ERR_PTR(-ENOMEM);
+        mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
+                             newargs);
+        kfree(newargs);
+        if (IS_ERR(mnt))
+                return ERR_CAST(mnt);
+        ns_private = create_mnt_ns(mnt);
+        if (IS_ERR(ns_private)) {
+                mntput(mnt);
+                return ERR_CAST(ns_private);
+        }
+        /*
+         * This will trigger the automount of the subvol so we can just
+         * drop the mnt we have here and return the dentry that we
+         * found.
+         */
+        error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
+                                LOOKUP_FOLLOW, &path);
+        put_mnt_ns(ns_private);
+        if (error)
+                return ERR_PTR(error);
+        if (!is_subvolume_inode(path.dentry->d_inode)) {
+                path_put(&path);
+                mntput(mnt);
+                error = -EINVAL;
+                printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
+                                subvol_name);
+                return ERR_PTR(-EINVAL);
+        }
+        /* Get a ref to the sb and the dentry we found and return it */
+        s = path.mnt->mnt_sb;
+        atomic_inc(&s->s_active);
+        root = dget(path.dentry);
+        path_put(&path);
+        down_write(&s->s_umount);
+        return root;
+}
 /*
 * Find a superblock for the given device / mount point.
@@ -784,13 +907,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        if (error)
                return ERR_PTR(error);
+        if (subvol_name) {
+                root = mount_subvol(subvol_name, flags, device_name, data);
+                kfree(subvol_name);
+                return root;
+        }
        error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
        if (error)
-                goto error_free_subvol_name;
+                return ERR_PTR(error);
        error = btrfs_open_devices(fs_devices, mode, fs_type);
        if (error)
-                goto error_free_subvol_name;
+                return ERR_PTR(error);
        if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
                error = -EACCES;
@@ -813,88 +942,57 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
        fs_info->fs_devices = fs_devices;
        tree_root->fs_info = fs_info;
+        fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+        fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+        if (!fs_info->super_copy || !fs_info->super_for_commit) {
+                error = -ENOMEM;
+                goto error_close_devices;
+        }
        bdev = fs_devices->latest_bdev;
        s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
-        if (IS_ERR(s))
+        if (IS_ERR(s)) {
-                goto error_s;
+                error = PTR_ERR(s);
+                goto error_close_devices;
+        }
        if (s->s_root) {
                if ((flags ^ s->s_flags) & MS_RDONLY) {
                        deactivate_locked_super(s);
-                        error = -EBUSY;
+                        return ERR_PTR(-EBUSY);
-                        goto error_close_devices;
                }
                btrfs_close_devices(fs_devices);
-                kfree(fs_info);
+                free_fs_info(fs_info);
                kfree(tree_root);
        } else {
                char b[BDEVNAME_SIZE];
                s->s_flags = flags | MS_NOSEC;
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+                btrfs_sb(s)->fs_info->bdev_holder = fs_type;
                error = btrfs_fill_super(s, fs_devices, data,
                                         flags & MS_SILENT ? 1 : 0);
                if (error) {
                        deactivate_locked_super(s);
-                        goto error_free_subvol_name;
+                        return ERR_PTR(error);
                }
-                btrfs_sb(s)->fs_info->bdev_holder = fs_type;
                s->s_flags |= MS_ACTIVE;
        }
-        /* if they gave us a subvolume name bind mount into that */
+        root = get_default_root(s, subvol_objectid);
-        if (strcmp(subvol_name, ".")) {
+        if (IS_ERR(root)) {
-                struct dentry *new_root;
+                deactivate_locked_super(s);
+                return root;
-                root = get_default_root(s, subvol_rootid);
-                if (IS_ERR(root)) {
-                        error = PTR_ERR(root);
-                        deactivate_locked_super(s);
-                        goto error_free_subvol_name;
-                }
-                mutex_lock(&root->d_inode->i_mutex);
-                new_root = lookup_one_len(subvol_name, root,
-                                      strlen(subvol_name));
-                mutex_unlock(&root->d_inode->i_mutex);
-                if (IS_ERR(new_root)) {
-                        dput(root);
-                        deactivate_locked_super(s);
-                        error = PTR_ERR(new_root);
-                        goto error_free_subvol_name;
-                }
-                if (!new_root->d_inode) {
-                        dput(root);
-                        dput(new_root);
-                        deactivate_locked_super(s);
-                        error = -ENXIO;
-                        goto error_free_subvol_name;
-                }
-                dput(root);
-                root = new_root;
-        } else {
-                root = get_default_root(s, subvol_objectid);
-                if (IS_ERR(root)) {
-                        error = PTR_ERR(root);
-                        deactivate_locked_super(s);
-                        goto error_free_subvol_name;
-                }
        }
-        kfree(subvol_name);
        return root;
-error_s:
-        error = PTR_ERR(s);
 error_close_devices:
        btrfs_close_devices(fs_devices);
-        kfree(fs_info);
+        free_fs_info(fs_info);
        kfree(tree_root);
-error_free_subvol_name:
-        kfree(subvol_name);
        return ERR_PTR(error);
 }
@@ -919,7 +1017,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                if (root->fs_info->fs_devices->rw_devices == 0)
                        return -EACCES;
-                if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+                if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
                        return -EINVAL;
                ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -1085,7 +1183,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+        struct btrfs_super_block *disk_super = root->fs_info->super_copy;
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..29f782cc2cc9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -275,7 +275,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
         */
        if (num_items > 0 && root != root->fs_info->chunk_root) {
                num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-                ret = btrfs_block_rsv_add(NULL, root,
+                ret = btrfs_block_rsv_add(root,
                                          &root->fs_info->trans_block_rsv,
                                          num_bytes);
                if (ret)
@@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
                                  struct btrfs_root *root)
 {
        int ret;
-        ret = btrfs_block_rsv_check(trans, root,
-                                    &root->fs_info->global_block_rsv, 0, 5);
+        ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
        return ret ? 1 : 0;
 }
@@ -427,17 +427,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root)
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
+        struct btrfs_block_rsv *rsv = trans->block_rsv;
        int updates;
        smp_mb();
        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
                return 1;
+        /*
+         * We need to do this in case we're deleting csums so the global block
+         * rsv get's used instead of the csum block rsv.
+         */
+        trans->block_rsv = NULL;
        updates = trans->delayed_ref_updates;
        trans->delayed_ref_updates = 0;
        if (updates)
                btrfs_run_delayed_refs(trans, root, updates);
+        trans->block_rsv = rsv;
        return should_end_transaction(trans, root);
 }
@@ -453,6 +462,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                return 0;
        }
+        btrfs_trans_release_metadata(trans, root);
+        trans->block_rsv = NULL;
        while (count < 4) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
@@ -473,8 +484,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                count++;
        }
-        btrfs_trans_release_metadata(trans, root);
        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
            should_end_transaction(trans, root)) {
                trans->transaction->blocked = 1;
@@ -562,50 +571,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
 int btrfs_write_marked_extents(struct btrfs_root *root,
                               struct extent_io_tree *dirty_pages, int mark)
 {
-        int ret;
        int err = 0;
        int werr = 0;
-        struct page *page;
+        struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
-        struct inode *btree_inode = root->fs_info->btree_inode;
        u64 start = 0;
        u64 end;
-        unsigned long index;
-        while (1) {
+        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                      mark)) {
-                                            mark);
+                convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
-                if (ret)
+                                   GFP_NOFS);
-                        break;
+                err = filemap_fdatawrite_range(mapping, start, end);
-                while (start <= end) {
+                if (err)
-                        cond_resched();
+                        werr = err;
+                cond_resched();
-                        index = start >> PAGE_CACHE_SHIFT;
+                start = end + 1;
-                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-                        page = find_get_page(btree_inode->i_mapping, index);
-                        if (!page)
-                                continue;
-                        btree_lock_page_hook(page);
-                        if (!page->mapping) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                continue;
-                        }
-                        if (PageWriteback(page)) {
-                                if (PageDirty(page))
-                                        wait_on_page_writeback(page);
-                                else {
-                                        unlock_page(page);
-                                        page_cache_release(page);
-                                        continue;
-                                }
-                        }
-                        err = write_one_page(page, 0);
-                        if (err)
-                                werr = err;
-                        page_cache_release(page);
-                }
        }
        if (err)
                werr = err;
@@ -621,39 +601,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 int btrfs_wait_marked_extents(struct btrfs_root *root,
                              struct extent_io_tree *dirty_pages, int mark)
 {
-        int ret;
        int err = 0;
        int werr = 0;
-        struct page *page;
+        struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
-        struct inode *btree_inode = root->fs_info->btree_inode;
        u64 start = 0;
        u64 end;
-        unsigned long index;
-        while (1) {
+        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
-                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                      EXTENT_NEED_WAIT)) {
-                                            mark);
+                clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
-                if (ret)
+                err = filemap_fdatawait_range(mapping, start, end);
-                        break;
+                if (err)
+                        werr = err;
-                clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+                cond_resched();
-                while (start <= end) {
+                start = end + 1;
-                        index = start >> PAGE_CACHE_SHIFT;
-                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-                        page = find_get_page(btree_inode->i_mapping, index);
-                        if (!page)
-                                continue;
-                        if (PageDirty(page)) {
-                                btree_lock_page_hook(page);
-                                wait_on_page_writeback(page);
-                                err = write_one_page(page, 0);
-                                if (err)
-                                        werr = err;
-                        }
-                        wait_on_page_writeback(page);
-                        page_cache_release(page);
-                        cond_resched();
-                }
        }
        if (err)
                werr = err;
@@ -673,7 +634,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
        ret = btrfs_write_marked_extents(root, dirty_pages, mark);
        ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
-        return ret || ret2;
+        if (ret)
+                return ret;
+        if (ret2)
+                return ret2;
+        return 0;
 }
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
@@ -911,10 +877,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        }
        btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
-        btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
        if (to_reserve > 0) {
-                ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+                ret = btrfs_block_rsv_add(root, &pending->block_rsv,
                                          to_reserve);
                if (ret) {
                        pending->error = ret;
@@ -1002,7 +967,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        BUG_ON(IS_ERR(pending->snap));
        btrfs_reloc_post_snapshot(trans, pending);
-        btrfs_orphan_post_snapshot(trans, pending);
 fail:
        kfree(new_root_item);
        trans->block_rsv = rsv;
@@ -1032,7 +996,7 @@ static void update_super_roots(struct btrfs_root *root)
        struct btrfs_root_item *root_item;
        struct btrfs_super_block *super;
-        super = &root->fs_info->super_copy;
+        super = root->fs_info->super_copy;
        root_item = &root->fs_info->chunk_root->root_item;
        super->chunk_root = root_item->bytenr;
@@ -1043,7 +1007,7 @@ static void update_super_roots(struct btrfs_root *root)
        super->root = root_item->bytenr;
        super->generation = root_item->generation;
        super->root_level = root_item->level;
-        if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+        if (btrfs_test_opt(root, SPACE_CACHE))
                super->cache_generation = root_item->generation;
 }
@@ -1168,14 +1132,15 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_run_ordered_operations(root, 0);
+        btrfs_trans_release_metadata(trans, root);
+        trans->block_rsv = NULL;
        /* make a pass through all the delayed refs we have so far
         * any runnings procs may add more while we are here
         */
        ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
-        btrfs_trans_release_metadata(trans, root);
        cur_trans = trans->transaction;
        /*
         * set the flushing flag so procs in this transaction have to
@@ -1341,12 +1306,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        update_super_roots(root);
        if (!root->fs_info->log_root_recovering) {
-                btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
+                btrfs_set_super_log_root(root->fs_info->super_copy, 0);
-                btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+                btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
        }
-        memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
+        memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
-               sizeof(root->fs_info->super_copy));
+               sizeof(*root->fs_info->super_copy));
        trans->transaction->blocked = 0;
        spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca067..f4d81c06d48f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
                              struct walk_control *wc, u64 gen)
 {
        if (wc->pin)
-                btrfs_pin_extent(log->fs_info->extent_root,
+                btrfs_pin_extent_for_log_replay(wc->trans,
-                                 eb->start, eb->len, 0);
+                                                log->fs_info->extent_root,
+                                                eb->start, eb->len);
        if (btrfs_buffer_uptodate(eb, gen)) {
                if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                WARN_ON(root_owner !=
                                        BTRFS_TREE_LOG_OBJECTID);
-                                ret = btrfs_free_reserved_extent(root,
+                                ret = btrfs_free_and_pin_reserved_extent(root,
                                                         bytenr, blocksize);
                                BUG_ON(ret);
                        }
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                btrfs_tree_unlock(next);
                                WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-                                ret = btrfs_free_reserved_extent(root,
+                                ret = btrfs_free_and_pin_reserved_extent(root,
                                                path->nodes[*level]->start,
                                                path->nodes[*level]->len);
                                BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
                        WARN_ON(log->root_key.objectid !=
                                BTRFS_TREE_LOG_OBJECTID);
-                        ret = btrfs_free_reserved_extent(log, next->start,
+                        ret = btrfs_free_and_pin_reserved_extent(log, next->start,
                                                         next->len);
                        BUG_ON(ret);
                }
@@ -2013,10 +2014,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        /* wait for previous tree log sync to complete */
        if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
                wait_log_commit(trans, root, root->log_transid - 1);
        while (1) {
                unsigned long batch = root->log_batch;
-                if (root->log_multiple_pids) {
+                /* when we're on an ssd, just kick the log commit out */
+                if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
                        mutex_unlock(&root->log_mutex);
                        schedule_timeout_uninterruptible(1);
                        mutex_lock(&root->log_mutex);
@@ -2117,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
-        btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+        btrfs_set_super_log_root(root->fs_info->super_for_commit,
                                log_root_tree->node->start);
-        btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+        btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
                                btrfs_header_level(log_root_tree->node));
        log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 18baac5a3f6c..f8e2943101a1 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
                }
                INIT_LIST_HEAD(&device->dev_alloc_list);
+                /* init readahead state */
+                spin_lock_init(&device->reada_lock);
+                device->reada_curr_zone = NULL;
+                atomic_set(&device->reada_in_flight, 0);
+                device->reada_next = 0;
+                INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+                INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
                mutex_lock(&fs_devices->device_list_mutex);
                list_add_rcu(&device->dev_list, &fs_devices->devices);
                mutex_unlock(&fs_devices->device_list_mutex);
@@ -597,10 +605,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                set_blocksize(bdev, 4096);
                bh = btrfs_read_dev_super(bdev);
-                if (!bh) {
+                if (!bh)
-                        ret = -EINVAL;
                        goto error_close;
-                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +661,7 @@ error:
                continue;
        }
        if (fs_devices->open_devices == 0) {
-                ret = -EIO;
+                ret = -EINVAL;
                goto out;
        }
        fs_devices->seeding = seeding;
@@ -1013,8 +1019,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
        }
        BUG_ON(ret);
-        if (device->bytes_used > 0)
+        if (device->bytes_used > 0) {
-                device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+                u64 len = btrfs_dev_extent_length(leaf, extent);
+                device->bytes_used -= len;
+                spin_lock(&root->fs_info->free_chunk_lock);
+                root->fs_info->free_chunk_space += len;
+                spin_unlock(&root->fs_info->free_chunk_lock);
+        }
        ret = btrfs_del_item(trans, root, path);
 out:
@@ -1356,6 +1367,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        if (ret)
                goto error_undo;
+        spin_lock(&root->fs_info->free_chunk_lock);
+        root->fs_info->free_chunk_space = device->total_bytes -
+                device->bytes_used;
+        spin_unlock(&root->fs_info->free_chunk_lock);
        device->in_fs_metadata = 0;
        btrfs_scrub_cancel_dev(root, device);
@@ -1387,8 +1403,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        call_rcu(&device->rcu, free_device);
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
-        num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+        num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
-        btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+        btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
        if (cur_devices->open_devices == 0) {
                struct btrfs_fs_devices *fs_devices;
@@ -1450,7 +1466,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
        struct btrfs_fs_devices *old_devices;
        struct btrfs_fs_devices *seed_devices;
-        struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+        struct btrfs_super_block *disk_super = root->fs_info->super_copy;
        struct btrfs_device *device;
        u64 super_flags;
@@ -1691,15 +1707,19 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
                root->fs_info->fs_devices->num_can_discard++;
        root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
+        spin_lock(&root->fs_info->free_chunk_lock);
+        root->fs_info->free_chunk_space += device->total_bytes;
+        spin_unlock(&root->fs_info->free_chunk_lock);
        if (!blk_queue_nonrot(bdev_get_queue(bdev)))
                root->fs_info->fs_devices->rotating = 1;
-        total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
-        btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+        btrfs_set_super_total_bytes(root->fs_info->super_copy,
                                    total_bytes + device->total_bytes);
-        total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
+        total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
-        btrfs_set_super_num_devices(&root->fs_info->super_copy,
+        btrfs_set_super_num_devices(root->fs_info->super_copy,
                                    total_bytes + 1);
        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
@@ -1790,7 +1810,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size)
 {
        struct btrfs_super_block *super_copy =
-                &device->dev_root->fs_info->super_copy;
+                device->dev_root->fs_info->super_copy;
        u64 old_total = btrfs_super_total_bytes(super_copy);
        u64 diff = new_size - device->total_bytes;
@@ -1849,7 +1869,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
                        chunk_offset)
 {
-        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        struct btrfs_disk_key *disk_key;
        struct btrfs_chunk *chunk;
        u8 *ptr;
@@ -2175,7 +2195,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        bool retried = false;
        struct extent_buffer *l;
        struct btrfs_key key;
-        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        u64 old_total = btrfs_super_total_bytes(super_copy);
        u64 old_size = device->total_bytes;
        u64 diff = device->total_bytes - new_size;
@@ -2192,8 +2212,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
        lock_chunks(root);
        device->total_bytes = new_size;
-        if (device->writeable)
+        if (device->writeable) {
                device->fs_devices->total_rw_bytes -= diff;
+                spin_lock(&root->fs_info->free_chunk_lock);
+                root->fs_info->free_chunk_space -= diff;
+                spin_unlock(&root->fs_info->free_chunk_lock);
+        }
        unlock_chunks(root);
 again:
@@ -2257,6 +2281,9 @@ again:
                device->total_bytes = old_size;
                if (device->writeable)
                        device->fs_devices->total_rw_bytes += diff;
+                spin_lock(&root->fs_info->free_chunk_lock);
+                root->fs_info->free_chunk_space += diff;
+                spin_unlock(&root->fs_info->free_chunk_lock);
                unlock_chunks(root);
                goto done;
        }
@@ -2292,7 +2319,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
                           struct btrfs_key *key,
                           struct btrfs_chunk *chunk, int item_size)
 {
-        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        struct btrfs_disk_key disk_key;
        u32 array_size;
        u8 *ptr;
@@ -2615,6 +2642,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
                index++;
        }
+        spin_lock(&extent_root->fs_info->free_chunk_lock);
+        extent_root->fs_info->free_chunk_space -= (stripe_size *
+                                                   map->num_stripes);
+        spin_unlock(&extent_root->fs_info->free_chunk_lock);
        index = 0;
        stripe = &chunk->stripe;
        while (index < map->num_stripes) {
@@ -3626,15 +3658,20 @@ static int read_one_dev(struct btrfs_root *root,
        fill_device_from_item(leaf, dev_item, device);
        device->dev_root = root->fs_info->dev_root;
        device->in_fs_metadata = 1;
-        if (device->writeable)
+        if (device->writeable) {
                device->fs_devices->total_rw_bytes += device->total_bytes;
+                spin_lock(&root->fs_info->free_chunk_lock);
+                root->fs_info->free_chunk_space += device->total_bytes -
+                        device->bytes_used;
+                spin_unlock(&root->fs_info->free_chunk_lock);
+        }
        ret = 0;
        return ret;
 }
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
-        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+        struct btrfs_super_block *super_copy = root->fs_info->super_copy;
        struct extent_buffer *sb;
        struct btrfs_disk_key *disk_key;
        struct btrfs_chunk *chunk;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 71f4f3f67495..ab5b1c49f352 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,14 @@ struct btrfs_device {
        struct btrfs_work work;
        struct rcu_head rcu;
        struct work_struct rcu_work;
+        /* readahead state */
+        spinlock_t reada_lock;
+        atomic_t reada_in_flight;
+        u64 reada_next;
+        struct reada_zone *reada_curr_zone;
+        struct radix_tree_root reada_zones;
+        struct radix_tree_root reada_extents;
 };
 struct btrfs_fs_devices {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 69565e5fc6a0..a76e41c04b71 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 again:
        ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
                                      name, name_len, value, size);
+        /*
+         * If we're setting an xattr to a new value but the new value is say
+         * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
+         * back from split_leaf.  This is because it thinks we'll be extending
+         * the existing item size, but we're asking for enough space to add the
+         * item itself.  So if we get EOVERFLOW just set ret to EEXIST and let
+         * the rest of the function figure it out.
+         */
+        if (ret == -EOVERFLOW)
+                ret = -EEXIST;
        if (ret == -EEXIST) {
                if (flags & XATTR_CREATE)
                        goto out;
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index e76bfeb68267..30acd22147e1 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -351,9 +351,7 @@ static int
 build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
 {
        unsigned int dlen;
-        unsigned int wlen;
+        unsigned int size = 2 * sizeof(struct ntlmssp2_name);
-        unsigned int size = 6 * sizeof(struct ntlmssp2_name);
-        __le64  curtime;
        char *defdmname = "WORKGROUP";
        unsigned char *blobptr;
        struct ntlmssp2_name *attrptr;
@@ -365,15 +363,14 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
        }
        dlen = strlen(ses->domainName);
-        wlen = strlen(ses->server->hostname);
-        /* The length of this blob is a size which is
+        /*
-         * six times the size of a structure which holds name/size +
+         * The length of this blob is two times the size of a
-         * two times the unicode length of a domain name +
+         * structure (av pair) which holds name/size
-         * two times the unicode length of a server name +
+         * ( for NTLMSSP_AV_NB_DOMAIN_NAME followed by NTLMSSP_AV_EOL ) +
-         * size of a timestamp (which is 8 bytes).
+         * unicode length of a netbios domain name
         */
-        ses->auth_key.len = size + 2 * (2 * dlen) + 2 * (2 * wlen) + 8;
+        ses->auth_key.len = size + 2 * dlen;
        ses->auth_key.response = kzalloc(ses->auth_key.len, GFP_KERNEL);
        if (!ses->auth_key.response) {
                ses->auth_key.len = 0;
@@ -384,44 +381,15 @@ build_avpair_blob(struct cifs_ses *ses, const struct nls_table *nls_cp)
        blobptr = ses->auth_key.response;
        attrptr = (struct ntlmssp2_name *) blobptr;
+        /*
+         * As defined in MS-NTLM 3.3.2, just this av pair field
+         * is sufficient as part of the temp
+         */
        attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_DOMAIN_NAME);
        attrptr->length = cpu_to_le16(2 * dlen);
        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
        cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
-        blobptr += 2 * dlen;
-        attrptr = (struct ntlmssp2_name *) blobptr;
-        attrptr->type = cpu_to_le16(NTLMSSP_AV_NB_COMPUTER_NAME);
-        attrptr->length = cpu_to_le16(2 * wlen);
-        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-        cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
-        blobptr += 2 * wlen;
-        attrptr = (struct ntlmssp2_name *) blobptr;
-        attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_DOMAIN_NAME);
-        attrptr->length = cpu_to_le16(2 * dlen);
-        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-        cifs_strtoUCS((__le16 *)blobptr, ses->domainName, dlen, nls_cp);
-        blobptr += 2 * dlen;
-        attrptr = (struct ntlmssp2_name *) blobptr;
-        attrptr->type = cpu_to_le16(NTLMSSP_AV_DNS_COMPUTER_NAME);
-        attrptr->length = cpu_to_le16(2 * wlen);
-        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-        cifs_strtoUCS((__le16 *)blobptr, ses->server->hostname, wlen, nls_cp);
-        blobptr += 2 * wlen;
-        attrptr = (struct ntlmssp2_name *) blobptr;
-        attrptr->type = cpu_to_le16(NTLMSSP_AV_TIMESTAMP);
-        attrptr->length = cpu_to_le16(sizeof(__le64));
-        blobptr = (unsigned char *)attrptr + sizeof(struct ntlmssp2_name);
-        curtime = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME));
-        memcpy(blobptr, &curtime, sizeof(__le64));
        return 0;
 }
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index f93eb948d071..54b8f1e7da94 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -548,6 +548,12 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                struct inode *dir = dentry->d_inode;
                struct dentry *child;
+                if (!dir) {
+                        dput(dentry);
+                        dentry = ERR_PTR(-ENOENT);
+                        break;
+                }
                /* skip separators */
                while (*s == sep)
                        s++;
@@ -563,10 +569,6 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
                mutex_unlock(&dir->i_mutex);
                dput(dentry);
                dentry = child;
-                if (!dentry->d_inode) {
-                        dput(dentry);
-                        dentry = ERR_PTR(-ENOENT);
-                }
        } while (!IS_ERR(dentry));
        _FreeXid(xid);
        kfree(full_path);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index aac37d99a487..a80f7bd97b90 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -4079,7 +4079,8 @@ int CIFSFindNext(const int xid, struct cifs_tcon *tcon,
        T2_FNEXT_RSP_PARMS *parms;
        char *response_data;
        int rc = 0;
-        int bytes_returned, name_len;
+        int bytes_returned;
+        unsigned int name_len;
        __u16 params, byte_count;
        cFYI(1, "In FindNext");
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 633c246b6775..71beb0201970 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1298,7 +1298,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        /* ignore */
                } else if (strnicmp(data, "guest", 5) == 0) {
                        /* ignore */
-                } else if (strnicmp(data, "rw", 2) == 0) {
+                } else if (strnicmp(data, "rw", 2) == 0 && strlen(data) == 2) {
                        /* ignore */
                } else if (strnicmp(data, "ro", 2) == 0) {
                        /* ignore */
@@ -1401,7 +1401,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
                        vol->server_ino = 1;
                } else if (strnicmp(data, "noserverino", 9) == 0) {
                        vol->server_ino = 0;
-                } else if (strnicmp(data, "rwpidforward", 4) == 0) {
+                } else if (strnicmp(data, "rwpidforward", 12) == 0) {
                        vol->rwpidforward = 1;
                } else if (strnicmp(data, "cifsacl", 7) == 0) {
                        vol->cifs_acl = 1;
@@ -2018,7 +2018,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
                warned_on_ntlm = true;
                cERROR(1, "default security mechanism requested.  The default "
                        "security mechanism will be upgraded from ntlm to "
-                        "ntlmv2 in kernel release 3.1");
+                        "ntlmv2 in kernel release 3.2");
        }
        ses->overrideSecFlg = volume_info->secFlg;
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 04da6acde85d..12661e1deedd 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1134,7 +1134,7 @@ struct buffer_head *ext3_bread(handle_t *handle, struct inode *inode,
                return bh;
        if (buffer_uptodate(bh))
                return bh;
-        ll_rw_block(READ_META, 1, &bh);
+        ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return bh;
@@ -2807,7 +2807,7 @@ make_io:
                trace_ext3_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
-                submit_bh(READ_META, bh);
+                submit_bh(READ | REQ_META | REQ_PRIO, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        ext3_error(inode->i_sb, "ext3_get_inode_loc",
diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c
index 5571708b6a58..0629e09f6511 100644
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -922,7 +922,8 @@ restart:
                                bh = ext3_getblk(NULL, dir, b++, 0, &err);
                                bh_use[ra_max] = bh;
                                if (bh)
-                                        ll_rw_block(READ_META, 1, &bh);
+                                        ll_rw_block(READ | REQ_META | REQ_PRIO,
+                                                    1, &bh);
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 18d2558b7624..986e2388f031 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -647,7 +647,7 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode,
                return bh;
        if (buffer_uptodate(bh))
                return bh;
-        ll_rw_block(READ_META, 1, &bh);
+        ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
        wait_on_buffer(bh);
        if (buffer_uptodate(bh))
                return bh;
@@ -3298,7 +3298,7 @@ make_io:
                trace_ext4_load_inode(inode);
                get_bh(bh);
                bh->b_end_io = end_buffer_read_sync;
-                submit_bh(READ_META, bh);
+                submit_bh(READ | REQ_META | REQ_PRIO, bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh)) {
                        EXT4_ERROR_INODE_BLOCK(inode, block,
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f8068c7bae9f..1c924faeb6c8 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -922,7 +922,8 @@ restart:
                                bh = ext4_getblk(NULL, dir, b++, 0, &err);
                                bh_use[ra_max] = bh;
                                if (bh)
-                                        ll_rw_block(READ_META, 1, &bh);
+                                        ll_rw_block(READ | REQ_META | REQ_PRIO,
+                                                    1, &bh);
                        }
                }
                if ((bh = bh_use[ra_ptr++]) == NULL)
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 85c62923ee29..598646434362 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -624,9 +624,9 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull)
        bh->b_end_io = end_buffer_write_sync;
        get_bh(bh);
        if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-                submit_bh(WRITE_SYNC | REQ_META, bh);
+                submit_bh(WRITE_SYNC | REQ_META | REQ_PRIO, bh);
        else
-                submit_bh(WRITE_FLUSH_FUA | REQ_META, bh);
+                submit_bh(WRITE_FLUSH_FUA | REQ_META | REQ_PRIO, bh);
        wait_on_buffer(bh);
        if (!buffer_uptodate(bh))
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 747238cd9f96..be29858900f6 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,7 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 {
        struct buffer_head *bh, *head;
        int nr_underway = 0;
-        int write_op = REQ_META |
+        int write_op = REQ_META | REQ_PRIO |
                (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE);
        BUG_ON(!PageLocked(page));
@@ -225,7 +225,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
        }
        bh->b_end_io = end_buffer_read_sync;
        get_bh(bh);
-        submit_bh(READ_SYNC | REQ_META, bh);
+        submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
        if (!(flags & DIO_WAIT))
                return 0;
@@ -435,7 +435,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
        if (buffer_uptodate(first_bh))
                goto out;
        if (!buffer_locked(first_bh))
-                ll_rw_block(READ_SYNC | REQ_META, 1, &first_bh);
+                ll_rw_block(READ_SYNC | REQ_META | REQ_PRIO, 1, &first_bh);
        dblock++;
        extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 3bc073a4cf82..079587e53849 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -224,7 +224,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
        bio->bi_end_io = end_bio_io_page;
        bio->bi_private = page;
-        submit_bio(READ_SYNC | REQ_META, bio);
+        submit_bio(READ_SYNC | REQ_META | REQ_PRIO, bio);
        wait_on_page_locked(page);
        bio_put(bio);
        if (!PageUptodate(page)) {
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 42e8d23bc047..0e8bb13381e4 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -709,7 +709,7 @@ get_a_page:
                set_buffer_uptodate(bh);
        if (!buffer_uptodate(bh)) {
-                ll_rw_block(READ_META, 1, &bh);
+                ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh);
                wait_on_buffer(bh);
                if (!buffer_uptodate(bh))
                        goto unlock_out;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index c106ca22e812..d24a9b666a23 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -344,6 +344,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        struct inode *root, *inode;
        struct qstr str;
        struct nls_table *nls = NULL;
+        u64 last_fs_block, last_fs_page;
        int err;
        err = -EINVAL;
@@ -399,9 +400,13 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent)
        if (!sbi->rsrc_clump_blocks)
                sbi->rsrc_clump_blocks = 1;
-        err = generic_check_addressable(sbi->alloc_blksz_shift,
+        err = -EFBIG;
-                                        sbi->total_blocks);
+        last_fs_block = sbi->total_blocks - 1;
-        if (err) {
+        last_fs_page = (last_fs_block << sbi->alloc_blksz_shift) >>
+                        PAGE_CACHE_SHIFT;
+        if ((last_fs_block > (sector_t)(~0ULL) >> (sbi->alloc_blksz_shift - 9)) ||
+            (last_fs_page > (pgoff_t)(~0ULL))) {
                printk(KERN_ERR "hfs: filesystem size too large.\n");
                goto out_free_vhdr;
        }
@@ -525,8 +530,8 @@ out_close_cat_tree:
 out_close_ext_tree:
        hfs_btree_close(sbi->ext_tree);
 out_free_vhdr:
-        kfree(sbi->s_vhdr);
+        kfree(sbi->s_vhdr_buf);
-        kfree(sbi->s_backup_vhdr);
+        kfree(sbi->s_backup_vhdr_buf);
 out_unload_nls:
        unload_nls(sbi->nls);
        unload_nls(nls);
diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c
index 10e515a0d452..7daf4b852d1c 100644
--- a/fs/hfsplus/wrapper.c
+++ b/fs/hfsplus/wrapper.c
@@ -272,9 +272,9 @@ reread:
        return 0;
 out_free_backup_vhdr:
-        kfree(sbi->s_backup_vhdr);
+        kfree(sbi->s_backup_vhdr_buf);
 out_free_vhdr:
-        kfree(sbi->s_vhdr);
+        kfree(sbi->s_vhdr_buf);
 out:
        return error;
 }
diff --git a/fs/namei.c b/fs/namei.c
index b52bc685465f..0b3138de2a3b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -721,12 +721,6 @@ static int follow_automount(struct path *path, unsigned flags,
        if (!path->dentry->d_op || !path->dentry->d_op->d_automount)
                return -EREMOTE;
-        /* We don't want to mount if someone supplied AT_NO_AUTOMOUNT
-         * and this is the terminal part of the path.
-         */
-        if ((flags & LOOKUP_NO_AUTOMOUNT) && !(flags & LOOKUP_PARENT))
-                return -EISDIR; /* we actually want to stop here */
        /* We don't want to mount if someone's just doing a stat -
         * unless they're stat'ing a directory and appended a '/' to
         * the name.
@@ -739,7 +733,7 @@ static int follow_automount(struct path *path, unsigned flags,
         * of the daemon to instantiate them before they can be used.
         */
        if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY |
-                     LOOKUP_OPEN | LOOKUP_CREATE)) &&
+                     LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) &&
            path->dentry->d_inode)
                return -EISDIR;
@@ -2616,6 +2610,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
        if (!dir->i_op->rmdir)
                return -EPERM;
+        dget(dentry);
        mutex_lock(&dentry->d_inode->i_mutex);
        error = -EBUSY;
@@ -2636,6 +2631,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 out:
        mutex_unlock(&dentry->d_inode->i_mutex);
+        dput(dentry);
        if (!error)
                d_delete(dentry);
        return error;
@@ -3025,6 +3021,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
        if (error)
                return error;
+        dget(new_dentry);
        if (target)
                mutex_lock(&target->i_mutex);
@@ -3045,6 +3042,7 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry,
 out:
        if (target)
                mutex_unlock(&target->i_mutex);
+        dput(new_dentry);
        if (!error)
                if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
                        d_move(old_dentry,new_dentry);
diff --git a/fs/namespace.c b/fs/namespace.c
index 22bfe8273c68..b4febb29d3bb 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1757,7 +1757,7 @@ static int do_loopback(struct path *path, char *old_name,
                return err;
        if (!old_name || !*old_name)
                return -EINVAL;
-        err = kern_path(old_name, LOOKUP_FOLLOW, &old_path);
+        err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path);
        if (err)
                return err;
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1ec1a85fa71c..3e93e9a1bee1 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -56,6 +56,9 @@ enum nfs4_session_state {
        NFS4_SESSION_DRAINING,
 };
+#define NFS4_RENEW_TIMEOUT              0x01
+#define NFS4_RENEW_DELEGATION_CB        0x02
 struct nfs4_minor_version_ops {
        u32     minor_version;
@@ -225,7 +228,7 @@ struct nfs4_state_recovery_ops {
 };
 struct nfs4_state_maintenance_ops {
-        int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *);
+        int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
        struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
        int (*renew_lease)(struct nfs_client *, struct rpc_cred *);
 };
@@ -237,8 +240,6 @@ extern const struct inode_operations nfs4_dir_inode_operations;
 extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *);
 extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *);
 extern int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred);
-extern int nfs4_proc_async_renew(struct nfs_client *, struct rpc_cred *);
-extern int nfs4_proc_renew(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs41_init_clientid(struct nfs_client *, struct rpc_cred *);
 extern int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait, bool roc);
@@ -349,6 +350,7 @@ extern void nfs4_close_sync(struct nfs4_state *, fmode_t);
 extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t);
 extern void nfs4_schedule_lease_recovery(struct nfs_client *);
 extern void nfs4_schedule_state_manager(struct nfs_client *);
+extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern void nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
 extern void nfs41_handle_recall_slot(struct nfs_client *clp);
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8c77039e7a81..4700fae1ada0 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -3374,9 +3374,13 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata)
        if (task->tk_status < 0) {
                /* Unless we're shutting down, schedule state recovery! */
-                if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) != 0)
+                if (test_bit(NFS_CS_RENEWD, &clp->cl_res_state) == 0)
+                        return;
+                if (task->tk_status != NFS4ERR_CB_PATH_DOWN) {
                        nfs4_schedule_lease_recovery(clp);
-                return;
+                        return;
+                }
+                nfs4_schedule_path_down_recovery(clp);
        }
        do_renew_lease(clp, timestamp);
 }
@@ -3386,7 +3390,7 @@ static const struct rpc_call_ops nfs4_renew_ops = {
        .rpc_release = nfs4_renew_release,
 };
-int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
 {
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -3395,9 +3399,11 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
        };
        struct nfs4_renewdata *data;
+        if (renew_flags == 0)
+                return 0;
        if (!atomic_inc_not_zero(&clp->cl_count))
                return -EIO;
-        data = kmalloc(sizeof(*data), GFP_KERNEL);
+        data = kmalloc(sizeof(*data), GFP_NOFS);
        if (data == NULL)
                return -ENOMEM;
        data->client = clp;
@@ -3406,7 +3412,7 @@ int nfs4_proc_async_renew(struct nfs_client *clp, struct rpc_cred *cred)
                        &nfs4_renew_ops, data);
 }
-int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred)
 {
        struct rpc_message msg = {
                .rpc_proc       = &nfs4_procedures[NFSPROC4_CLNT_RENEW],
@@ -5504,11 +5510,13 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_
        return rpc_run_task(&task_setup_data);
 }
-static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred)
+static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred, unsigned renew_flags)
 {
        struct rpc_task *task;
        int ret = 0;
+        if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0)
+                return 0;
        task = _nfs41_proc_sequence(clp, cred);
        if (IS_ERR(task))
                ret = PTR_ERR(task);
diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c
index df8e7f3ca56d..dc484c0eae7f 100644
--- a/fs/nfs/nfs4renewd.c
+++ b/fs/nfs/nfs4renewd.c
@@ -60,6 +60,7 @@ nfs4_renew_state(struct work_struct *work)
        struct rpc_cred *cred;
        long lease;
        unsigned long last, now;
+        unsigned renew_flags = 0;
        ops = clp->cl_mvops->state_renewal_ops;
        dprintk("%s: start\n", __func__);
@@ -72,18 +73,23 @@ nfs4_renew_state(struct work_struct *work)
        last = clp->cl_last_renewal;
        now = jiffies;
        /* Are we close to a lease timeout? */
-        if (time_after(now, last + lease/3)) {
+        if (time_after(now, last + lease/3))
+                renew_flags |= NFS4_RENEW_TIMEOUT;
+        if (nfs_delegations_present(clp))
+                renew_flags |= NFS4_RENEW_DELEGATION_CB;
+        if (renew_flags != 0) {
                cred = ops->get_state_renewal_cred_locked(clp);
                spin_unlock(&clp->cl_lock);
                if (cred == NULL) {
-                        if (!nfs_delegations_present(clp)) {
+                        if (!(renew_flags & NFS4_RENEW_DELEGATION_CB)) {
                                set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
                                goto out;
                        }
                        nfs_expire_all_delegations(clp);
                } else {
                        /* Queue an asynchronous RENEW. */
-                        ops->sched_state_renewal(clp, cred);
+                        ops->sched_state_renewal(clp, cred, renew_flags);
                        put_rpccred(cred);
                        goto out_exp;
                }
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 72ab97ef3d61..39914be40b03 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1038,6 +1038,12 @@ void nfs4_schedule_lease_recovery(struct nfs_client *clp)
        nfs4_schedule_state_manager(clp);
 }
+void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
+{
+        nfs_handle_cb_pathdown(clp);
+        nfs4_schedule_state_manager(clp);
+}
 static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index b961ceac66b4..5b19b6aabe18 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2035,9 +2035,6 @@ static inline void nfs_initialise_sb(struct super_block *sb)
                sb->s_blocksize = nfs_block_bits(server->wsize,
                                                 &sb->s_blocksize_bits);
-        if (server->flags & NFS_MOUNT_NOAC)
-                sb->s_flags |= MS_SYNCHRONOUS;
        sb->s_bdi = &server->backing_dev_info;
        nfs_super_set_maxbytes(sb, server->maxfilesize);
@@ -2249,6 +2246,10 @@ static struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
        if (server->flags & NFS_MOUNT_UNSHARED)
                compare_super = NULL;
+        /* -o noac implies -o sync */
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
@@ -2361,6 +2362,10 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags,
        if (server->flags & NFS_MOUNT_UNSHARED)
                compare_super = NULL;
+        /* -o noac implies -o sync */
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(&nfs_fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
@@ -2628,6 +2633,10 @@ nfs4_remote_mount(struct file_system_type *fs_type, int flags,
        if (server->flags & NFS4_MOUNT_UNSHARED)
                compare_super = NULL;
+        /* -o noac implies -o sync */
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
@@ -2789,7 +2798,7 @@ static struct dentry *nfs_follow_remote_path(struct vfsmount *root_mnt,
                goto out_put_mnt_ns;
        ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt,
-                        export_path, LOOKUP_FOLLOW, &path);
+                        export_path, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
        nfs_referral_loop_unprotect();
        put_mnt_ns(ns_private);
@@ -2916,6 +2925,10 @@ nfs4_xdev_mount(struct file_system_type *fs_type, int flags,
        if (server->flags & NFS4_MOUNT_UNSHARED)
                compare_super = NULL;
+        /* -o noac implies -o sync */
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
@@ -3003,6 +3016,10 @@ nfs4_remote_referral_mount(struct file_system_type *fs_type, int flags,
        if (server->flags & NFS4_MOUNT_UNSHARED)
                compare_super = NULL;
+        /* -o noac implies -o sync */
+        if (server->flags & NFS_MOUNT_NOAC)
+                sb_mntdata.mntflags |= MS_SYNCHRONOUS;
        /* Get a superblock - note that we may end up sharing one that already exists */
        s = sget(&nfs4_fs_type, compare_super, nfs_set_super, &sb_mntdata);
        if (IS_ERR(s)) {
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index b39b37f80913..c9bd2a6b7d4b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -958,7 +958,7 @@ static int nfs_flush_multi(struct nfs_pageio_descriptor *desc, struct list_head
                if (!data)
                        goto out_bad;
                data->pagevec[0] = page;
-                nfs_write_rpcsetup(req, data, wsize, offset, desc->pg_ioflags);
+                nfs_write_rpcsetup(req, data, len, offset, desc->pg_ioflags);
                list_add(&data->list, res);
                requests++;
                nbytes -= len;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 25b6a887adb9..5afaa58a8630 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -877,30 +877,54 @@ struct numa_maps_private {
        struct numa_maps md;
 };
-static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty)
+static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty,
+                        unsigned long nr_pages)
 {
        int count = page_mapcount(page);
-        md->pages++;
+        md->pages += nr_pages;
        if (pte_dirty || PageDirty(page))
-                md->dirty++;
+                md->dirty += nr_pages;
        if (PageSwapCache(page))
-                md->swapcache++;
+                md->swapcache += nr_pages;
        if (PageActive(page) || PageUnevictable(page))
-                md->active++;
+                md->active += nr_pages;
        if (PageWriteback(page))
-                md->writeback++;
+                md->writeback += nr_pages;
        if (PageAnon(page))
-                md->anon++;
+                md->anon += nr_pages;
        if (count > md->mapcount_max)
                md->mapcount_max = count;
-        md->node[page_to_nid(page)]++;
+        md->node[page_to_nid(page)] += nr_pages;
+}
+static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma,
+                unsigned long addr)
+{
+        struct page *page;
+        int nid;
+        if (!pte_present(pte))
+                return NULL;
+        page = vm_normal_page(vma, addr, pte);
+        if (!page)
+                return NULL;
+        if (PageReserved(page))
+                return NULL;
+        nid = page_to_nid(page);
+        if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
+                return NULL;
+        return page;
 }
 static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
@@ -912,26 +936,32 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
        pte_t *pte;
        md = walk->private;
-        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+        spin_lock(&walk->mm->page_table_lock);
-        do {
+        if (pmd_trans_huge(*pmd)) {
-                struct page *page;
+                if (pmd_trans_splitting(*pmd)) {
-                int nid;
+                        spin_unlock(&walk->mm->page_table_lock);
+                        wait_split_huge_page(md->vma->anon_vma, pmd);
+                } else {
+                        pte_t huge_pte = *(pte_t *)pmd;
+                        struct page *page;
-                if (!pte_present(*pte))
+                        page = can_gather_numa_stats(huge_pte, md->vma, addr);
-                        continue;
+                        if (page)
+                                gather_stats(page, md, pte_dirty(huge_pte),
+                                                HPAGE_PMD_SIZE/PAGE_SIZE);
+                        spin_unlock(&walk->mm->page_table_lock);
+                        return 0;
+                }
+        } else {
+                spin_unlock(&walk->mm->page_table_lock);
+        }
-                page = vm_normal_page(md->vma, addr, *pte);
+        orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+        do {
+                struct page *page = can_gather_numa_stats(*pte, md->vma, addr);
                if (!page)
                        continue;
+                gather_stats(page, md, pte_dirty(*pte), 1);
-                if (PageReserved(page))
-                        continue;
-                nid = page_to_nid(page);
-                if (!node_isset(nid, node_states[N_HIGH_MEMORY]))
-                        continue;
-                gather_stats(page, md, pte_dirty(*pte));
        } while (pte++, addr += PAGE_SIZE, addr != end);
        pte_unmap_unlock(orig_pte, ptl);
@@ -952,7 +982,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask,
                return 0;
        md = walk->private;
-        gather_stats(page, md, pte_dirty(*pte));
+        gather_stats(page, md, pte_dirty(*pte), 1);
        return 0;
 }
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index b34bdb25490c..10b6be3ca280 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -355,7 +355,7 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
         * resolution (think about autofs) and thus deadlocks could arise.
         */
        if (cmds == Q_QUOTAON) {
-                ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW, &path);
+                ret = user_path_at(AT_FDCWD, addr, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path);
                if (ret)
                        pathp = ERR_PTR(ret);
                else
diff --git a/fs/stat.c b/fs/stat.c
index ba5316ffac61..78a3aa83c7ea 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -81,8 +81,6 @@ int vfs_fstatat(int dfd, const char __user *filename, struct kstat *stat,
        if (!(flag & AT_SYMLINK_NOFOLLOW))
                lookup_flags |= LOOKUP_FOLLOW;
-        if (flag & AT_NO_AUTOMOUNT)
-                lookup_flags |= LOOKUP_NO_AUTOMOUNT;
        if (flag & AT_EMPTY_PATH)
                lookup_flags |= LOOKUP_EMPTY;
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 63e971e2b837..8c37dde4c521 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1300,6 +1300,7 @@ xfs_end_io_direct_write(
        bool                    is_async)
 {
        struct xfs_ioend        *ioend = iocb->private;
+        struct inode            *inode = ioend->io_inode;
        /*
         * blockdev_direct_IO can return an error even after the I/O
@@ -1331,7 +1332,7 @@ xfs_end_io_direct_write(
        }
        /* XXX: probably should move into the real I/O completion handler */
-        inode_dio_done(ioend->io_inode);
+        inode_dio_done(inode);
 }
 STATIC ssize_t
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index cac2ecfa6746..ef43fce519a1 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -629,7 +629,7 @@ xfs_buf_item_push(
 * the xfsbufd to get this buffer written. We have to unlock the buffer
 * to allow the xfsbufd to write it, too.
 */
-STATIC void
+STATIC bool
 xfs_buf_item_pushbuf(
        struct xfs_log_item     *lip)
 {
@@ -643,6 +643,7 @@ xfs_buf_item_pushbuf(
        xfs_buf_delwri_promote(bp);
        xfs_buf_relse(bp);
+        return true;
 }
 STATIC void
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 9e0e2fa3f2c8..bb3f71d236d2 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -183,13 +183,14 @@ xfs_qm_dqunpin_wait(
 * search the buffer cache can be a time consuming thing, and AIL lock is a
 * spinlock.
 */
-STATIC void
+STATIC bool
 xfs_qm_dquot_logitem_pushbuf(
        struct xfs_log_item     *lip)
 {
        struct xfs_dq_logitem   *qlip = DQUOT_ITEM(lip);
        struct xfs_dquot        *dqp = qlip->qli_dquot;
        struct xfs_buf          *bp;
+        bool                    ret = true;
        ASSERT(XFS_DQ_IS_LOCKED(dqp));
@@ -201,17 +202,20 @@ xfs_qm_dquot_logitem_pushbuf(
        if (completion_done(&dqp->q_flush) ||
            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_dqunlock(dqp);
-                return;
+                return true;
        }
        bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
                        dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
        xfs_dqunlock(dqp);
        if (!bp)
-                return;
+                return true;
        if (XFS_BUF_ISDELAYWRITE(bp))
                xfs_buf_delwri_promote(bp);
+        if (xfs_buf_ispinned(bp))
+                ret = false;
        xfs_buf_relse(bp);
+        return ret;
 }
 /*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 588406dc6a35..836ad80d4f2b 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -708,13 +708,14 @@ xfs_inode_item_committed(
 * marked delayed write. If that's the case, we'll promote it and that will
 * allow the caller to write the buffer by triggering the xfsbufd to run.
 */
-STATIC void
+STATIC bool
 xfs_inode_item_pushbuf(
        struct xfs_log_item     *lip)
 {
        struct xfs_inode_log_item *iip = INODE_ITEM(lip);
        struct xfs_inode        *ip = iip->ili_inode;
        struct xfs_buf          *bp;
+        bool                    ret = true;
        ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
@@ -725,7 +726,7 @@ xfs_inode_item_pushbuf(
        if (completion_done(&ip->i_flush) ||
            !(lip->li_flags & XFS_LI_IN_AIL)) {
                xfs_iunlock(ip, XFS_ILOCK_SHARED);
-                return;
+                return true;
        }
        bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
@@ -733,10 +734,13 @@ xfs_inode_item_pushbuf(
        xfs_iunlock(ip, XFS_ILOCK_SHARED);
        if (!bp)
-                return;
+                return true;
        if (XFS_BUF_ISDELAYWRITE(bp))
                xfs_buf_delwri_promote(bp);
+        if (xfs_buf_ispinned(bp))
+                ret = false;
        xfs_buf_relse(bp);
+        return ret;
 }
 /*
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 1e8a45e74c3e..828662f70d64 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -68,6 +68,8 @@
 #include <linux/ctype.h>
 #include <linux/writeback.h>
 #include <linux/capability.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
 #include <linux/list_sort.h>
 #include <asm/page.h>
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2366c54cc4fa..5cf06b85fd9d 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1652,24 +1652,13 @@ xfs_init_workqueues(void)
         */
        xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8);
        if (!xfs_syncd_wq)
-                goto out;
+                return -ENOMEM;
-        xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8);
-        if (!xfs_ail_wq)
-                goto out_destroy_syncd;
        return 0;
-out_destroy_syncd:
-        destroy_workqueue(xfs_syncd_wq);
-out:
-        return -ENOMEM;
 }
 STATIC void
 xfs_destroy_workqueues(void)
 {
-        destroy_workqueue(xfs_ail_wq);
        destroy_workqueue(xfs_syncd_wq);
 }
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 06a9759b6352..53597f4db9b5 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -350,7 +350,7 @@ typedef struct xfs_item_ops {
        void (*iop_unlock)(xfs_log_item_t *);
        xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
        void (*iop_push)(xfs_log_item_t *);
-        void (*iop_pushbuf)(xfs_log_item_t *);
+        bool (*iop_pushbuf)(xfs_log_item_t *);
        void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
 } xfs_item_ops_t;
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index c15aa29fa169..3a1e7ca54c2d 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -28,8 +28,6 @@
 #include "xfs_trans_priv.h"
 #include "xfs_error.h"
-struct workqueue_struct *xfs_ail_wq;    /* AIL workqueue */
 #ifdef DEBUG
 /*
 * Check that the list is sorted as it should be.
@@ -356,16 +354,10 @@ xfs_ail_delete(
        xfs_trans_ail_cursor_clear(ailp, lip);
 }
-/*
+static long
- * xfs_ail_worker does the work of pushing on the AIL. It will requeue itself
+xfsaild_push(
- * to run at a later time if there is more work to do to complete the push.
+        struct xfs_ail          *ailp)
- */
-STATIC void
-xfs_ail_worker(
-        struct work_struct      *work)
 {
-        struct xfs_ail          *ailp = container_of(to_delayed_work(work),
-                                        struct xfs_ail, xa_work);
        xfs_mount_t             *mp = ailp->xa_mount;
        struct xfs_ail_cursor   cur;
        xfs_log_item_t          *lip;
@@ -427,8 +419,13 @@ xfs_ail_worker(
                case XFS_ITEM_PUSHBUF:
                        XFS_STATS_INC(xs_push_ail_pushbuf);
-                        IOP_PUSHBUF(lip);
-                        ailp->xa_last_pushed_lsn = lsn;
+                        if (!IOP_PUSHBUF(lip)) {
+                                stuck++;
+                                flush_log = 1;
+                        } else {
+                                ailp->xa_last_pushed_lsn = lsn;
+                        }
                        push_xfsbufd = 1;
                        break;
@@ -440,7 +437,6 @@ xfs_ail_worker(
                case XFS_ITEM_LOCKED:
                        XFS_STATS_INC(xs_push_ail_locked);
-                        ailp->xa_last_pushed_lsn = lsn;
                        stuck++;
                        break;
@@ -501,20 +497,6 @@ out_done:
                /* We're past our target or empty, so idle */
                ailp->xa_last_pushed_lsn = 0;
-                /*
-                 * We clear the XFS_AIL_PUSHING_BIT first before checking
-                 * whether the target has changed. If the target has changed,
-                 * this pushes the requeue race directly onto the result of the
-                 * atomic test/set bit, so we are guaranteed that either the
-                 * the pusher that changed the target or ourselves will requeue
-                 * the work (but not both).
-                 */
-                clear_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags);
-                smp_rmb();
-                if (XFS_LSN_CMP(ailp->xa_target, target) == 0 ||
-                    test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
-                        return;
                tout = 50;
        } else if (XFS_LSN_CMP(lsn, target) >= 0) {
                /*
@@ -537,9 +519,30 @@ out_done:
                tout = 20;
        }
-        /* There is more to do, requeue us.  */
+        return tout;
-        queue_delayed_work(xfs_syncd_wq, &ailp->xa_work,
+}
-                                        msecs_to_jiffies(tout));
+static int
+xfsaild(
+        void            *data)
+{
+        struct xfs_ail  *ailp = data;
+        long            tout = 0;       /* milliseconds */
+        while (!kthread_should_stop()) {
+                if (tout && tout <= 20)
+                        __set_current_state(TASK_KILLABLE);
+                else
+                        __set_current_state(TASK_INTERRUPTIBLE);
+                schedule_timeout(tout ?
+                                 msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT);
+                try_to_freeze();
+                tout = xfsaild_push(ailp);
+        }
+        return 0;
 }
 /*
@@ -574,8 +577,9 @@ xfs_ail_push(
         */
        smp_wmb();
        xfs_trans_ail_copy_lsn(ailp, &ailp->xa_target, &threshold_lsn);
-        if (!test_and_set_bit(XFS_AIL_PUSHING_BIT, &ailp->xa_flags))
+        smp_wmb();
-                queue_delayed_work(xfs_syncd_wq, &ailp->xa_work, 0);
+        wake_up_process(ailp->xa_task);
 }
 /*
@@ -813,9 +817,18 @@ xfs_trans_ail_init(
        INIT_LIST_HEAD(&ailp->xa_ail);
        INIT_LIST_HEAD(&ailp->xa_cursors);
        spin_lock_init(&ailp->xa_lock);
-        INIT_DELAYED_WORK(&ailp->xa_work, xfs_ail_worker);
+        ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
+                        ailp->xa_mount->m_fsname);
+        if (IS_ERR(ailp->xa_task))
+                goto out_free_ailp;
        mp->m_ail = ailp;
        return 0;
+out_free_ailp:
+        kmem_free(ailp);
+        return ENOMEM;
 }
 void
@@ -824,6 +837,6 @@ xfs_trans_ail_destroy(
 {
        struct xfs_ail  *ailp = mp->m_ail;
-        cancel_delayed_work_sync(&ailp->xa_work);
+        kthread_stop(ailp->xa_task);
        kmem_free(ailp);
 }
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 212946b97239..22750b5e4a8f 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -64,23 +64,17 @@ struct xfs_ail_cursor {
 */
 struct xfs_ail {
        struct xfs_mount        *xa_mount;
+        struct task_struct      *xa_task;
        struct list_head        xa_ail;
        xfs_lsn_t               xa_target;
        struct list_head        xa_cursors;
        spinlock_t              xa_lock;
-        struct delayed_work     xa_work;
        xfs_lsn_t               xa_last_pushed_lsn;
-        unsigned long           xa_flags;
 };
-#define XFS_AIL_PUSHING_BIT     0
 /*
 * From xfs_trans_ail.c
 */
-extern struct workqueue_struct  *xfs_ail_wq;    /* AIL workqueue */
 void    xfs_trans_ail_update_bulk(struct xfs_ail *ailp,
                                struct xfs_ail_cursor *cur,
                                struct xfs_log_item **log_items, int nr_items,