46 files changed, 5421 insertions, 1518 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index ccd25ba7a9ac..9a8622a5b867 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -5,6 +5,9 @@ config BTRFS_FS
        select ZLIB_DEFLATE
        select LZO_COMPRESS
        select LZO_DECOMPRESS
+        select RAID6_PQ
+        select XOR_BLOCKS
        help
          Btrfs is a new filesystem with extents, writable snapshotting,
          support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 7df3e0f0ee51..3932224f99e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
           export.o tree-log.o free-space-cache.o zlib.o lzo.o \
           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-           reada.o backref.o ulist.o qgroup.o send.o dev-replace.o
+           reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 04edf69be875..bd605c87adfd 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -352,11 +352,8 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
                err = __resolve_indirect_ref(fs_info, search_commit_root,
                                             time_seq, ref, parents,
                                             extent_item_pos);
-                if (err) {
+                if (err)
-                        if (ret == 0)
-                                ret = err;
                        continue;
-                }
                /* we put the first parent into the ref at hand */
                ULIST_ITER_INIT(&uiter);
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index d61feca79455..310a7f6d09b1 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -19,7 +19,7 @@
 #ifndef __BTRFS_BACKREF__
 #define __BTRFS_BACKREF__
-#include "ioctl.h"
+#include <linux/btrfs.h>
 #include "ulist.h"
 #include "extent_io.h"
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 2a8c242bc4f5..d9b97d4960e6 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -40,6 +40,8 @@
 #define BTRFS_INODE_HAS_ASYNC_EXTENT            6
 #define BTRFS_INODE_NEEDS_FULL_SYNC             7
 #define BTRFS_INODE_COPY_EVERYTHING             8
+#define BTRFS_INODE_IN_DELALLOC_LIST            9
+#define BTRFS_INODE_READDIO_NEED_LOCK           10
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -216,4 +218,22 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
        return 0;
 }
+/*
+ * Disable DIO read nolock optimization, so new dio readers will be forced
+ * to grab i_mutex. It is used to avoid the endless truncate due to
+ * nonlocked dio read.
+ */
+static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
+{
+        set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags);
+        smp_mb();
+}
+static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
+{
+        smp_mb__before_clear_bit();
+        clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+                  &BTRFS_I(inode)->runtime_flags);
+}
 #endif
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 11d47bfb62b4..18af6f48781a 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -813,8 +813,7 @@ static int btrfsic_process_superblock_dev_mirror(
            (bh->b_data + (dev_bytenr & 4095));
        if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
-            strncmp((char *)(&(super_tmp->magic)), BTRFS_MAGIC,
+            super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
-                    sizeof(super_tmp->magic)) ||
            memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
            btrfs_super_nodesize(super_tmp) != state->metablock_size ||
            btrfs_super_leafsize(super_tmp) != state->metablock_size ||
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 94ab2f80e7e3..15b94089abc4 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -372,7 +372,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                page = compressed_pages[pg_index];
                page->mapping = inode->i_mapping;
                if (bio->bi_size)
-                        ret = io_tree->ops->merge_bio_hook(page, 0,
+                        ret = io_tree->ops->merge_bio_hook(WRITE, page, 0,
                                                           PAGE_CACHE_SIZE,
                                                           bio, 0);
                else
@@ -655,7 +655,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                page->index = em_start >> PAGE_CACHE_SHIFT;
                if (comp_bio->bi_size)
-                        ret = tree->ops->merge_bio_hook(page, 0,
+                        ret = tree->ops->merge_bio_hook(READ, page, 0,
                                                        PAGE_CACHE_SIZE,
                                                        comp_bio, 0);
                else
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index eea5da7a2b9a..ecd25a1b4e51 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1138,6 +1138,7 @@ __tree_mod_log_rewind(struct extent_buffer *eb, u64 time_seq,
                switch (tm->op) {
                case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
                        BUG_ON(tm->slot < n);
+                        /* Fallthrough */
                case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
                case MOD_LOG_KEY_REMOVE:
                        btrfs_set_node_key(eb, &tm->key, tm->slot);
@@ -1222,7 +1223,7 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
        __tree_mod_log_rewind(eb_rewin, time_seq, tm);
        WARN_ON(btrfs_header_nritems(eb_rewin) >
-                BTRFS_NODEPTRS_PER_BLOCK(fs_info->fs_root));
+                BTRFS_NODEPTRS_PER_BLOCK(fs_info->tree_root));
        return eb_rewin;
 }
@@ -1441,7 +1442,7 @@ int btrfs_comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2)
 */
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct extent_buffer *parent,
-                       int start_slot, int cache_only, u64 *last_ret,
+                       int start_slot, u64 *last_ret,
                       struct btrfs_key *progress)
 {
        struct extent_buffer *cur;
@@ -1461,8 +1462,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
        struct btrfs_disk_key disk_key;
        parent_level = btrfs_header_level(parent);
-        if (cache_only && parent_level != 1)
-                return 0;
        WARN_ON(trans->transaction != root->fs_info->running_transaction);
        WARN_ON(trans->transid != root->fs_info->generation);
@@ -1508,10 +1507,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                else
                        uptodate = 0;
                if (!cur || !uptodate) {
-                        if (cache_only) {
-                                free_extent_buffer(cur);
-                                continue;
-                        }
                        if (!cur) {
                                cur = read_tree_block(root, blocknr,
                                                         blocksize, gen);
@@ -4825,8 +4820,8 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 /*
 * A helper function to walk down the tree starting at min_key, and looking
- * for nodes or leaves that are either in cache or have a minimum
+ * for nodes or leaves that are have a minimum transaction id.
- * transaction id.  This is used by the btree defrag code, and tree logging
+ * This is used by the btree defrag code, and tree logging
 *
 * This does not cow, but it does stuff the starting key it finds back
 * into min_key, so you can call btrfs_search_slot with cow=1 on the
@@ -4847,7 +4842,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 */
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
                         struct btrfs_key *max_key,
-                         struct btrfs_path *path, int cache_only,
+                         struct btrfs_path *path,
                         u64 min_trans)
 {
        struct extent_buffer *cur;
@@ -4887,15 +4882,12 @@ again:
                if (sret && slot > 0)
                        slot--;
                /*
-                 * check this node pointer against the cache_only and
+                 * check this node pointer against the min_trans parameters.
-                 * min_trans parameters.  If it isn't in cache or is too
+                 * If it is too old, old, skip to the next one.
-                 * old, skip to the next one.
                 */
                while (slot < nritems) {
                        u64 blockptr;
                        u64 gen;
-                        struct extent_buffer *tmp;
-                        struct btrfs_disk_key disk_key;
                        blockptr = btrfs_node_blockptr(cur, slot);
                        gen = btrfs_node_ptr_generation(cur, slot);
@@ -4903,27 +4895,7 @@ again:
                                slot++;
                                continue;
                        }
-                        if (!cache_only)
+                        break;
-                                break;
-                        if (max_key) {
-                                btrfs_node_key(cur, &disk_key, slot);
-                                if (comp_keys(&disk_key, max_key) >= 0) {
-                                        ret = 1;
-                                        goto out;
-                                }
-                        }
-                        tmp = btrfs_find_tree_block(root, blockptr,
-                                            btrfs_level_size(root, level - 1));
-                        if (tmp && btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
-                                free_extent_buffer(tmp);
-                                break;
-                        }
-                        if (tmp)
-                                free_extent_buffer(tmp);
-                        slot++;
                }
 find_next_key:
                /*
@@ -4934,7 +4906,7 @@ find_next_key:
                        path->slots[level] = slot;
                        btrfs_set_path_blocking(path);
                        sret = btrfs_find_next_key(root, path, min_key, level,
-                                                  cache_only, min_trans);
+                                                  min_trans);
                        if (sret == 0) {
                                btrfs_release_path(path);
                                goto again;
@@ -5399,8 +5371,7 @@ out:
 /*
 * this is similar to btrfs_next_leaf, but does not try to preserve
 * and fixup the path.  It looks for and returns the next key in the
- * tree based on the current path and the cache_only and min_trans
+ * tree based on the current path and the min_trans parameters.
- * parameters.
 *
 * 0 is returned if another key is found, < 0 if there are any errors
 * and 1 is returned if there are no higher keys in the tree
@@ -5409,8 +5380,7 @@ out:
 * calling this function.
 */
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
-                        struct btrfs_key *key, int level,
+                        struct btrfs_key *key, int level, u64 min_trans)
-                        int cache_only, u64 min_trans)
 {
        int slot;
        struct extent_buffer *c;
@@ -5461,22 +5431,8 @@ next:
                if (level == 0)
                        btrfs_item_key_to_cpu(c, key, slot);
                else {
-                        u64 blockptr = btrfs_node_blockptr(c, slot);
                        u64 gen = btrfs_node_ptr_generation(c, slot);
-                        if (cache_only) {
-                                struct extent_buffer *cur;
-                                cur = btrfs_find_tree_block(root, blockptr,
-                                            btrfs_level_size(root, level - 1));
-                                if (!cur ||
-                                    btrfs_buffer_uptodate(cur, gen, 1) <= 0) {
-                                        slot++;
-                                        if (cur)
-                                                free_extent_buffer(cur);
-                                        goto next;
-                                }
-                                free_extent_buffer(cur);
-                        }
                        if (gen < min_trans) {
                                slot++;
                                goto next;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 547b7b05727f..0d82922179db 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -31,10 +31,10 @@
 #include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
 #include <linux/pagemap.h>
+#include <linux/btrfs.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
-#include "ioctl.h"
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -46,7 +46,7 @@ extern struct kmem_cache *btrfs_path_cachep;
 extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
-#define BTRFS_MAGIC "_BHRfS_M"
+#define BTRFS_MAGIC 0x4D5F53665248425FULL /* ascii _BHRfS_M, no null */
 #define BTRFS_MAX_MIRRORS 3
@@ -191,6 +191,8 @@ static int btrfs_csum_sizes[] = { 4, 0 };
 /* ioprio of readahead is set to idle */
 #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
+#define BTRFS_DIRTY_METADATA_THRESH     (32 * 1024 * 1024)
 /*
 * The key defines the order in the tree, and so it also defines (optimal)
 * block layout.
@@ -336,7 +338,10 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 /*
 * File system states
 */
+#define BTRFS_FS_STATE_ERROR            0
+#define BTRFS_FS_STATE_REMOUNTING       1
+/* Super block flags */
 /* Errors detected */
 #define BTRFS_SUPER_FLAG_ERROR          (1ULL << 2)
@@ -502,6 +507,7 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_INCOMPAT_BIG_METADATA     (1ULL << 5)
 #define BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF    (1ULL << 6)
+#define BTRFS_FEATURE_INCOMPAT_RAID56           (1ULL << 7)
 #define BTRFS_FEATURE_COMPAT_SUPP               0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
@@ -511,6 +517,7 @@ struct btrfs_super_block {
         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
         BTRFS_FEATURE_INCOMPAT_BIG_METADATA |          \
         BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO |          \
+         BTRFS_FEATURE_INCOMPAT_RAID56 |                \
         BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
 /*
@@ -952,8 +959,20 @@ struct btrfs_dev_replace_item {
 #define BTRFS_BLOCK_GROUP_RAID1         (1ULL << 4)
 #define BTRFS_BLOCK_GROUP_DUP           (1ULL << 5)
 #define BTRFS_BLOCK_GROUP_RAID10        (1ULL << 6)
+#define BTRFS_BLOCK_GROUP_RAID5    (1 << 7)
+#define BTRFS_BLOCK_GROUP_RAID6    (1 << 8)
 #define BTRFS_BLOCK_GROUP_RESERVED      BTRFS_AVAIL_ALLOC_BIT_SINGLE
-#define BTRFS_NR_RAID_TYPES             5
+enum btrfs_raid_types {
+        BTRFS_RAID_RAID10,
+        BTRFS_RAID_RAID1,
+        BTRFS_RAID_DUP,
+        BTRFS_RAID_RAID0,
+        BTRFS_RAID_SINGLE,
+        BTRFS_RAID_RAID5,
+        BTRFS_RAID_RAID6,
+        BTRFS_NR_RAID_TYPES
+};
 #define BTRFS_BLOCK_GROUP_TYPE_MASK     (BTRFS_BLOCK_GROUP_DATA |    \
                                         BTRFS_BLOCK_GROUP_SYSTEM |  \
@@ -961,6 +980,8 @@ struct btrfs_dev_replace_item {
 #define BTRFS_BLOCK_GROUP_PROFILE_MASK  (BTRFS_BLOCK_GROUP_RAID0 |   \
                                         BTRFS_BLOCK_GROUP_RAID1 |   \
+                                         BTRFS_BLOCK_GROUP_RAID5 |   \
+                                         BTRFS_BLOCK_GROUP_RAID6 |   \
                                         BTRFS_BLOCK_GROUP_DUP |     \
                                         BTRFS_BLOCK_GROUP_RAID10)
 /*
@@ -1185,6 +1206,10 @@ struct btrfs_block_group_cache {
        u64 flags;
        u64 sectorsize;
        u64 cache_generation;
+        /* for raid56, this is a full stripe, without parity */
+        unsigned long full_stripe_len;
        unsigned int ro:1;
        unsigned int dirty:1;
        unsigned int iref:1;
@@ -1225,6 +1250,28 @@ struct seq_list {
        u64 seq;
 };
+enum btrfs_orphan_cleanup_state {
+        ORPHAN_CLEANUP_STARTED  = 1,
+        ORPHAN_CLEANUP_DONE     = 2,
+};
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash {
+        struct list_head hash_list;
+        wait_queue_head_t wait;
+        spinlock_t lock;
+};
+/* used by the raid56 code to lock stripes for read/modify/write */
+struct btrfs_stripe_hash_table {
+        struct list_head stripe_cache;
+        spinlock_t cache_lock;
+        int cache_size;
+        struct btrfs_stripe_hash table[];
+};
+#define BTRFS_STRIPE_HASH_TABLE_BITS 11
 /* fs_info */
 struct reloc_control;
 struct btrfs_device;
@@ -1250,6 +1297,7 @@ struct btrfs_fs_info {
        /* block group cache stuff */
        spinlock_t block_group_cache_lock;
+        u64 first_logical_byte;
        struct rb_root block_group_cache_tree;
        /* keep track of unallocated space */
@@ -1288,7 +1336,23 @@ struct btrfs_fs_info {
        u64 last_trans_log_full_commit;
        unsigned long mount_opt;
        unsigned long compress_type:4;
+        /*
+         * It is a suggestive number, the read side is safe even it gets a
+         * wrong number because we will write out the data into a regular
+         * extent. The write side(mount/remount) is under ->s_umount lock,
+         * so it is also safe.
+         */
        u64 max_inline;
+        /*
+         * Protected by ->chunk_mutex and sb->s_umount.
+         *
+         * The reason that we use two lock to protect it is because only
+         * remount and mount operations can change it and these two operations
+         * are under sb->s_umount, but the read side (chunk allocation) can not
+         * acquire sb->s_umount or the deadlock would happen. So we use two
+         * locks to protect it. On the write side, we must acquire two locks,
+         * and on the read side, we just need acquire one of them.
+         */
        u64 alloc_start;
        struct btrfs_transaction *running_transaction;
        wait_queue_head_t transaction_throttle;
@@ -1307,6 +1371,13 @@ struct btrfs_fs_info {
        struct mutex cleaner_mutex;
        struct mutex chunk_mutex;
        struct mutex volume_mutex;
+        /* this is used during read/modify/write to make sure
+         * no two ios are trying to mod the same stripe at the same
+         * time
+         */
+        struct btrfs_stripe_hash_table *stripe_hash_table;
        /*
         * this protects the ordered operations list only while we are
         * processing all of the entries on it.  This way we make
@@ -1365,6 +1436,7 @@ struct btrfs_fs_info {
         */
        struct list_head ordered_extents;
+        spinlock_t delalloc_lock;
        /*
         * all of the inodes that have delalloc bytes.  It is possible for
         * this list to be empty even when there is still dirty data=ordered
@@ -1373,13 +1445,6 @@ struct btrfs_fs_info {
        struct list_head delalloc_inodes;
        /*
-         * special rename and truncate targets that must be on disk before
-         * we're allowed to commit.  This is basically the ext3 style
-         * data=ordered list.
-         */
-        struct list_head ordered_operations;
-        /*
         * there is a pool of worker threads for checksumming during writes
         * and a pool for checksumming after reads.  This is because readers
         * can run with FS locks held, and the writers may be waiting for
@@ -1395,6 +1460,8 @@ struct btrfs_fs_info {
        struct btrfs_workers flush_workers;
        struct btrfs_workers endio_workers;
        struct btrfs_workers endio_meta_workers;
+        struct btrfs_workers endio_raid56_workers;
+        struct btrfs_workers rmw_workers;
        struct btrfs_workers endio_meta_write_workers;
        struct btrfs_workers endio_write_workers;
        struct btrfs_workers endio_freespace_worker;
@@ -1423,10 +1490,12 @@ struct btrfs_fs_info {
        u64 total_pinned;
-        /* protected by the delalloc lock, used to keep from writing
+        /* used to keep from writing metadata until there is a nice batch */
-         * metadata until there is a nice batch
+        struct percpu_counter dirty_metadata_bytes;
-         */
+        struct percpu_counter delalloc_bytes;
-        u64 dirty_metadata_bytes;
+        s32 dirty_metadata_batch;
+        s32 delalloc_batch;
        struct list_head dirty_cowonly_roots;
        struct btrfs_fs_devices *fs_devices;
@@ -1442,9 +1511,6 @@ struct btrfs_fs_info {
        struct reloc_control *reloc_ctl;
-        spinlock_t delalloc_lock;
-        u64 delalloc_bytes;
        /* data_alloc_cluster is only used in ssd mode */
        struct btrfs_free_cluster data_alloc_cluster;
@@ -1456,6 +1522,8 @@ struct btrfs_fs_info {
        struct rb_root defrag_inodes;
        atomic_t defrag_running;
+        /* Used to protect avail_{data, metadata, system}_alloc_bits */
+        seqlock_t profiles_lock;
        /*
         * these three are in extended format (availability of single
         * chunks is denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other
@@ -1520,7 +1588,7 @@ struct btrfs_fs_info {
        u64 qgroup_seq;
        /* filesystem state */
-        u64 fs_state;
+        unsigned long fs_state;
        struct btrfs_delayed_root *delayed_root;
@@ -1623,6 +1691,9 @@ struct btrfs_root {
        struct list_head root_list;
+        spinlock_t log_extents_lock[2];
+        struct list_head logged_list[2];
        spinlock_t orphan_lock;
        atomic_t orphan_inodes;
        struct btrfs_block_rsv *orphan_block_rsv;
@@ -1832,6 +1903,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
+#define btrfs_raw_test_opt(o, opt)      ((o) & BTRFS_MOUNT_##opt)
 #define btrfs_test_opt(root, opt)       ((root)->fs_info->mount_opt & \
                                         BTRFS_MOUNT_##opt)
 /*
@@ -2936,8 +3008,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
-                                    struct btrfs_root *root,
                                    u64 bytenr, u64 num_bytes);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
@@ -3035,8 +3106,13 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
                                  struct inode *inode);
 void btrfs_orphan_release_metadata(struct inode *inode);
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
-                                struct btrfs_pending_snapshot *pending);
+                                     struct btrfs_block_rsv *rsv,
+                                     int nitems,
+                                     u64 *qgroup_reserved);
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+                                      struct btrfs_block_rsv *rsv,
+                                      u64 qgroup_reserved);
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes);
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes);
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes);
@@ -3092,10 +3168,10 @@ struct extent_buffer *btrfs_root_node(struct btrfs_root *root);
 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root);
 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
                        struct btrfs_key *key, int lowest_level,
-                        int cache_only, u64 min_trans);
+                        u64 min_trans);
 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
                         struct btrfs_key *max_key,
-                         struct btrfs_path *path, int cache_only,
+                         struct btrfs_path *path,
                         u64 min_trans);
 enum btrfs_compare_tree_result {
        BTRFS_COMPARE_TREE_NEW,
@@ -3148,7 +3224,7 @@ int btrfs_search_slot_for_read(struct btrfs_root *root,
                               int find_higher, int return_any);
 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct extent_buffer *parent,
-                       int start_slot, int cache_only, u64 *last_ret,
+                       int start_slot, u64 *last_ret,
                       struct btrfs_key *progress);
 void btrfs_release_path(struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
@@ -3459,9 +3535,9 @@ int btrfs_writepages(struct address_space *mapping,
                     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
                             struct btrfs_root *new_root, u64 new_dirid);
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
-                         size_t size, struct bio *bio, unsigned long bio_flags);
+                         size_t size, struct bio *bio,
+                         unsigned long bio_flags);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
@@ -3543,7 +3619,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root, int cache_only);
+                        struct btrfs_root *root);
 /* sysfs.c */
 int btrfs_init_sysfs(void);
@@ -3620,11 +3696,14 @@ __printf(5, 6)
 void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
                   unsigned int line, int errno, const char *fmt, ...);
+/*
+ * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
+ * will panic().  Otherwise we BUG() here.
+ */
 #define btrfs_panic(fs_info, errno, fmt, args...)                       \
 do {                                                                    \
-        struct btrfs_fs_info *_i = (fs_info);                           \
+        __btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
-        __btrfs_panic(_i, __func__, __LINE__, errno, fmt, ##args);      \
+        BUG();                                                          \
-        BUG_ON(!(_i->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR));    \
 } while (0)
 /* acl.c */
@@ -3745,4 +3824,11 @@ static inline int is_fstree(u64 rootid)
                return 1;
        return 0;
 }
+static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
+{
+        return signal_pending(current);
+}
 #endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 34836036f01b..0b278b117cbe 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -875,7 +875,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
                                     struct btrfs_delayed_item *delayed_item)
 {
        struct extent_buffer *leaf;
-        struct btrfs_item *item;
        char *ptr;
        int ret;
@@ -886,7 +885,6 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
        leaf = path->nodes[0];
-        item = btrfs_item_nr(leaf, path->slots[0]);
        ptr = btrfs_item_ptr(leaf, path->slots[0], char);
        write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
@@ -1065,32 +1063,25 @@ static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
        }
 }
-static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
-                                      struct btrfs_root *root,
+                                        struct btrfs_root *root,
-                                      struct btrfs_path *path,
+                                        struct btrfs_path *path,
-                                      struct btrfs_delayed_node *node)
+                                        struct btrfs_delayed_node *node)
 {
        struct btrfs_key key;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
        int ret;
-        mutex_lock(&node->mutex);
-        if (!node->inode_dirty) {
-                mutex_unlock(&node->mutex);
-                return 0;
-        }
        key.objectid = node->inode_id;
        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
        key.offset = 0;
        ret = btrfs_lookup_inode(trans, root, path, &key, 1);
        if (ret > 0) {
                btrfs_release_path(path);
-                mutex_unlock(&node->mutex);
                return -ENOENT;
        } else if (ret < 0) {
-                mutex_unlock(&node->mutex);
                return ret;
        }
@@ -1105,11 +1096,47 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
        btrfs_delayed_inode_release_metadata(root, node);
        btrfs_release_delayed_inode(node);
-        mutex_unlock(&node->mutex);
        return 0;
 }
+static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+                                             struct btrfs_root *root,
+                                             struct btrfs_path *path,
+                                             struct btrfs_delayed_node *node)
+{
+        int ret;
+        mutex_lock(&node->mutex);
+        if (!node->inode_dirty) {
+                mutex_unlock(&node->mutex);
+                return 0;
+        }
+        ret = __btrfs_update_delayed_inode(trans, root, path, node);
+        mutex_unlock(&node->mutex);
+        return ret;
+}
+static inline int
+__btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+                                   struct btrfs_path *path,
+                                   struct btrfs_delayed_node *node)
+{
+        int ret;
+        ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+        if (ret)
+                return ret;
+        ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+        if (ret)
+                return ret;
+        ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+        return ret;
+}
 /*
 * Called when committing the transaction.
 * Returns 0 on success.
@@ -1119,7 +1146,6 @@ static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
 static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root, int nr)
 {
-        struct btrfs_root *curr_root = root;
        struct btrfs_delayed_root *delayed_root;
        struct btrfs_delayed_node *curr_node, *prev_node;
        struct btrfs_path *path;
@@ -1142,15 +1168,8 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
        curr_node = btrfs_first_delayed_node(delayed_root);
        while (curr_node && (!count || (count && nr--))) {
-                curr_root = curr_node->root;
+                ret = __btrfs_commit_inode_delayed_items(trans, path,
-                ret = btrfs_insert_delayed_items(trans, path, curr_root,
+                                                         curr_node);
-                                                 curr_node);
-                if (!ret)
-                        ret = btrfs_delete_delayed_items(trans, path,
-                                                curr_root, curr_node);
-                if (!ret)
-                        ret = btrfs_update_delayed_inode(trans, curr_root,
-                                                path, curr_node);
                if (ret) {
                        btrfs_release_delayed_node(curr_node);
                        curr_node = NULL;
@@ -1183,51 +1202,93 @@ int btrfs_run_delayed_items_nr(struct btrfs_trans_handle *trans,
        return __btrfs_run_delayed_items(trans, root, nr);
 }
-static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
-                                              struct btrfs_delayed_node *node)
+                                     struct inode *inode)
 {
+        struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
        struct btrfs_path *path;
        struct btrfs_block_rsv *block_rsv;
        int ret;
+        if (!delayed_node)
+                return 0;
+        mutex_lock(&delayed_node->mutex);
+        if (!delayed_node->count) {
+                mutex_unlock(&delayed_node->mutex);
+                btrfs_release_delayed_node(delayed_node);
+                return 0;
+        }
+        mutex_unlock(&delayed_node->mutex);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->leave_spinning = 1;
        block_rsv = trans->block_rsv;
-        trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
+        trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
-        ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+        ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
-        if (!ret)
-                ret = btrfs_delete_delayed_items(trans, path, node->root, node);
-        if (!ret)
-                ret = btrfs_update_delayed_inode(trans, node->root, path, node);
-        btrfs_free_path(path);
+        btrfs_release_delayed_node(delayed_node);
+        btrfs_free_path(path);
        trans->block_rsv = block_rsv;
        return ret;
 }
-int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+int btrfs_commit_inode_delayed_inode(struct inode *inode)
-                                     struct inode *inode)
 {
+        struct btrfs_trans_handle *trans;
        struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+        struct btrfs_path *path;
+        struct btrfs_block_rsv *block_rsv;
        int ret;
        if (!delayed_node)
                return 0;
        mutex_lock(&delayed_node->mutex);
-        if (!delayed_node->count) {
+        if (!delayed_node->inode_dirty) {
                mutex_unlock(&delayed_node->mutex);
                btrfs_release_delayed_node(delayed_node);
                return 0;
        }
        mutex_unlock(&delayed_node->mutex);
-        ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
+        trans = btrfs_join_transaction(delayed_node->root);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out;
+        }
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto trans_out;
+        }
+        path->leave_spinning = 1;
+        block_rsv = trans->block_rsv;
+        trans->block_rsv = &delayed_node->root->fs_info->delayed_block_rsv;
+        mutex_lock(&delayed_node->mutex);
+        if (delayed_node->inode_dirty)
+                ret = __btrfs_update_delayed_inode(trans, delayed_node->root,
+                                                   path, delayed_node);
+        else
+                ret = 0;
+        mutex_unlock(&delayed_node->mutex);
+        btrfs_free_path(path);
+        trans->block_rsv = block_rsv;
+trans_out:
+        btrfs_end_transaction(trans, delayed_node->root);
+        btrfs_btree_balance_dirty(delayed_node->root);
+out:
        btrfs_release_delayed_node(delayed_node);
        return ret;
 }
@@ -1258,7 +1319,6 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
        struct btrfs_root *root;
        struct btrfs_block_rsv *block_rsv;
        int need_requeue = 0;
-        int ret;
        async_node = container_of(work, struct btrfs_async_delayed_node, work);
@@ -1277,14 +1337,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
        block_rsv = trans->block_rsv;
        trans->block_rsv = &root->fs_info->delayed_block_rsv;
-        ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
+        __btrfs_commit_inode_delayed_items(trans, path, delayed_node);
-        if (!ret)
-                ret = btrfs_delete_delayed_items(trans, path, root,
-                                                 delayed_node);
-        if (!ret)
-                btrfs_update_delayed_inode(trans, root, path, delayed_node);
        /*
         * Maybe new delayed items have been inserted, so we need requeue
         * the work. Besides that, we must dequeue the empty delayed nodes
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
index 4f808e1baeed..78b6ad0fc669 100644
--- a/fs/btrfs/delayed-inode.h
+++ b/fs/btrfs/delayed-inode.h
@@ -117,6 +117,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 /* Used for evicting the inode. */
 void btrfs_remove_delayed_node(struct inode *inode);
 void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_commit_inode_delayed_inode(struct inode *inode);
 int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index ae9411773397..b7a0641ead77 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -23,6 +23,10 @@
 #include "delayed-ref.h"
 #include "transaction.h"
+struct kmem_cache *btrfs_delayed_ref_head_cachep;
+struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+struct kmem_cache *btrfs_delayed_data_ref_cachep;
+struct kmem_cache *btrfs_delayed_extent_op_cachep;
 /*
 * delayed back reference update tracking.  For subvolume trees
 * we queue up extent allocations and backref maintenance for
@@ -422,6 +426,14 @@ again:
        return 1;
 }
+void btrfs_release_ref_cluster(struct list_head *cluster)
+{
+        struct list_head *pos, *q;
+        list_for_each_safe(pos, q, cluster)
+                list_del_init(pos);
+}
 /*
 * helper function to update an extent delayed ref in the
 * rbtree.  existing and update must both have the same
@@ -511,7 +523,7 @@ update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
                                        ref->extent_op->flags_to_set;
                                existing_ref->extent_op->update_flags = 1;
                        }
-                        kfree(ref->extent_op);
+                        btrfs_free_delayed_extent_op(ref->extent_op);
                }
        }
        /*
@@ -592,7 +604,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
-                kfree(head_ref);
+                kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
        } else {
                delayed_refs->num_heads++;
                delayed_refs->num_heads_ready++;
@@ -653,7 +665,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
-                kfree(full_ref);
+                kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref);
        } else {
                delayed_refs->num_entries++;
                trans->delayed_ref_updates++;
@@ -714,7 +726,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
                 * we've updated the existing ref, free the newly
                 * allocated ref
                 */
-                kfree(full_ref);
+                kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref);
        } else {
                delayed_refs->num_entries++;
                trans->delayed_ref_updates++;
@@ -738,13 +750,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_ref_root *delayed_refs;
        BUG_ON(extent_op && extent_op->is_data);
-        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
        if (!ref)
                return -ENOMEM;
-        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
        if (!head_ref) {
-                kfree(ref);
+                kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
                return -ENOMEM;
        }
@@ -786,13 +798,13 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_ref_root *delayed_refs;
        BUG_ON(extent_op && !extent_op->is_data);
-        ref = kmalloc(sizeof(*ref), GFP_NOFS);
+        ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
        if (!ref)
                return -ENOMEM;
-        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
        if (!head_ref) {
-                kfree(ref);
+                kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
                return -ENOMEM;
        }
@@ -826,7 +838,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info,
        struct btrfs_delayed_ref_head *head_ref;
        struct btrfs_delayed_ref_root *delayed_refs;
-        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+        head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
        if (!head_ref)
                return -ENOMEM;
@@ -860,3 +872,51 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
 }
+void btrfs_delayed_ref_exit(void)
+{
+        if (btrfs_delayed_ref_head_cachep)
+                kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
+        if (btrfs_delayed_tree_ref_cachep)
+                kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
+        if (btrfs_delayed_data_ref_cachep)
+                kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
+        if (btrfs_delayed_extent_op_cachep)
+                kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
+}
+int btrfs_delayed_ref_init(void)
+{
+        btrfs_delayed_ref_head_cachep = kmem_cache_create(
+                                "btrfs_delayed_ref_head",
+                                sizeof(struct btrfs_delayed_ref_head), 0,
+                                SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+        if (!btrfs_delayed_ref_head_cachep)
+                goto fail;
+        btrfs_delayed_tree_ref_cachep = kmem_cache_create(
+                                "btrfs_delayed_tree_ref",
+                                sizeof(struct btrfs_delayed_tree_ref), 0,
+                                SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+        if (!btrfs_delayed_tree_ref_cachep)
+                goto fail;
+        btrfs_delayed_data_ref_cachep = kmem_cache_create(
+                                "btrfs_delayed_data_ref",
+                                sizeof(struct btrfs_delayed_data_ref), 0,
+                                SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+        if (!btrfs_delayed_data_ref_cachep)
+                goto fail;
+        btrfs_delayed_extent_op_cachep = kmem_cache_create(
+                                "btrfs_delayed_extent_op",
+                                sizeof(struct btrfs_delayed_extent_op), 0,
+                                SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+        if (!btrfs_delayed_extent_op_cachep)
+                goto fail;
+        return 0;
+fail:
+        btrfs_delayed_ref_exit();
+        return -ENOMEM;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index c9d703693df0..f75fcaf79aeb 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -132,6 +132,15 @@ struct btrfs_delayed_ref_root {
        unsigned long num_heads_ready;
        /*
+         * bumped when someone is making progress on the delayed
+         * refs, so that other procs know they are just adding to
+         * contention intead of helping
+         */
+        atomic_t procs_running_refs;
+        atomic_t ref_seq;
+        wait_queue_head_t wait;
+        /*
         * set when the tree is flushing before a transaction commit,
         * used by the throttling code to decide if new updates need
         * to be run right away
@@ -141,12 +150,47 @@ struct btrfs_delayed_ref_root {
        u64 run_delayed_start;
 };
+extern struct kmem_cache *btrfs_delayed_ref_head_cachep;
+extern struct kmem_cache *btrfs_delayed_tree_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_data_ref_cachep;
+extern struct kmem_cache *btrfs_delayed_extent_op_cachep;
+int btrfs_delayed_ref_init(void);
+void btrfs_delayed_ref_exit(void);
+static inline struct btrfs_delayed_extent_op *
+btrfs_alloc_delayed_extent_op(void)
+{
+        return kmem_cache_alloc(btrfs_delayed_extent_op_cachep, GFP_NOFS);
+}
+static inline void
+btrfs_free_delayed_extent_op(struct btrfs_delayed_extent_op *op)
+{
+        if (op)
+                kmem_cache_free(btrfs_delayed_extent_op_cachep, op);
+}
 static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
 {
        WARN_ON(atomic_read(&ref->refs) == 0);
        if (atomic_dec_and_test(&ref->refs)) {
                WARN_ON(ref->in_tree);
-                kfree(ref);
+                switch (ref->type) {
+                case BTRFS_TREE_BLOCK_REF_KEY:
+                case BTRFS_SHARED_BLOCK_REF_KEY:
+                        kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+                        break;
+                case BTRFS_EXTENT_DATA_REF_KEY:
+                case BTRFS_SHARED_DATA_REF_KEY:
+                        kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
+                        break;
+                case 0:
+                        kmem_cache_free(btrfs_delayed_ref_head_cachep, ref);
+                        break;
+                default:
+                        BUG();
+                }
        }
 }
@@ -176,8 +220,14 @@ struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
                           struct btrfs_delayed_ref_head *head);
+static inline void btrfs_delayed_ref_unlock(struct btrfs_delayed_ref_head *head)
+{
+        mutex_unlock(&head->mutex);
+}
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
                           struct list_head *cluster, u64 search_start);
+void btrfs_release_ref_cluster(struct list_head *cluster);
 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info,
                            struct btrfs_delayed_ref_root *delayed_refs,
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 66dbc8dbddf7..7ba7b3900cb8 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -465,7 +465,11 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
         * flush all outstanding I/O and inode extent mappings before the
         * copy operation is declared as being finished
         */
-        btrfs_start_delalloc_inodes(root, 0);
+        ret = btrfs_start_delalloc_inodes(root, 0);
+        if (ret) {
+                mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+                return ret;
+        }
        btrfs_wait_ordered_extents(root, 0);
        trans = btrfs_start_transaction(root, 0);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a8f652dc940b..02369a3c162e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -46,6 +46,7 @@
 #include "check-integrity.h"
 #include "rcu-string.h"
 #include "dev-replace.h"
+#include "raid56.h"
 #ifdef CONFIG_X86
 #include <asm/cpufeature.h>
@@ -56,7 +57,8 @@ static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
 static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
                                    int read_only);
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+                                             struct btrfs_root *root);
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root);
 static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                      struct btrfs_root *root);
@@ -420,7 +422,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
 {
        struct extent_io_tree *tree;
-        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 start = page_offset(page);
        u64 found_start;
        struct extent_buffer *eb;
@@ -639,8 +641,15 @@ err:
                btree_readahead_hook(root, eb, eb->start, ret);
        }
-        if (ret)
+        if (ret) {
+                /*
+                 * our io error hook is going to dec the io pages
+                 * again, we have to make sure it has something
+                 * to decrement
+                 */
+                atomic_inc(&eb->io_pages);
                clear_extent_buffer_uptodate(eb);
+        }
        free_extent_buffer(eb);
 out:
        return ret;
@@ -654,6 +663,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror)
        eb = (struct extent_buffer *)page->private;
        set_bit(EXTENT_BUFFER_IOERR, &eb->bflags);
        eb->read_mirror = failed_mirror;
+        atomic_dec(&eb->io_pages);
        if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
                btree_readahead_hook(root, eb, eb->start, -EIO);
        return -EIO;    /* we fixed nothing */
@@ -670,17 +680,23 @@ static void end_workqueue_bio(struct bio *bio, int err)
        end_io_wq->work.flags = 0;
        if (bio->bi_rw & REQ_WRITE) {
-                if (end_io_wq->metadata == 1)
+                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA)
                        btrfs_queue_worker(&fs_info->endio_meta_write_workers,
                                           &end_io_wq->work);
-                else if (end_io_wq->metadata == 2)
+                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE)
                        btrfs_queue_worker(&fs_info->endio_freespace_worker,
                                           &end_io_wq->work);
+                else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+                        btrfs_queue_worker(&fs_info->endio_raid56_workers,
+                                           &end_io_wq->work);
                else
                        btrfs_queue_worker(&fs_info->endio_write_workers,
                                           &end_io_wq->work);
        } else {
-                if (end_io_wq->metadata)
+                if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56)
+                        btrfs_queue_worker(&fs_info->endio_raid56_workers,
+                                           &end_io_wq->work);
+                else if (end_io_wq->metadata)
                        btrfs_queue_worker(&fs_info->endio_meta_workers,
                                           &end_io_wq->work);
                else
@@ -695,6 +711,7 @@ static void end_workqueue_bio(struct bio *bio, int err)
 * 0 - if data
 * 1 - if normal metadta
 * 2 - if writing to the free space cache area
+ * 3 - raid parity work
 */
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata)
@@ -946,18 +963,20 @@ static int btree_writepages(struct address_space *mapping,
                            struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
+        struct btrfs_fs_info *fs_info;
+        int ret;
        tree = &BTRFS_I(mapping->host)->io_tree;
        if (wbc->sync_mode == WB_SYNC_NONE) {
-                struct btrfs_root *root = BTRFS_I(mapping->host)->root;
-                u64 num_dirty;
-                unsigned long thresh = 32 * 1024 * 1024;
                if (wbc->for_kupdate)
                        return 0;
+                fs_info = BTRFS_I(mapping->host)->root->fs_info;
                /* this is a bit racy, but that's ok */
-                num_dirty = root->fs_info->dirty_metadata_bytes;
+                ret = percpu_counter_compare(&fs_info->dirty_metadata_bytes,
-                if (num_dirty < thresh)
+                                             BTRFS_DIRTY_METADATA_THRESH);
+                if (ret < 0)
                        return 0;
        }
        return btree_write_cache_pages(mapping, wbc);
@@ -1125,24 +1144,16 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 void clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                      struct extent_buffer *buf)
 {
+        struct btrfs_fs_info *fs_info = root->fs_info;
        if (btrfs_header_generation(buf) ==
-            root->fs_info->running_transaction->transid) {
+            fs_info->running_transaction->transid) {
                btrfs_assert_tree_locked(buf);
                if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
-                        spin_lock(&root->fs_info->delalloc_lock);
+                        __percpu_counter_add(&fs_info->dirty_metadata_bytes,
-                        if (root->fs_info->dirty_metadata_bytes >= buf->len)
+                                             -buf->len,
-                                root->fs_info->dirty_metadata_bytes -= buf->len;
+                                             fs_info->dirty_metadata_batch);
-                        else {
-                                spin_unlock(&root->fs_info->delalloc_lock);
-                                btrfs_panic(root->fs_info, -EOVERFLOW,
-                                          "Can't clear %lu bytes from "
-                                          " dirty_mdatadata_bytes (%llu)",
-                                          buf->len,
-                                          root->fs_info->dirty_metadata_bytes);
-                        }
-                        spin_unlock(&root->fs_info->delalloc_lock);
                        /* ugh, clear_extent_buffer_dirty needs to lock the page */
                        btrfs_set_lock_blocking(buf);
                        clear_extent_buffer_dirty(buf);
@@ -1178,9 +1189,13 @@ static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->root_list);
+        INIT_LIST_HEAD(&root->logged_list[0]);
+        INIT_LIST_HEAD(&root->logged_list[1]);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
        spin_lock_init(&root->accounting_lock);
+        spin_lock_init(&root->log_extents_lock[0]);
+        spin_lock_init(&root->log_extents_lock[1]);
        mutex_init(&root->objectid_mutex);
        mutex_init(&root->log_mutex);
        init_waitqueue_head(&root->log_writer_wait);
@@ -2004,10 +2019,24 @@ int open_ctree(struct super_block *sb,
                goto fail_srcu;
        }
+        ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0);
+        if (ret) {
+                err = ret;
+                goto fail_bdi;
+        }
+        fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE *
+                                        (1 + ilog2(nr_cpu_ids));
+        ret = percpu_counter_init(&fs_info->delalloc_bytes, 0);
+        if (ret) {
+                err = ret;
+                goto fail_dirty_metadata_bytes;
+        }
        fs_info->btree_inode = new_inode(sb);
        if (!fs_info->btree_inode) {
                err = -ENOMEM;
-                goto fail_bdi;
+                goto fail_delalloc_bytes;
        }
        mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
@@ -2017,7 +2046,6 @@ int open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->dead_roots);
        INIT_LIST_HEAD(&fs_info->delayed_iputs);
        INIT_LIST_HEAD(&fs_info->delalloc_inodes);
-        INIT_LIST_HEAD(&fs_info->ordered_operations);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_lock);
        spin_lock_init(&fs_info->trans_lock);
@@ -2028,6 +2056,7 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->tree_mod_seq_lock);
        rwlock_init(&fs_info->tree_mod_log_lock);
        mutex_init(&fs_info->reloc_mutex);
+        seqlock_init(&fs_info->profiles_lock);
        init_completion(&fs_info->kobj_unregister);
        INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
@@ -2126,6 +2155,7 @@ int open_ctree(struct super_block *sb,
        spin_lock_init(&fs_info->block_group_cache_lock);
        fs_info->block_group_cache_tree = RB_ROOT;
+        fs_info->first_logical_byte = (u64)-1;
        extent_io_tree_init(&fs_info->freed_extents[0],
                             fs_info->btree_inode->i_mapping);
@@ -2165,6 +2195,12 @@ int open_ctree(struct super_block *sb,
        init_waitqueue_head(&fs_info->transaction_blocked_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
+        ret = btrfs_alloc_stripe_hash_table(fs_info);
+        if (ret) {
+                err = ret;
+                goto fail_alloc;
+        }
        __setup_root(4096, 4096, 4096, 4096, tree_root,
                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
@@ -2187,7 +2223,8 @@ int open_ctree(struct super_block *sb,
                goto fail_alloc;
        /* check FS state, whether FS is broken. */
-        fs_info->fs_state |= btrfs_super_flags(disk_super);
+        if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR)
+                set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
        ret = btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
        if (ret) {
@@ -2261,6 +2298,8 @@ int open_ctree(struct super_block *sb,
        leafsize = btrfs_super_leafsize(disk_super);
        sectorsize = btrfs_super_sectorsize(disk_super);
        stripesize = btrfs_super_stripesize(disk_super);
+        fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids));
+        fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
        /*
         * mixed block groups end up with duplicate but slightly offset
@@ -2332,6 +2371,12 @@ int open_ctree(struct super_block *sb,
        btrfs_init_workers(&fs_info->endio_meta_write_workers,
                           "endio-meta-write", fs_info->thread_pool_size,
                           &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->endio_raid56_workers,
+                           "endio-raid56", fs_info->thread_pool_size,
+                           &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->rmw_workers,
+                           "rmw", fs_info->thread_pool_size,
+                           &fs_info->generic_worker);
        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
                           fs_info->thread_pool_size,
                           &fs_info->generic_worker);
@@ -2350,6 +2395,8 @@ int open_ctree(struct super_block *sb,
         */
        fs_info->endio_workers.idle_thresh = 4;
        fs_info->endio_meta_workers.idle_thresh = 4;
+        fs_info->endio_raid56_workers.idle_thresh = 4;
+        fs_info->rmw_workers.idle_thresh = 2;
        fs_info->endio_write_workers.idle_thresh = 2;
        fs_info->endio_meta_write_workers.idle_thresh = 2;
@@ -2366,6 +2413,8 @@ int open_ctree(struct super_block *sb,
        ret |= btrfs_start_workers(&fs_info->fixup_workers);
        ret |= btrfs_start_workers(&fs_info->endio_workers);
        ret |= btrfs_start_workers(&fs_info->endio_meta_workers);
+        ret |= btrfs_start_workers(&fs_info->rmw_workers);
+        ret |= btrfs_start_workers(&fs_info->endio_raid56_workers);
        ret |= btrfs_start_workers(&fs_info->endio_meta_write_workers);
        ret |= btrfs_start_workers(&fs_info->endio_write_workers);
        ret |= btrfs_start_workers(&fs_info->endio_freespace_worker);
@@ -2390,8 +2439,7 @@ int open_ctree(struct super_block *sb,
        sb->s_blocksize = sectorsize;
        sb->s_blocksize_bits = blksize_bits(sectorsize);
-        if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC,
+        if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {
-                    sizeof(disk_super->magic))) {
                printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
                goto fail_sb_buffer;
        }
@@ -2694,13 +2742,13 @@ fail_cleaner:
         * kthreads
         */
        filemap_write_and_wait(fs_info->btree_inode->i_mapping);
-        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 fail_block_groups:
        btrfs_free_block_groups(fs_info);
 fail_tree_roots:
        free_root_pointers(fs_info, 1);
+        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 fail_sb_buffer:
        btrfs_stop_workers(&fs_info->generic_worker);
@@ -2710,6 +2758,8 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->workers);
        btrfs_stop_workers(&fs_info->endio_workers);
        btrfs_stop_workers(&fs_info->endio_meta_workers);
+        btrfs_stop_workers(&fs_info->endio_raid56_workers);
+        btrfs_stop_workers(&fs_info->rmw_workers);
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -2721,13 +2771,17 @@ fail_alloc:
 fail_iput:
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
-        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
+fail_delalloc_bytes:
+        percpu_counter_destroy(&fs_info->delalloc_bytes);
+fail_dirty_metadata_bytes:
+        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
 fail_bdi:
        bdi_destroy(&fs_info->bdi);
 fail_srcu:
        cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
+        btrfs_free_stripe_hash_table(fs_info);
        btrfs_close_devices(fs_info->fs_devices);
        return err;
@@ -2795,8 +2849,7 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
                super = (struct btrfs_super_block *)bh->b_data;
                if (btrfs_super_bytenr(super) != bytenr ||
-                    strncmp((char *)(&super->magic), BTRFS_MAGIC,
+                    super->magic != cpu_to_le64(BTRFS_MAGIC)) {
-                            sizeof(super->magic))) {
                        brelse(bh);
                        continue;
                }
@@ -3076,11 +3129,16 @@ int btrfs_calc_num_tolerated_disk_barrier_failures(
                                     ((flags & BTRFS_BLOCK_GROUP_PROFILE_MASK)
                                      == 0)))
                                        num_tolerated_disk_barrier_failures = 0;
-                                else if (num_tolerated_disk_barrier_failures > 1
+                                else if (num_tolerated_disk_barrier_failures > 1) {
-                                         &&
+                                        if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-                                         (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                            BTRFS_BLOCK_GROUP_RAID5 |
-                                                   BTRFS_BLOCK_GROUP_RAID10)))
+                                            BTRFS_BLOCK_GROUP_RAID10)) {
-                                        num_tolerated_disk_barrier_failures = 1;
+                                                num_tolerated_disk_barrier_failures = 1;
+                                        } else if (flags &
+                                                   BTRFS_BLOCK_GROUP_RAID5) {
+                                                num_tolerated_disk_barrier_failures = 2;
+                                        }
+                                }
                        }
                }
                up_read(&sinfo->groups_sem);
@@ -3195,6 +3253,11 @@ void btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
        if (btrfs_root_refs(&root->root_item) == 0)
                synchronize_srcu(&fs_info->subvol_srcu);
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                btrfs_free_log(NULL, root);
+                btrfs_free_log_root_tree(NULL, fs_info);
+        }
        __btrfs_remove_free_space_cache(root->free_ino_pinned);
        __btrfs_remove_free_space_cache(root->free_ino_ctl);
        free_fs_root(root);
@@ -3339,7 +3402,7 @@ int close_ctree(struct btrfs_root *root)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
-        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
                btrfs_error_commit_super(root);
        btrfs_put_block_group_cache(fs_info);
@@ -3352,9 +3415,9 @@ int close_ctree(struct btrfs_root *root)
        btrfs_free_qgroup_config(root->fs_info);
-        if (fs_info->delalloc_bytes) {
+        if (percpu_counter_sum(&fs_info->delalloc_bytes)) {
-                printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n",
+                printk(KERN_INFO "btrfs: at unmount delalloc count %lld\n",
-                       (unsigned long long)fs_info->delalloc_bytes);
+                       percpu_counter_sum(&fs_info->delalloc_bytes));
        }
        free_extent_buffer(fs_info->extent_root->node);
@@ -3384,6 +3447,8 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->workers);
        btrfs_stop_workers(&fs_info->endio_workers);
        btrfs_stop_workers(&fs_info->endio_meta_workers);
+        btrfs_stop_workers(&fs_info->endio_raid56_workers);
+        btrfs_stop_workers(&fs_info->rmw_workers);
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
        btrfs_stop_workers(&fs_info->endio_freespace_worker);
@@ -3401,9 +3466,13 @@ int close_ctree(struct btrfs_root *root)
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
+        percpu_counter_destroy(&fs_info->dirty_metadata_bytes);
+        percpu_counter_destroy(&fs_info->delalloc_bytes);
        bdi_destroy(&fs_info->bdi);
        cleanup_srcu_struct(&fs_info->subvol_srcu);
+        btrfs_free_stripe_hash_table(fs_info);
        return 0;
 }
@@ -3443,11 +3512,10 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
                        (unsigned long long)transid,
                        (unsigned long long)root->fs_info->generation);
        was_dirty = set_extent_buffer_dirty(buf);
-        if (!was_dirty) {
+        if (!was_dirty)
-                spin_lock(&root->fs_info->delalloc_lock);
+                __percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
-                root->fs_info->dirty_metadata_bytes += buf->len;
+                                     buf->len,
-                spin_unlock(&root->fs_info->delalloc_lock);
+                                     root->fs_info->dirty_metadata_batch);
-        }
 }
 static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
@@ -3457,8 +3525,7 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
         * looks as though older kernels can get into trouble with
         * this code, they end up stuck in balance_dirty_pages forever
         */
-        u64 num_dirty;
+        int ret;
-        unsigned long thresh = 32 * 1024 * 1024;
        if (current->flags & PF_MEMALLOC)
                return;
@@ -3466,9 +3533,9 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
        if (flush_delayed)
                btrfs_balance_delayed_items(root);
-        num_dirty = root->fs_info->dirty_metadata_bytes;
+        ret = percpu_counter_compare(&root->fs_info->dirty_metadata_bytes,
+                                     BTRFS_DIRTY_METADATA_THRESH);
-        if (num_dirty > thresh) {
+        if (ret > 0) {
                balance_dirty_pages_ratelimited(
                                   root->fs_info->btree_inode->i_mapping);
        }
@@ -3518,7 +3585,8 @@ void btrfs_error_commit_super(struct btrfs_root *root)
        btrfs_cleanup_transaction(root);
 }
-static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
+static void btrfs_destroy_ordered_operations(struct btrfs_transaction *t,
+                                             struct btrfs_root *root)
 {
        struct btrfs_inode *btrfs_inode;
        struct list_head splice;
@@ -3528,7 +3596,7 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
        mutex_lock(&root->fs_info->ordered_operations_mutex);
        spin_lock(&root->fs_info->ordered_extent_lock);
-        list_splice_init(&root->fs_info->ordered_operations, &splice);
+        list_splice_init(&t->ordered_operations, &splice);
        while (!list_empty(&splice)) {
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
                                         ordered_operations);
@@ -3544,35 +3612,16 @@ static void btrfs_destroy_ordered_operations(struct btrfs_root *root)
 static void btrfs_destroy_ordered_extents(struct btrfs_root *root)
 {
-        struct list_head splice;
        struct btrfs_ordered_extent *ordered;
-        struct inode *inode;
-        INIT_LIST_HEAD(&splice);
        spin_lock(&root->fs_info->ordered_extent_lock);
+        /*
-        list_splice_init(&root->fs_info->ordered_extents, &splice);
+         * This will just short circuit the ordered completion stuff which will
-        while (!list_empty(&splice)) {
+         * make sure the ordered extent gets properly cleaned up.
-                ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+         */
-                                     root_extent_list);
+        list_for_each_entry(ordered, &root->fs_info->ordered_extents,
+                            root_extent_list)
-                list_del_init(&ordered->root_extent_list);
+                set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
-                atomic_inc(&ordered->refs);
-                /* the inode may be getting freed (in sys_unlink path). */
-                inode = igrab(ordered->inode);
-                spin_unlock(&root->fs_info->ordered_extent_lock);
-                if (inode)
-                        iput(inode);
-                atomic_set(&ordered->refs, 1);
-                btrfs_put_ordered_extent(ordered);
-                spin_lock(&root->fs_info->ordered_extent_lock);
-        }
        spin_unlock(&root->fs_info->ordered_extent_lock);
 }
@@ -3594,11 +3643,11 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
        }
        while ((node = rb_first(&delayed_refs->root)) != NULL) {
-                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                struct btrfs_delayed_ref_head *head = NULL;
+                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
                atomic_set(&ref->refs, 1);
                if (btrfs_delayed_ref_is_head(ref)) {
-                        struct btrfs_delayed_ref_head *head;
                        head = btrfs_delayed_node_to_head(ref);
                        if (!mutex_trylock(&head->mutex)) {
@@ -3614,16 +3663,18 @@ int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
                                continue;
                        }
-                        kfree(head->extent_op);
+                        btrfs_free_delayed_extent_op(head->extent_op);
                        delayed_refs->num_heads--;
                        if (list_empty(&head->cluster))
                                delayed_refs->num_heads_ready--;
                        list_del_init(&head->cluster);
                }
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
+                if (head)
+                        mutex_unlock(&head->mutex);
                spin_unlock(&delayed_refs->lock);
                btrfs_put_delayed_ref(ref);
@@ -3671,6 +3722,8 @@ static void btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
                                    delalloc_inodes);
                list_del_init(&btrfs_inode->delalloc_inodes);
+                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                          &btrfs_inode->runtime_flags);
                btrfs_invalidate_inodes(btrfs_inode->root);
        }
@@ -3823,10 +3876,8 @@ int btrfs_cleanup_transaction(struct btrfs_root *root)
        while (!list_empty(&list)) {
                t = list_entry(list.next, struct btrfs_transaction, list);
-                if (!t)
-                        break;
-                btrfs_destroy_ordered_operations(root);
+                btrfs_destroy_ordered_operations(t, root);
                btrfs_destroy_ordered_extents(root);
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 305c33efb0e3..034d7dc552b2 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -25,6 +25,13 @@
 #define BTRFS_SUPER_MIRROR_MAX   3
 #define BTRFS_SUPER_MIRROR_SHIFT 12
+enum {
+        BTRFS_WQ_ENDIO_DATA = 0,
+        BTRFS_WQ_ENDIO_METADATA = 1,
+        BTRFS_WQ_ENDIO_FREE_SPACE = 2,
+        BTRFS_WQ_ENDIO_RAID56 = 3,
+};
 static inline u64 btrfs_sb_offset(int mirror)
 {
        u64 start = 16 * 1024;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cf54bdfee334..3e074dab2d57 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -31,6 +31,7 @@
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
+#include "raid56.h"
 #include "locking.h"
 #include "free-space-cache.h"
 #include "math.h"
@@ -72,8 +73,7 @@ enum {
        RESERVE_ALLOC_NO_ACCOUNT = 2,
 };
-static int update_block_group(struct btrfs_trans_handle *trans,
+static int update_block_group(struct btrfs_root *root,
-                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
@@ -103,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
                                       u64 num_bytes, int reserve);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+                               u64 num_bytes);
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -162,6 +164,10 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
        rb_link_node(&block_group->cache_node, parent, p);
        rb_insert_color(&block_group->cache_node,
                        &info->block_group_cache_tree);
+        if (info->first_logical_byte > block_group->key.objectid)
+                info->first_logical_byte = block_group->key.objectid;
        spin_unlock(&info->block_group_cache_lock);
        return 0;
@@ -203,8 +209,11 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
                        break;
                }
        }
-        if (ret)
+        if (ret) {
                btrfs_get_block_group(ret);
+                if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
+                        info->first_logical_byte = ret->key.objectid;
+        }
        spin_unlock(&info->block_group_cache_lock);
        return ret;
@@ -468,8 +477,6 @@ out:
 }
 static int cache_block_group(struct btrfs_block_group_cache *cache,
-                             struct btrfs_trans_handle *trans,
-                             struct btrfs_root *root,
                             int load_cache_only)
 {
        DEFINE_WAIT(wait);
@@ -527,12 +534,6 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
        cache->cached = BTRFS_CACHE_FAST;
        spin_unlock(&cache->lock);
-        /*
-         * We can't do the read from on-disk cache during a commit since we need
-         * to have the normal tree locking.  Also if we are currently trying to
-         * allocate blocks for the tree root we can't do the fast caching since
-         * we likely hold important locks.
-         */
        if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
                ret = load_free_space_cache(fs_info, cache);
@@ -1852,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                *actual_bytes = discarded_bytes;
+        if (ret == -EOPNOTSUPP)
+                ret = 0;
        return ret;
 }
@@ -2143,7 +2146,6 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
                                                      node->num_bytes);
                        }
                }
-                mutex_unlock(&head->mutex);
                return ret;
        }
@@ -2258,7 +2260,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                         * process of being added. Don't run this ref yet.
                         */
                        list_del_init(&locked_ref->cluster);
-                        mutex_unlock(&locked_ref->mutex);
+                        btrfs_delayed_ref_unlock(locked_ref);
                        locked_ref = NULL;
                        delayed_refs->num_heads_ready++;
                        spin_unlock(&delayed_refs->lock);
@@ -2285,7 +2287,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                        ref = &locked_ref->node;
                        if (extent_op && must_insert_reserved) {
-                                kfree(extent_op);
+                                btrfs_free_delayed_extent_op(extent_op);
                                extent_op = NULL;
                        }
@@ -2294,28 +2296,25 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                                ret = run_delayed_extent_op(trans, root,
                                                            ref, extent_op);
-                                kfree(extent_op);
+                                btrfs_free_delayed_extent_op(extent_op);
                                if (ret) {
-                                        list_del_init(&locked_ref->cluster);
+                                        printk(KERN_DEBUG
-                                        mutex_unlock(&locked_ref->mutex);
+                                               "btrfs: run_delayed_extent_op "
+                                               "returned %d\n", ret);
-                                        printk(KERN_DEBUG "btrfs: run_delayed_extent_op returned %d\n", ret);
                                        spin_lock(&delayed_refs->lock);
+                                        btrfs_delayed_ref_unlock(locked_ref);
                                        return ret;
                                }
                                goto next;
                        }
-                        list_del_init(&locked_ref->cluster);
-                        locked_ref = NULL;
                }
                ref->in_tree = 0;
                rb_erase(&ref->rb_node, &delayed_refs->root);
                delayed_refs->num_entries--;
-                if (locked_ref) {
+                if (!btrfs_delayed_ref_is_head(ref)) {
                        /*
                         * when we play the delayed ref, also correct the
                         * ref_mod on head
@@ -2337,20 +2336,29 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
                ret = run_one_delayed_ref(trans, root, ref, extent_op,
                                          must_insert_reserved);
-                btrfs_put_delayed_ref(ref);
+                btrfs_free_delayed_extent_op(extent_op);
-                kfree(extent_op);
-                count++;
                if (ret) {
-                        if (locked_ref) {
+                        btrfs_delayed_ref_unlock(locked_ref);
-                                list_del_init(&locked_ref->cluster);
+                        btrfs_put_delayed_ref(ref);
-                                mutex_unlock(&locked_ref->mutex);
+                        printk(KERN_DEBUG
-                        }
+                               "btrfs: run_one_delayed_ref returned %d\n", ret);
-                        printk(KERN_DEBUG "btrfs: run_one_delayed_ref returned %d\n", ret);
                        spin_lock(&delayed_refs->lock);
                        return ret;
                }
+                /*
+                 * If this node is a head, that means all the refs in this head
+                 * have been dealt with, and we will pick the next head to deal
+                 * with, so we must unlock the head and drop it from the cluster
+                 * list before we release it.
+                 */
+                if (btrfs_delayed_ref_is_head(ref)) {
+                        list_del_init(&locked_ref->cluster);
+                        btrfs_delayed_ref_unlock(locked_ref);
+                        locked_ref = NULL;
+                }
+                btrfs_put_delayed_ref(ref);
+                count++;
 next:
                cond_resched();
                spin_lock(&delayed_refs->lock);
@@ -2435,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
        return ret;
 }
+static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
+                      int count)
+{
+        int val = atomic_read(&delayed_refs->ref_seq);
+        if (val < seq || val >= seq + count)
+                return 1;
+        return 0;
+}
 /*
 * this starts processing the delayed reference count updates and
 * extent insertions we have queued up so far.  count can be
@@ -2469,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
+        if (count == 0) {
+                count = delayed_refs->num_entries * 2;
+                run_most = 1;
+        }
+        if (!run_all && !run_most) {
+                int old;
+                int seq = atomic_read(&delayed_refs->ref_seq);
+progress:
+                old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+                if (old) {
+                        DEFINE_WAIT(__wait);
+                        if (delayed_refs->num_entries < 16348)
+                                return 0;
+                        prepare_to_wait(&delayed_refs->wait, &__wait,
+                                        TASK_UNINTERRUPTIBLE);
+                        old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+                        if (old) {
+                                schedule();
+                                finish_wait(&delayed_refs->wait, &__wait);
+                                if (!refs_newer(delayed_refs, seq, 256))
+                                        goto progress;
+                                else
+                                        return 0;
+                        } else {
+                                finish_wait(&delayed_refs->wait, &__wait);
+                                goto again;
+                        }
+                }
+        } else {
+                atomic_inc(&delayed_refs->procs_running_refs);
+        }
 again:
        loops = 0;
        spin_lock(&delayed_refs->lock);
@@ -2477,10 +2533,6 @@ again:
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
-        if (count == 0) {
-                count = delayed_refs->num_entries * 2;
-                run_most = 1;
-        }
        while (1) {
                if (!(run_all || run_most) &&
                    delayed_refs->num_heads_ready < 64)
@@ -2500,11 +2552,15 @@ again:
                ret = run_clustered_refs(trans, root, &cluster);
                if (ret < 0) {
+                        btrfs_release_ref_cluster(&cluster);
                        spin_unlock(&delayed_refs->lock);
                        btrfs_abort_transaction(trans, root, ret);
+                        atomic_dec(&delayed_refs->procs_running_refs);
                        return ret;
                }
+                atomic_add(ret, &delayed_refs->ref_seq);
                count -= min_t(unsigned long, ret, count);
                if (count == 0)
@@ -2573,6 +2629,11 @@ again:
                goto again;
        }
 out:
+        atomic_dec(&delayed_refs->procs_running_refs);
+        smp_mb();
+        if (waitqueue_active(&delayed_refs->wait))
+                wake_up(&delayed_refs->wait);
        spin_unlock(&delayed_refs->lock);
        assert_qgroups_uptodate(trans);
        return 0;
@@ -2586,7 +2647,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
        struct btrfs_delayed_extent_op *extent_op;
        int ret;
-        extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+        extent_op = btrfs_alloc_delayed_extent_op();
        if (!extent_op)
                return -ENOMEM;
@@ -2598,7 +2659,7 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
        ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
                                          num_bytes, extent_op);
        if (ret)
-                kfree(extent_op);
+                btrfs_free_delayed_extent_op(extent_op);
        return ret;
 }
@@ -3223,12 +3284,14 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
        u64 extra_flags = chunk_to_extended(flags) &
                                BTRFS_EXTENDED_PROFILE_MASK;
+        write_seqlock(&fs_info->profiles_lock);
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                fs_info->avail_data_alloc_bits |= extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_METADATA)
                fs_info->avail_metadata_alloc_bits |= extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                fs_info->avail_system_alloc_bits |= extra_flags;
+        write_sequnlock(&fs_info->profiles_lock);
 }
 /*
@@ -3276,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        u64 num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
        u64 target;
+        u64 tmp;
        /*
         * see if restripe for this chunk_type is in progress, if so
@@ -3292,40 +3356,48 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        }
        spin_unlock(&root->fs_info->balance_lock);
+        /* First, mask out the RAID levels which aren't possible */
        if (num_devices == 1)
-                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+                           BTRFS_BLOCK_GROUP_RAID5);
+        if (num_devices < 3)
+                flags &= ~BTRFS_BLOCK_GROUP_RAID6;
        if (num_devices < 4)
                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
-        if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
+        tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
-            (flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
-                      BTRFS_BLOCK_GROUP_RAID10))) {
+                       BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
-                flags &= ~BTRFS_BLOCK_GROUP_DUP;
+        flags &= ~tmp;
-        }
-        if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
-            (flags & BTRFS_BLOCK_GROUP_RAID10)) {
-                flags &= ~BTRFS_BLOCK_GROUP_RAID1;
-        }
-        if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
+        if (tmp & BTRFS_BLOCK_GROUP_RAID6)
-            ((flags & BTRFS_BLOCK_GROUP_RAID1) |
+                tmp = BTRFS_BLOCK_GROUP_RAID6;
-             (flags & BTRFS_BLOCK_GROUP_RAID10) |
+        else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
-             (flags & BTRFS_BLOCK_GROUP_DUP))) {
+                tmp = BTRFS_BLOCK_GROUP_RAID5;
-                flags &= ~BTRFS_BLOCK_GROUP_RAID0;
+        else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
-        }
+                tmp = BTRFS_BLOCK_GROUP_RAID10;
+        else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+                tmp = BTRFS_BLOCK_GROUP_RAID1;
+        else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+                tmp = BTRFS_BLOCK_GROUP_RAID0;
-        return extended_to_chunk(flags);
+        return extended_to_chunk(flags | tmp);
 }
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-        if (flags & BTRFS_BLOCK_GROUP_DATA)
+        unsigned seq;
-                flags |= root->fs_info->avail_data_alloc_bits;
-        else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+        do {
-                flags |= root->fs_info->avail_system_alloc_bits;
+                seq = read_seqbegin(&root->fs_info->profiles_lock);
-        else if (flags & BTRFS_BLOCK_GROUP_METADATA)
-                flags |= root->fs_info->avail_metadata_alloc_bits;
+                if (flags & BTRFS_BLOCK_GROUP_DATA)
+                        flags |= root->fs_info->avail_data_alloc_bits;
+                else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
+                        flags |= root->fs_info->avail_system_alloc_bits;
+                else if (flags & BTRFS_BLOCK_GROUP_METADATA)
+                        flags |= root->fs_info->avail_metadata_alloc_bits;
+        } while (read_seqretry(&root->fs_info->profiles_lock, seq));
        return btrfs_reduce_alloc_profile(root, flags);
 }
@@ -3333,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
        u64 flags;
+        u64 ret;
        if (data)
                flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3341,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
        else
                flags = BTRFS_BLOCK_GROUP_METADATA;
-        return get_alloc_profile(root, flags);
+        ret = get_alloc_profile(root, flags);
+        return ret;
 }
 /*
@@ -3357,7 +3431,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
        int ret = 0, committed = 0, alloc_chunk = 1;
        /* make sure bytes are sectorsize aligned */
-        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+        bytes = ALIGN(bytes, root->sectorsize);
        if (root == root->fs_info->tree_root ||
            BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
@@ -3452,7 +3526,7 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
        struct btrfs_space_info *data_sinfo;
        /* make sure bytes are sectorsize aligned */
-        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+        bytes = ALIGN(bytes, root->sectorsize);
        data_sinfo = root->fs_info->data_sinfo;
        spin_lock(&data_sinfo->lock);
@@ -3516,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
 {
        u64 num_dev;
-        if (type & BTRFS_BLOCK_GROUP_RAID10 ||
+        if (type & (BTRFS_BLOCK_GROUP_RAID10 |
-            type & BTRFS_BLOCK_GROUP_RAID0)
+                    BTRFS_BLOCK_GROUP_RAID0 |
+                    BTRFS_BLOCK_GROUP_RAID5 |
+                    BTRFS_BLOCK_GROUP_RAID6))
                num_dev = root->fs_info->fs_devices->rw_devices;
        else if (type & BTRFS_BLOCK_GROUP_RAID1)
                num_dev = 2;
@@ -3564,6 +3640,10 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        int wait_for_alloc = 0;
        int ret = 0;
+        /* Don't re-enter if we're already allocating a chunk */
+        if (trans->allocating_chunk)
+                return -ENOSPC;
        space_info = __find_space_info(extent_root->fs_info, flags);
        if (!space_info) {
                ret = update_space_info(extent_root->fs_info, flags,
@@ -3606,6 +3686,8 @@ again:
                goto again;
        }
+        trans->allocating_chunk = true;
        /*
         * If we have mixed data/metadata chunks we want to make sure we keep
         * allocating mixed chunks instead of individual chunks.
@@ -3632,19 +3714,20 @@ again:
        check_system_chunk(trans, extent_root, flags);
        ret = btrfs_alloc_chunk(trans, extent_root, flags);
-        if (ret < 0 && ret != -ENOSPC)
+        trans->allocating_chunk = false;
-                goto out;
        spin_lock(&space_info->lock);
+        if (ret < 0 && ret != -ENOSPC)
+                goto out;
        if (ret)
                space_info->full = 1;
        else
                ret = 1;
        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+out:
        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
-out:
        mutex_unlock(&fs_info->chunk_mutex);
        return ret;
 }
@@ -3653,13 +3736,31 @@ static int can_overcommit(struct btrfs_root *root,
                          struct btrfs_space_info *space_info, u64 bytes,
                          enum btrfs_reserve_flush_enum flush)
 {
+        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        u64 profile = btrfs_get_alloc_profile(root, 0);
+        u64 rsv_size = 0;
        u64 avail;
        u64 used;
+        u64 to_add;
        used = space_info->bytes_used + space_info->bytes_reserved +
-                space_info->bytes_pinned + space_info->bytes_readonly +
+                space_info->bytes_pinned + space_info->bytes_readonly;
-                space_info->bytes_may_use;
+        spin_lock(&global_rsv->lock);
+        rsv_size = global_rsv->size;
+        spin_unlock(&global_rsv->lock);
+        /*
+         * We only want to allow over committing if we have lots of actual space
+         * free, but if we don't have enough space to handle the global reserve
+         * space then we could end up having a real enospc problem when trying
+         * to allocate a chunk or some other such important allocation.
+         */
+        rsv_size <<= 1;
+        if (used + rsv_size >= space_info->total_bytes)
+                return 0;
+        used += space_info->bytes_may_use;
        spin_lock(&root->fs_info->free_chunk_lock);
        avail = root->fs_info->free_chunk_space;
@@ -3667,28 +3768,60 @@ static int can_overcommit(struct btrfs_root *root,
        /*
         * If we have dup, raid1 or raid10 then only half of the free
-         * space is actually useable.
+         * space is actually useable.  For raid56, the space info used
+         * doesn't include the parity drive, so we don't have to
+         * change the math
         */
        if (profile & (BTRFS_BLOCK_GROUP_DUP |
                       BTRFS_BLOCK_GROUP_RAID1 |
                       BTRFS_BLOCK_GROUP_RAID10))
                avail >>= 1;
+        to_add = space_info->total_bytes;
        /*
         * If we aren't flushing all things, let us overcommit up to
         * 1/2th of the space. If we can flush, don't let us overcommit
         * too much, let it overcommit up to 1/8 of the space.
         */
        if (flush == BTRFS_RESERVE_FLUSH_ALL)
-                avail >>= 3;
+                to_add >>= 3;
        else
-                avail >>= 1;
+                to_add >>= 1;
-        if (used + bytes < space_info->total_bytes + avail)
+        /*
+         * Limit the overcommit to the amount of free space we could possibly
+         * allocate for chunks.
+         */
+        to_add = min(avail, to_add);
+        if (used + bytes < space_info->total_bytes + to_add)
                return 1;
        return 0;
 }
+void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
+                                  unsigned long nr_pages)
+{
+        struct super_block *sb = root->fs_info->sb;
+        int started;
+        /* If we can not start writeback, just sync all the delalloc file. */
+        started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
+                                                      WB_REASON_FS_FREE_SPACE);
+        if (!started) {
+                /*
+                 * We needn't worry the filesystem going from r/w to r/o though
+                 * we don't acquire ->s_umount mutex, because the filesystem
+                 * should guarantee the delalloc inodes list be empty after
+                 * the filesystem is readonly(all dirty pages are written to
+                 * the disk).
+                 */
+                btrfs_start_delalloc_inodes(root, 0);
+                btrfs_wait_ordered_extents(root, 0);
+        }
+}
 /*
 * shrink metadata reservation for delalloc
 */
@@ -3710,7 +3843,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        space_info = block_rsv->space_info;
        smp_mb();
-        delalloc_bytes = root->fs_info->delalloc_bytes;
+        delalloc_bytes = percpu_counter_sum_positive(
+                                                &root->fs_info->delalloc_bytes);
        if (delalloc_bytes == 0) {
                if (trans)
                        return;
@@ -3721,10 +3855,7 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
        while (delalloc_bytes && loops < 3) {
                max_reclaim = min(delalloc_bytes, to_reclaim);
                nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
-                try_to_writeback_inodes_sb_nr(root->fs_info->sb,
+                btrfs_writeback_inodes_sb_nr(root, nr_pages);
-                                              nr_pages,
-                                              WB_REASON_FS_FREE_SPACE);
                /*
                 * We need to wait for the async pages to actually start before
                 * we do anything.
@@ -3752,7 +3883,8 @@ static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
                                break;
                }
                smp_mb();
-                delalloc_bytes = root->fs_info->delalloc_bytes;
+                delalloc_bytes = percpu_counter_sum_positive(
+                                                &root->fs_info->delalloc_bytes);
        }
 }
@@ -4016,6 +4148,15 @@ again:
                goto again;
 out:
+        if (ret == -ENOSPC &&
+            unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+                struct btrfs_block_rsv *global_rsv =
+                        &root->fs_info->global_block_rsv;
+                if (block_rsv != global_rsv &&
+                    !block_rsv_use_bytes(global_rsv, orig_bytes))
+                        ret = 0;
+        }
        if (flushing) {
                spin_lock(&space_info->lock);
                space_info->flush = 0;
@@ -4402,19 +4543,60 @@ void btrfs_orphan_release_metadata(struct inode *inode)
        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
-int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
+/*
-                                struct btrfs_pending_snapshot *pending)
+ * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
+ * root: the root of the parent directory
+ * rsv: block reservation
+ * items: the number of items that we need do reservation
+ * qgroup_reserved: used to return the reserved size in qgroup
+ *
+ * This function is used to reserve the space for snapshot/subvolume
+ * creation and deletion. Those operations are different with the
+ * common file/directory operations, they change two fs/file trees
+ * and root tree, the number of items that the qgroup reserves is
+ * different with the free space reservation. So we can not use
+ * the space reseravtion mechanism in start_transaction().
+ */
+int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
+                                     struct btrfs_block_rsv *rsv,
+                                     int items,
+                                     u64 *qgroup_reserved)
 {
-        struct btrfs_root *root = pending->root;
+        u64 num_bytes;
-        struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
+        int ret;
-        struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
-        /*
+        if (root->fs_info->quota_enabled) {
-         * two for root back/forward refs, two for directory entries,
+                /* One for parent inode, two for dir entries */
-         * one for root of the snapshot and one for parent inode.
+                num_bytes = 3 * root->leafsize;
-         */
+                ret = btrfs_qgroup_reserve(root, num_bytes);
-        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
+                if (ret)
-        dst_rsv->space_info = src_rsv->space_info;
+                        return ret;
-        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
+        } else {
+                num_bytes = 0;
+        }
+        *qgroup_reserved = num_bytes;
+        num_bytes = btrfs_calc_trans_metadata_size(root, items);
+        rsv->space_info = __find_space_info(root->fs_info,
+                                            BTRFS_BLOCK_GROUP_METADATA);
+        ret = btrfs_block_rsv_add(root, rsv, num_bytes,
+                                  BTRFS_RESERVE_FLUSH_ALL);
+        if (ret) {
+                if (*qgroup_reserved)
+                        btrfs_qgroup_free(root, *qgroup_reserved);
+        }
+        return ret;
+}
+void btrfs_subvolume_release_metadata(struct btrfs_root *root,
+                                      struct btrfs_block_rsv *rsv,
+                                      u64 qgroup_reserved)
+{
+        btrfs_block_rsv_release(root, rsv, (u64)-1);
+        if (qgroup_reserved)
+                btrfs_qgroup_free(root, qgroup_reserved);
 }
 /**
@@ -4522,6 +4704,8 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
        int ret = 0;
        bool delalloc_lock = true;
+        u64 to_free = 0;
+        unsigned dropped;
        /* If we are a free space inode we need to not flush since we will be in
         * the middle of a transaction commit.  We also don't need the delalloc
@@ -4565,54 +4749,19 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        csum_bytes = BTRFS_I(inode)->csum_bytes;
        spin_unlock(&BTRFS_I(inode)->lock);
-        if (root->fs_info->quota_enabled)
+        if (root->fs_info->quota_enabled) {
                ret = btrfs_qgroup_reserve(root, num_bytes +
                                           nr_extents * root->leafsize);
+                if (ret)
+                        goto out_fail;
+        }
-        /*
+        ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
-         * ret != 0 here means the qgroup reservation failed, we go straight to
+        if (unlikely(ret)) {
-         * the shared error handling then.
+                if (root->fs_info->quota_enabled)
-         */
-        if (ret == 0)
-                ret = reserve_metadata_bytes(root, block_rsv,
-                                             to_reserve, flush);
-        if (ret) {
-                u64 to_free = 0;
-                unsigned dropped;
-                spin_lock(&BTRFS_I(inode)->lock);
-                dropped = drop_outstanding_extent(inode);
-                /*
-                 * If the inodes csum_bytes is the same as the original
-                 * csum_bytes then we know we haven't raced with any free()ers
-                 * so we can just reduce our inodes csum bytes and carry on.
-                 * Otherwise we have to do the normal free thing to account for
-                 * the case that the free side didn't free up its reserve
-                 * because of this outstanding reservation.
-                 */
-                if (BTRFS_I(inode)->csum_bytes == csum_bytes)
-                        calc_csum_metadata_size(inode, num_bytes, 0);
-                else
-                        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
-                spin_unlock(&BTRFS_I(inode)->lock);
-                if (dropped)
-                        to_free += btrfs_calc_trans_metadata_size(root, dropped);
-                if (to_free) {
-                        btrfs_block_rsv_release(root, block_rsv, to_free);
-                        trace_btrfs_space_reservation(root->fs_info,
-                                                      "delalloc",
-                                                      btrfs_ino(inode),
-                                                      to_free, 0);
-                }
-                if (root->fs_info->quota_enabled) {
                        btrfs_qgroup_free(root, num_bytes +
                                                nr_extents * root->leafsize);
-                }
+                goto out_fail;
-                if (delalloc_lock)
-                        mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
-                return ret;
        }
        spin_lock(&BTRFS_I(inode)->lock);
@@ -4633,6 +4782,34 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
        return 0;
+out_fail:
+        spin_lock(&BTRFS_I(inode)->lock);
+        dropped = drop_outstanding_extent(inode);
+        /*
+         * If the inodes csum_bytes is the same as the original
+         * csum_bytes then we know we haven't raced with any free()ers
+         * so we can just reduce our inodes csum bytes and carry on.
+         * Otherwise we have to do the normal free thing to account for
+         * the case that the free side didn't free up its reserve
+         * because of this outstanding reservation.
+         */
+        if (BTRFS_I(inode)->csum_bytes == csum_bytes)
+                calc_csum_metadata_size(inode, num_bytes, 0);
+        else
+                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+        spin_unlock(&BTRFS_I(inode)->lock);
+        if (dropped)
+                to_free += btrfs_calc_trans_metadata_size(root, dropped);
+        if (to_free) {
+                btrfs_block_rsv_release(root, block_rsv, to_free);
+                trace_btrfs_space_reservation(root->fs_info, "delalloc",
+                                              btrfs_ino(inode), to_free, 0);
+        }
+        if (delalloc_lock)
+                mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
+        return ret;
 }
 /**
@@ -4654,7 +4831,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        spin_lock(&BTRFS_I(inode)->lock);
        dropped = drop_outstanding_extent(inode);
-        to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+        if (num_bytes)
+                to_free = calc_csum_metadata_size(inode, num_bytes, 0);
        spin_unlock(&BTRFS_I(inode)->lock);
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -4721,8 +4899,7 @@ void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
        btrfs_free_reserved_data_space(inode, num_bytes);
 }
-static int update_block_group(struct btrfs_trans_handle *trans,
+static int update_block_group(struct btrfs_root *root,
-                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc)
 {
        struct btrfs_block_group_cache *cache = NULL;
@@ -4759,7 +4936,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                 * space back to the block group, otherwise we will leak space.
                 */
                if (!alloc && cache->cached == BTRFS_CACHE_NO)
-                        cache_block_group(cache, trans, NULL, 1);
+                        cache_block_group(cache, 1);
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
@@ -4809,6 +4986,13 @@ static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
        struct btrfs_block_group_cache *cache;
        u64 bytenr;
+        spin_lock(&root->fs_info->block_group_cache_lock);
+        bytenr = root->fs_info->first_logical_byte;
+        spin_unlock(&root->fs_info->block_group_cache_lock);
+        if (bytenr < (u64)-1)
+                return bytenr;
        cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
        if (!cache)
                return 0;
@@ -4859,8 +5043,7 @@ int btrfs_pin_extent(struct btrfs_root *root,
 /*
 * this function must be called within transaction
 */
-int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
-                                    struct btrfs_root *root,
                                    u64 bytenr, u64 num_bytes)
 {
        struct btrfs_block_group_cache *cache;
@@ -4874,7 +5057,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
         * to one because the slow code to read in the free extents does check
         * the pinned extents.
         */
-        cache_block_group(cache, trans, root, 1);
+        cache_block_group(cache, 1);
        pin_down_extent(root, cache, bytenr, num_bytes, 0);
@@ -5271,7 +5454,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                        }
                }
-                ret = update_block_group(trans, root, bytenr, num_bytes, 0);
+                ret = update_block_group(root, bytenr, num_bytes, 0);
                if (ret) {
                        btrfs_abort_transaction(trans, extent_root, ret);
                        goto out;
@@ -5316,7 +5499,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
        if (head->extent_op) {
                if (!head->must_insert_reserved)
                        goto out;
-                kfree(head->extent_op);
+                btrfs_free_delayed_extent_op(head->extent_op);
                head->extent_op = NULL;
        }
@@ -5439,10 +5622,11 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        return ret;
 }
-static u64 stripe_align(struct btrfs_root *root, u64 val)
+static u64 stripe_align(struct btrfs_root *root,
+                        struct btrfs_block_group_cache *cache,
+                        u64 val, u64 num_bytes)
 {
-        u64 mask = ((u64)root->stripesize - 1);
+        u64 ret = ALIGN(val, root->stripesize);
-        u64 ret = (val + mask) & ~mask;
        return ret;
 }
@@ -5462,7 +5646,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
                                u64 num_bytes)
 {
        struct btrfs_caching_control *caching_ctl;
-        DEFINE_WAIT(wait);
        caching_ctl = get_caching_control(cache);
        if (!caching_ctl)
@@ -5479,7 +5662,6 @@ static noinline int
 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
        struct btrfs_caching_control *caching_ctl;
-        DEFINE_WAIT(wait);
        caching_ctl = get_caching_control(cache);
        if (!caching_ctl)
@@ -5493,20 +5675,20 @@ wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 int __get_raid_index(u64 flags)
 {
-        int index;
        if (flags & BTRFS_BLOCK_GROUP_RAID10)
-                index = 0;
+                return BTRFS_RAID_RAID10;
        else if (flags & BTRFS_BLOCK_GROUP_RAID1)
-                index = 1;
+                return BTRFS_RAID_RAID1;
        else if (flags & BTRFS_BLOCK_GROUP_DUP)
-                index = 2;
+                return BTRFS_RAID_DUP;
        else if (flags & BTRFS_BLOCK_GROUP_RAID0)
-                index = 3;
+                return BTRFS_RAID_RAID0;
-        else
+        else if (flags & BTRFS_BLOCK_GROUP_RAID5)
-                index = 4;
+                return BTRFS_RAID_RAID5;
+        else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+                return BTRFS_RAID_RAID6;
-        return index;
+        return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 }
 static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5649,6 +5831,8 @@ search:
                if (!block_group_bits(block_group, data)) {
                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
                                BTRFS_BLOCK_GROUP_RAID1 |
+                                BTRFS_BLOCK_GROUP_RAID5 |
+                                BTRFS_BLOCK_GROUP_RAID6 |
                                BTRFS_BLOCK_GROUP_RAID10;
                        /*
@@ -5664,8 +5848,7 @@ have_block_group:
                cached = block_group_cache_done(block_group);
                if (unlikely(!cached)) {
                        found_uncached_bg = true;
-                        ret = cache_block_group(block_group, trans,
+                        ret = cache_block_group(block_group, 0);
-                                                orig_root, 0);
                        BUG_ON(ret < 0);
                        ret = 0;
                }
@@ -5678,6 +5861,7 @@ have_block_group:
                 * lets look there
                 */
                if (last_ptr) {
+                        unsigned long aligned_cluster;
                        /*
                         * the refill lock keeps out other
                         * people trying to start a new cluster
@@ -5744,11 +5928,15 @@ refill_cluster:
                                goto unclustered_alloc;
                        }
+                        aligned_cluster = max_t(unsigned long,
+                                                empty_cluster + empty_size,
+                                              block_group->full_stripe_len);
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
                                               search_start, num_bytes,
-                                               empty_cluster + empty_size);
+                                               aligned_cluster);
                        if (ret == 0) {
                                /*
                                 * now pull our allocation out of this
@@ -5819,7 +6007,8 @@ unclustered_alloc:
                        goto loop;
                }
 checks:
-                search_start = stripe_align(root, offset);
+                search_start = stripe_align(root, used_block_group,
+                                            offset, num_bytes);
                /* move on to the next group */
                if (search_start + num_bytes >
@@ -5970,7 +6159,7 @@ again:
        if (ret == -ENOSPC) {
                if (!final_tried) {
                        num_bytes = num_bytes >> 1;
-                        num_bytes = num_bytes & ~(root->sectorsize - 1);
+                        num_bytes = round_down(num_bytes, root->sectorsize);
                        num_bytes = max(num_bytes, min_alloc_size);
                        if (num_bytes == min_alloc_size)
                                final_tried = true;
@@ -6094,7 +6283,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(path->nodes[0]);
        btrfs_free_path(path);
-        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+        ret = update_block_group(root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -6158,7 +6347,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        btrfs_free_path(path);
-        ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
+        ret = update_block_group(root, ins->objectid, ins->offset, 1);
        if (ret) { /* -ENOENT, logic error */
                printk(KERN_ERR "btrfs update block group failed for %llu "
                       "%llu\n", (unsigned long long)ins->objectid,
@@ -6201,7 +6390,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        u64 num_bytes = ins->offset;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        cache_block_group(block_group, trans, NULL, 0);
+        cache_block_group(block_group, 0);
        caching_ctl = get_caching_control(block_group);
        if (!caching_ctl) {
@@ -6315,12 +6504,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        if (!ret)
                return block_rsv;
        if (ret && !block_rsv->failfast) {
-                static DEFINE_RATELIMIT_STATE(_rs,
+                if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
-                                DEFAULT_RATELIMIT_INTERVAL,
+                        static DEFINE_RATELIMIT_STATE(_rs,
-                                /*DEFAULT_RATELIMIT_BURST*/ 2);
+                                        DEFAULT_RATELIMIT_INTERVAL * 10,
-                if (__ratelimit(&_rs))
+                                        /*DEFAULT_RATELIMIT_BURST*/ 1);
-                        WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
+                        if (__ratelimit(&_rs))
-                             ret);
+                                WARN(1, KERN_DEBUG
+                                        "btrfs: block rsv returned %d\n", ret);
+                }
                ret = reserve_metadata_bytes(root, block_rsv, blocksize,
                                             BTRFS_RESERVE_NO_FLUSH);
                if (!ret) {
@@ -6386,7 +6577,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
        if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
                struct btrfs_delayed_extent_op *extent_op;
-                extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
+                extent_op = btrfs_alloc_delayed_extent_op();
                BUG_ON(!extent_op); /* -ENOMEM */
                if (key)
                        memcpy(&extent_op->key, key, sizeof(extent_op->key));
@@ -7189,6 +7380,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
                root->fs_info->fs_devices->missing_devices;
        stripped = BTRFS_BLOCK_GROUP_RAID0 |
+                BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
        if (num_devices == 1) {
@@ -7467,16 +7659,16 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                index = get_block_group_index(block_group);
        }
-        if (index == 0) {
+        if (index == BTRFS_RAID_RAID10) {
                dev_min = 4;
                /* Divide by 2 */
                min_free >>= 1;
-        } else if (index == 1) {
+        } else if (index == BTRFS_RAID_RAID1) {
                dev_min = 2;
-        } else if (index == 2) {
+        } else if (index == BTRFS_RAID_DUP) {
                /* Multiply by 2 */
                min_free <<= 1;
-        } else if (index == 3) {
+        } else if (index == BTRFS_RAID_RAID0) {
                dev_min = fs_devices->rw_devices;
                do_div(min_free, dev_min);
        }
@@ -7637,11 +7829,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
-                if (space_info->bytes_pinned > 0 ||
+                if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
-                    space_info->bytes_reserved > 0 ||
+                        if (space_info->bytes_pinned > 0 ||
-                    space_info->bytes_may_use > 0) {
+                            space_info->bytes_reserved > 0 ||
-                        WARN_ON(1);
+                            space_info->bytes_may_use > 0) {
-                        dump_space_info(space_info, 0, 0);
+                                WARN_ON(1);
+                                dump_space_info(space_info, 0, 0);
+                        }
                }
                list_del(&space_info->list);
                kfree(space_info);
@@ -7740,7 +7934,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                btrfs_release_path(path);
                cache->flags = btrfs_block_group_flags(&cache->item);
                cache->sectorsize = root->sectorsize;
+                cache->full_stripe_len = btrfs_full_stripe_len(root,
+                                               &root->fs_info->mapping_tree,
+                                               found_key.objectid);
                btrfs_init_free_space_ctl(cache);
                /*
@@ -7794,6 +7990,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                if (!(get_alloc_profile(root, space_info->flags) &
                      (BTRFS_BLOCK_GROUP_RAID10 |
                       BTRFS_BLOCK_GROUP_RAID1 |
+                       BTRFS_BLOCK_GROUP_RAID5 |
+                       BTRFS_BLOCK_GROUP_RAID6 |
                       BTRFS_BLOCK_GROUP_DUP)))
                        continue;
                /*
@@ -7869,6 +8067,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        cache->sectorsize = root->sectorsize;
        cache->fs_info = root->fs_info;
+        cache->full_stripe_len = btrfs_full_stripe_len(root,
+                                               &root->fs_info->mapping_tree,
+                                               chunk_offset);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
@@ -7918,12 +8119,14 @@ static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
        u64 extra_flags = chunk_to_extended(flags) &
                                BTRFS_EXTENDED_PROFILE_MASK;
+        write_seqlock(&fs_info->profiles_lock);
        if (flags & BTRFS_BLOCK_GROUP_DATA)
                fs_info->avail_data_alloc_bits &= ~extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_METADATA)
                fs_info->avail_metadata_alloc_bits &= ~extra_flags;
        if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
                fs_info->avail_system_alloc_bits &= ~extra_flags;
+        write_sequnlock(&fs_info->profiles_lock);
 }
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
@@ -8022,6 +8225,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_lock(&root->fs_info->block_group_cache_lock);
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
+        if (root->fs_info->first_logical_byte == block_group->key.objectid)
+                root->fs_info->first_logical_byte = (u64)-1;
        spin_unlock(&root->fs_info->block_group_cache_lock);
        down_write(&block_group->space_info->groups_sem);
@@ -8144,7 +8350,7 @@ int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
                if (end - start >= range->minlen) {
                        if (!block_group_cache_done(cache)) {
-                                ret = cache_block_group(cache, NULL, root, 0);
+                                ret = cache_block_group(cache, 0);
                                if (!ret)
                                        wait_block_group_cache_done(cache);
                        }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 1b319df29eee..f173c5af6461 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -4,7 +4,6 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/page-flags.h>
-#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/blkdev.h>
 #include <linux/swap.h>
@@ -1834,7 +1833,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 */
 static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 {
-        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1, NULL))
                SetPageUptodate(page);
@@ -1846,7 +1845,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page)
 */
 static void check_page_locked(struct extent_io_tree *tree, struct page *page)
 {
-        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0, NULL))
                unlock_page(page);
@@ -1895,13 +1894,11 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
        if (ret)
                err = ret;
-        if (did_repair) {
+        ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
-                ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+                                rec->start + rec->len - 1,
-                                        rec->start + rec->len - 1,
+                                EXTENT_DAMAGED, GFP_NOFS);
-                                        EXTENT_DAMAGED, GFP_NOFS);
+        if (ret && !err)
-                if (ret && !err)
+                err = ret;
-                        err = ret;
-        }
        kfree(rec);
        return err;
@@ -1932,10 +1929,15 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
        u64 map_length = 0;
        u64 sector;
        struct btrfs_bio *bbio = NULL;
+        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
        int ret;
        BUG_ON(!mirror_num);
+        /* we can't repair anything in raid56 yet */
+        if (btrfs_is_parity_mirror(map_tree, logical, length, mirror_num))
+                return 0;
        bio = bio_alloc(GFP_NOFS, 1);
        if (!bio)
                return -EIO;
@@ -1960,7 +1962,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start,
                return -EIO;
        }
        bio->bi_bdev = dev->bdev;
-        bio_add_page(bio, page, length, start-page_offset(page));
+        bio_add_page(bio, page, length, start - page_offset(page));
        btrfsic_submit_bio(WRITE_SYNC, bio);
        wait_for_completion(&compl);
@@ -2052,6 +2054,7 @@ static int clean_io_failure(u64 start, struct page *page)
                                                failrec->failed_mirror);
                        did_repair = !ret;
                }
+                ret = 0;
        }
 out:
@@ -2293,8 +2296,7 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
                struct page *page = bvec->bv_page;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
-                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+                start = page_offset(page) + bvec->bv_offset;
-                         bvec->bv_offset;
                end = start + bvec->bv_len - 1;
                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2353,8 +2355,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                         (long int)bio->bi_bdev);
                tree = &BTRFS_I(page->mapping->host)->io_tree;
-                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
+                start = page_offset(page) + bvec->bv_offset;
-                        bvec->bv_offset;
                end = start + bvec->bv_len - 1;
                if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE)
@@ -2471,7 +2472,7 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
        struct extent_io_tree *tree = bio->bi_private;
        u64 start;
-        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
+        start = page_offset(page) + bvec->bv_offset;
        bio->bi_private = NULL;
@@ -2489,13 +2490,13 @@ static int __must_check submit_one_bio(int rw, struct bio *bio,
        return ret;
 }
-static int merge_bio(struct extent_io_tree *tree, struct page *page,
+static int merge_bio(int rw, struct extent_io_tree *tree, struct page *page,
                     unsigned long offset, size_t size, struct bio *bio,
                     unsigned long bio_flags)
 {
        int ret = 0;
        if (tree->ops && tree->ops->merge_bio_hook)
-                ret = tree->ops->merge_bio_hook(page, offset, size, bio,
+                ret = tree->ops->merge_bio_hook(rw, page, offset, size, bio,
                                                bio_flags);
        BUG_ON(ret < 0);
        return ret;
@@ -2530,7 +2531,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
                                sector;
                if (prev_bio_flags != bio_flags || !contig ||
-                    merge_bio(tree, page, offset, page_size, bio, bio_flags) ||
+                    merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
                    bio_add_page(bio, page, page_size, offset) < page_size) {
                        ret = submit_one_bio(rw, bio, mirror_num,
                                             prev_bio_flags);
@@ -2595,7 +2596,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                                   unsigned long *bio_flags)
 {
        struct inode *inode = page->mapping->host;
-        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 start = page_offset(page);
        u64 page_end = start + PAGE_CACHE_SIZE - 1;
        u64 end;
        u64 cur = start;
@@ -2648,6 +2649,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                }
        }
        while (cur <= end) {
+                unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
                if (cur >= last_byte) {
                        char *userpage;
                        struct extent_state *cached = NULL;
@@ -2682,7 +2685,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                cur_end = min(extent_map_end(em) - 1, end);
-                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+                iosize = ALIGN(iosize, blocksize);
                if (this_bio_flag & EXTENT_BIO_COMPRESSED) {
                        disk_io_size = em->block_len;
                        sector = em->block_start >> 9;
@@ -2735,26 +2738,17 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        continue;
                }
-                ret = 0;
+                pnr -= page->index;
-                if (tree->ops && tree->ops->readpage_io_hook) {
+                ret = submit_extent_page(READ, tree, page,
-                        ret = tree->ops->readpage_io_hook(page, cur,
-                                                          cur + iosize - 1);
-                }
-                if (!ret) {
-                        unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
-                        pnr -= page->index;
-                        ret = submit_extent_page(READ, tree, page,
                                         sector, disk_io_size, pg_offset,
                                         bdev, bio, pnr,
                                         end_bio_extent_readpage, mirror_num,
                                         *bio_flags,
                                         this_bio_flag);
-                        if (!ret) {
+                if (!ret) {
-                                nr++;
+                        nr++;
-                                *bio_flags = this_bio_flag;
+                        *bio_flags = this_bio_flag;
-                        }
+                } else {
-                }
-                if (ret) {
                        SetPageError(page);
                        unlock_extent(tree, cur, cur + iosize - 1);
                }
@@ -2806,7 +2800,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        struct inode *inode = page->mapping->host;
        struct extent_page_data *epd = data;
        struct extent_io_tree *tree = epd->tree;
-        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 start = page_offset(page);
        u64 delalloc_start;
        u64 page_end = start + PAGE_CACHE_SIZE - 1;
        u64 end;
@@ -2982,7 +2976,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                BUG_ON(extent_map_end(em) <= cur);
                BUG_ON(end < cur);
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
-                iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1);
+                iosize = ALIGN(iosize, blocksize);
                sector = (em->block_start + extent_offset) >> 9;
                bdev = em->bdev;
                block_start = em->block_start;
@@ -3124,12 +3118,9 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
                set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
                spin_unlock(&eb->refs_lock);
                btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-                spin_lock(&fs_info->delalloc_lock);
+                __percpu_counter_add(&fs_info->dirty_metadata_bytes,
-                if (fs_info->dirty_metadata_bytes >= eb->len)
+                                     -eb->len,
-                        fs_info->dirty_metadata_bytes -= eb->len;
+                                     fs_info->dirty_metadata_batch);
-                else
-                        WARN_ON(1);
-                spin_unlock(&fs_info->delalloc_lock);
                ret = 1;
        } else {
                spin_unlock(&eb->refs_lock);
@@ -3446,15 +3437,9 @@ retry:
                         * swizzled back from swapper_space to tmpfs file
                         * mapping
                         */
-                        if (tree->ops &&
+                        if (!trylock_page(page)) {
-                            tree->ops->write_cache_pages_lock_hook) {
+                                flush_fn(data);
-                                tree->ops->write_cache_pages_lock_hook(page,
+                                lock_page(page);
-                                                               data, flush_fn);
-                        } else {
-                                if (!trylock_page(page)) {
-                                        flush_fn(data);
-                                        lock_page(page);
-                                }
                        }
                        if (unlikely(page->mapping != mapping)) {
@@ -3674,11 +3659,11 @@ int extent_invalidatepage(struct extent_io_tree *tree,
                          struct page *page, unsigned long offset)
 {
        struct extent_state *cached_state = NULL;
-        u64 start = ((u64)page->index << PAGE_CACHE_SHIFT);
+        u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        size_t blocksize = page->mapping->host->i_sb->s_blocksize;
-        start += (offset + blocksize - 1) & ~(blocksize - 1);
+        start += ALIGN(offset, blocksize);
        if (start > end)
                return 0;
@@ -3700,7 +3685,7 @@ int try_release_extent_state(struct extent_map_tree *map,
                             struct extent_io_tree *tree, struct page *page,
                             gfp_t mask)
 {
-        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        int ret = 1;
@@ -3739,7 +3724,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                               gfp_t mask)
 {
        struct extent_map *em;
-        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
+        u64 start = page_offset(page);
        u64 end = start + PAGE_CACHE_SIZE - 1;
        if ((mask & __GFP_WAIT) &&
@@ -3797,7 +3782,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode,
                len = last - offset;
                if (len == 0)
                        break;
-                len = (len + sectorsize - 1) & ~(sectorsize - 1);
+                len = ALIGN(len, sectorsize);
                em = get_extent(inode, NULL, 0, offset, len, 0);
                if (IS_ERR_OR_NULL(em))
                        return em;
@@ -3995,8 +3980,6 @@ static void __free_extent_buffer(struct extent_buffer *eb)
        list_del(&eb->leak_list);
        spin_unlock_irqrestore(&leak_lock, flags);
 #endif
-        if (eb->pages && eb->pages != eb->inline_pages)
-                kfree(eb->pages);
        kmem_cache_free(extent_buffer_cache, eb);
 }
@@ -4037,19 +4020,12 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
        atomic_set(&eb->refs, 1);
        atomic_set(&eb->io_pages, 0);
-        if (len > MAX_INLINE_EXTENT_BUFFER_SIZE) {
+        /*
-                struct page **pages;
+         * Sanity checks, currently the maximum is 64k covered by 16x 4k pages
-                int num_pages = (len + PAGE_CACHE_SIZE - 1) >>
+         */
-                        PAGE_CACHE_SHIFT;
+        BUILD_BUG_ON(BTRFS_MAX_METADATA_BLOCKSIZE
-                pages = kzalloc(num_pages, mask);
+                > MAX_INLINE_EXTENT_BUFFER_SIZE);
-                if (!pages) {
+        BUG_ON(len > MAX_INLINE_EXTENT_BUFFER_SIZE);
-                        __free_extent_buffer(eb);
-                        return NULL;
-                }
-                eb->pages = pages;
-        } else {
-                eb->pages = eb->inline_pages;
-        }
        return eb;
 }
@@ -4180,6 +4156,7 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
 static void check_buffer_tree_ref(struct extent_buffer *eb)
 {
+        int refs;
        /* the ref bit is tricky.  We have to make sure it is set
         * if we have the buffer dirty.   Otherwise the
         * code to free a buffer can end up dropping a dirty
@@ -4200,6 +4177,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb)
         * So bump the ref count first, then set the bit.  If someone
         * beat us to it, drop the ref we added.
         */
+        refs = atomic_read(&eb->refs);
+        if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
+                return;
        spin_lock(&eb->refs_lock);
        if (!test_and_set_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
                atomic_inc(&eb->refs);
@@ -4401,9 +4382,20 @@ static int release_extent_buffer(struct extent_buffer *eb, gfp_t mask)
 void free_extent_buffer(struct extent_buffer *eb)
 {
+        int refs;
+        int old;
        if (!eb)
                return;
+        while (1) {
+                refs = atomic_read(&eb->refs);
+                if (refs <= 3)
+                        break;
+                old = atomic_cmpxchg(&eb->refs, refs, refs - 1);
+                if (old == refs)
+                        return;
+        }
        spin_lock(&eb->refs_lock);
        if (atomic_read(&eb->refs) == 2 &&
            test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags))
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 2eacfabd3263..6068a1985560 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -72,10 +72,9 @@ struct extent_io_ops {
        int (*writepage_start_hook)(struct page *page, u64 start, u64 end);
        int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
        extent_submit_bio_hook_t *submit_bio_hook;
-        int (*merge_bio_hook)(struct page *page, unsigned long offset,
+        int (*merge_bio_hook)(int rw, struct page *page, unsigned long offset,
                              size_t size, struct bio *bio,
                              unsigned long bio_flags);
-        int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
        int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
        int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
                                    struct extent_state *state, int mirror);
@@ -90,8 +89,6 @@ struct extent_io_ops {
                                  struct extent_state *other);
        void (*split_extent_hook)(struct inode *inode,
                                  struct extent_state *orig, u64 split);
-        int (*write_cache_pages_lock_hook)(struct page *page, void *data,
-                                           void (*flush_fn)(void *));
 };
 struct extent_io_tree {
@@ -161,8 +158,7 @@ struct extent_buffer {
         */
        wait_queue_head_t read_lock_wq;
        wait_queue_head_t lock_wq;
-        struct page *inline_pages[INLINE_EXTENT_BUFFER_PAGES];
+        struct page *pages[INLINE_EXTENT_BUFFER_PAGES];
-        struct page **pages;
 };
 static inline void extent_set_compress_type(unsigned long *bio_flags,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index fdb7a8db3b57..2834ca5768ea 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -1,6 +1,5 @@
 #include <linux/err.h>
 #include <linux/slab.h>
-#include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
 #include "ctree.h"
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 94aa53b38721..ec160202be3e 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -684,6 +684,24 @@ out:
        return ret;
 }
+static u64 btrfs_sector_sum_left(struct btrfs_ordered_sum *sums,
+                                 struct btrfs_sector_sum *sector_sum,
+                                 u64 total_bytes, u64 sectorsize)
+{
+        u64 tmp = sectorsize;
+        u64 next_sector = sector_sum->bytenr;
+        struct btrfs_sector_sum *next = sector_sum + 1;
+        while ((tmp + total_bytes) < sums->len) {
+                if (next_sector + sectorsize != next->bytenr)
+                        break;
+                tmp += sectorsize;
+                next_sector = next->bytenr;
+                next++;
+        }
+        return tmp;
+}
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct btrfs_ordered_sum *sums)
@@ -789,20 +807,32 @@ again:
                goto insert;
        }
-        if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) /
+        if (csum_offset == btrfs_item_size_nr(leaf, path->slots[0]) /
            csum_size) {
-                u32 diff = (csum_offset + 1) * csum_size;
+                int extend_nr;
+                u64 tmp;
+                u32 diff;
+                u32 free_space;
-                /*
+                if (btrfs_leaf_free_space(root, leaf) <
-                 * is the item big enough already?  we dropped our lock
+                                 sizeof(struct btrfs_item) + csum_size * 2)
-                 * before and need to recheck
+                        goto insert;
-                 */
-                if (diff < btrfs_item_size_nr(leaf, path->slots[0]))
+                free_space = btrfs_leaf_free_space(root, leaf) -
-                        goto csum;
+                                         sizeof(struct btrfs_item) - csum_size;
+                tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
+                                            root->sectorsize);
+                tmp >>= root->fs_info->sb->s_blocksize_bits;
+                WARN_ON(tmp < 1);
+                extend_nr = max_t(int, 1, (int)tmp);
+                diff = (csum_offset + extend_nr) * csum_size;
+                diff = min(diff, MAX_CSUM_ITEMS(root, csum_size) * csum_size);
                diff = diff - btrfs_item_size_nr(leaf, path->slots[0]);
-                if (diff != csum_size)
+                diff = min(free_space, diff);
-                        goto insert;
+                diff /= csum_size;
+                diff *= csum_size;
                btrfs_extend_item(trans, root, path, diff);
                goto csum;
@@ -812,19 +842,14 @@ insert:
        btrfs_release_path(path);
        csum_offset = 0;
        if (found_next) {
-                u64 tmp = total_bytes + root->sectorsize;
+                u64 tmp;
-                u64 next_sector = sector_sum->bytenr;
-                struct btrfs_sector_sum *next = sector_sum + 1;
-                while (tmp < sums->len) {
+                tmp = btrfs_sector_sum_left(sums, sector_sum, total_bytes,
-                        if (next_sector + root->sectorsize != next->bytenr)
+                                            root->sectorsize);
-                                break;
-                        tmp += root->sectorsize;
-                        next_sector = next->bytenr;
-                        next++;
-                }
-                tmp = min(tmp, next_offset - file_key.offset);
                tmp >>= root->fs_info->sb->s_blocksize_bits;
+                tmp = min(tmp, (next_offset - file_key.offset) >>
+                                         root->fs_info->sb->s_blocksize_bits);
                tmp = max((u64)1, tmp);
                tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size));
                ins_size = csum_size * tmp;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4b241fe9d2fe..af1d0605a5c1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -30,11 +30,11 @@
 #include <linux/statfs.h>
 #include <linux/compat.h>
 #include <linux/slab.h>
+#include <linux/btrfs.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "tree-log.h"
 #include "locking.h"
@@ -374,6 +374,11 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
        atomic_inc(&fs_info->defrag_running);
        while(1) {
+                /* Pause the auto defragger. */
+                if (test_bit(BTRFS_FS_STATE_REMOUNTING,
+                             &fs_info->fs_state))
+                        break;
                if (!__need_auto_defrag(fs_info->tree_root))
                        break;
@@ -505,8 +510,7 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
        loff_t isize = i_size_read(inode);
        start_pos = pos & ~((u64)root->sectorsize - 1);
-        num_bytes = (write_bytes + pos - start_pos +
+        num_bytes = ALIGN(write_bytes + pos - start_pos, root->sectorsize);
-                    root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
@@ -1544,7 +1548,7 @@ static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
         * although we have opened a file as writable, we have
         * to stop this write operation to ensure FS consistency.
         */
-        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
                mutex_unlock(&inode->i_mutex);
                err = -EROFS;
                goto out;
@@ -1627,7 +1631,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
         */
        if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE,
                               &BTRFS_I(inode)->runtime_flags)) {
-                btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+                struct btrfs_trans_handle *trans;
+                struct btrfs_root *root = BTRFS_I(inode)->root;
+                /*
+                 * We need to block on a committing transaction to keep us from
+                 * throwing a ordered operation on to the list and causing
+                 * something like sync to deadlock trying to flush out this
+                 * inode.
+                 */
+                trans = btrfs_start_transaction(root, 0);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
+                btrfs_add_ordered_operation(trans, BTRFS_I(inode)->root, inode);
+                btrfs_end_transaction(trans, root);
                if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
                        filemap_flush(inode->i_mapping);
        }
@@ -1654,16 +1671,21 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        int ret = 0;
        struct btrfs_trans_handle *trans;
+        bool full_sync = 0;
        trace_btrfs_sync_file(file, datasync);
        /*
         * We write the dirty pages in the range and wait until they complete
         * out of the ->i_mutex. If so, we can flush the dirty pages by
-         * multi-task, and make the performance up.
+         * multi-task, and make the performance up.  See
+         * btrfs_wait_ordered_range for an explanation of the ASYNC check.
         */
        atomic_inc(&BTRFS_I(inode)->sync_writers);
-        ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
+        ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
+        if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+                             &BTRFS_I(inode)->runtime_flags))
+                ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
        atomic_dec(&BTRFS_I(inode)->sync_writers);
        if (ret)
                return ret;
@@ -1675,7 +1697,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
         * range being left.
         */
        atomic_inc(&root->log_batch);
-        btrfs_wait_ordered_range(inode, start, end - start + 1);
+        full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
+                             &BTRFS_I(inode)->runtime_flags);
+        if (full_sync)
+                btrfs_wait_ordered_range(inode, start, end - start + 1);
        atomic_inc(&root->log_batch);
        /*
@@ -1742,13 +1767,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
        if (ret != BTRFS_NO_LOG_SYNC) {
                if (ret > 0) {
+                        /*
+                         * If we didn't already wait for ordered extents we need
+                         * to do that now.
+                         */
+                        if (!full_sync)
+                                btrfs_wait_ordered_range(inode, start,
+                                                         end - start + 1);
                        ret = btrfs_commit_transaction(trans, root);
                } else {
                        ret = btrfs_sync_log(trans, root);
-                        if (ret == 0)
+                        if (ret == 0) {
                                ret = btrfs_end_transaction(trans, root);
-                        else
+                        } else {
+                                if (!full_sync)
+                                        btrfs_wait_ordered_range(inode, start,
+                                                                 end -
+                                                                 start + 1);
                                ret = btrfs_commit_transaction(trans, root);
+                        }
                }
        } else {
                ret = btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0be7a8742a43..1f84fc09c1a8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1356,6 +1356,8 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
        u64 bytes_per_bg = BITS_PER_BITMAP * ctl->unit;
        int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+        max_bitmaps = max(max_bitmaps, 1);
        BUG_ON(ctl->total_bitmaps > max_bitmaps);
        /*
@@ -1463,10 +1465,14 @@ static int search_bitmap(struct btrfs_free_space_ctl *ctl,
 }
 static struct btrfs_free_space *
-find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes,
+                unsigned long align)
 {
        struct btrfs_free_space *entry;
        struct rb_node *node;
+        u64 ctl_off;
+        u64 tmp;
+        u64 align_off;
        int ret;
        if (!ctl->free_space_offset.rb_node)
@@ -1481,15 +1487,34 @@ find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
                if (entry->bytes < *bytes)
                        continue;
+                /* make sure the space returned is big enough
+                 * to match our requested alignment
+                 */
+                if (*bytes >= align) {
+                        ctl_off = entry->offset - ctl->start;
+                        tmp = ctl_off + align - 1;;
+                        do_div(tmp, align);
+                        tmp = tmp * align + ctl->start;
+                        align_off = tmp - entry->offset;
+                } else {
+                        align_off = 0;
+                        tmp = entry->offset;
+                }
+                if (entry->bytes < *bytes + align_off)
+                        continue;
                if (entry->bitmap) {
-                        ret = search_bitmap(ctl, entry, offset, bytes);
+                        ret = search_bitmap(ctl, entry, &tmp, bytes);
-                        if (!ret)
+                        if (!ret) {
+                                *offset = tmp;
                                return entry;
+                        }
                        continue;
                }
-                *offset = entry->offset;
+                *offset = tmp;
-                *bytes = entry->bytes;
+                *bytes = entry->bytes - align_off;
                return entry;
        }
@@ -1636,10 +1661,14 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
        }
        /*
-         * some block groups are so tiny they can't be enveloped by a bitmap, so
+         * The original block groups from mkfs can be really small, like 8
-         * don't even bother to create a bitmap for this
+         * megabytes, so don't bother with a bitmap for those entries.  However
+         * some block groups can be smaller than what a bitmap would cover but
+         * are still large enough that they could overflow the 32k memory limit,
+         * so allow those block groups to still be allowed to have a bitmap
+         * entry.
         */
-        if (BITS_PER_BITMAP * ctl->unit > block_group->key.offset)
+        if (((BITS_PER_BITMAP * ctl->unit) >> 1) > block_group->key.offset)
                return false;
        return true;
@@ -2095,9 +2124,12 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
        struct btrfs_free_space *entry = NULL;
        u64 bytes_search = bytes + empty_size;
        u64 ret = 0;
+        u64 align_gap = 0;
+        u64 align_gap_len = 0;
        spin_lock(&ctl->tree_lock);
-        entry = find_free_space(ctl, &offset, &bytes_search);
+        entry = find_free_space(ctl, &offset, &bytes_search,
+                                block_group->full_stripe_len);
        if (!entry)
                goto out;
@@ -2107,9 +2139,15 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
                if (!entry->bytes)
                        free_bitmap(ctl, entry);
        } else {
                unlink_free_space(ctl, entry);
-                entry->offset += bytes;
+                align_gap_len = offset - entry->offset;
-                entry->bytes -= bytes;
+                align_gap = entry->offset;
+                entry->offset = offset + bytes;
+                WARN_ON(entry->bytes < bytes + align_gap_len);
+                entry->bytes -= bytes + align_gap_len;
                if (!entry->bytes)
                        kmem_cache_free(btrfs_free_space_cachep, entry);
                else
@@ -2119,6 +2157,8 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 out:
        spin_unlock(&ctl->tree_lock);
+        if (align_gap_len)
+                __btrfs_add_free_space(ctl, align_gap, align_gap_len);
        return ret;
 }
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 55c07b650378..c226daefd65d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -39,12 +39,13 @@
 #include <linux/slab.h>
 #include <linux/ratelimit.h>
 #include <linux/mount.h>
+#include <linux/btrfs.h>
+#include <linux/blkdev.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "ordered-data.h"
 #include "xattr.h"
@@ -54,6 +55,7 @@
 #include "locking.h"
 #include "free-space-cache.h"
 #include "inode-map.h"
+#include "backref.h"
 struct btrfs_iget_args {
        u64 ino;
@@ -231,8 +233,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
        u64 isize = i_size_read(inode);
        u64 actual_end = min(end + 1, isize);
        u64 inline_len = actual_end - start;
-        u64 aligned_end = (end + root->sectorsize - 1) &
+        u64 aligned_end = ALIGN(end, root->sectorsize);
-                        ~((u64)root->sectorsize - 1);
        u64 data_len = inline_len;
        int ret;
@@ -265,6 +266,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                return 1;
        }
+        set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
        return 0;
@@ -389,7 +391,7 @@ again:
         * a compressed extent to 128k.
         */
        total_compressed = min(total_compressed, max_uncompressed);
-        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+        num_bytes = ALIGN(end - start + 1, blocksize);
        num_bytes = max(blocksize,  num_bytes);
        total_in = 0;
        ret = 0;
@@ -488,15 +490,13 @@ cont:
                 * up to a block size boundary so the allocator does sane
                 * things
                 */
-                total_compressed = (total_compressed + blocksize - 1) &
+                total_compressed = ALIGN(total_compressed, blocksize);
-                        ~(blocksize - 1);
                /*
                 * one last check to make sure the compression is really a
                 * win, compare the page count read with the blocks on disk
                 */
-                total_in = (total_in + PAGE_CACHE_SIZE - 1) &
+                total_in = ALIGN(total_in, PAGE_CACHE_SIZE);
-                        ~(PAGE_CACHE_SIZE - 1);
                if (total_compressed >= total_in) {
                        will_compress = 0;
                } else {
@@ -608,7 +608,7 @@ static noinline int submit_compressed_extents(struct inode *inode,
        if (list_empty(&async_cow->extents))
                return 0;
+again:
        while (!list_empty(&async_cow->extents)) {
                async_extent = list_entry(async_cow->extents.next,
                                          struct async_extent, list);
@@ -648,6 +648,8 @@ retry:
                                                  async_extent->ram_size - 1,
                                                  btrfs_get_extent,
                                                  WB_SYNC_ALL);
+                        else if (ret)
+                                unlock_page(async_cow->locked_page);
                        kfree(async_extent);
                        cond_resched();
                        continue;
@@ -672,6 +674,7 @@ retry:
                if (ret) {
                        int i;
                        for (i = 0; i < async_extent->nr_pages; i++) {
                                WARN_ON(async_extent->pages[i]->mapping);
                                page_cache_release(async_extent->pages[i]);
@@ -679,12 +682,10 @@ retry:
                        kfree(async_extent->pages);
                        async_extent->nr_pages = 0;
                        async_extent->pages = NULL;
-                        unlock_extent(io_tree, async_extent->start,
-                                      async_extent->start +
-                                      async_extent->ram_size - 1);
                        if (ret == -ENOSPC)
                                goto retry;
-                        goto out_free; /* JDM: Requeue? */
+                        goto out_free;
                }
                /*
@@ -696,10 +697,13 @@ retry:
                                        async_extent->ram_size - 1, 0);
                em = alloc_extent_map();
-                BUG_ON(!em); /* -ENOMEM */
+                if (!em)
+                        goto out_free_reserve;
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
                em->orig_start = em->start;
+                em->mod_start = em->start;
+                em->mod_len = em->len;
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
@@ -726,6 +730,9 @@ retry:
                                                async_extent->ram_size - 1, 0);
                }
+                if (ret)
+                        goto out_free_reserve;
                ret = btrfs_add_ordered_extent_compress(inode,
                                                async_extent->start,
                                                ins.objectid,
@@ -733,7 +740,8 @@ retry:
                                                ins.offset,
                                                BTRFS_ORDERED_COMPRESSED,
                                                async_extent->compress_type);
-                BUG_ON(ret); /* -ENOMEM */
+                if (ret)
+                        goto out_free_reserve;
                /*
                 * clear dirty, set writeback and unlock the pages.
@@ -754,18 +762,30 @@ retry:
                                    ins.objectid,
                                    ins.offset, async_extent->pages,
                                    async_extent->nr_pages);
-                BUG_ON(ret); /* -ENOMEM */
                alloc_hint = ins.objectid + ins.offset;
                kfree(async_extent);
+                if (ret)
+                        goto out;
                cond_resched();
        }
        ret = 0;
 out:
        return ret;
+out_free_reserve:
+        btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
 out_free:
+        extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
+                                     async_extent->start,
+                                     async_extent->start +
+                                     async_extent->ram_size - 1,
+                                     NULL, EXTENT_CLEAR_UNLOCK_PAGE |
+                                     EXTENT_CLEAR_UNLOCK |
+                                     EXTENT_CLEAR_DELALLOC |
+                                     EXTENT_CLEAR_DIRTY |
+                                     EXTENT_SET_WRITEBACK |
+                                     EXTENT_END_WRITEBACK);
        kfree(async_extent);
-        goto out;
+        goto again;
 }
 static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
@@ -834,7 +854,7 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
        BUG_ON(btrfs_is_free_space_inode(inode));
-        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
+        num_bytes = ALIGN(end - start + 1, blocksize);
        num_bytes = max(blocksize,  num_bytes);
        disk_num_bytes = num_bytes;
@@ -892,6 +912,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
                em->orig_start = em->start;
                ram_size = ins.offset;
                em->len = ins.offset;
+                em->mod_start = em->start;
+                em->mod_len = em->len;
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
@@ -1338,6 +1360,8 @@ out_check:
                        em->block_start = disk_bytenr;
                        em->orig_block_len = disk_num_bytes;
                        em->bdev = root->fs_info->fs_devices->latest_bdev;
+                        em->mod_start = em->start;
+                        em->mod_len = em->len;
                        set_bit(EXTENT_FLAG_PINNED, &em->flags);
                        set_bit(EXTENT_FLAG_FILLING, &em->flags);
                        em->generation = -1;
@@ -1508,14 +1532,22 @@ static void btrfs_set_bit_hook(struct inode *inode,
                        spin_unlock(&BTRFS_I(inode)->lock);
                }
-                spin_lock(&root->fs_info->delalloc_lock);
+                __percpu_counter_add(&root->fs_info->delalloc_bytes, len,
+                                     root->fs_info->delalloc_batch);
+                spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes += len;
-                root->fs_info->delalloc_bytes += len;
+                if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                                         &BTRFS_I(inode)->runtime_flags)) {
-                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                        spin_lock(&root->fs_info->delalloc_lock);
-                                      &root->fs_info->delalloc_inodes);
+                        if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                                list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
+                                              &root->fs_info->delalloc_inodes);
+                                set_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                        &BTRFS_I(inode)->runtime_flags);
+                        }
+                        spin_unlock(&root->fs_info->delalloc_lock);
                }
-                spin_unlock(&root->fs_info->delalloc_lock);
+                spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
@@ -1550,15 +1582,22 @@ static void btrfs_clear_bit_hook(struct inode *inode,
                    && do_list)
                        btrfs_free_reserved_data_space(inode, len);
-                spin_lock(&root->fs_info->delalloc_lock);
+                __percpu_counter_add(&root->fs_info->delalloc_bytes, -len,
-                root->fs_info->delalloc_bytes -= len;
+                                     root->fs_info->delalloc_batch);
+                spin_lock(&BTRFS_I(inode)->lock);
                BTRFS_I(inode)->delalloc_bytes -= len;
                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
-                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                    test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
-                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+                             &BTRFS_I(inode)->runtime_flags)) {
+                        spin_lock(&root->fs_info->delalloc_lock);
+                        if (!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                                list_del_init(&BTRFS_I(inode)->delalloc_inodes);
+                                clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                          &BTRFS_I(inode)->runtime_flags);
+                        }
+                        spin_unlock(&root->fs_info->delalloc_lock);
                }
-                spin_unlock(&root->fs_info->delalloc_lock);
+                spin_unlock(&BTRFS_I(inode)->lock);
        }
 }
@@ -1566,7 +1605,7 @@ static void btrfs_clear_bit_hook(struct inode *inode,
 * extent_io.c merge_bio_hook, this must check the chunk tree to make sure
 * we don't create bios that span stripes or chunks
 */
-int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
+int btrfs_merge_bio_hook(int rw, struct page *page, unsigned long offset,
                         size_t size, struct bio *bio,
                         unsigned long bio_flags)
 {
@@ -1581,7 +1620,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
        length = bio->bi_size;
        map_length = length;
-        ret = btrfs_map_block(root->fs_info, READ, logical,
+        ret = btrfs_map_block(root->fs_info, rw, logical,
                              &map_length, NULL, 0);
        /* Will always return 0 with map_multi == NULL */
        BUG_ON(ret < 0);
@@ -1892,6 +1931,640 @@ out:
        return ret;
 }
+/* snapshot-aware defrag */
+struct sa_defrag_extent_backref {
+        struct rb_node node;
+        struct old_sa_defrag_extent *old;
+        u64 root_id;
+        u64 inum;
+        u64 file_pos;
+        u64 extent_offset;
+        u64 num_bytes;
+        u64 generation;
+};
+struct old_sa_defrag_extent {
+        struct list_head list;
+        struct new_sa_defrag_extent *new;
+        u64 extent_offset;
+        u64 bytenr;
+        u64 offset;
+        u64 len;
+        int count;
+};
+struct new_sa_defrag_extent {
+        struct rb_root root;
+        struct list_head head;
+        struct btrfs_path *path;
+        struct inode *inode;
+        u64 file_pos;
+        u64 len;
+        u64 bytenr;
+        u64 disk_len;
+        u8 compress_type;
+};
+static int backref_comp(struct sa_defrag_extent_backref *b1,
+                        struct sa_defrag_extent_backref *b2)
+{
+        if (b1->root_id < b2->root_id)
+                return -1;
+        else if (b1->root_id > b2->root_id)
+                return 1;
+        if (b1->inum < b2->inum)
+                return -1;
+        else if (b1->inum > b2->inum)
+                return 1;
+        if (b1->file_pos < b2->file_pos)
+                return -1;
+        else if (b1->file_pos > b2->file_pos)
+                return 1;
+        /*
+         * [------------------------------] ===> (a range of space)
+         *     |<--->|   |<---->| =============> (fs/file tree A)
+         * |<---------------------------->| ===> (fs/file tree B)
+         *
+         * A range of space can refer to two file extents in one tree while
+         * refer to only one file extent in another tree.
+         *
+         * So we may process a disk offset more than one time(two extents in A)
+         * and locate at the same extent(one extent in B), then insert two same
+         * backrefs(both refer to the extent in B).
+         */
+        return 0;
+}
+static void backref_insert(struct rb_root *root,
+                           struct sa_defrag_extent_backref *backref)
+{
+        struct rb_node **p = &root->rb_node;
+        struct rb_node *parent = NULL;
+        struct sa_defrag_extent_backref *entry;
+        int ret;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct sa_defrag_extent_backref, node);
+                ret = backref_comp(backref, entry);
+                if (ret < 0)
+                        p = &(*p)->rb_left;
+                else
+                        p = &(*p)->rb_right;
+        }
+        rb_link_node(&backref->node, parent, p);
+        rb_insert_color(&backref->node, root);
+}
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
+                                       void *ctx)
+{
+        struct btrfs_file_extent_item *extent;
+        struct btrfs_fs_info *fs_info;
+        struct old_sa_defrag_extent *old = ctx;
+        struct new_sa_defrag_extent *new = old->new;
+        struct btrfs_path *path = new->path;
+        struct btrfs_key key;
+        struct btrfs_root *root;
+        struct sa_defrag_extent_backref *backref;
+        struct extent_buffer *leaf;
+        struct inode *inode = new->inode;
+        int slot;
+        int ret;
+        u64 extent_offset;
+        u64 num_bytes;
+        if (BTRFS_I(inode)->root->root_key.objectid == root_id &&
+            inum == btrfs_ino(inode))
+                return 0;
+        key.objectid = root_id;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        key.offset = (u64)-1;
+        fs_info = BTRFS_I(inode)->root->fs_info;
+        root = btrfs_read_fs_root_no_name(fs_info, &key);
+        if (IS_ERR(root)) {
+                if (PTR_ERR(root) == -ENOENT)
+                        return 0;
+                WARN_ON(1);
+                pr_debug("inum=%llu, offset=%llu, root_id=%llu\n",
+                         inum, offset, root_id);
+                return PTR_ERR(root);
+        }
+        key.objectid = inum;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        if (offset > (u64)-1 << 32)
+                key.offset = 0;
+        else
+                key.offset = offset;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                WARN_ON(1);
+                return ret;
+        }
+        while (1) {
+                cond_resched();
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0) {
+                                goto out;
+                        } else if (ret > 0) {
+                                ret = 0;
+                                goto out;
+                        }
+                        continue;
+                }
+                path->slots[0]++;
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.objectid > inum)
+                        goto out;
+                if (key.objectid < inum || key.type != BTRFS_EXTENT_DATA_KEY)
+                        continue;
+                extent = btrfs_item_ptr(leaf, slot,
+                                        struct btrfs_file_extent_item);
+                if (btrfs_file_extent_disk_bytenr(leaf, extent) != old->bytenr)
+                        continue;
+                extent_offset = btrfs_file_extent_offset(leaf, extent);
+                if (key.offset - extent_offset != offset)
+                        continue;
+                num_bytes = btrfs_file_extent_num_bytes(leaf, extent);
+                if (extent_offset >= old->extent_offset + old->offset +
+                    old->len || extent_offset + num_bytes <=
+                    old->extent_offset + old->offset)
+                        continue;
+                break;
+        }
+        backref = kmalloc(sizeof(*backref), GFP_NOFS);
+        if (!backref) {
+                ret = -ENOENT;
+                goto out;
+        }
+        backref->root_id = root_id;
+        backref->inum = inum;
+        backref->file_pos = offset + extent_offset;
+        backref->num_bytes = num_bytes;
+        backref->extent_offset = extent_offset;
+        backref->generation = btrfs_file_extent_generation(leaf, extent);
+        backref->old = old;
+        backref_insert(&new->root, backref);
+        old->count++;
+out:
+        btrfs_release_path(path);
+        WARN_ON(ret);
+        return ret;
+}
+static noinline bool record_extent_backrefs(struct btrfs_path *path,
+                                   struct new_sa_defrag_extent *new)
+{
+        struct btrfs_fs_info *fs_info = BTRFS_I(new->inode)->root->fs_info;
+        struct old_sa_defrag_extent *old, *tmp;
+        int ret;
+        new->path = path;
+        list_for_each_entry_safe(old, tmp, &new->head, list) {
+                ret = iterate_inodes_from_logical(old->bytenr, fs_info,
+                                                  path, record_one_backref,
+                                                  old);
+                BUG_ON(ret < 0 && ret != -ENOENT);
+                /* no backref to be processed for this extent */
+                if (!old->count) {
+                        list_del(&old->list);
+                        kfree(old);
+                }
+        }
+        if (list_empty(&new->head))
+                return false;
+        return true;
+}
+static int relink_is_mergable(struct extent_buffer *leaf,
+                              struct btrfs_file_extent_item *fi,
+                              u64 disk_bytenr)
+{
+        if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr)
+                return 0;
+        if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
+                return 0;
+        if (btrfs_file_extent_compression(leaf, fi) ||
+            btrfs_file_extent_encryption(leaf, fi) ||
+            btrfs_file_extent_other_encoding(leaf, fi))
+                return 0;
+        return 1;
+}
+/*
+ * Note the backref might has changed, and in this case we just return 0.
+ */
+static noinline int relink_extent_backref(struct btrfs_path *path,
+                                 struct sa_defrag_extent_backref *prev,
+                                 struct sa_defrag_extent_backref *backref)
+{
+        struct btrfs_file_extent_item *extent;
+        struct btrfs_file_extent_item *item;
+        struct btrfs_ordered_extent *ordered;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_fs_info *fs_info;
+        struct btrfs_root *root;
+        struct btrfs_key key;
+        struct extent_buffer *leaf;
+        struct old_sa_defrag_extent *old = backref->old;
+        struct new_sa_defrag_extent *new = old->new;
+        struct inode *src_inode = new->inode;
+        struct inode *inode;
+        struct extent_state *cached = NULL;
+        int ret = 0;
+        u64 start;
+        u64 len;
+        u64 lock_start;
+        u64 lock_end;
+        bool merge = false;
+        int index;
+        if (prev && prev->root_id == backref->root_id &&
+            prev->inum == backref->inum &&
+            prev->file_pos + prev->num_bytes == backref->file_pos)
+                merge = true;
+        /* step 1: get root */
+        key.objectid = backref->root_id;
+        key.type = BTRFS_ROOT_ITEM_KEY;
+        key.offset = (u64)-1;
+        fs_info = BTRFS_I(src_inode)->root->fs_info;
+        index = srcu_read_lock(&fs_info->subvol_srcu);
+        root = btrfs_read_fs_root_no_name(fs_info, &key);
+        if (IS_ERR(root)) {
+                srcu_read_unlock(&fs_info->subvol_srcu, index);
+                if (PTR_ERR(root) == -ENOENT)
+                        return 0;
+                return PTR_ERR(root);
+        }
+        if (btrfs_root_refs(&root->root_item) == 0) {
+                srcu_read_unlock(&fs_info->subvol_srcu, index);
+                /* parse ENOENT to 0 */
+                return 0;
+        }
+        /* step 2: get inode */
+        key.objectid = backref->inum;
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+        if (IS_ERR(inode)) {
+                srcu_read_unlock(&fs_info->subvol_srcu, index);
+                return 0;
+        }
+        srcu_read_unlock(&fs_info->subvol_srcu, index);
+        /* step 3: relink backref */
+        lock_start = backref->file_pos;
+        lock_end = backref->file_pos + backref->num_bytes - 1;
+        lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+                         0, &cached);
+        ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
+        if (ordered) {
+                btrfs_put_ordered_extent(ordered);
+                goto out_unlock;
+        }
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out_unlock;
+        }
+        key.objectid = backref->inum;
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        key.offset = backref->file_pos;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                goto out_free_path;
+        } else if (ret > 0) {
+                ret = 0;
+                goto out_free_path;
+        }
+        extent = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                                struct btrfs_file_extent_item);
+        if (btrfs_file_extent_generation(path->nodes[0], extent) !=
+            backref->generation)
+                goto out_free_path;
+        btrfs_release_path(path);
+        start = backref->file_pos;
+        if (backref->extent_offset < old->extent_offset + old->offset)
+                start += old->extent_offset + old->offset -
+                         backref->extent_offset;
+        len = min(backref->extent_offset + backref->num_bytes,
+                  old->extent_offset + old->offset + old->len);
+        len -= max(backref->extent_offset, old->extent_offset + old->offset);
+        ret = btrfs_drop_extents(trans, root, inode, start,
+                                 start + len, 1);
+        if (ret)
+                goto out_free_path;
+again:
+        key.objectid = btrfs_ino(inode);
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        key.offset = start;
+        if (merge) {
+                struct btrfs_file_extent_item *fi;
+                u64 extent_len;
+                struct btrfs_key found_key;
+                ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+                if (ret < 0)
+                        goto out_free_path;
+                path->slots[0]--;
+                leaf = path->nodes[0];
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                fi = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_file_extent_item);
+                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
+                if (relink_is_mergable(leaf, fi, new->bytenr) &&
+                    extent_len + found_key.offset == start) {
+                        btrfs_set_file_extent_num_bytes(leaf, fi,
+                                                        extent_len + len);
+                        btrfs_mark_buffer_dirty(leaf);
+                        inode_add_bytes(inode, len);
+                        ret = 1;
+                        goto out_free_path;
+                } else {
+                        merge = false;
+                        btrfs_release_path(path);
+                        goto again;
+                }
+        }
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                        sizeof(*extent));
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                goto out_free_path;
+        }
+        leaf = path->nodes[0];
+        item = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_file_extent_item);
+        btrfs_set_file_extent_disk_bytenr(leaf, item, new->bytenr);
+        btrfs_set_file_extent_disk_num_bytes(leaf, item, new->disk_len);
+        btrfs_set_file_extent_offset(leaf, item, start - new->file_pos);
+        btrfs_set_file_extent_num_bytes(leaf, item, len);
+        btrfs_set_file_extent_ram_bytes(leaf, item, new->len);
+        btrfs_set_file_extent_generation(leaf, item, trans->transid);
+        btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG);
+        btrfs_set_file_extent_compression(leaf, item, new->compress_type);
+        btrfs_set_file_extent_encryption(leaf, item, 0);
+        btrfs_set_file_extent_other_encoding(leaf, item, 0);
+        btrfs_mark_buffer_dirty(leaf);
+        inode_add_bytes(inode, len);
+        ret = btrfs_inc_extent_ref(trans, root, new->bytenr,
+                        new->disk_len, 0,
+                        backref->root_id, backref->inum,
+                        new->file_pos, 0);      /* start - extent_offset */
+        if (ret) {
+                btrfs_abort_transaction(trans, root, ret);
+                goto out_free_path;
+        }
+        ret = 1;
+out_free_path:
+        btrfs_release_path(path);
+        btrfs_end_transaction(trans, root);
+out_unlock:
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
+                             &cached, GFP_NOFS);
+        iput(inode);
+        return ret;
+}
+static void relink_file_extents(struct new_sa_defrag_extent *new)
+{
+        struct btrfs_path *path;
+        struct old_sa_defrag_extent *old, *tmp;
+        struct sa_defrag_extent_backref *backref;
+        struct sa_defrag_extent_backref *prev = NULL;
+        struct inode *inode;
+        struct btrfs_root *root;
+        struct rb_node *node;
+        int ret;
+        inode = new->inode;
+        root = BTRFS_I(inode)->root;
+        path = btrfs_alloc_path();
+        if (!path)
+                return;
+        if (!record_extent_backrefs(path, new)) {
+                btrfs_free_path(path);
+                goto out;
+        }
+        btrfs_release_path(path);
+        while (1) {
+                node = rb_first(&new->root);
+                if (!node)
+                        break;
+                rb_erase(node, &new->root);
+                backref = rb_entry(node, struct sa_defrag_extent_backref, node);
+                ret = relink_extent_backref(path, prev, backref);
+                WARN_ON(ret < 0);
+                kfree(prev);
+                if (ret == 1)
+                        prev = backref;
+                else
+                        prev = NULL;
+                cond_resched();
+        }
+        kfree(prev);
+        btrfs_free_path(path);
+        list_for_each_entry_safe(old, tmp, &new->head, list) {
+                list_del(&old->list);
+                kfree(old);
+        }
+out:
+        atomic_dec(&root->fs_info->defrag_running);
+        wake_up(&root->fs_info->transaction_wait);
+        kfree(new);
+}
+static struct new_sa_defrag_extent *
+record_old_file_extents(struct inode *inode,
+                        struct btrfs_ordered_extent *ordered)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_path *path;
+        struct btrfs_key key;
+        struct old_sa_defrag_extent *old, *tmp;
+        struct new_sa_defrag_extent *new;
+        int ret;
+        new = kmalloc(sizeof(*new), GFP_NOFS);
+        if (!new)
+                return NULL;
+        new->inode = inode;
+        new->file_pos = ordered->file_offset;
+        new->len = ordered->len;
+        new->bytenr = ordered->start;
+        new->disk_len = ordered->disk_len;
+        new->compress_type = ordered->compress_type;
+        new->root = RB_ROOT;
+        INIT_LIST_HEAD(&new->head);
+        path = btrfs_alloc_path();
+        if (!path)
+                goto out_kfree;
+        key.objectid = btrfs_ino(inode);
+        key.type = BTRFS_EXTENT_DATA_KEY;
+        key.offset = new->file_pos;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out_free_path;
+        if (ret > 0 && path->slots[0] > 0)
+                path->slots[0]--;
+        /* find out all the old extents for the file range */
+        while (1) {
+                struct btrfs_file_extent_item *extent;
+                struct extent_buffer *l;
+                int slot;
+                u64 num_bytes;
+                u64 offset;
+                u64 end;
+                u64 disk_bytenr;
+                u64 extent_offset;
+                l = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(l)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out_free_list;
+                        else if (ret > 0)
+                                break;
+                        continue;
+                }
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (key.objectid != btrfs_ino(inode))
+                        break;
+                if (key.type != BTRFS_EXTENT_DATA_KEY)
+                        break;
+                if (key.offset >= new->file_pos + new->len)
+                        break;
+                extent = btrfs_item_ptr(l, slot, struct btrfs_file_extent_item);
+                num_bytes = btrfs_file_extent_num_bytes(l, extent);
+                if (key.offset + num_bytes < new->file_pos)
+                        goto next;
+                disk_bytenr = btrfs_file_extent_disk_bytenr(l, extent);
+                if (!disk_bytenr)
+                        goto next;
+                extent_offset = btrfs_file_extent_offset(l, extent);
+                old = kmalloc(sizeof(*old), GFP_NOFS);
+                if (!old)
+                        goto out_free_list;
+                offset = max(new->file_pos, key.offset);
+                end = min(new->file_pos + new->len, key.offset + num_bytes);
+                old->bytenr = disk_bytenr;
+                old->extent_offset = extent_offset;
+                old->offset = offset - key.offset;
+                old->len = end - offset;
+                old->new = new;
+                old->count = 0;
+                list_add_tail(&old->list, &new->head);
+next:
+                path->slots[0]++;
+                cond_resched();
+        }
+        btrfs_free_path(path);
+        atomic_inc(&root->fs_info->defrag_running);
+        return new;
+out_free_list:
+        list_for_each_entry_safe(old, tmp, &new->head, list) {
+                list_del(&old->list);
+                kfree(old);
+        }
+out_free_path:
+        btrfs_free_path(path);
+out_kfree:
+        kfree(new);
+        return NULL;
+}
 /*
 * helper function for btrfs_finish_ordered_io, this
 * just reads in some of the csum leaves to prime them into ram
@@ -1909,6 +2582,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
        struct btrfs_trans_handle *trans = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
+        struct new_sa_defrag_extent *new = NULL;
        int compress_type = 0;
        int ret;
        bool nolock;
@@ -1943,6 +2617,20 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
                         ordered_extent->file_offset + ordered_extent->len - 1,
                         0, &cached_state);
+        ret = test_range_bit(io_tree, ordered_extent->file_offset,
+                        ordered_extent->file_offset + ordered_extent->len - 1,
+                        EXTENT_DEFRAG, 1, cached_state);
+        if (ret) {
+                u64 last_snapshot = btrfs_root_last_snapshot(&root->root_item);
+                if (last_snapshot >= BTRFS_I(inode)->generation)
+                        /* the inode is shared */
+                        new = record_old_file_extents(inode, ordered_extent);
+                clear_extent_bit(io_tree, ordered_extent->file_offset,
+                        ordered_extent->file_offset + ordered_extent->len - 1,
+                        EXTENT_DEFRAG, 0, 0, &cached_state, GFP_NOFS);
+        }
        if (nolock)
                trans = btrfs_join_transaction_nolock(root);
        else
@@ -2001,17 +2689,33 @@ out:
        if (trans)
                btrfs_end_transaction(trans, root);
-        if (ret)
+        if (ret) {
                clear_extent_uptodate(io_tree, ordered_extent->file_offset,
                                      ordered_extent->file_offset +
                                      ordered_extent->len - 1, NULL, GFP_NOFS);
+                /*
+                 * If the ordered extent had an IOERR or something else went
+                 * wrong we need to return the space for this ordered extent
+                 * back to the allocator.
+                 */
+                if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+                    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
+                        btrfs_free_reserved_extent(root, ordered_extent->start,
+                                                   ordered_extent->disk_len);
+        }
        /*
         * This needs to be done to make sure anybody waiting knows we are done
         * updating everything for this ordered extent.
         */
        btrfs_remove_ordered_extent(inode, ordered_extent);
+        /* for snapshot-aware defrag */
+        if (new)
+                relink_file_extents(new);
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
@@ -2062,7 +2766,7 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                               struct extent_state *state, int mirror)
 {
-        size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT);
+        size_t offset = start - page_offset(page);
        struct inode *inode = page->mapping->host;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        char *kaddr;
@@ -2167,11 +2871,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
        }
 }
-enum btrfs_orphan_cleanup_state {
-        ORPHAN_CLEANUP_STARTED  = 1,
-        ORPHAN_CLEANUP_DONE     = 2,
-};
 /*
 * This is called in transaction commit time. If there are no orphan
 * files in the subvolume, it removes orphan item and frees block_rsv
@@ -2469,6 +3168,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                 */
                set_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
                        &BTRFS_I(inode)->runtime_flags);
+                atomic_inc(&root->orphan_inodes);
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
@@ -2491,6 +3191,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                                goto out;
                        ret = btrfs_truncate(inode);
+                        if (ret)
+                                btrfs_orphan_del(NULL, inode);
                } else {
                        nr_unlink++;
                }
@@ -2709,34 +3411,41 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct btrfs_inode_item *item,
                            struct inode *inode)
 {
-        btrfs_set_inode_uid(leaf, item, i_uid_read(inode));
+        struct btrfs_map_token token;
-        btrfs_set_inode_gid(leaf, item, i_gid_read(inode));
-        btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
+        btrfs_init_map_token(&token);
-        btrfs_set_inode_mode(leaf, item, inode->i_mode);
-        btrfs_set_inode_nlink(leaf, item, inode->i_nlink);
+        btrfs_set_token_inode_uid(leaf, item, i_uid_read(inode), &token);
+        btrfs_set_token_inode_gid(leaf, item, i_gid_read(inode), &token);
+        btrfs_set_token_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size,
+                                   &token);
+        btrfs_set_token_inode_mode(leaf, item, inode->i_mode, &token);
+        btrfs_set_token_inode_nlink(leaf, item, inode->i_nlink, &token);
-        btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item),
+        btrfs_set_token_timespec_sec(leaf, btrfs_inode_atime(item),
-                               inode->i_atime.tv_sec);
+                                     inode->i_atime.tv_sec, &token);
-        btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item),
+        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_atime(item),
-                                inode->i_atime.tv_nsec);
+                                      inode->i_atime.tv_nsec, &token);
-        btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item),
+        btrfs_set_token_timespec_sec(leaf, btrfs_inode_mtime(item),
-                               inode->i_mtime.tv_sec);
+                                     inode->i_mtime.tv_sec, &token);
-        btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item),
+        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_mtime(item),
-                                inode->i_mtime.tv_nsec);
+                                      inode->i_mtime.tv_nsec, &token);
-        btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item),
+        btrfs_set_token_timespec_sec(leaf, btrfs_inode_ctime(item),
-                               inode->i_ctime.tv_sec);
+                                     inode->i_ctime.tv_sec, &token);
-        btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item),
+        btrfs_set_token_timespec_nsec(leaf, btrfs_inode_ctime(item),
-                                inode->i_ctime.tv_nsec);
+                                      inode->i_ctime.tv_nsec, &token);
-        btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode));
+        btrfs_set_token_inode_nbytes(leaf, item, inode_get_bytes(inode),
-        btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation);
+                                     &token);
-        btrfs_set_inode_sequence(leaf, item, inode->i_version);
+        btrfs_set_token_inode_generation(leaf, item, BTRFS_I(inode)->generation,
-        btrfs_set_inode_transid(leaf, item, trans->transid);
+                                         &token);
-        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
+        btrfs_set_token_inode_sequence(leaf, item, inode->i_version, &token);
-        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
+        btrfs_set_token_inode_transid(leaf, item, trans->transid, &token);
-        btrfs_set_inode_block_group(leaf, item, 0);
+        btrfs_set_token_inode_rdev(leaf, item, inode->i_rdev, &token);
+        btrfs_set_token_inode_flags(leaf, item, BTRFS_I(inode)->flags, &token);
+        btrfs_set_token_inode_block_group(leaf, item, 0, &token);
 }
 /*
@@ -3304,7 +4013,6 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        u64 extent_num_bytes = 0;
        u64 extent_offset = 0;
        u64 item_end = 0;
-        u64 mask = root->sectorsize - 1;
        u32 found_type = (u8)-1;
        int found_extent;
        int del_item;
@@ -3328,7 +4036,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
         * extent just the way it is.
         */
        if (root->ref_cows || root == root->fs_info->tree_root)
-                btrfs_drop_extent_cache(inode, (new_size + mask) & (~mask), (u64)-1, 0);
+                btrfs_drop_extent_cache(inode, ALIGN(new_size,
+                                        root->sectorsize), (u64)-1, 0);
        /*
         * This function is also used to drop the items in the log tree before
@@ -3407,10 +4116,9 @@ search_again:
                        if (!del_item) {
                                u64 orig_num_bytes =
                                        btrfs_file_extent_num_bytes(leaf, fi);
-                                extent_num_bytes = new_size -
+                                extent_num_bytes = ALIGN(new_size -
-                                        found_key.offset + root->sectorsize - 1;
+                                                found_key.offset,
-                                extent_num_bytes = extent_num_bytes &
+                                                root->sectorsize);
-                                        ~((u64)root->sectorsize - 1);
                                btrfs_set_file_extent_num_bytes(leaf, fi,
                                                         extent_num_bytes);
                                num_dec = (orig_num_bytes -
@@ -3646,9 +4354,8 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-        u64 mask = root->sectorsize - 1;
+        u64 hole_start = ALIGN(oldsize, root->sectorsize);
-        u64 hole_start = (oldsize + mask) & ~mask;
+        u64 block_end = ALIGN(size, root->sectorsize);
-        u64 block_end = (size + mask) & ~mask;
        u64 last_byte;
        u64 cur_offset;
        u64 hole_size;
@@ -3681,7 +4388,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
                        break;
                }
                last_byte = min(extent_map_end(em), block_end);
-                last_byte = (last_byte + mask) & ~mask;
+                last_byte = ALIGN(last_byte , root->sectorsize);
                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
                        struct extent_map *hole_em;
                        hole_size = last_byte - cur_offset;
@@ -3832,6 +4539,12 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
                /* we don't support swapfiles, so vmtruncate shouldn't fail */
                truncate_setsize(inode, newsize);
+                /* Disable nonlocked read DIO to avoid the end less truncate */
+                btrfs_inode_block_unlocked_dio(inode);
+                inode_dio_wait(inode);
+                btrfs_inode_resume_unlocked_dio(inode);
                ret = btrfs_truncate(inode);
                if (ret && inode->i_nlink)
                        btrfs_orphan_del(NULL, inode);
@@ -3904,6 +4617,12 @@ void btrfs_evict_inode(struct inode *inode)
                goto no_delete;
        }
+        ret = btrfs_commit_inode_delayed_inode(inode);
+        if (ret) {
+                btrfs_orphan_del(NULL, inode);
+                goto no_delete;
+        }
        rsv = btrfs_alloc_block_rsv(root, BTRFS_BLOCK_RSV_TEMP);
        if (!rsv) {
                btrfs_orphan_del(NULL, inode);
@@ -3941,7 +4660,7 @@ void btrfs_evict_inode(struct inode *inode)
                        goto no_delete;
                }
-                trans = btrfs_start_transaction_lflush(root, 1);
+                trans = btrfs_join_transaction(root);
                if (IS_ERR(trans)) {
                        btrfs_orphan_del(NULL, inode);
                        btrfs_free_block_rsv(root, rsv);
@@ -3955,9 +4674,6 @@ void btrfs_evict_inode(struct inode *inode)
                        break;
                trans->block_rsv = &root->fs_info->trans_block_rsv;
-                ret = btrfs_update_inode(trans, root, inode);
-                BUG_ON(ret);
                btrfs_end_transaction(trans, root);
                trans = NULL;
                btrfs_btree_balance_dirty(root);
@@ -4854,7 +5570,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                if (btrfs_test_opt(root, NODATASUM))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
                if (btrfs_test_opt(root, NODATACOW))
-                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
+                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW |
+                                BTRFS_INODE_NODATASUM;
        }
        insert_inode_hash(inode);
@@ -5006,12 +5723,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                goto out_unlock;
        }
-        err = btrfs_update_inode(trans, root, inode);
-        if (err) {
-                drop_inode = 1;
-                goto out_unlock;
-        }
        /*
        * If the active LSM wants to access the inode during
        * d_instantiate it needs these. Smack checks to see
@@ -5396,8 +6107,7 @@ again:
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                size_t size;
                size = btrfs_file_extent_inline_len(leaf, item);
-                extent_end = (extent_start + size + root->sectorsize - 1) &
+                extent_end = ALIGN(extent_start + size, root->sectorsize);
-                        ~((u64)root->sectorsize - 1);
        }
        if (start >= extent_end) {
@@ -5469,8 +6179,7 @@ again:
                copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset,
                                size - extent_offset);
                em->start = extent_start + extent_offset;
-                em->len = (copy_size + root->sectorsize - 1) &
+                em->len = ALIGN(copy_size, root->sectorsize);
-                        ~((u64)root->sectorsize - 1);
                em->orig_block_len = em->len;
                em->orig_start = em->start;
                if (compress_type) {
@@ -5949,6 +6658,8 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
        em->start = start;
        em->orig_start = orig_start;
+        em->mod_start = start;
+        em->mod_len = len;
        em->len = len;
        em->block_len = block_len;
        em->block_start = block_start;
@@ -5990,16 +6701,12 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        u64 len = bh_result->b_size;
        struct btrfs_trans_handle *trans;
        int unlock_bits = EXTENT_LOCKED;
-        int ret;
+        int ret = 0;
-        if (create) {
+        if (create)
-                ret = btrfs_delalloc_reserve_space(inode, len);
-                if (ret)
-                        return ret;
                unlock_bits |= EXTENT_DELALLOC | EXTENT_DIRTY;
-        } else {
+        else
                len = min_t(u64, len, root->sectorsize);
-        }
        lockstart = start;
        lockend = start + len - 1;
@@ -6011,14 +6718,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (lock_extent_direct(inode, lockstart, lockend, &cached_state, create))
                return -ENOTBLK;
-        if (create) {
-                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                     lockend, EXTENT_DELALLOC, NULL,
-                                     &cached_state, GFP_NOFS);
-                if (ret)
-                        goto unlock_err;
-        }
        em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
@@ -6050,7 +6749,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
        if (!create && (em->block_start == EXTENT_MAP_HOLE ||
                        test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
                free_extent_map(em);
-                ret = 0;
                goto unlock_err;
        }
@@ -6148,6 +6846,15 @@ unlock:
                 */
                if (start + len > i_size_read(inode))
                        i_size_write(inode, start + len);
+                spin_lock(&BTRFS_I(inode)->lock);
+                BTRFS_I(inode)->outstanding_extents++;
+                spin_unlock(&BTRFS_I(inode)->lock);
+                ret = set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                     lockstart + len - 1, EXTENT_DELALLOC, NULL,
+                                     &cached_state, GFP_NOFS);
+                BUG_ON(ret);
        }
        /*
@@ -6156,24 +6863,9 @@ unlock:
         * aren't using if there is any left over space.
         */
        if (lockstart < lockend) {
-                if (create && len < lockend - lockstart) {
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
+                                 lockend, unlock_bits, 1, 0,
-                                         lockstart + len - 1,
+                                 &cached_state, GFP_NOFS);
-                                         unlock_bits | EXTENT_DEFRAG, 1, 0,
-                                         &cached_state, GFP_NOFS);
-                        /*
-                         * Beside unlock, we also need to cleanup reserved space
-                         * for the left range by attaching EXTENT_DO_ACCOUNTING.
-                         */
-                        clear_extent_bit(&BTRFS_I(inode)->io_tree,
-                                         lockstart + len, lockend,
-                                         unlock_bits | EXTENT_DO_ACCOUNTING |
-                                         EXTENT_DEFRAG, 1, 0, NULL, GFP_NOFS);
-                } else {
-                        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
-                                         lockend, unlock_bits, 1, 0,
-                                         &cached_state, GFP_NOFS);
-                }
        } else {
                free_extent_state(cached_state);
        }
@@ -6183,9 +6875,6 @@ unlock:
        return 0;
 unlock_err:
-        if (create)
-                unlock_bits |= EXTENT_DO_ACCOUNTING;
        clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend,
                         unlock_bits, 1, 0, &cached_state, GFP_NOFS);
        return ret;
@@ -6426,19 +7115,24 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
        int async_submit = 0;
        map_length = orig_bio->bi_size;
-        ret = btrfs_map_block(root->fs_info, READ, start_sector << 9,
+        ret = btrfs_map_block(root->fs_info, rw, start_sector << 9,
                              &map_length, NULL, 0);
        if (ret) {
                bio_put(orig_bio);
                return -EIO;
        }
        if (map_length >= orig_bio->bi_size) {
                bio = orig_bio;
                goto submit;
        }
-        async_submit = 1;
+        /* async crcs make it difficult to collect full stripe writes. */
+        if (btrfs_get_alloc_profile(root, 1) &
+            (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6))
+                async_submit = 0;
+        else
+                async_submit = 1;
        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
        if (!bio)
                return -ENOMEM;
@@ -6480,7 +7174,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
                        bio->bi_end_io = btrfs_end_dio_bio;
                        map_length = orig_bio->bi_size;
-                        ret = btrfs_map_block(root->fs_info, READ,
+                        ret = btrfs_map_block(root->fs_info, rw,
                                              start_sector << 9,
                                              &map_length, NULL, 0);
                        if (ret) {
@@ -6623,15 +7317,60 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 {
        struct file *file = iocb->ki_filp;
        struct inode *inode = file->f_mapping->host;
+        size_t count = 0;
+        int flags = 0;
+        bool wakeup = true;
+        bool relock = false;
+        ssize_t ret;
        if (check_direct_IO(BTRFS_I(inode)->root, rw, iocb, iov,
                            offset, nr_segs))
                return 0;
-        return __blockdev_direct_IO(rw, iocb, inode,
+        atomic_inc(&inode->i_dio_count);
-                   BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+        smp_mb__after_atomic_inc();
-                   iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
-                   btrfs_submit_direct, 0);
+        if (rw & WRITE) {
+                count = iov_length(iov, nr_segs);
+                /*
+                 * If the write DIO is beyond the EOF, we need update
+                 * the isize, but it is protected by i_mutex. So we can
+                 * not unlock the i_mutex at this case.
+                 */
+                if (offset + count <= inode->i_size) {
+                        mutex_unlock(&inode->i_mutex);
+                        relock = true;
+                }
+                ret = btrfs_delalloc_reserve_space(inode, count);
+                if (ret)
+                        goto out;
+        } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
+                                     &BTRFS_I(inode)->runtime_flags))) {
+                inode_dio_done(inode);
+                flags = DIO_LOCKING | DIO_SKIP_HOLES;
+                wakeup = false;
+        }
+        ret = __blockdev_direct_IO(rw, iocb, inode,
+                        BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev,
+                        iov, offset, nr_segs, btrfs_get_blocks_direct, NULL,
+                        btrfs_submit_direct, flags);
+        if (rw & WRITE) {
+                if (ret < 0 && ret != -EIOCBQUEUED)
+                        btrfs_delalloc_release_space(inode, count);
+                else if (ret >= 0 && (size_t)ret < count)
+                        btrfs_delalloc_release_space(inode,
+                                                     count - (size_t)ret);
+                else
+                        btrfs_delalloc_release_metadata(inode, 0);
+        }
+out:
+        if (wakeup)
+                inode_dio_done(inode);
+        if (relock)
+                mutex_lock(&inode->i_mutex);
+        return ret;
 }
 #define BTRFS_FIEMAP_FLAGS      (FIEMAP_FLAG_SYNC)
@@ -6735,8 +7474,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned long offset)
                return;
        }
        lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
-        ordered = btrfs_lookup_ordered_extent(inode,
+        ordered = btrfs_lookup_ordered_extent(inode, page_offset(page));
-                                           page_offset(page));
        if (ordered) {
                /*
                 * IO on this page will never be started, so we need
@@ -7216,8 +7954,9 @@ int btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        /* the snap/subvol tree is on deleting */
        if (btrfs_root_refs(&root->root_item) == 0 &&
-            !btrfs_is_free_space_inode(inode))
+            root != root->fs_info->tree_root)
                return 1;
        else
                return generic_drop_inode(inode);
@@ -7299,40 +8038,22 @@ fail:
 static int btrfs_getattr(struct vfsmount *mnt,
                         struct dentry *dentry, struct kstat *stat)
 {
+        u64 delalloc_bytes;
        struct inode *inode = dentry->d_inode;
        u32 blocksize = inode->i_sb->s_blocksize;
        generic_fillattr(inode, stat);
        stat->dev = BTRFS_I(inode)->root->anon_dev;
        stat->blksize = PAGE_CACHE_SIZE;
+        spin_lock(&BTRFS_I(inode)->lock);
+        delalloc_bytes = BTRFS_I(inode)->delalloc_bytes;
+        spin_unlock(&BTRFS_I(inode)->lock);
        stat->blocks = (ALIGN(inode_get_bytes(inode), blocksize) +
-                ALIGN(BTRFS_I(inode)->delalloc_bytes, blocksize)) >> 9;
+                        ALIGN(delalloc_bytes, blocksize)) >> 9;
        return 0;
 }
-/*
- * If a file is moved, it will inherit the cow and compression flags of the new
- * directory.
- */
-static void fixup_inode_flags(struct inode *dir, struct inode *inode)
-{
-        struct btrfs_inode *b_dir = BTRFS_I(dir);
-        struct btrfs_inode *b_inode = BTRFS_I(inode);
-        if (b_dir->flags & BTRFS_INODE_NODATACOW)
-                b_inode->flags |= BTRFS_INODE_NODATACOW;
-        else
-                b_inode->flags &= ~BTRFS_INODE_NODATACOW;
-        if (b_dir->flags & BTRFS_INODE_COMPRESS) {
-                b_inode->flags |= BTRFS_INODE_COMPRESS;
-                b_inode->flags &= ~BTRFS_INODE_NOCOMPRESS;
-        } else {
-                b_inode->flags &= ~(BTRFS_INODE_COMPRESS |
-                                    BTRFS_INODE_NOCOMPRESS);
-        }
-}
 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry)
 {
@@ -7498,8 +8219,6 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                }
        }
-        fixup_inode_flags(new_dir, old_inode);
        ret = btrfs_add_link(trans, new_dir, old_inode,
                             new_dentry->d_name.name,
                             new_dentry->d_name.len, 0, index);
@@ -7583,7 +8302,7 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        INIT_LIST_HEAD(&works);
        INIT_LIST_HEAD(&splice);
-again:
        spin_lock(&root->fs_info->delalloc_lock);
        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
        while (!list_empty(&splice)) {
@@ -7593,8 +8312,11 @@ again:
                list_del_init(&binode->delalloc_inodes);
                inode = igrab(&binode->vfs_inode);
-                if (!inode)
+                if (!inode) {
+                        clear_bit(BTRFS_INODE_IN_DELALLOC_LIST,
+                                  &binode->runtime_flags);
                        continue;
+                }
                list_add_tail(&binode->delalloc_inodes,
                              &root->fs_info->delalloc_inodes);
@@ -7619,13 +8341,6 @@ again:
                btrfs_wait_and_free_delalloc_work(work);
        }
-        spin_lock(&root->fs_info->delalloc_lock);
-        if (!list_empty(&root->fs_info->delalloc_inodes)) {
-                spin_unlock(&root->fs_info->delalloc_lock);
-                goto again;
-        }
-        spin_unlock(&root->fs_info->delalloc_lock);
        /* the filemap_flush will queue IO into the worker threads, but
         * we have to make sure the IO is actually started and that
         * ordered extents get created before we return
@@ -7801,8 +8516,9 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
                        }
                }
-                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
+                ret = btrfs_reserve_extent(trans, root,
-                                           0, *alloc_hint, &ins, 1);
+                                           min(num_bytes, 256ULL * 1024 * 1024),
+                                           min_size, 0, *alloc_hint, &ins, 1);
                if (ret) {
                        if (own_trans)
                                btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index c3f09f71bedd..c83086fdda05 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -42,12 +42,12 @@
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/uuid.h>
+#include <linux/btrfs.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
@@ -363,46 +363,52 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
        return 0;
 }
-static noinline int create_subvol(struct btrfs_root *root,
+static noinline int create_subvol(struct inode *dir,
                                  struct dentry *dentry,
                                  char *name, int namelen,
                                  u64 *async_transid,
-                                  struct btrfs_qgroup_inherit **inherit)
+                                  struct btrfs_qgroup_inherit *inherit)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_key key;
        struct btrfs_root_item root_item;
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_root *new_root;
-        struct dentry *parent = dentry->d_parent;
+        struct btrfs_block_rsv block_rsv;
-        struct inode *dir;
        struct timespec cur_time = CURRENT_TIME;
        int ret;
        int err;
        u64 objectid;
        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
        u64 index = 0;
+        u64 qgroup_reserved;
        uuid_le new_uuid;
        ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
        if (ret)
                return ret;
-        dir = parent->d_inode;
+        btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
        /*
-         * 1 - inode item
+         * The same as the snapshot creation, please see the comment
-         * 2 - refs
+         * of create_snapshot().
-         * 1 - root item
-         * 2 - dir items
         */
-        trans = btrfs_start_transaction(root, 6);
+        ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
-        if (IS_ERR(trans))
+                                               7, &qgroup_reserved);
-                return PTR_ERR(trans);
+        if (ret)
+                return ret;
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out;
+        }
+        trans->block_rsv = &block_rsv;
+        trans->bytes_reserved = block_rsv.size;
-        ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid,
+        ret = btrfs_qgroup_inherit(trans, root->fs_info, 0, objectid, inherit);
-                                   inherit ? *inherit : NULL);
        if (ret)
                goto fail;
@@ -516,6 +522,8 @@ static noinline int create_subvol(struct btrfs_root *root,
        BUG_ON(ret);
 fail:
+        trans->block_rsv = NULL;
+        trans->bytes_reserved = 0;
        if (async_transid) {
                *async_transid = trans->transid;
                err = btrfs_commit_transaction_async(trans, root, 1);
@@ -527,13 +535,15 @@ fail:
        if (!ret)
                d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
+out:
+        btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
        return ret;
 }
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+static int create_snapshot(struct btrfs_root *root, struct inode *dir,
-                           char *name, int namelen, u64 *async_transid,
+                           struct dentry *dentry, char *name, int namelen,
-                           bool readonly, struct btrfs_qgroup_inherit **inherit)
+                           u64 *async_transid, bool readonly,
+                           struct btrfs_qgroup_inherit *inherit)
 {
        struct inode *inode;
        struct btrfs_pending_snapshot *pending_snapshot;
@@ -549,23 +559,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        btrfs_init_block_rsv(&pending_snapshot->block_rsv,
                             BTRFS_BLOCK_RSV_TEMP);
+        /*
+         * 1 - parent dir inode
+         * 2 - dir entries
+         * 1 - root item
+         * 2 - root ref/backref
+         * 1 - root of snapshot
+         */
+        ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
+                                        &pending_snapshot->block_rsv, 7,
+                                        &pending_snapshot->qgroup_reserved);
+        if (ret)
+                goto out;
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
        pending_snapshot->readonly = readonly;
-        if (inherit) {
+        pending_snapshot->dir = dir;
-                pending_snapshot->inherit = *inherit;
+        pending_snapshot->inherit = inherit;
-                *inherit = NULL;        /* take responsibility to free it */
-        }
-        trans = btrfs_start_transaction(root->fs_info->extent_root, 6);
+        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                ret = PTR_ERR(trans);
                goto fail;
        }
-        ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
-        BUG_ON(ret);
        spin_lock(&root->fs_info->trans_lock);
        list_add(&pending_snapshot->list,
                 &trans->transaction->pending_snapshots);
@@ -602,6 +620,10 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
        d_instantiate(dentry, inode);
        ret = 0;
 fail:
+        btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
+                                         &pending_snapshot->block_rsv,
+                                         pending_snapshot->qgroup_reserved);
+out:
        kfree(pending_snapshot);
        return ret;
 }
@@ -695,7 +717,7 @@ static noinline int btrfs_mksubvol(struct path *parent,
                                   char *name, int namelen,
                                   struct btrfs_root *snap_src,
                                   u64 *async_transid, bool readonly,
-                                   struct btrfs_qgroup_inherit **inherit)
+                                   struct btrfs_qgroup_inherit *inherit)
 {
        struct inode *dir  = parent->dentry->d_inode;
        struct dentry *dentry;
@@ -732,11 +754,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
                goto out_up_read;
        if (snap_src) {
-                error = create_snapshot(snap_src, dentry, name, namelen,
+                error = create_snapshot(snap_src, dir, dentry, name, namelen,
                                        async_transid, readonly, inherit);
        } else {
-                error = create_subvol(BTRFS_I(dir)->root, dentry,
+                error = create_subvol(dir, dentry, name, namelen,
-                                      name, namelen, async_transid, inherit);
+                                      async_transid, inherit);
        }
        if (!error)
                fsnotify_mkdir(dir, dentry);
@@ -818,7 +840,7 @@ static int find_new_extents(struct btrfs_root *root,
        while(1) {
                ret = btrfs_search_forward(root, &min_key, &max_key,
-                                           path, 0, newer_than);
+                                           path, newer_than);
                if (ret != 0)
                        goto none;
                if (min_key.objectid != ino)
@@ -1206,6 +1228,12 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
                if (!(inode->i_sb->s_flags & MS_ACTIVE))
                        break;
+                if (btrfs_defrag_cancelled(root->fs_info)) {
+                        printk(KERN_DEBUG "btrfs: defrag_file cancelled\n");
+                        ret = -EAGAIN;
+                        break;
+                }
                if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
                                         extent_thresh, &last_len, &skip,
                                         &defrag_end, range->flags &
@@ -1329,9 +1357,6 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        int ret = 0;
        int mod = 0;
-        if (root->fs_info->sb->s_flags & MS_RDONLY)
-                return -EROFS;
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
@@ -1363,6 +1388,10 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                *devstr = '\0';
                devstr = vol_args->name;
                devid = simple_strtoull(devstr, &end, 10);
+                if (!devid) {
+                        ret = -EINVAL;
+                        goto out_free;
+                }
                printk(KERN_INFO "btrfs: resizing devid %llu\n",
                       (unsigned long long)devid);
        }
@@ -1371,7 +1400,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        if (!device) {
                printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
                       (unsigned long long)devid);
-                ret = -EINVAL;
+                ret = -ENODEV;
                goto out_free;
        }
@@ -1379,7 +1408,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
                printk(KERN_INFO "btrfs: resizer unable to apply on "
                       "readonly device %llu\n",
                       (unsigned long long)devid);
-                ret = -EINVAL;
+                ret = -EPERM;
                goto out_free;
        }
@@ -1401,7 +1430,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
        }
        if (device->is_tgtdev_for_dev_replace) {
-                ret = -EINVAL;
+                ret = -EPERM;
                goto out_free;
        }
@@ -1457,7 +1486,7 @@ out:
 static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
                                char *name, unsigned long fd, int subvol,
                                u64 *transid, bool readonly,
-                                struct btrfs_qgroup_inherit **inherit)
+                                struct btrfs_qgroup_inherit *inherit)
 {
        int namelen;
        int ret = 0;
@@ -1566,7 +1595,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
                                              vol_args->fd, subvol, ptr,
-                                              readonly, &inherit);
+                                              readonly, inherit);
        if (ret == 0 && ptr &&
            copy_to_user(arg +
@@ -1863,7 +1892,7 @@ static noinline int search_ioctl(struct inode *inode,
        path->keep_locks = 1;
        while(1) {
-                ret = btrfs_search_forward(root, &key, &max_key, path, 0,
+                ret = btrfs_search_forward(root, &key, &max_key, path,
                                           sk->min_transid);
                if (ret != 0) {
                        if (ret > 0)
@@ -2035,6 +2064,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        struct btrfs_root *dest = NULL;
        struct btrfs_ioctl_vol_args *vol_args;
        struct btrfs_trans_handle *trans;
+        struct btrfs_block_rsv block_rsv;
+        u64 qgroup_reserved;
        int namelen;
        int ret;
        int err = 0;
@@ -2124,12 +2155,23 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        if (err)
                goto out_up_write;
+        btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
+        /*
+         * One for dir inode, two for dir entries, two for root
+         * ref/backref.
+         */
+        err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
+                                               5, &qgroup_reserved);
+        if (err)
+                goto out_up_write;
        trans = btrfs_start_transaction(root, 0);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
-                goto out_up_write;
+                goto out_release;
        }
-        trans->block_rsv = &root->fs_info->global_block_rsv;
+        trans->block_rsv = &block_rsv;
+        trans->bytes_reserved = block_rsv.size;
        ret = btrfs_unlink_subvol(trans, root, dir,
                                dest->root_key.objectid,
@@ -2159,10 +2201,14 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                }
        }
 out_end_trans:
+        trans->block_rsv = NULL;
+        trans->bytes_reserved = 0;
        ret = btrfs_end_transaction(trans, root);
        if (ret && !err)
                err = ret;
        inode->i_flags |= S_DEAD;
+out_release:
+        btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved);
 out_up_write:
        up_write(&root->fs_info->subvol_sem);
 out_unlock:
@@ -2171,6 +2217,12 @@ out_unlock:
                shrink_dcache_sb(root->fs_info->sb);
                btrfs_invalidate_inodes(dest);
                d_delete(dentry);
+                /* the last ref */
+                if (dest->cache_inode) {
+                        iput(dest->cache_inode);
+                        dest->cache_inode = NULL;
+                }
        }
 out_dput:
        dput(dentry);
@@ -2211,10 +2263,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        ret = -EPERM;
                        goto out;
                }
-                ret = btrfs_defrag_root(root, 0);
+                ret = btrfs_defrag_root(root);
                if (ret)
                        goto out;
-                ret = btrfs_defrag_root(root->fs_info->extent_root, 0);
+                ret = btrfs_defrag_root(root->fs_info->extent_root);
                break;
        case S_IFREG:
                if (!(file->f_mode & FMODE_WRITE)) {
@@ -3111,7 +3163,7 @@ static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
        u64 transid;
        int ret;
-        trans = btrfs_attach_transaction(root);
+        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                if (PTR_ERR(trans) != -ENOENT)
                        return PTR_ERR(trans);
@@ -3289,7 +3341,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
        struct inode_fs_paths *ipath = NULL;
        struct btrfs_path *path;
-        if (!capable(CAP_SYS_ADMIN))
+        if (!capable(CAP_DAC_READ_SEARCH))
                return -EPERM;
        path = btrfs_alloc_path();
@@ -3914,6 +3966,65 @@ out:
        return ret;
 }
+static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
+{
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        const char *label = root->fs_info->super_copy->label;
+        size_t len = strnlen(label, BTRFS_LABEL_SIZE);
+        int ret;
+        if (len == BTRFS_LABEL_SIZE) {
+                pr_warn("btrfs: label is too long, return the first %zu bytes\n",
+                        --len);
+        }
+        mutex_lock(&root->fs_info->volume_mutex);
+        ret = copy_to_user(arg, label, len);
+        mutex_unlock(&root->fs_info->volume_mutex);
+        return ret ? -EFAULT : 0;
+}
+static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
+{
+        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
+        struct btrfs_super_block *super_block = root->fs_info->super_copy;
+        struct btrfs_trans_handle *trans;
+        char label[BTRFS_LABEL_SIZE];
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        if (copy_from_user(label, arg, sizeof(label)))
+                return -EFAULT;
+        if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
+                pr_err("btrfs: unable to set label with more than %d bytes\n",
+                       BTRFS_LABEL_SIZE - 1);
+                return -EINVAL;
+        }
+        ret = mnt_want_write_file(file);
+        if (ret)
+                return ret;
+        mutex_lock(&root->fs_info->volume_mutex);
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out_unlock;
+        }
+        strcpy(super_block->label, label);
+        ret = btrfs_end_transaction(trans, root);
+out_unlock:
+        mutex_unlock(&root->fs_info->volume_mutex);
+        mnt_drop_write_file(file);
+        return ret;
+}
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -4014,6 +4125,10 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_qgroup_limit(file, argp);
        case BTRFS_IOC_DEV_REPLACE:
                return btrfs_ioctl_dev_replace(root, argp);
+        case BTRFS_IOC_GET_FSLABEL:
+                return btrfs_ioctl_get_fslabel(file, argp);
+        case BTRFS_IOC_SET_FSLABEL:
+                return btrfs_ioctl_set_fslabel(file, argp);
        }
        return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
deleted file mode 100644
index dabca9cc8c2e..000000000000
--- a/fs/btrfs/ioctl.h
+++ /dev/null
@@ -1,502 +0,0 @@
-/*
- * Copyright (C) 2007 Oracle.  All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License v2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- */
-#ifndef __IOCTL_
-#define __IOCTL_
-#include <linux/ioctl.h>
-#define BTRFS_IOCTL_MAGIC 0x94
-#define BTRFS_VOL_NAME_MAX 255
-/* this should be 4k */
-#define BTRFS_PATH_NAME_MAX 4087
-struct btrfs_ioctl_vol_args {
-        __s64 fd;
-        char name[BTRFS_PATH_NAME_MAX + 1];
-};
-#define BTRFS_DEVICE_PATH_NAME_MAX 1024
-#define BTRFS_SUBVOL_CREATE_ASYNC       (1ULL << 0)
-#define BTRFS_SUBVOL_RDONLY             (1ULL << 1)
-#define BTRFS_SUBVOL_QGROUP_INHERIT     (1ULL << 2)
-#define BTRFS_FSID_SIZE 16
-#define BTRFS_UUID_SIZE 16
-#define BTRFS_QGROUP_INHERIT_SET_LIMITS (1ULL << 0)
-struct btrfs_qgroup_limit {
-        __u64   flags;
-        __u64   max_rfer;
-        __u64   max_excl;
-        __u64   rsv_rfer;
-        __u64   rsv_excl;
-};
-struct btrfs_qgroup_inherit {
-        __u64   flags;
-        __u64   num_qgroups;
-        __u64   num_ref_copies;
-        __u64   num_excl_copies;
-        struct btrfs_qgroup_limit lim;
-        __u64   qgroups[0];
-};
-struct btrfs_ioctl_qgroup_limit_args {
-        __u64   qgroupid;
-        struct btrfs_qgroup_limit lim;
-};
-#define BTRFS_SUBVOL_NAME_MAX 4039
-struct btrfs_ioctl_vol_args_v2 {
-        __s64 fd;
-        __u64 transid;
-        __u64 flags;
-        union {
-                struct {
-                        __u64 size;
-                        struct btrfs_qgroup_inherit __user *qgroup_inherit;
-                };
-                __u64 unused[4];
-        };
-        char name[BTRFS_SUBVOL_NAME_MAX + 1];
-};
-/*
- * structure to report errors and progress to userspace, either as a
- * result of a finished scrub, a canceled scrub or a progress inquiry
- */
-struct btrfs_scrub_progress {
-        __u64 data_extents_scrubbed;    /* # of data extents scrubbed */
-        __u64 tree_extents_scrubbed;    /* # of tree extents scrubbed */
-        __u64 data_bytes_scrubbed;      /* # of data bytes scrubbed */
-        __u64 tree_bytes_scrubbed;      /* # of tree bytes scrubbed */
-        __u64 read_errors;              /* # of read errors encountered (EIO) */
-        __u64 csum_errors;              /* # of failed csum checks */
-        __u64 verify_errors;            /* # of occurences, where the metadata
-                                         * of a tree block did not match the
-                                         * expected values, like generation or
-                                         * logical */
-        __u64 no_csum;                  /* # of 4k data block for which no csum
-                                         * is present, probably the result of
-                                         * data written with nodatasum */
-        __u64 csum_discards;            /* # of csum for which no data was found
-                                         * in the extent tree. */
-        __u64 super_errors;             /* # of bad super blocks encountered */
-        __u64 malloc_errors;            /* # of internal kmalloc errors. These
-                                         * will likely cause an incomplete
-                                         * scrub */
-        __u64 uncorrectable_errors;     /* # of errors where either no intact
-                                         * copy was found or the writeback
-                                         * failed */
-        __u64 corrected_errors;         /* # of errors corrected */
-        __u64 last_physical;            /* last physical address scrubbed. In
-                                         * case a scrub was aborted, this can
-                                         * be used to restart the scrub */
-        __u64 unverified_errors;        /* # of occurences where a read for a
-                                         * full (64k) bio failed, but the re-
-                                         * check succeeded for each 4k piece.
-                                         * Intermittent error. */
-};
-#define BTRFS_SCRUB_READONLY    1
-struct btrfs_ioctl_scrub_args {
-        __u64 devid;                            /* in */
-        __u64 start;                            /* in */
-        __u64 end;                              /* in */
-        __u64 flags;                            /* in */
-        struct btrfs_scrub_progress progress;   /* out */
-        /* pad to 1k */
-        __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
-};
-#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_ALWAYS    0
-#define BTRFS_IOCTL_DEV_REPLACE_CONT_READING_FROM_SRCDEV_MODE_AVOID     1
-struct btrfs_ioctl_dev_replace_start_params {
-        __u64 srcdevid; /* in, if 0, use srcdev_name instead */
-        __u64 cont_reading_from_srcdev_mode;    /* in, see #define
-                                                 * above */
-        __u8 srcdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];       /* in */
-        __u8 tgtdev_name[BTRFS_DEVICE_PATH_NAME_MAX + 1];       /* in */
-};
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED     0
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED           1
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED          2
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED          3
-#define BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED         4
-struct btrfs_ioctl_dev_replace_status_params {
-        __u64 replace_state;    /* out, see #define above */
-        __u64 progress_1000;    /* out, 0 <= x <= 1000 */
-        __u64 time_started;     /* out, seconds since 1-Jan-1970 */
-        __u64 time_stopped;     /* out, seconds since 1-Jan-1970 */
-        __u64 num_write_errors; /* out */
-        __u64 num_uncorrectable_read_errors;    /* out */
-};
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_START                       0
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS                      1
-#define BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL                      2
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NO_ERROR                 0
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_NOT_STARTED              1
-#define BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED          2
-struct btrfs_ioctl_dev_replace_args {
-        __u64 cmd;      /* in */
-        __u64 result;   /* out */
-        union {
-                struct btrfs_ioctl_dev_replace_start_params start;
-                struct btrfs_ioctl_dev_replace_status_params status;
-        };      /* in/out */
-        __u64 spare[64];
-};
-struct btrfs_ioctl_dev_info_args {
-        __u64 devid;                            /* in/out */
-        __u8 uuid[BTRFS_UUID_SIZE];             /* in/out */
-        __u64 bytes_used;                       /* out */
-        __u64 total_bytes;                      /* out */
-        __u64 unused[379];                      /* pad to 4k */
-        __u8 path[BTRFS_DEVICE_PATH_NAME_MAX];  /* out */
-};
-struct btrfs_ioctl_fs_info_args {
-        __u64 max_id;                           /* out */
-        __u64 num_devices;                      /* out */
-        __u8 fsid[BTRFS_FSID_SIZE];             /* out */
-        __u64 reserved[124];                    /* pad to 1k */
-};
-/* balance control ioctl modes */
-#define BTRFS_BALANCE_CTL_PAUSE         1
-#define BTRFS_BALANCE_CTL_CANCEL        2
-/*
- * this is packed, because it should be exactly the same as its disk
- * byte order counterpart (struct btrfs_disk_balance_args)
- */
-struct btrfs_balance_args {
-        __u64 profiles;
-        __u64 usage;
-        __u64 devid;
-        __u64 pstart;
-        __u64 pend;
-        __u64 vstart;
-        __u64 vend;
-        __u64 target;
-        __u64 flags;
-        __u64 unused[8];
-} __attribute__ ((__packed__));
-/* report balance progress to userspace */
-struct btrfs_balance_progress {
-        __u64 expected;         /* estimated # of chunks that will be
-                                 * relocated to fulfill the request */
-        __u64 considered;       /* # of chunks we have considered so far */
-        __u64 completed;        /* # of chunks relocated so far */
-};
-#define BTRFS_BALANCE_STATE_RUNNING     (1ULL << 0)
-#define BTRFS_BALANCE_STATE_PAUSE_REQ   (1ULL << 1)
-#define BTRFS_BALANCE_STATE_CANCEL_REQ  (1ULL << 2)
-struct btrfs_ioctl_balance_args {
-        __u64 flags;                            /* in/out */
-        __u64 state;                            /* out */
-        struct btrfs_balance_args data;         /* in/out */
-        struct btrfs_balance_args meta;         /* in/out */
-        struct btrfs_balance_args sys;          /* in/out */
-        struct btrfs_balance_progress stat;     /* out */
-        __u64 unused[72];                       /* pad to 1k */
-};
-#define BTRFS_INO_LOOKUP_PATH_MAX 4080
-struct btrfs_ioctl_ino_lookup_args {
-        __u64 treeid;
-        __u64 objectid;
-        char name[BTRFS_INO_LOOKUP_PATH_MAX];
-};
-struct btrfs_ioctl_search_key {
-        /* which root are we searching.  0 is the tree of tree roots */
-        __u64 tree_id;
-        /* keys returned will be >= min and <= max */
-        __u64 min_objectid;
-        __u64 max_objectid;
-        /* keys returned will be >= min and <= max */
-        __u64 min_offset;
-        __u64 max_offset;
-        /* max and min transids to search for */
-        __u64 min_transid;
-        __u64 max_transid;
-        /* keys returned will be >= min and <= max */
-        __u32 min_type;
-        __u32 max_type;
-        /*
-         * how many items did userland ask for, and how many are we
-         * returning
-         */
-        __u32 nr_items;
-        /* align to 64 bits */
-        __u32 unused;
-        /* some extra for later */
-        __u64 unused1;
-        __u64 unused2;
-        __u64 unused3;
-        __u64 unused4;
-};
-struct btrfs_ioctl_search_header {
-        __u64 transid;
-        __u64 objectid;
-        __u64 offset;
-        __u32 type;
-        __u32 len;
-};
-#define BTRFS_SEARCH_ARGS_BUFSIZE (4096 - sizeof(struct btrfs_ioctl_search_key))
-/*
- * the buf is an array of search headers where
- * each header is followed by the actual item
- * the type field is expanded to 32 bits for alignment
- */
-struct btrfs_ioctl_search_args {
-        struct btrfs_ioctl_search_key key;
-        char buf[BTRFS_SEARCH_ARGS_BUFSIZE];
-};
-struct btrfs_ioctl_clone_range_args {
-  __s64 src_fd;
-  __u64 src_offset, src_length;
-  __u64 dest_offset;
-};
-/* flags for the defrag range ioctl */
-#define BTRFS_DEFRAG_RANGE_COMPRESS 1
-#define BTRFS_DEFRAG_RANGE_START_IO 2
-struct btrfs_ioctl_space_info {
-        __u64 flags;
-        __u64 total_bytes;
-        __u64 used_bytes;
-};
-struct btrfs_ioctl_space_args {
-        __u64 space_slots;
-        __u64 total_spaces;
-        struct btrfs_ioctl_space_info spaces[0];
-};
-struct btrfs_data_container {
-        __u32   bytes_left;     /* out -- bytes not needed to deliver output */
-        __u32   bytes_missing;  /* out -- additional bytes needed for result */
-        __u32   elem_cnt;       /* out */
-        __u32   elem_missed;    /* out */
-        __u64   val[0];         /* out */
-};
-struct btrfs_ioctl_ino_path_args {
-        __u64                           inum;           /* in */
-        __u64                           size;           /* in */
-        __u64                           reserved[4];
-        /* struct btrfs_data_container  *fspath;           out */
-        __u64                           fspath;         /* out */
-};
-struct btrfs_ioctl_logical_ino_args {
-        __u64                           logical;        /* in */
-        __u64                           size;           /* in */
-        __u64                           reserved[4];
-        /* struct btrfs_data_container  *inodes;        out   */
-        __u64                           inodes;
-};
-enum btrfs_dev_stat_values {
-        /* disk I/O failure stats */
-        BTRFS_DEV_STAT_WRITE_ERRS, /* EIO or EREMOTEIO from lower layers */
-        BTRFS_DEV_STAT_READ_ERRS, /* EIO or EREMOTEIO from lower layers */
-        BTRFS_DEV_STAT_FLUSH_ERRS, /* EIO or EREMOTEIO from lower layers */
-        /* stats for indirect indications for I/O failures */
-        BTRFS_DEV_STAT_CORRUPTION_ERRS, /* checksum error, bytenr error or
-                                         * contents is illegal: this is an
-                                         * indication that the block was damaged
-                                         * during read or write, or written to
-                                         * wrong location or read from wrong
-                                         * location */
-        BTRFS_DEV_STAT_GENERATION_ERRS, /* an indication that blocks have not
-                                         * been written */
-        BTRFS_DEV_STAT_VALUES_MAX
-};
-/* Reset statistics after reading; needs SYS_ADMIN capability */
-#define BTRFS_DEV_STATS_RESET           (1ULL << 0)
-struct btrfs_ioctl_get_dev_stats {
-        __u64 devid;                            /* in */
-        __u64 nr_items;                         /* in/out */
-        __u64 flags;                            /* in/out */
-        /* out values: */
-        __u64 values[BTRFS_DEV_STAT_VALUES_MAX];
-        __u64 unused[128 - 2 - BTRFS_DEV_STAT_VALUES_MAX]; /* pad to 1k */
-};
-#define BTRFS_QUOTA_CTL_ENABLE  1
-#define BTRFS_QUOTA_CTL_DISABLE 2
-#define BTRFS_QUOTA_CTL_RESCAN  3
-struct btrfs_ioctl_quota_ctl_args {
-        __u64 cmd;
-        __u64 status;
-};
-struct btrfs_ioctl_qgroup_assign_args {
-        __u64 assign;
-        __u64 src;
-        __u64 dst;
-};
-struct btrfs_ioctl_qgroup_create_args {
-        __u64 create;
-        __u64 qgroupid;
-};
-struct btrfs_ioctl_timespec {
-        __u64 sec;
-        __u32 nsec;
-};
-struct btrfs_ioctl_received_subvol_args {
-        char    uuid[BTRFS_UUID_SIZE];  /* in */
-        __u64   stransid;               /* in */
-        __u64   rtransid;               /* out */
-        struct btrfs_ioctl_timespec stime; /* in */
-        struct btrfs_ioctl_timespec rtime; /* out */
-        __u64   flags;                  /* in */
-        __u64   reserved[16];           /* in */
-};
-struct btrfs_ioctl_send_args {
-        __s64 send_fd;                  /* in */
-        __u64 clone_sources_count;      /* in */
-        __u64 __user *clone_sources;    /* in */
-        __u64 parent_root;              /* in */
-        __u64 flags;                    /* in */
-        __u64 reserved[4];              /* in */
-};
-#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
-                                   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
-                                   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \
-                                   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \
-                                   struct btrfs_ioctl_vol_args)
-/* trans start and trans end are dangerous, and only for
- * use by applications that know how to avoid the
- * resulting deadlocks
- */
-#define BTRFS_IOC_TRANS_START  _IO(BTRFS_IOCTL_MAGIC, 6)
-#define BTRFS_IOC_TRANS_END    _IO(BTRFS_IOCTL_MAGIC, 7)
-#define BTRFS_IOC_SYNC         _IO(BTRFS_IOCTL_MAGIC, 8)
-#define BTRFS_IOC_CLONE        _IOW(BTRFS_IOCTL_MAGIC, 9, int)
-#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \
-                                   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \
-                                   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \
-                                   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \
-                                  struct btrfs_ioctl_clone_range_args)
-#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \
-                                   struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_SNAP_DESTROY _IOW(BTRFS_IOCTL_MAGIC, 15, \
-                                struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_DEFRAG_RANGE _IOW(BTRFS_IOCTL_MAGIC, 16, \
-                                struct btrfs_ioctl_defrag_range_args)
-#define BTRFS_IOC_TREE_SEARCH _IOWR(BTRFS_IOCTL_MAGIC, 17, \
-                                   struct btrfs_ioctl_search_args)
-#define BTRFS_IOC_INO_LOOKUP _IOWR(BTRFS_IOCTL_MAGIC, 18, \
-                                   struct btrfs_ioctl_ino_lookup_args)
-#define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
-#define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
-                                    struct btrfs_ioctl_space_args)
-#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
-#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
-#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
-                                   struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 24, \
-                                   struct btrfs_ioctl_vol_args_v2)
-#define BTRFS_IOC_SUBVOL_GETFLAGS _IOR(BTRFS_IOCTL_MAGIC, 25, __u64)
-#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
-#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
-                              struct btrfs_ioctl_scrub_args)
-#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
-#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
-                                       struct btrfs_ioctl_scrub_args)
-#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
-                                 struct btrfs_ioctl_dev_info_args)
-#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
-                               struct btrfs_ioctl_fs_info_args)
-#define BTRFS_IOC_BALANCE_V2 _IOWR(BTRFS_IOCTL_MAGIC, 32, \
-                                   struct btrfs_ioctl_balance_args)
-#define BTRFS_IOC_BALANCE_CTL _IOW(BTRFS_IOCTL_MAGIC, 33, int)
-#define BTRFS_IOC_BALANCE_PROGRESS _IOR(BTRFS_IOCTL_MAGIC, 34, \
-                                        struct btrfs_ioctl_balance_args)
-#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
-                                        struct btrfs_ioctl_ino_path_args)
-#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
-                                        struct btrfs_ioctl_ino_path_args)
-#define BTRFS_IOC_SET_RECEIVED_SUBVOL _IOWR(BTRFS_IOCTL_MAGIC, 37, \
-                                struct btrfs_ioctl_received_subvol_args)
-#define BTRFS_IOC_SEND _IOW(BTRFS_IOCTL_MAGIC, 38, struct btrfs_ioctl_send_args)
-#define BTRFS_IOC_DEVICES_READY _IOR(BTRFS_IOCTL_MAGIC, 39, \
-                                     struct btrfs_ioctl_vol_args)
-#define BTRFS_IOC_QUOTA_CTL _IOWR(BTRFS_IOCTL_MAGIC, 40, \
-                               struct btrfs_ioctl_quota_ctl_args)
-#define BTRFS_IOC_QGROUP_ASSIGN _IOW(BTRFS_IOCTL_MAGIC, 41, \
-                               struct btrfs_ioctl_qgroup_assign_args)
-#define BTRFS_IOC_QGROUP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 42, \
-                               struct btrfs_ioctl_qgroup_create_args)
-#define BTRFS_IOC_QGROUP_LIMIT _IOR(BTRFS_IOCTL_MAGIC, 43, \
-                               struct btrfs_ioctl_qgroup_limit_args)
-#define BTRFS_IOC_GET_DEV_STATS _IOWR(BTRFS_IOCTL_MAGIC, 52, \
-                                      struct btrfs_ioctl_get_dev_stats)
-#define BTRFS_IOC_DEV_REPLACE _IOWR(BTRFS_IOCTL_MAGIC, 53, \
-                                    struct btrfs_ioctl_dev_replace_args)
-#endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 2a1762c66041..e95df435d897 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -113,11 +113,10 @@ again:
                read_unlock(&eb->lock);
                return;
        }
-        read_unlock(&eb->lock);
-        wait_event(eb->write_lock_wq, atomic_read(&eb->blocking_writers) == 0);
-        read_lock(&eb->lock);
        if (atomic_read(&eb->blocking_writers)) {
                read_unlock(&eb->lock);
+                wait_event(eb->write_lock_wq,
+                           atomic_read(&eb->blocking_writers) == 0);
                goto again;
        }
        atomic_inc(&eb->read_locks);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e5ed56729607..dc08d77b717e 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -196,6 +196,9 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->file_offset = file_offset;
        entry->start = start;
        entry->len = len;
+        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) &&
+            !(type == BTRFS_ORDERED_NOCOW))
+                entry->csum_bytes_left = disk_len;
        entry->disk_len = disk_len;
        entry->bytes_left = len;
        entry->inode = igrab(inode);
@@ -213,6 +216,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        INIT_LIST_HEAD(&entry->root_extent_list);
        INIT_LIST_HEAD(&entry->work_list);
        init_completion(&entry->completion);
+        INIT_LIST_HEAD(&entry->log_list);
        trace_btrfs_ordered_extent_add(inode, entry);
@@ -270,6 +274,10 @@ void btrfs_add_ordered_sum(struct inode *inode,
        tree = &BTRFS_I(inode)->ordered_tree;
        spin_lock_irq(&tree->lock);
        list_add_tail(&sum->list, &entry->list);
+        WARN_ON(entry->csum_bytes_left < sum->len);
+        entry->csum_bytes_left -= sum->len;
+        if (entry->csum_bytes_left == 0)
+                wake_up(&entry->wait);
        spin_unlock_irq(&tree->lock);
 }
@@ -405,6 +413,66 @@ out:
        return ret == 0;
 }
+/* Needs to either be called under a log transaction or the log_mutex */
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct btrfs_ordered_extent *ordered;
+        struct rb_node *n;
+        int index = log->log_transid % 2;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        spin_lock_irq(&tree->lock);
+        for (n = rb_first(&tree->tree); n; n = rb_next(n)) {
+                ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node);
+                spin_lock(&log->log_extents_lock[index]);
+                if (list_empty(&ordered->log_list)) {
+                        list_add_tail(&ordered->log_list, &log->logged_list[index]);
+                        atomic_inc(&ordered->refs);
+                }
+                spin_unlock(&log->log_extents_lock[index]);
+        }
+        spin_unlock_irq(&tree->lock);
+}
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid)
+{
+        struct btrfs_ordered_extent *ordered;
+        int index = transid % 2;
+        spin_lock_irq(&log->log_extents_lock[index]);
+        while (!list_empty(&log->logged_list[index])) {
+                ordered = list_first_entry(&log->logged_list[index],
+                                           struct btrfs_ordered_extent,
+                                           log_list);
+                list_del_init(&ordered->log_list);
+                spin_unlock_irq(&log->log_extents_lock[index]);
+                wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE,
+                                                   &ordered->flags));
+                btrfs_put_ordered_extent(ordered);
+                spin_lock_irq(&log->log_extents_lock[index]);
+        }
+        spin_unlock_irq(&log->log_extents_lock[index]);
+}
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid)
+{
+        struct btrfs_ordered_extent *ordered;
+        int index = transid % 2;
+        spin_lock_irq(&log->log_extents_lock[index]);
+        while (!list_empty(&log->logged_list[index])) {
+                ordered = list_first_entry(&log->logged_list[index],
+                                           struct btrfs_ordered_extent,
+                                           log_list);
+                list_del_init(&ordered->log_list);
+                spin_unlock_irq(&log->log_extents_lock[index]);
+                btrfs_put_ordered_extent(ordered);
+                spin_lock_irq(&log->log_extents_lock[index]);
+        }
+        spin_unlock_irq(&log->log_extents_lock[index]);
+}
 /*
 * used to drop a reference on an ordered extent.  This will free
 * the extent if the last reference is dropped
@@ -544,10 +612,12 @@ void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput)
 * extra check to make sure the ordered operation list really is empty
 * before we return
 */
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root, int wait)
 {
        struct btrfs_inode *btrfs_inode;
        struct inode *inode;
+        struct btrfs_transaction *cur_trans = trans->transaction;
        struct list_head splice;
        struct list_head works;
        struct btrfs_delalloc_work *work, *next;
@@ -558,14 +628,10 @@ int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
        mutex_lock(&root->fs_info->ordered_operations_mutex);
        spin_lock(&root->fs_info->ordered_extent_lock);
-again:
+        list_splice_init(&cur_trans->ordered_operations, &splice);
-        list_splice_init(&root->fs_info->ordered_operations, &splice);
        while (!list_empty(&splice)) {
                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
                                   ordered_operations);
                inode = &btrfs_inode->vfs_inode;
                list_del_init(&btrfs_inode->ordered_operations);
@@ -574,24 +640,22 @@ again:
                 * the inode may be getting freed (in sys_unlink path).
                 */
                inode = igrab(inode);
-                if (!wait && inode) {
-                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
-                              &root->fs_info->ordered_operations);
-                }
                if (!inode)
                        continue;
+                if (!wait)
+                        list_add_tail(&BTRFS_I(inode)->ordered_operations,
+                                      &cur_trans->ordered_operations);
                spin_unlock(&root->fs_info->ordered_extent_lock);
                work = btrfs_alloc_delalloc_work(inode, wait, 1);
                if (!work) {
+                        spin_lock(&root->fs_info->ordered_extent_lock);
                        if (list_empty(&BTRFS_I(inode)->ordered_operations))
                                list_add_tail(&btrfs_inode->ordered_operations,
                                              &splice);
-                        spin_lock(&root->fs_info->ordered_extent_lock);
                        list_splice_tail(&splice,
-                                         &root->fs_info->ordered_operations);
+                                         &cur_trans->ordered_operations);
                        spin_unlock(&root->fs_info->ordered_extent_lock);
                        ret = -ENOMEM;
                        goto out;
@@ -603,9 +667,6 @@ again:
                cond_resched();
                spin_lock(&root->fs_info->ordered_extent_lock);
        }
-        if (wait && !list_empty(&root->fs_info->ordered_operations))
-                goto again;
        spin_unlock(&root->fs_info->ordered_extent_lock);
 out:
        list_for_each_entry_safe(work, next, &works, list) {
@@ -974,6 +1035,7 @@ out:
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root, struct inode *inode)
 {
+        struct btrfs_transaction *cur_trans = trans->transaction;
        u64 last_mod;
        last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
@@ -988,7 +1050,7 @@ void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
        spin_lock(&root->fs_info->ordered_extent_lock);
        if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
                list_add_tail(&BTRFS_I(inode)->ordered_operations,
-                              &root->fs_info->ordered_operations);
+                              &cur_trans->ordered_operations);
        }
        spin_unlock(&root->fs_info->ordered_extent_lock);
 }
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index f29d4bf5fbe7..8eadfe406cdd 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -79,6 +79,8 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_UPDATED_ISIZE 7 /* indicates whether this ordered extent
                                       * has done its due diligence in updating
                                       * the isize. */
+#define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
+                                       ordered extent */
 struct btrfs_ordered_extent {
        /* logical offset in the file */
@@ -96,6 +98,9 @@ struct btrfs_ordered_extent {
        /* number of bytes that still need writing */
        u64 bytes_left;
+        /* number of bytes that still need csumming */
+        u64 csum_bytes_left;
        /*
         * the end of the ordered extent which is behind it but
         * didn't update disk_i_size. Please see the comment of
@@ -118,6 +123,9 @@ struct btrfs_ordered_extent {
        /* list of checksums for insertion when the extent io is done */
        struct list_head list;
+        /* If we need to wait on this to be done */
+        struct list_head log_list;
        /* used to wait for the BTRFS_ORDERED_COMPLETE bit */
        wait_queue_head_t wait;
@@ -189,11 +197,15 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode,
 int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
                                struct btrfs_ordered_extent *ordered);
 int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum);
-int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root, int wait);
 void btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode);
 void btrfs_wait_ordered_extents(struct btrfs_root *root, int delay_iput);
+void btrfs_get_logged_extents(struct btrfs_root *log, struct inode *inode);
+void btrfs_wait_logged_extents(struct btrfs_root *log, u64 transid);
+void btrfs_free_logged_extents(struct btrfs_root *log, u64 transid);
 int __init ordered_data_init(void);
 void ordered_data_exit(void);
 #endif
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 50d95fd190a5..920957ecb27e 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -294,6 +294,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
                               btrfs_dev_extent_chunk_offset(l, dev_extent),
                               (unsigned long long)
                               btrfs_dev_extent_length(l, dev_extent));
+                        break;
                case BTRFS_DEV_STATS_KEY:
                        printk(KERN_INFO "\t\tdevice stats\n");
                        break;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a5c856234323..aee4b1cc3d98 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -23,13 +23,13 @@
 #include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/btrfs.h>
 #include "ctree.h"
 #include "transaction.h"
 #include "disk-io.h"
 #include "locking.h"
 #include "ulist.h"
-#include "ioctl.h"
 #include "backref.h"
 /* TODO XXX FIXME
@@ -620,7 +620,9 @@ static int update_qgroup_limit_item(struct btrfs_trans_handle *trans,
        key.offset = qgroupid;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret > 0)
                ret = -ENOENT;
@@ -661,7 +663,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans,
        key.offset = qgroup->qgroupid;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret > 0)
                ret = -ENOENT;
@@ -702,7 +706,9 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
        key.offset = 0;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (ret > 0)
                ret = -ENOENT;
@@ -732,33 +738,38 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans,
 {
        struct btrfs_path *path;
        struct btrfs_key key;
+        struct extent_buffer *leaf = NULL;
        int ret;
+        int nr = 0;
-        if (!root)
-                return -EINVAL;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        while (1) {
+        path->leave_spinning = 1;
-                key.objectid = 0;
-                key.offset = 0;
-                key.type = 0;
-                path->leave_spinning = 1;
+        key.objectid = 0;
+        key.offset = 0;
+        key.type = 0;
+        while (1) {
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-                if (ret > 0) {
+                if (ret < 0)
-                        if (path->slots[0] == 0)
+                        goto out;
-                                break;
+                leaf = path->nodes[0];
-                        path->slots[0]--;
+                nr = btrfs_header_nritems(leaf);
-                } else if (ret < 0) {
+                if (!nr)
                        break;
-                }
+                /*
+                 * delete the leaf one by one
-                ret = btrfs_del_item(trans, root, path);
+                 * since the whole tree is going
+                 * to be deleted.
+                 */
+                path->slots[0] = 0;
+                ret = btrfs_del_items(trans, root, path, 0, nr);
                if (ret)
                        goto out;
                btrfs_release_path(path);
        }
        ret = 0;
@@ -847,6 +858,10 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
        int ret = 0;
        spin_lock(&fs_info->qgroup_lock);
+        if (!fs_info->quota_root) {
+                spin_unlock(&fs_info->qgroup_lock);
+                return 0;
+        }
        fs_info->quota_enabled = 0;
        fs_info->pending_quota_state = 0;
        quota_root = fs_info->quota_root;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
new file mode 100644
index 000000000000..07222053c7d8
--- /dev/null
+++ b/fs/btrfs/raid56.c
@@ -0,0 +1,2099 @@
+/*
+ * Copyright (C) 2012 Fusion-io  All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/bio.h>
+#include <linux/slab.h>
+#include <linux/buffer_head.h>
+#include <linux/blkdev.h>
+#include <linux/random.h>
+#include <linux/iocontext.h>
+#include <linux/capability.h>
+#include <linux/ratelimit.h>
+#include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <linux/hash.h>
+#include <linux/list_sort.h>
+#include <linux/raid/xor.h>
+#include <asm/div64.h>
+#include "compat.h"
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+#include "raid56.h"
+#include "async-thread.h"
+#include "check-integrity.h"
+#include "rcu-string.h"
+/* set when additional merges to this rbio are not allowed */
+#define RBIO_RMW_LOCKED_BIT     1
+/*
+ * set when this rbio is sitting in the hash, but it is just a cache
+ * of past RMW
+ */
+#define RBIO_CACHE_BIT          2
+/*
+ * set when it is safe to trust the stripe_pages for caching
+ */
+#define RBIO_CACHE_READY_BIT    3
+#define RBIO_CACHE_SIZE 1024
+struct btrfs_raid_bio {
+        struct btrfs_fs_info *fs_info;
+        struct btrfs_bio *bbio;
+        /*
+         * logical block numbers for the start of each stripe
+         * The last one or two are p/q.  These are sorted,
+         * so raid_map[0] is the start of our full stripe
+         */
+        u64 *raid_map;
+        /* while we're doing rmw on a stripe
+         * we put it into a hash table so we can
+         * lock the stripe and merge more rbios
+         * into it.
+         */
+        struct list_head hash_list;
+        /*
+         * LRU list for the stripe cache
+         */
+        struct list_head stripe_cache;
+        /*
+         * for scheduling work in the helper threads
+         */
+        struct btrfs_work work;
+        /*
+         * bio list and bio_list_lock are used
+         * to add more bios into the stripe
+         * in hopes of avoiding the full rmw
+         */
+        struct bio_list bio_list;
+        spinlock_t bio_list_lock;
+        /* also protected by the bio_list_lock, the
+         * plug list is used by the plugging code
+         * to collect partial bios while plugged.  The
+         * stripe locking code also uses it to hand off
+         * the stripe lock to the next pending IO
+         */
+        struct list_head plug_list;
+        /*
+         * flags that tell us if it is safe to
+         * merge with this bio
+         */
+        unsigned long flags;
+        /* size of each individual stripe on disk */
+        int stripe_len;
+        /* number of data stripes (no p/q) */
+        int nr_data;
+        /*
+         * set if we're doing a parity rebuild
+         * for a read from higher up, which is handled
+         * differently from a parity rebuild as part of
+         * rmw
+         */
+        int read_rebuild;
+        /* first bad stripe */
+        int faila;
+        /* second bad stripe (for raid6 use) */
+        int failb;
+        /*
+         * number of pages needed to represent the full
+         * stripe
+         */
+        int nr_pages;
+        /*
+         * size of all the bios in the bio_list.  This
+         * helps us decide if the rbio maps to a full
+         * stripe or not
+         */
+        int bio_list_bytes;
+        atomic_t refs;
+        /*
+         * these are two arrays of pointers.  We allocate the
+         * rbio big enough to hold them both and setup their
+         * locations when the rbio is allocated
+         */
+        /* pointers to pages that we allocated for
+         * reading/writing stripes directly from the disk (including P/Q)
+         */
+        struct page **stripe_pages;
+        /*
+         * pointers to the pages in the bio_list.  Stored
+         * here for faster lookup
+         */
+        struct page **bio_pages;
+};
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
+static void rmw_work(struct btrfs_work *work);
+static void read_rebuild_work(struct btrfs_work *work);
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
+static void async_read_rebuild(struct btrfs_raid_bio *rbio);
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
+static void __free_raid_bio(struct btrfs_raid_bio *rbio);
+static void index_rbio_pages(struct btrfs_raid_bio *rbio);
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
+/*
+ * the stripe hash table is used for locking, and to collect
+ * bios in hopes of making a full stripe
+ */
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
+{
+        struct btrfs_stripe_hash_table *table;
+        struct btrfs_stripe_hash_table *x;
+        struct btrfs_stripe_hash *cur;
+        struct btrfs_stripe_hash *h;
+        int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
+        int i;
+        int table_size;
+        if (info->stripe_hash_table)
+                return 0;
+        /*
+         * The table is large, starting with order 4 and can go as high as
+         * order 7 in case lock debugging is turned on.
+         *
+         * Try harder to allocate and fallback to vmalloc to lower the chance
+         * of a failing mount.
+         */
+        table_size = sizeof(*table) + sizeof(*h) * num_entries;
+        table = kzalloc(table_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+        if (!table) {
+                table = vzalloc(table_size);
+                if (!table)
+                        return -ENOMEM;
+        }
+        spin_lock_init(&table->cache_lock);
+        INIT_LIST_HEAD(&table->stripe_cache);
+        h = table->table;
+        for (i = 0; i < num_entries; i++) {
+                cur = h + i;
+                INIT_LIST_HEAD(&cur->hash_list);
+                spin_lock_init(&cur->lock);
+                init_waitqueue_head(&cur->wait);
+        }
+        x = cmpxchg(&info->stripe_hash_table, NULL, table);
+        if (x) {
+                if (is_vmalloc_addr(x))
+                        vfree(x);
+                else
+                        kfree(x);
+        }
+        return 0;
+}
+/*
+ * caching an rbio means to copy anything from the
+ * bio_pages array into the stripe_pages array.  We
+ * use the page uptodate bit in the stripe cache array
+ * to indicate if it has valid data
+ *
+ * once the caching is done, we set the cache ready
+ * bit.
+ */
+static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+        int i;
+        char *s;
+        char *d;
+        int ret;
+        ret = alloc_rbio_pages(rbio);
+        if (ret)
+                return;
+        for (i = 0; i < rbio->nr_pages; i++) {
+                if (!rbio->bio_pages[i])
+                        continue;
+                s = kmap(rbio->bio_pages[i]);
+                d = kmap(rbio->stripe_pages[i]);
+                memcpy(d, s, PAGE_CACHE_SIZE);
+                kunmap(rbio->bio_pages[i]);
+                kunmap(rbio->stripe_pages[i]);
+                SetPageUptodate(rbio->stripe_pages[i]);
+        }
+        set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+}
+/*
+ * we hash on the first logical address of the stripe
+ */
+static int rbio_bucket(struct btrfs_raid_bio *rbio)
+{
+        u64 num = rbio->raid_map[0];
+        /*
+         * we shift down quite a bit.  We're using byte
+         * addressing, and most of the lower bits are zeros.
+         * This tends to upset hash_64, and it consistently
+         * returns just one or two different values.
+         *
+         * shifting off the lower bits fixes things.
+         */
+        return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
+}
+/*
+ * stealing an rbio means taking all the uptodate pages from the stripe
+ * array in the source rbio and putting them into the destination rbio
+ */
+static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
+{
+        int i;
+        struct page *s;
+        struct page *d;
+        if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
+                return;
+        for (i = 0; i < dest->nr_pages; i++) {
+                s = src->stripe_pages[i];
+                if (!s || !PageUptodate(s)) {
+                        continue;
+                }
+                d = dest->stripe_pages[i];
+                if (d)
+                        __free_page(d);
+                dest->stripe_pages[i] = s;
+                src->stripe_pages[i] = NULL;
+        }
+}
+/*
+ * merging means we take the bio_list from the victim and
+ * splice it into the destination.  The victim should
+ * be discarded afterwards.
+ *
+ * must be called with dest->rbio_list_lock held
+ */
+static void merge_rbio(struct btrfs_raid_bio *dest,
+                       struct btrfs_raid_bio *victim)
+{
+        bio_list_merge(&dest->bio_list, &victim->bio_list);
+        dest->bio_list_bytes += victim->bio_list_bytes;
+        bio_list_init(&victim->bio_list);
+}
+/*
+ * used to prune items that are in the cache.  The caller
+ * must hold the hash table lock.
+ */
+static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+        int bucket = rbio_bucket(rbio);
+        struct btrfs_stripe_hash_table *table;
+        struct btrfs_stripe_hash *h;
+        int freeit = 0;
+        /*
+         * check the bit again under the hash table lock.
+         */
+        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+                return;
+        table = rbio->fs_info->stripe_hash_table;
+        h = table->table + bucket;
+        /* hold the lock for the bucket because we may be
+         * removing it from the hash table
+         */
+        spin_lock(&h->lock);
+        /*
+         * hold the lock for the bio list because we need
+         * to make sure the bio list is empty
+         */
+        spin_lock(&rbio->bio_list_lock);
+        if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+                list_del_init(&rbio->stripe_cache);
+                table->cache_size -= 1;
+                freeit = 1;
+                /* if the bio list isn't empty, this rbio is
+                 * still involved in an IO.  We take it out
+                 * of the cache list, and drop the ref that
+                 * was held for the list.
+                 *
+                 * If the bio_list was empty, we also remove
+                 * the rbio from the hash_table, and drop
+                 * the corresponding ref
+                 */
+                if (bio_list_empty(&rbio->bio_list)) {
+                        if (!list_empty(&rbio->hash_list)) {
+                                list_del_init(&rbio->hash_list);
+                                atomic_dec(&rbio->refs);
+                                BUG_ON(!list_empty(&rbio->plug_list));
+                        }
+                }
+        }
+        spin_unlock(&rbio->bio_list_lock);
+        spin_unlock(&h->lock);
+        if (freeit)
+                __free_raid_bio(rbio);
+}
+/*
+ * prune a given rbio from the cache
+ */
+static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
+{
+        struct btrfs_stripe_hash_table *table;
+        unsigned long flags;
+        if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
+                return;
+        table = rbio->fs_info->stripe_hash_table;
+        spin_lock_irqsave(&table->cache_lock, flags);
+        __remove_rbio_from_cache(rbio);
+        spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+/*
+ * remove everything in the cache
+ */
+void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
+{
+        struct btrfs_stripe_hash_table *table;
+        unsigned long flags;
+        struct btrfs_raid_bio *rbio;
+        table = info->stripe_hash_table;
+        spin_lock_irqsave(&table->cache_lock, flags);
+        while (!list_empty(&table->stripe_cache)) {
+                rbio = list_entry(table->stripe_cache.next,
+                                  struct btrfs_raid_bio,
+                                  stripe_cache);
+                __remove_rbio_from_cache(rbio);
+        }
+        spin_unlock_irqrestore(&table->cache_lock, flags);
+}
+/*
+ * remove all cached entries and free the hash table
+ * used by unmount
+ */
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
+{
+        if (!info->stripe_hash_table)
+                return;
+        btrfs_clear_rbio_cache(info);
+        if (is_vmalloc_addr(info->stripe_hash_table))
+                vfree(info->stripe_hash_table);
+        else
+                kfree(info->stripe_hash_table);
+        info->stripe_hash_table = NULL;
+}
+/*
+ * insert an rbio into the stripe cache.  It
+ * must have already been prepared by calling
+ * cache_rbio_pages
+ *
+ * If this rbio was already cached, it gets
+ * moved to the front of the lru.
+ *
+ * If the size of the rbio cache is too big, we
+ * prune an item.
+ */
+static void cache_rbio(struct btrfs_raid_bio *rbio)
+{
+        struct btrfs_stripe_hash_table *table;
+        unsigned long flags;
+        if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
+                return;
+        table = rbio->fs_info->stripe_hash_table;
+        spin_lock_irqsave(&table->cache_lock, flags);
+        spin_lock(&rbio->bio_list_lock);
+        /* bump our ref if we were not in the list before */
+        if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
+                atomic_inc(&rbio->refs);
+        if (!list_empty(&rbio->stripe_cache)){
+                list_move(&rbio->stripe_cache, &table->stripe_cache);
+        } else {
+                list_add(&rbio->stripe_cache, &table->stripe_cache);
+                table->cache_size += 1;
+        }
+        spin_unlock(&rbio->bio_list_lock);
+        if (table->cache_size > RBIO_CACHE_SIZE) {
+                struct btrfs_raid_bio *found;
+                found = list_entry(table->stripe_cache.prev,
+                                  struct btrfs_raid_bio,
+                                  stripe_cache);
+                if (found != rbio)
+                        __remove_rbio_from_cache(found);
+        }
+        spin_unlock_irqrestore(&table->cache_lock, flags);
+        return;
+}
+/*
+ * helper function to run the xor_blocks api.  It is only
+ * able to do MAX_XOR_BLOCKS at a time, so we need to
+ * loop through.
+ */
+static void run_xor(void **pages, int src_cnt, ssize_t len)
+{
+        int src_off = 0;
+        int xor_src_cnt = 0;
+        void *dest = pages[src_cnt];
+        while(src_cnt > 0) {
+                xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
+                xor_blocks(xor_src_cnt, len, dest, pages + src_off);
+                src_cnt -= xor_src_cnt;
+                src_off += xor_src_cnt;
+        }
+}
+/*
+ * returns true if the bio list inside this rbio
+ * covers an entire stripe (no rmw required).
+ * Must be called with the bio list lock held, or
+ * at a time when you know it is impossible to add
+ * new bios into the list
+ */
+static int __rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+        unsigned long size = rbio->bio_list_bytes;
+        int ret = 1;
+        if (size != rbio->nr_data * rbio->stripe_len)
+                ret = 0;
+        BUG_ON(size > rbio->nr_data * rbio->stripe_len);
+        return ret;
+}
+static int rbio_is_full(struct btrfs_raid_bio *rbio)
+{
+        unsigned long flags;
+        int ret;
+        spin_lock_irqsave(&rbio->bio_list_lock, flags);
+        ret = __rbio_is_full(rbio);
+        spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+        return ret;
+}
+/*
+ * returns 1 if it is safe to merge two rbios together.
+ * The merging is safe if the two rbios correspond to
+ * the same stripe and if they are both going in the same
+ * direction (read vs write), and if neither one is
+ * locked for final IO
+ *
+ * The caller is responsible for locking such that
+ * rmw_locked is safe to test
+ */
+static int rbio_can_merge(struct btrfs_raid_bio *last,
+                          struct btrfs_raid_bio *cur)
+{
+        if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
+            test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
+                return 0;
+        /*
+         * we can't merge with cached rbios, since the
+         * idea is that when we merge the destination
+         * rbio is going to run our IO for us.  We can
+         * steal from cached rbio's though, other functions
+         * handle that.
+         */
+        if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
+            test_bit(RBIO_CACHE_BIT, &cur->flags))
+                return 0;
+        if (last->raid_map[0] !=
+            cur->raid_map[0])
+                return 0;
+        /* reads can't merge with writes */
+        if (last->read_rebuild !=
+            cur->read_rebuild) {
+                return 0;
+        }
+        return 1;
+}
+/*
+ * helper to index into the pstripe
+ */
+static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+        index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+        return rbio->stripe_pages[index];
+}
+/*
+ * helper to index into the qstripe, returns null
+ * if there is no qstripe
+ */
+static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
+{
+        if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
+                return NULL;
+        index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
+                PAGE_CACHE_SHIFT;
+        return rbio->stripe_pages[index];
+}
+/*
+ * The first stripe in the table for a logical address
+ * has the lock.  rbios are added in one of three ways:
+ *
+ * 1) Nobody has the stripe locked yet.  The rbio is given
+ * the lock and 0 is returned.  The caller must start the IO
+ * themselves.
+ *
+ * 2) Someone has the stripe locked, but we're able to merge
+ * with the lock owner.  The rbio is freed and the IO will
+ * start automatically along with the existing rbio.  1 is returned.
+ *
+ * 3) Someone has the stripe locked, but we're not able to merge.
+ * The rbio is added to the lock owner's plug list, or merged into
+ * an rbio already on the plug list.  When the lock owner unlocks,
+ * the next rbio on the list is run and the IO is started automatically.
+ * 1 is returned
+ *
+ * If we return 0, the caller still owns the rbio and must continue with
+ * IO submission.  If we return 1, the caller must assume the rbio has
+ * already been freed.
+ */
+static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
+{
+        int bucket = rbio_bucket(rbio);
+        struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
+        struct btrfs_raid_bio *cur;
+        struct btrfs_raid_bio *pending;
+        unsigned long flags;
+        DEFINE_WAIT(wait);
+        struct btrfs_raid_bio *freeit = NULL;
+        struct btrfs_raid_bio *cache_drop = NULL;
+        int ret = 0;
+        int walk = 0;
+        spin_lock_irqsave(&h->lock, flags);
+        list_for_each_entry(cur, &h->hash_list, hash_list) {
+                walk++;
+                if (cur->raid_map[0] == rbio->raid_map[0]) {
+                        spin_lock(&cur->bio_list_lock);
+                        /* can we steal this cached rbio's pages? */
+                        if (bio_list_empty(&cur->bio_list) &&
+                            list_empty(&cur->plug_list) &&
+                            test_bit(RBIO_CACHE_BIT, &cur->flags) &&
+                            !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
+                                list_del_init(&cur->hash_list);
+                                atomic_dec(&cur->refs);
+                                steal_rbio(cur, rbio);
+                                cache_drop = cur;
+                                spin_unlock(&cur->bio_list_lock);
+                                goto lockit;
+                        }
+                        /* can we merge into the lock owner? */
+                        if (rbio_can_merge(cur, rbio)) {
+                                merge_rbio(cur, rbio);
+                                spin_unlock(&cur->bio_list_lock);
+                                freeit = rbio;
+                                ret = 1;
+                                goto out;
+                        }
+                        /*
+                         * we couldn't merge with the running
+                         * rbio, see if we can merge with the
+                         * pending ones.  We don't have to
+                         * check for rmw_locked because there
+                         * is no way they are inside finish_rmw
+                         * right now
+                         */
+                        list_for_each_entry(pending, &cur->plug_list,
+                                            plug_list) {
+                                if (rbio_can_merge(pending, rbio)) {
+                                        merge_rbio(pending, rbio);
+                                        spin_unlock(&cur->bio_list_lock);
+                                        freeit = rbio;
+                                        ret = 1;
+                                        goto out;
+                                }
+                        }
+                        /* no merging, put us on the tail of the plug list,
+                         * our rbio will be started with the currently
+                         * running rbio unlocks
+                         */
+                        list_add_tail(&rbio->plug_list, &cur->plug_list);
+                        spin_unlock(&cur->bio_list_lock);
+                        ret = 1;
+                        goto out;
+                }
+        }
+lockit:
+        atomic_inc(&rbio->refs);
+        list_add(&rbio->hash_list, &h->hash_list);
+out:
+        spin_unlock_irqrestore(&h->lock, flags);
+        if (cache_drop)
+                remove_rbio_from_cache(cache_drop);
+        if (freeit)
+                __free_raid_bio(freeit);
+        return ret;
+}
+/*
+ * called as rmw or parity rebuild is completed.  If the plug list has more
+ * rbios waiting for this stripe, the next one on the list will be started
+ */
+static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
+{
+        int bucket;
+        struct btrfs_stripe_hash *h;
+        unsigned long flags;
+        int keep_cache = 0;
+        bucket = rbio_bucket(rbio);
+        h = rbio->fs_info->stripe_hash_table->table + bucket;
+        if (list_empty(&rbio->plug_list))
+                cache_rbio(rbio);
+        spin_lock_irqsave(&h->lock, flags);
+        spin_lock(&rbio->bio_list_lock);
+        if (!list_empty(&rbio->hash_list)) {
+                /*
+                 * if we're still cached and there is no other IO
+                 * to perform, just leave this rbio here for others
+                 * to steal from later
+                 */
+                if (list_empty(&rbio->plug_list) &&
+                    test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
+                        keep_cache = 1;
+                        clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+                        BUG_ON(!bio_list_empty(&rbio->bio_list));
+                        goto done;
+                }
+                list_del_init(&rbio->hash_list);
+                atomic_dec(&rbio->refs);
+                /*
+                 * we use the plug list to hold all the rbios
+                 * waiting for the chance to lock this stripe.
+                 * hand the lock over to one of them.
+                 */
+                if (!list_empty(&rbio->plug_list)) {
+                        struct btrfs_raid_bio *next;
+                        struct list_head *head = rbio->plug_list.next;
+                        next = list_entry(head, struct btrfs_raid_bio,
+                                          plug_list);
+                        list_del_init(&rbio->plug_list);
+                        list_add(&next->hash_list, &h->hash_list);
+                        atomic_inc(&next->refs);
+                        spin_unlock(&rbio->bio_list_lock);
+                        spin_unlock_irqrestore(&h->lock, flags);
+                        if (next->read_rebuild)
+                                async_read_rebuild(next);
+                        else {
+                                steal_rbio(rbio, next);
+                                async_rmw_stripe(next);
+                        }
+                        goto done_nolock;
+                } else  if (waitqueue_active(&h->wait)) {
+                        spin_unlock(&rbio->bio_list_lock);
+                        spin_unlock_irqrestore(&h->lock, flags);
+                        wake_up(&h->wait);
+                        goto done_nolock;
+                }
+        }
+done:
+        spin_unlock(&rbio->bio_list_lock);
+        spin_unlock_irqrestore(&h->lock, flags);
+done_nolock:
+        if (!keep_cache)
+                remove_rbio_from_cache(rbio);
+}
+static void __free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+        int i;
+        WARN_ON(atomic_read(&rbio->refs) < 0);
+        if (!atomic_dec_and_test(&rbio->refs))
+                return;
+        WARN_ON(!list_empty(&rbio->stripe_cache));
+        WARN_ON(!list_empty(&rbio->hash_list));
+        WARN_ON(!bio_list_empty(&rbio->bio_list));
+        for (i = 0; i < rbio->nr_pages; i++) {
+                if (rbio->stripe_pages[i]) {
+                        __free_page(rbio->stripe_pages[i]);
+                        rbio->stripe_pages[i] = NULL;
+                }
+        }
+        kfree(rbio->raid_map);
+        kfree(rbio->bbio);
+        kfree(rbio);
+}
+static void free_raid_bio(struct btrfs_raid_bio *rbio)
+{
+        unlock_stripe(rbio);
+        __free_raid_bio(rbio);
+}
+/*
+ * this frees the rbio and runs through all the bios in the
+ * bio_list and calls end_io on them
+ */
+static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
+{
+        struct bio *cur = bio_list_get(&rbio->bio_list);
+        struct bio *next;
+        free_raid_bio(rbio);
+        while (cur) {
+                next = cur->bi_next;
+                cur->bi_next = NULL;
+                if (uptodate)
+                        set_bit(BIO_UPTODATE, &cur->bi_flags);
+                bio_endio(cur, err);
+                cur = next;
+        }
+}
+/*
+ * end io function used by finish_rmw.  When we finally
+ * get here, we've written a full stripe
+ */
+static void raid_write_end_io(struct bio *bio, int err)
+{
+        struct btrfs_raid_bio *rbio = bio->bi_private;
+        if (err)
+                fail_bio_stripe(rbio, bio);
+        bio_put(bio);
+        if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+                return;
+        err = 0;
+        /* OK, we have read all the stripes we need to. */
+        if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+                err = -EIO;
+        rbio_orig_end_io(rbio, err, 0);
+        return;
+}
+/*
+ * the read/modify/write code wants to use the original bio for
+ * any pages it included, and then use the rbio for everything
+ * else.  This function decides if a given index (stripe number)
+ * and page number in that stripe fall inside the original bio
+ * or the rbio.
+ *
+ * if you set bio_list_only, you'll get a NULL back for any ranges
+ * that are outside the bio_list
+ *
+ * This doesn't take any refs on anything, you get a bare page pointer
+ * and the caller must bump refs as required.
+ *
+ * You must call index_rbio_pages once before you can trust
+ * the answers from this function.
+ */
+static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
+                                 int index, int pagenr, int bio_list_only)
+{
+        int chunk_page;
+        struct page *p = NULL;
+        chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
+        spin_lock_irq(&rbio->bio_list_lock);
+        p = rbio->bio_pages[chunk_page];
+        spin_unlock_irq(&rbio->bio_list_lock);
+        if (p || bio_list_only)
+                return p;
+        return rbio->stripe_pages[chunk_page];
+}
+/*
+ * number of pages we need for the entire stripe across all the
+ * drives
+ */
+static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
+{
+        unsigned long nr = stripe_len * nr_stripes;
+        return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+/*
+ * allocation and initial setup for the btrfs_raid_bio.  Not
+ * this does not allocate any pages for rbio->pages.
+ */
+static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
+                          struct btrfs_bio *bbio, u64 *raid_map,
+                          u64 stripe_len)
+{
+        struct btrfs_raid_bio *rbio;
+        int nr_data = 0;
+        int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
+        void *p;
+        rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
+                        GFP_NOFS);
+        if (!rbio) {
+                kfree(raid_map);
+                kfree(bbio);
+                return ERR_PTR(-ENOMEM);
+        }
+        bio_list_init(&rbio->bio_list);
+        INIT_LIST_HEAD(&rbio->plug_list);
+        spin_lock_init(&rbio->bio_list_lock);
+        INIT_LIST_HEAD(&rbio->stripe_cache);
+        INIT_LIST_HEAD(&rbio->hash_list);
+        rbio->bbio = bbio;
+        rbio->raid_map = raid_map;
+        rbio->fs_info = root->fs_info;
+        rbio->stripe_len = stripe_len;
+        rbio->nr_pages = num_pages;
+        rbio->faila = -1;
+        rbio->failb = -1;
+        atomic_set(&rbio->refs, 1);
+        /*
+         * the stripe_pages and bio_pages array point to the extra
+         * memory we allocated past the end of the rbio
+         */
+        p = rbio + 1;
+        rbio->stripe_pages = p;
+        rbio->bio_pages = p + sizeof(struct page *) * num_pages;
+        if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
+                nr_data = bbio->num_stripes - 2;
+        else
+                nr_data = bbio->num_stripes - 1;
+        rbio->nr_data = nr_data;
+        return rbio;
+}
+/* allocate pages for all the stripes in the bio, including parity */
+static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+        int i;
+        struct page *page;
+        for (i = 0; i < rbio->nr_pages; i++) {
+                if (rbio->stripe_pages[i])
+                        continue;
+                page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                if (!page)
+                        return -ENOMEM;
+                rbio->stripe_pages[i] = page;
+                ClearPageUptodate(page);
+        }
+        return 0;
+}
+/* allocate pages for just the p/q stripes */
+static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
+{
+        int i;
+        struct page *page;
+        i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+        for (; i < rbio->nr_pages; i++) {
+                if (rbio->stripe_pages[i])
+                        continue;
+                page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                if (!page)
+                        return -ENOMEM;
+                rbio->stripe_pages[i] = page;
+        }
+        return 0;
+}
+/*
+ * add a single page from a specific stripe into our list of bios for IO
+ * this will try to merge into existing bios if possible, and returns
+ * zero if all went well.
+ */
+int rbio_add_io_page(struct btrfs_raid_bio *rbio,
+                     struct bio_list *bio_list,
+                     struct page *page,
+                     int stripe_nr,
+                     unsigned long page_index,
+                     unsigned long bio_max_len)
+{
+        struct bio *last = bio_list->tail;
+        u64 last_end = 0;
+        int ret;
+        struct bio *bio;
+        struct btrfs_bio_stripe *stripe;
+        u64 disk_start;
+        stripe = &rbio->bbio->stripes[stripe_nr];
+        disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
+        /* if the device is missing, just fail this stripe */
+        if (!stripe->dev->bdev)
+                return fail_rbio_index(rbio, stripe_nr);
+        /* see if we can add this page onto our existing bio */
+        if (last) {
+                last_end = (u64)last->bi_sector << 9;
+                last_end += last->bi_size;
+                /*
+                 * we can't merge these if they are from different
+                 * devices or if they are not contiguous
+                 */
+                if (last_end == disk_start && stripe->dev->bdev &&
+                    test_bit(BIO_UPTODATE, &last->bi_flags) &&
+                    last->bi_bdev == stripe->dev->bdev) {
+                        ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
+                        if (ret == PAGE_CACHE_SIZE)
+                                return 0;
+                }
+        }
+        /* put a new bio on the list */
+        bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_size = 0;
+        bio->bi_bdev = stripe->dev->bdev;
+        bio->bi_sector = disk_start >> 9;
+        set_bit(BIO_UPTODATE, &bio->bi_flags);
+        bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
+        bio_list_add(bio_list, bio);
+        return 0;
+}
+/*
+ * while we're doing the read/modify/write cycle, we could
+ * have errors in reading pages off the disk.  This checks
+ * for errors and if we're not able to read the page it'll
+ * trigger parity reconstruction.  The rmw will be finished
+ * after we've reconstructed the failed stripes
+ */
+static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
+{
+        if (rbio->faila >= 0 || rbio->failb >= 0) {
+                BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
+                __raid56_parity_recover(rbio);
+        } else {
+                finish_rmw(rbio);
+        }
+}
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
+{
+        int index;
+        index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
+        index += page;
+        return rbio->stripe_pages[index];
+}
+/*
+ * helper function to walk our bio list and populate the bio_pages array with
+ * the result.  This seems expensive, but it is faster than constantly
+ * searching through the bio list as we setup the IO in finish_rmw or stripe
+ * reconstruction.
+ *
+ * This must be called before you trust the answers from page_in_rbio
+ */
+static void index_rbio_pages(struct btrfs_raid_bio *rbio)
+{
+        struct bio *bio;
+        u64 start;
+        unsigned long stripe_offset;
+        unsigned long page_index;
+        struct page *p;
+        int i;
+        spin_lock_irq(&rbio->bio_list_lock);
+        bio_list_for_each(bio, &rbio->bio_list) {
+                start = (u64)bio->bi_sector << 9;
+                stripe_offset = start - rbio->raid_map[0];
+                page_index = stripe_offset >> PAGE_CACHE_SHIFT;
+                for (i = 0; i < bio->bi_vcnt; i++) {
+                        p = bio->bi_io_vec[i].bv_page;
+                        rbio->bio_pages[page_index + i] = p;
+                }
+        }
+        spin_unlock_irq(&rbio->bio_list_lock);
+}
+/*
+ * this is called from one of two situations.  We either
+ * have a full stripe from the higher layers, or we've read all
+ * the missing bits off disk.
+ *
+ * This will calculate the parity and then send down any
+ * changed blocks.
+ */
+static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
+{
+        struct btrfs_bio *bbio = rbio->bbio;
+        void *pointers[bbio->num_stripes];
+        int stripe_len = rbio->stripe_len;
+        int nr_data = rbio->nr_data;
+        int stripe;
+        int pagenr;
+        int p_stripe = -1;
+        int q_stripe = -1;
+        struct bio_list bio_list;
+        struct bio *bio;
+        int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
+        int ret;
+        bio_list_init(&bio_list);
+        if (bbio->num_stripes - rbio->nr_data == 1) {
+                p_stripe = bbio->num_stripes - 1;
+        } else if (bbio->num_stripes - rbio->nr_data == 2) {
+                p_stripe = bbio->num_stripes - 2;
+                q_stripe = bbio->num_stripes - 1;
+        } else {
+                BUG();
+        }
+        /* at this point we either have a full stripe,
+         * or we've read the full stripe from the drive.
+         * recalculate the parity and write the new results.
+         *
+         * We're not allowed to add any new bios to the
+         * bio list here, anyone else that wants to
+         * change this stripe needs to do their own rmw.
+         */
+        spin_lock_irq(&rbio->bio_list_lock);
+        set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+        spin_unlock_irq(&rbio->bio_list_lock);
+        atomic_set(&rbio->bbio->error, 0);
+        /*
+         * now that we've set rmw_locked, run through the
+         * bio list one last time and map the page pointers
+         *
+         * We don't cache full rbios because we're assuming
+         * the higher layers are unlikely to use this area of
+         * the disk again soon.  If they do use it again,
+         * hopefully they will send another full bio.
+         */
+        index_rbio_pages(rbio);
+        if (!rbio_is_full(rbio))
+                cache_rbio_pages(rbio);
+        else
+                clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+        for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+                struct page *p;
+                /* first collect one page from each data stripe */
+                for (stripe = 0; stripe < nr_data; stripe++) {
+                        p = page_in_rbio(rbio, stripe, pagenr, 0);
+                        pointers[stripe] = kmap(p);
+                }
+                /* then add the parity stripe */
+                p = rbio_pstripe_page(rbio, pagenr);
+                SetPageUptodate(p);
+                pointers[stripe++] = kmap(p);
+                if (q_stripe != -1) {
+                        /*
+                         * raid6, add the qstripe and call the
+                         * library function to fill in our p/q
+                         */
+                        p = rbio_qstripe_page(rbio, pagenr);
+                        SetPageUptodate(p);
+                        pointers[stripe++] = kmap(p);
+                        raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
+                                                pointers);
+                } else {
+                        /* raid5 */
+                        memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
+                        run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
+                }
+                for (stripe = 0; stripe < bbio->num_stripes; stripe++)
+                        kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+        }
+        /*
+         * time to start writing.  Make bios for everything from the
+         * higher layers (the bio_list in our rbio) and our p/q.  Ignore
+         * everything else.
+         */
+        for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+                for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+                        struct page *page;
+                        if (stripe < rbio->nr_data) {
+                                page = page_in_rbio(rbio, stripe, pagenr, 1);
+                                if (!page)
+                                        continue;
+                        } else {
+                               page = rbio_stripe_page(rbio, stripe, pagenr);
+                        }
+                        ret = rbio_add_io_page(rbio, &bio_list,
+                                       page, stripe, pagenr, rbio->stripe_len);
+                        if (ret)
+                                goto cleanup;
+                }
+        }
+        atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
+        BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
+        while (1) {
+                bio = bio_list_pop(&bio_list);
+                if (!bio)
+                        break;
+                bio->bi_private = rbio;
+                bio->bi_end_io = raid_write_end_io;
+                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+                submit_bio(WRITE, bio);
+        }
+        return;
+cleanup:
+        rbio_orig_end_io(rbio, -EIO, 0);
+}
+/*
+ * helper to find the stripe number for a given bio.  Used to figure out which
+ * stripe has failed.  This expects the bio to correspond to a physical disk,
+ * so it looks up based on physical sector numbers.
+ */
+static int find_bio_stripe(struct btrfs_raid_bio *rbio,
+                           struct bio *bio)
+{
+        u64 physical = bio->bi_sector;
+        u64 stripe_start;
+        int i;
+        struct btrfs_bio_stripe *stripe;
+        physical <<= 9;
+        for (i = 0; i < rbio->bbio->num_stripes; i++) {
+                stripe = &rbio->bbio->stripes[i];
+                stripe_start = stripe->physical;
+                if (physical >= stripe_start &&
+                    physical < stripe_start + rbio->stripe_len) {
+                        return i;
+                }
+        }
+        return -1;
+}
+/*
+ * helper to find the stripe number for a given
+ * bio (before mapping).  Used to figure out which stripe has
+ * failed.  This looks up based on logical block numbers.
+ */
+static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
+                                   struct bio *bio)
+{
+        u64 logical = bio->bi_sector;
+        u64 stripe_start;
+        int i;
+        logical <<= 9;
+        for (i = 0; i < rbio->nr_data; i++) {
+                stripe_start = rbio->raid_map[i];
+                if (logical >= stripe_start &&
+                    logical < stripe_start + rbio->stripe_len) {
+                        return i;
+                }
+        }
+        return -1;
+}
+/*
+ * returns -EIO if we had too many failures
+ */
+static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
+{
+        unsigned long flags;
+        int ret = 0;
+        spin_lock_irqsave(&rbio->bio_list_lock, flags);
+        /* we already know this stripe is bad, move on */
+        if (rbio->faila == failed || rbio->failb == failed)
+                goto out;
+        if (rbio->faila == -1) {
+                /* first failure on this rbio */
+                rbio->faila = failed;
+                atomic_inc(&rbio->bbio->error);
+        } else if (rbio->failb == -1) {
+                /* second failure on this rbio */
+                rbio->failb = failed;
+                atomic_inc(&rbio->bbio->error);
+        } else {
+                ret = -EIO;
+        }
+out:
+        spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
+        return ret;
+}
+/*
+ * helper to fail a stripe based on a physical disk
+ * bio.
+ */
+static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
+                           struct bio *bio)
+{
+        int failed = find_bio_stripe(rbio, bio);
+        if (failed < 0)
+                return -EIO;
+        return fail_rbio_index(rbio, failed);
+}
+/*
+ * this sets each page in the bio uptodate.  It should only be used on private
+ * rbio pages, nothing that comes in from the higher layers
+ */
+static void set_bio_pages_uptodate(struct bio *bio)
+{
+        int i;
+        struct page *p;
+        for (i = 0; i < bio->bi_vcnt; i++) {
+                p = bio->bi_io_vec[i].bv_page;
+                SetPageUptodate(p);
+        }
+}
+/*
+ * end io for the read phase of the rmw cycle.  All the bios here are physical
+ * stripe bios we've read from the disk so we can recalculate the parity of the
+ * stripe.
+ *
+ * This will usually kick off finish_rmw once all the bios are read in, but it
+ * may trigger parity reconstruction if we had any errors along the way
+ */
+static void raid_rmw_end_io(struct bio *bio, int err)
+{
+        struct btrfs_raid_bio *rbio = bio->bi_private;
+        if (err)
+                fail_bio_stripe(rbio, bio);
+        else
+                set_bio_pages_uptodate(bio);
+        bio_put(bio);
+        if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+                return;
+        err = 0;
+        if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+                goto cleanup;
+        /*
+         * this will normally call finish_rmw to start our write
+         * but if there are any failed stripes we'll reconstruct
+         * from parity first
+         */
+        validate_rbio_for_rmw(rbio);
+        return;
+cleanup:
+        rbio_orig_end_io(rbio, -EIO, 0);
+}
+static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+        rbio->work.flags = 0;
+        rbio->work.func = rmw_work;
+        btrfs_queue_worker(&rbio->fs_info->rmw_workers,
+                           &rbio->work);
+}
+static void async_read_rebuild(struct btrfs_raid_bio *rbio)
+{
+        rbio->work.flags = 0;
+        rbio->work.func = read_rebuild_work;
+        btrfs_queue_worker(&rbio->fs_info->rmw_workers,
+                           &rbio->work);
+}
+/*
+ * the stripe must be locked by the caller.  It will
+ * unlock after all the writes are done
+ */
+static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
+{
+        int bios_to_read = 0;
+        struct btrfs_bio *bbio = rbio->bbio;
+        struct bio_list bio_list;
+        int ret;
+        int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        int pagenr;
+        int stripe;
+        struct bio *bio;
+        bio_list_init(&bio_list);
+        ret = alloc_rbio_pages(rbio);
+        if (ret)
+                goto cleanup;
+        index_rbio_pages(rbio);
+        atomic_set(&rbio->bbio->error, 0);
+        /*
+         * build a list of bios to read all the missing parts of this
+         * stripe
+         */
+        for (stripe = 0; stripe < rbio->nr_data; stripe++) {
+                for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+                        struct page *page;
+                        /*
+                         * we want to find all the pages missing from
+                         * the rbio and read them from the disk.  If
+                         * page_in_rbio finds a page in the bio list
+                         * we don't need to read it off the stripe.
+                         */
+                        page = page_in_rbio(rbio, stripe, pagenr, 1);
+                        if (page)
+                                continue;
+                        page = rbio_stripe_page(rbio, stripe, pagenr);
+                        /*
+                         * the bio cache may have handed us an uptodate
+                         * page.  If so, be happy and use it
+                         */
+                        if (PageUptodate(page))
+                                continue;
+                        ret = rbio_add_io_page(rbio, &bio_list, page,
+                                       stripe, pagenr, rbio->stripe_len);
+                        if (ret)
+                                goto cleanup;
+                }
+        }
+        bios_to_read = bio_list_size(&bio_list);
+        if (!bios_to_read) {
+                /*
+                 * this can happen if others have merged with
+                 * us, it means there is nothing left to read.
+                 * But if there are missing devices it may not be
+                 * safe to do the full stripe write yet.
+                 */
+                goto finish;
+        }
+        /*
+         * the bbio may be freed once we submit the last bio.  Make sure
+         * not to touch it after that
+         */
+        atomic_set(&bbio->stripes_pending, bios_to_read);
+        while (1) {
+                bio = bio_list_pop(&bio_list);
+                if (!bio)
+                        break;
+                bio->bi_private = rbio;
+                bio->bi_end_io = raid_rmw_end_io;
+                btrfs_bio_wq_end_io(rbio->fs_info, bio,
+                                    BTRFS_WQ_ENDIO_RAID56);
+                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+                submit_bio(READ, bio);
+        }
+        /* the actual write will happen once the reads are done */
+        return 0;
+cleanup:
+        rbio_orig_end_io(rbio, -EIO, 0);
+        return -EIO;
+finish:
+        validate_rbio_for_rmw(rbio);
+        return 0;
+}
+/*
+ * if the upper layers pass in a full stripe, we thank them by only allocating
+ * enough pages to hold the parity, and sending it all down quickly.
+ */
+static int full_stripe_write(struct btrfs_raid_bio *rbio)
+{
+        int ret;
+        ret = alloc_rbio_parity_pages(rbio);
+        if (ret)
+                return ret;
+        ret = lock_stripe_add(rbio);
+        if (ret == 0)
+                finish_rmw(rbio);
+        return 0;
+}
+/*
+ * partial stripe writes get handed over to async helpers.
+ * We're really hoping to merge a few more writes into this
+ * rbio before calculating new parity
+ */
+static int partial_stripe_write(struct btrfs_raid_bio *rbio)
+{
+        int ret;
+        ret = lock_stripe_add(rbio);
+        if (ret == 0)
+                async_rmw_stripe(rbio);
+        return 0;
+}
+/*
+ * sometimes while we were reading from the drive to
+ * recalculate parity, enough new bios come into create
+ * a full stripe.  So we do a check here to see if we can
+ * go directly to finish_rmw
+ */
+static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
+{
+        /* head off into rmw land if we don't have a full stripe */
+        if (!rbio_is_full(rbio))
+                return partial_stripe_write(rbio);
+        return full_stripe_write(rbio);
+}
+/*
+ * We use plugging call backs to collect full stripes.
+ * Any time we get a partial stripe write while plugged
+ * we collect it into a list.  When the unplug comes down,
+ * we sort the list by logical block number and merge
+ * everything we can into the same rbios
+ */
+struct btrfs_plug_cb {
+        struct blk_plug_cb cb;
+        struct btrfs_fs_info *info;
+        struct list_head rbio_list;
+        struct btrfs_work work;
+};
+/*
+ * rbios on the plug list are sorted for easier merging.
+ */
+static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+{
+        struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
+                                                 plug_list);
+        struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
+                                                 plug_list);
+        u64 a_sector = ra->bio_list.head->bi_sector;
+        u64 b_sector = rb->bio_list.head->bi_sector;
+        if (a_sector < b_sector)
+                return -1;
+        if (a_sector > b_sector)
+                return 1;
+        return 0;
+}
+static void run_plug(struct btrfs_plug_cb *plug)
+{
+        struct btrfs_raid_bio *cur;
+        struct btrfs_raid_bio *last = NULL;
+        /*
+         * sort our plug list then try to merge
+         * everything we can in hopes of creating full
+         * stripes.
+         */
+        list_sort(NULL, &plug->rbio_list, plug_cmp);
+        while (!list_empty(&plug->rbio_list)) {
+                cur = list_entry(plug->rbio_list.next,
+                                 struct btrfs_raid_bio, plug_list);
+                list_del_init(&cur->plug_list);
+                if (rbio_is_full(cur)) {
+                        /* we have a full stripe, send it down */
+                        full_stripe_write(cur);
+                        continue;
+                }
+                if (last) {
+                        if (rbio_can_merge(last, cur)) {
+                                merge_rbio(last, cur);
+                                __free_raid_bio(cur);
+                                continue;
+                        }
+                        __raid56_parity_write(last);
+                }
+                last = cur;
+        }
+        if (last) {
+                __raid56_parity_write(last);
+        }
+        kfree(plug);
+}
+/*
+ * if the unplug comes from schedule, we have to push the
+ * work off to a helper thread
+ */
+static void unplug_work(struct btrfs_work *work)
+{
+        struct btrfs_plug_cb *plug;
+        plug = container_of(work, struct btrfs_plug_cb, work);
+        run_plug(plug);
+}
+static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+        struct btrfs_plug_cb *plug;
+        plug = container_of(cb, struct btrfs_plug_cb, cb);
+        if (from_schedule) {
+                plug->work.flags = 0;
+                plug->work.func = unplug_work;
+                btrfs_queue_worker(&plug->info->rmw_workers,
+                                   &plug->work);
+                return;
+        }
+        run_plug(plug);
+}
+/*
+ * our main entry point for writes from the rest of the FS.
+ */
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+                        struct btrfs_bio *bbio, u64 *raid_map,
+                        u64 stripe_len)
+{
+        struct btrfs_raid_bio *rbio;
+        struct btrfs_plug_cb *plug = NULL;
+        struct blk_plug_cb *cb;
+        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+        if (IS_ERR(rbio)) {
+                kfree(raid_map);
+                kfree(bbio);
+                return PTR_ERR(rbio);
+        }
+        bio_list_add(&rbio->bio_list, bio);
+        rbio->bio_list_bytes = bio->bi_size;
+        /*
+         * don't plug on full rbios, just get them out the door
+         * as quickly as we can
+         */
+        if (rbio_is_full(rbio))
+                return full_stripe_write(rbio);
+        cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
+                               sizeof(*plug));
+        if (cb) {
+                plug = container_of(cb, struct btrfs_plug_cb, cb);
+                if (!plug->info) {
+                        plug->info = root->fs_info;
+                        INIT_LIST_HEAD(&plug->rbio_list);
+                }
+                list_add_tail(&rbio->plug_list, &plug->rbio_list);
+        } else {
+                return __raid56_parity_write(rbio);
+        }
+        return 0;
+}
+/*
+ * all parity reconstruction happens here.  We've read in everything
+ * we can find from the drives and this does the heavy lifting of
+ * sorting the good from the bad.
+ */
+static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
+{
+        int pagenr, stripe;
+        void **pointers;
+        int faila = -1, failb = -1;
+        int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        struct page *page;
+        int err;
+        int i;
+        pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
+                           GFP_NOFS);
+        if (!pointers) {
+                err = -ENOMEM;
+                goto cleanup_io;
+        }
+        faila = rbio->faila;
+        failb = rbio->failb;
+        if (rbio->read_rebuild) {
+                spin_lock_irq(&rbio->bio_list_lock);
+                set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
+                spin_unlock_irq(&rbio->bio_list_lock);
+        }
+        index_rbio_pages(rbio);
+        for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+                /* setup our array of pointers with pages
+                 * from each stripe
+                 */
+                for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+                        /*
+                         * if we're rebuilding a read, we have to use
+                         * pages from the bio list
+                         */
+                        if (rbio->read_rebuild &&
+                            (stripe == faila || stripe == failb)) {
+                                page = page_in_rbio(rbio, stripe, pagenr, 0);
+                        } else {
+                                page = rbio_stripe_page(rbio, stripe, pagenr);
+                        }
+                        pointers[stripe] = kmap(page);
+                }
+                /* all raid6 handling here */
+                if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
+                    RAID6_Q_STRIPE) {
+                        /*
+                         * single failure, rebuild from parity raid5
+                         * style
+                         */
+                        if (failb < 0) {
+                                if (faila == rbio->nr_data) {
+                                        /*
+                                         * Just the P stripe has failed, without
+                                         * a bad data or Q stripe.
+                                         * TODO, we should redo the xor here.
+                                         */
+                                        err = -EIO;
+                                        goto cleanup;
+                                }
+                                /*
+                                 * a single failure in raid6 is rebuilt
+                                 * in the pstripe code below
+                                 */
+                                goto pstripe;
+                        }
+                        /* make sure our ps and qs are in order */
+                        if (faila > failb) {
+                                int tmp = failb;
+                                failb = faila;
+                                faila = tmp;
+                        }
+                        /* if the q stripe is failed, do a pstripe reconstruction
+                         * from the xors.
+                         * If both the q stripe and the P stripe are failed, we're
+                         * here due to a crc mismatch and we can't give them the
+                         * data they want
+                         */
+                        if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
+                                if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
+                                        err = -EIO;
+                                        goto cleanup;
+                                }
+                                /*
+                                 * otherwise we have one bad data stripe and
+                                 * a good P stripe.  raid5!
+                                 */
+                                goto pstripe;
+                        }
+                        if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
+                                raid6_datap_recov(rbio->bbio->num_stripes,
+                                                  PAGE_SIZE, faila, pointers);
+                        } else {
+                                raid6_2data_recov(rbio->bbio->num_stripes,
+                                                  PAGE_SIZE, faila, failb,
+                                                  pointers);
+                        }
+                } else {
+                        void *p;
+                        /* rebuild from P stripe here (raid5 or raid6) */
+                        BUG_ON(failb != -1);
+pstripe:
+                        /* Copy parity block into failed block to start with */
+                        memcpy(pointers[faila],
+                               pointers[rbio->nr_data],
+                               PAGE_CACHE_SIZE);
+                        /* rearrange the pointer array */
+                        p = pointers[faila];
+                        for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
+                                pointers[stripe] = pointers[stripe + 1];
+                        pointers[rbio->nr_data - 1] = p;
+                        /* xor in the rest */
+                        run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
+                }
+                /* if we're doing this rebuild as part of an rmw, go through
+                 * and set all of our private rbio pages in the
+                 * failed stripes as uptodate.  This way finish_rmw will
+                 * know they can be trusted.  If this was a read reconstruction,
+                 * other endio functions will fiddle the uptodate bits
+                 */
+                if (!rbio->read_rebuild) {
+                        for (i = 0;  i < nr_pages; i++) {
+                                if (faila != -1) {
+                                        page = rbio_stripe_page(rbio, faila, i);
+                                        SetPageUptodate(page);
+                                }
+                                if (failb != -1) {
+                                        page = rbio_stripe_page(rbio, failb, i);
+                                        SetPageUptodate(page);
+                                }
+                        }
+                }
+                for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
+                        /*
+                         * if we're rebuilding a read, we have to use
+                         * pages from the bio list
+                         */
+                        if (rbio->read_rebuild &&
+                            (stripe == faila || stripe == failb)) {
+                                page = page_in_rbio(rbio, stripe, pagenr, 0);
+                        } else {
+                                page = rbio_stripe_page(rbio, stripe, pagenr);
+                        }
+                        kunmap(page);
+                }
+        }
+        err = 0;
+cleanup:
+        kfree(pointers);
+cleanup_io:
+        if (rbio->read_rebuild) {
+                if (err == 0)
+                        cache_rbio_pages(rbio);
+                else
+                        clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
+                rbio_orig_end_io(rbio, err, err == 0);
+        } else if (err == 0) {
+                rbio->faila = -1;
+                rbio->failb = -1;
+                finish_rmw(rbio);
+        } else {
+                rbio_orig_end_io(rbio, err, 0);
+        }
+}
+/*
+ * This is called only for stripes we've read from disk to
+ * reconstruct the parity.
+ */
+static void raid_recover_end_io(struct bio *bio, int err)
+{
+        struct btrfs_raid_bio *rbio = bio->bi_private;
+        /*
+         * we only read stripe pages off the disk, set them
+         * up to date if there were no errors
+         */
+        if (err)
+                fail_bio_stripe(rbio, bio);
+        else
+                set_bio_pages_uptodate(bio);
+        bio_put(bio);
+        if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
+                return;
+        if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
+                rbio_orig_end_io(rbio, -EIO, 0);
+        else
+                __raid_recover_end_io(rbio);
+}
+/*
+ * reads everything we need off the disk to reconstruct
+ * the parity. endio handlers trigger final reconstruction
+ * when the IO is done.
+ *
+ * This is used both for reads from the higher layers and for
+ * parity construction required to finish a rmw cycle.
+ */
+static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
+{
+        int bios_to_read = 0;
+        struct btrfs_bio *bbio = rbio->bbio;
+        struct bio_list bio_list;
+        int ret;
+        int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+        int pagenr;
+        int stripe;
+        struct bio *bio;
+        bio_list_init(&bio_list);
+        ret = alloc_rbio_pages(rbio);
+        if (ret)
+                goto cleanup;
+        atomic_set(&rbio->bbio->error, 0);
+        /*
+         * read everything that hasn't failed.  Thanks to the
+         * stripe cache, it is possible that some or all of these
+         * pages are going to be uptodate.
+         */
+        for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
+                if (rbio->faila == stripe ||
+                    rbio->failb == stripe)
+                        continue;
+                for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+                        struct page *p;
+                        /*
+                         * the rmw code may have already read this
+                         * page in
+                         */
+                        p = rbio_stripe_page(rbio, stripe, pagenr);
+                        if (PageUptodate(p))
+                                continue;
+                        ret = rbio_add_io_page(rbio, &bio_list,
+                                       rbio_stripe_page(rbio, stripe, pagenr),
+                                       stripe, pagenr, rbio->stripe_len);
+                        if (ret < 0)
+                                goto cleanup;
+                }
+        }
+        bios_to_read = bio_list_size(&bio_list);
+        if (!bios_to_read) {
+                /*
+                 * we might have no bios to read just because the pages
+                 * were up to date, or we might have no bios to read because
+                 * the devices were gone.
+                 */
+                if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
+                        __raid_recover_end_io(rbio);
+                        goto out;
+                } else {
+                        goto cleanup;
+                }
+        }
+        /*
+         * the bbio may be freed once we submit the last bio.  Make sure
+         * not to touch it after that
+         */
+        atomic_set(&bbio->stripes_pending, bios_to_read);
+        while (1) {
+                bio = bio_list_pop(&bio_list);
+                if (!bio)
+                        break;
+                bio->bi_private = rbio;
+                bio->bi_end_io = raid_recover_end_io;
+                btrfs_bio_wq_end_io(rbio->fs_info, bio,
+                                    BTRFS_WQ_ENDIO_RAID56);
+                BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
+                submit_bio(READ, bio);
+        }
+out:
+        return 0;
+cleanup:
+        if (rbio->read_rebuild)
+                rbio_orig_end_io(rbio, -EIO, 0);
+        return -EIO;
+}
+/*
+ * the main entry point for reads from the higher layers.  This
+ * is really only called when the normal read path had a failure,
+ * so we assume the bio they send down corresponds to a failed part
+ * of the drive.
+ */
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+                          struct btrfs_bio *bbio, u64 *raid_map,
+                          u64 stripe_len, int mirror_num)
+{
+        struct btrfs_raid_bio *rbio;
+        int ret;
+        rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
+        if (IS_ERR(rbio)) {
+                return PTR_ERR(rbio);
+        }
+        rbio->read_rebuild = 1;
+        bio_list_add(&rbio->bio_list, bio);
+        rbio->bio_list_bytes = bio->bi_size;
+        rbio->faila = find_logical_bio_stripe(rbio, bio);
+        if (rbio->faila == -1) {
+                BUG();
+                kfree(rbio);
+                return -EIO;
+        }
+        /*
+         * reconstruct from the q stripe if they are
+         * asking for mirror 3
+         */
+        if (mirror_num == 3)
+                rbio->failb = bbio->num_stripes - 2;
+        ret = lock_stripe_add(rbio);
+        /*
+         * __raid56_parity_recover will end the bio with
+         * any errors it hits.  We don't want to return
+         * its error value up the stack because our caller
+         * will end up calling bio_endio with any nonzero
+         * return
+         */
+        if (ret == 0)
+                __raid56_parity_recover(rbio);
+        /*
+         * our rbio has been added to the list of
+         * rbios that will be handled after the
+         * currently lock owner is done
+         */
+        return 0;
+}
+static void rmw_work(struct btrfs_work *work)
+{
+        struct btrfs_raid_bio *rbio;
+        rbio = container_of(work, struct btrfs_raid_bio, work);
+        raid56_rmw_stripe(rbio);
+}
+static void read_rebuild_work(struct btrfs_work *work)
+{
+        struct btrfs_raid_bio *rbio;
+        rbio = container_of(work, struct btrfs_raid_bio, work);
+        __raid56_parity_recover(rbio);
+}
diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h
new file mode 100644
index 000000000000..ea5d73bfdfbe
--- /dev/null
+++ b/fs/btrfs/raid56.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2012 Fusion-io  All rights reserved.
+ * Copyright (C) 2012 Intel Corp. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __BTRFS_RAID56__
+#define __BTRFS_RAID56__
+static inline int nr_parity_stripes(struct map_lookup *map)
+{
+        if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+                return 1;
+        else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+                return 2;
+        else
+                return 0;
+}
+static inline int nr_data_stripes(struct map_lookup *map)
+{
+        return map->num_stripes - nr_parity_stripes(map);
+}
+#define RAID5_P_STRIPE ((u64)-2)
+#define RAID6_Q_STRIPE ((u64)-1)
+#define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) ||         \
+                             ((x) == RAID6_Q_STRIPE))
+int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
+                                 struct btrfs_bio *bbio, u64 *raid_map,
+                                 u64 stripe_len, int mirror_num);
+int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
+                               struct btrfs_bio *bbio, u64 *raid_map,
+                               u64 stripe_len);
+int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
+void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);
+#endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 17c306bf177a..50695dc5e2ab 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3017,7 +3017,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
                        }
                }
-                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+                page_start = page_offset(page);
                page_end = page_start + PAGE_CACHE_SIZE - 1;
                lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 67783e03d121..53c3501fa4ca 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -28,6 +28,7 @@
 #include "dev-replace.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
+#include "raid56.h"
 /*
 * This is only the first step towards a full-features scrub. It reads all
@@ -2254,6 +2255,13 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
        struct btrfs_device *extent_dev;
        int extent_mirror_num;
+        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+                         BTRFS_BLOCK_GROUP_RAID6)) {
+                if (num >= nr_data_stripes(map)) {
+                        return 0;
+                }
+        }
        nstripes = length;
        offset = 0;
        do_div(nstripes, map->stripe_len);
@@ -2708,7 +2716,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
        int     ret;
        struct btrfs_root *root = sctx->dev_root;
-        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return -EIO;
        gen = root->fs_info->last_trans_committed;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index f4ab7a9260eb..f7a8b861058b 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -85,6 +85,7 @@ struct send_ctx {
        u32 send_max_size;
        u64 total_send_size;
        u64 cmd_send_size[BTRFS_SEND_C_MAX + 1];
+        u64 flags;      /* 'flags' member of btrfs_ioctl_send_args is u64 */
        struct vfsmount *mnt;
@@ -3709,6 +3710,39 @@ out:
        return ret;
 }
+/*
+ * Send an update extent command to user space.
+ */
+static int send_update_extent(struct send_ctx *sctx,
+                              u64 offset, u32 len)
+{
+        int ret = 0;
+        struct fs_path *p;
+        p = fs_path_alloc(sctx);
+        if (!p)
+                return -ENOMEM;
+        ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT);
+        if (ret < 0)
+                goto out;
+        ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p);
+        if (ret < 0)
+                goto out;
+        TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p);
+        TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset);
+        TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, len);
+        ret = send_cmd(sctx);
+tlv_put_failure:
+out:
+        fs_path_free(sctx, p);
+        return ret;
+}
 static int send_write_or_clone(struct send_ctx *sctx,
                               struct btrfs_path *path,
                               struct btrfs_key *key,
@@ -3744,7 +3778,11 @@ static int send_write_or_clone(struct send_ctx *sctx,
                goto out;
        }
-        if (!clone_root) {
+        if (clone_root) {
+                ret = send_clone(sctx, offset, len, clone_root);
+        } else if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) {
+                ret = send_update_extent(sctx, offset, len);
+        } else {
                while (pos < len) {
                        l = len - pos;
                        if (l > BTRFS_SEND_READ_SIZE)
@@ -3757,10 +3795,7 @@ static int send_write_or_clone(struct send_ctx *sctx,
                        pos += ret;
                }
                ret = 0;
-        } else {
-                ret = send_clone(sctx, offset, len, clone_root);
        }
 out:
        return ret;
 }
@@ -4536,7 +4571,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        struct btrfs_fs_info *fs_info;
        struct btrfs_ioctl_send_args *arg = NULL;
        struct btrfs_key key;
-        struct file *filp = NULL;
        struct send_ctx *sctx = NULL;
        u32 i;
        u64 *clone_sources_tmp = NULL;
@@ -4561,6 +4595,11 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                goto out;
        }
+        if (arg->flags & ~BTRFS_SEND_FLAG_NO_FILE_DATA) {
+                ret = -EINVAL;
+                goto out;
+        }
        sctx = kzalloc(sizeof(struct send_ctx), GFP_NOFS);
        if (!sctx) {
                ret = -ENOMEM;
@@ -4572,6 +4611,8 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
        INIT_RADIX_TREE(&sctx->name_cache, GFP_NOFS);
        INIT_LIST_HEAD(&sctx->name_cache_list);
+        sctx->flags = arg->flags;
        sctx->send_filp = fget(arg->send_fd);
        if (IS_ERR(sctx->send_filp)) {
                ret = PTR_ERR(sctx->send_filp);
@@ -4673,8 +4714,6 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_)
                goto out;
 out:
-        if (filp)
-                fput(filp);
        kfree(arg);
        vfree(clone_sources_tmp);
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 1bf4f32fd4ef..8bb18f7ccaa6 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -86,6 +86,7 @@ enum btrfs_send_cmd {
        BTRFS_SEND_C_UTIMES,
        BTRFS_SEND_C_END,
+        BTRFS_SEND_C_UPDATE_EXTENT,
        __BTRFS_SEND_C_MAX,
 };
 #define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index d8982e9601d3..68a29a1ea068 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -41,13 +41,13 @@
 #include <linux/slab.h>
 #include <linux/cleancache.h>
 #include <linux/ratelimit.h>
+#include <linux/btrfs.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ioctl.h"
 #include "print-tree.h"
 #include "xattr.h"
 #include "volumes.h"
@@ -63,8 +63,7 @@
 static const struct super_operations btrfs_super_ops;
 static struct file_system_type btrfs_fs_type;
-static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+static const char *btrfs_decode_error(int errno, char nbuf[16])
-                                      char nbuf[16])
 {
        char *errstr = NULL;
@@ -98,7 +97,7 @@ static void __save_error_info(struct btrfs_fs_info *fs_info)
         * today we only save the error info into ram.  Long term we'll
         * also send it down to the disk
         */
-        fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+        set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
 }
 static void save_error_info(struct btrfs_fs_info *fs_info)
@@ -114,7 +113,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
        if (sb->s_flags & MS_RDONLY)
                return;
-        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
                sb->s_flags |= MS_RDONLY;
                printk(KERN_INFO "btrfs is forced readonly\n");
                /*
@@ -142,8 +141,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
        struct super_block *sb = fs_info->sb;
        char nbuf[16];
        const char *errstr;
-        va_list args;
-        va_start(args, fmt);
        /*
         * Special case: if the error is EROFS, and we're already
@@ -152,15 +149,18 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
        if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
                return;
-        errstr = btrfs_decode_error(fs_info, errno, nbuf);
+        errstr = btrfs_decode_error(errno, nbuf);
        if (fmt) {
-                struct va_format vaf = {
+                struct va_format vaf;
-                        .fmt = fmt,
+                va_list args;
-                        .va = &args,
-                };
+                va_start(args, fmt);
+                vaf.fmt = fmt;
+                vaf.va = &args;
                printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s (%pV)\n",
                        sb->s_id, function, line, errstr, &vaf);
+                va_end(args);
        } else {
                printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
                        sb->s_id, function, line, errstr);
@@ -171,7 +171,6 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
                save_error_info(fs_info);
                btrfs_handle_error(fs_info);
        }
-        va_end(args);
 }
 static const char * const logtypes[] = {
@@ -261,7 +260,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
                char nbuf[16];
                const char *errstr;
-                errstr = btrfs_decode_error(root->fs_info, errno, nbuf);
+                errstr = btrfs_decode_error(errno, nbuf);
                btrfs_printk(root->fs_info,
                             "%s:%d: Aborting unused transaction(%s).\n",
                             function, line, errstr);
@@ -289,8 +288,8 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
        va_start(args, fmt);
        vaf.va = &args;
-        errstr = btrfs_decode_error(fs_info, errno, nbuf);
+        errstr = btrfs_decode_error(errno, nbuf);
-        if (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR)
+        if (fs_info && (fs_info->mount_opt & BTRFS_MOUNT_PANIC_ON_FATAL_ERROR))
                panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (%s)\n",
                        s_id, function, line, &vaf, errstr);
@@ -438,6 +437,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_compress_force:
                case Opt_compress_force_type:
                        compress_force = true;
+                        /* Fallthrough */
                case Opt_compress:
                case Opt_compress_type:
                        if (token == Opt_compress ||
@@ -519,7 +519,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_alloc_start:
                        num = match_strdup(&args[0]);
                        if (num) {
+                                mutex_lock(&info->chunk_mutex);
                                info->alloc_start = memparse(num, NULL);
+                                mutex_unlock(&info->chunk_mutex);
                                kfree(num);
                                printk(KERN_INFO
                                        "btrfs: allocations start at %llu\n",
@@ -876,7 +878,7 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_wait_ordered_extents(root, 0);
-        trans = btrfs_attach_transaction(root);
+        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                /* no transaction, don't bother */
                if (PTR_ERR(trans) == -ENOENT)
@@ -1200,6 +1202,38 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
                              new_pool_size);
 }
+static inline void btrfs_remount_prepare(struct btrfs_fs_info *fs_info,
+                                         unsigned long old_opts, int flags)
+{
+        set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+        if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+            (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+             (flags & MS_RDONLY))) {
+                /* wait for any defraggers to finish */
+                wait_event(fs_info->transaction_wait,
+                           (atomic_read(&fs_info->defrag_running) == 0));
+                if (flags & MS_RDONLY)
+                        sync_filesystem(fs_info->sb);
+        }
+}
+static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info,
+                                         unsigned long old_opts)
+{
+        /*
+         * We need cleanup all defragable inodes if the autodefragment is
+         * close or the fs is R/O.
+         */
+        if (btrfs_raw_test_opt(old_opts, AUTO_DEFRAG) &&
+            (!btrfs_raw_test_opt(fs_info->mount_opt, AUTO_DEFRAG) ||
+             (fs_info->sb->s_flags & MS_RDONLY))) {
+                btrfs_cleanup_defrag_inodes(fs_info);
+        }
+        clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state);
+}
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(sb);
@@ -1213,6 +1247,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        unsigned int old_metadata_ratio = fs_info->metadata_ratio;
        int ret;
+        btrfs_remount_prepare(fs_info, old_opts, *flags);
        ret = btrfs_parse_options(root, data);
        if (ret) {
                ret = -EINVAL;
@@ -1223,7 +1259,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                fs_info->thread_pool_size, old_thread_pool_size);
        if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
-                return 0;
+                goto out;
        if (*flags & MS_RDONLY) {
                /*
@@ -1278,7 +1314,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
                }
                sb->s_flags &= ~MS_RDONLY;
        }
+out:
+        btrfs_remount_cleanup(fs_info, old_opts);
        return 0;
 restore:
@@ -1289,10 +1326,13 @@ restore:
        fs_info->mount_opt = old_opts;
        fs_info->compress_type = old_compress_type;
        fs_info->max_inline = old_max_inline;
+        mutex_lock(&fs_info->chunk_mutex);
        fs_info->alloc_start = old_alloc_start;
+        mutex_unlock(&fs_info->chunk_mutex);
        btrfs_resize_thread_pool(fs_info,
                old_thread_pool_size, fs_info->thread_pool_size);
        fs_info->metadata_ratio = old_metadata_ratio;
+        btrfs_remount_cleanup(fs_info, old_opts);
        return ret;
 }
@@ -1559,7 +1599,7 @@ static int btrfs_freeze(struct super_block *sb)
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = btrfs_sb(sb)->tree_root;
-        trans = btrfs_attach_transaction(root);
+        trans = btrfs_attach_transaction_barrier(root);
        if (IS_ERR(trans)) {
                /* no transaction, don't bother */
                if (PTR_ERR(trans) == -ENOENT)
@@ -1684,10 +1724,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                goto free_delayed_inode;
-        err = btrfs_interface_init();
+        err = btrfs_delayed_ref_init();
        if (err)
                goto free_auto_defrag;
+        err = btrfs_interface_init();
+        if (err)
+                goto free_delayed_ref;
        err = register_filesystem(&btrfs_fs_type);
        if (err)
                goto unregister_ioctl;
@@ -1699,6 +1743,8 @@ static int __init init_btrfs_fs(void)
 unregister_ioctl:
        btrfs_interface_exit();
+free_delayed_ref:
+        btrfs_delayed_ref_exit();
 free_auto_defrag:
        btrfs_auto_defrag_exit();
 free_delayed_inode:
@@ -1720,6 +1766,7 @@ free_compress:
 static void __exit exit_btrfs_fs(void)
 {
        btrfs_destroy_cachep();
+        btrfs_delayed_ref_exit();
        btrfs_auto_defrag_exit();
        btrfs_delayed_inode_exit();
        ordered_data_exit();
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index daac9ae6d731..5b326cd60a4a 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -21,7 +21,6 @@
 #include <linux/spinlock.h>
 #include <linux/completion.h>
 #include <linux/buffer_head.h>
-#include <linux/module.h>
 #include <linux/kobject.h>
 #include "ctree.h"
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4c0067c4f76d..e52da6fb1165 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -40,7 +40,6 @@ void put_transaction(struct btrfs_transaction *transaction)
        if (atomic_dec_and_test(&transaction->use_count)) {
                BUG_ON(!list_empty(&transaction->list));
                WARN_ON(transaction->delayed_refs.root.rb_node);
-                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
 }
@@ -51,6 +50,14 @@ static noinline void switch_commit_root(struct btrfs_root *root)
        root->commit_root = btrfs_root_node(root);
 }
+static inline int can_join_transaction(struct btrfs_transaction *trans,
+                                       int type)
+{
+        return !(trans->in_commit &&
+                 type != TRANS_JOIN &&
+                 type != TRANS_JOIN_NOLOCK);
+}
 /*
 * either allocate a new transaction or hop into the existing one
 */
@@ -62,7 +69,7 @@ static noinline int join_transaction(struct btrfs_root *root, int type)
        spin_lock(&fs_info->trans_lock);
 loop:
        /* The file system has been taken offline. No new transactions. */
-        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+        if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
                spin_unlock(&fs_info->trans_lock);
                return -EROFS;
        }
@@ -86,6 +93,10 @@ loop:
                        spin_unlock(&fs_info->trans_lock);
                        return cur_trans->aborted;
                }
+                if (!can_join_transaction(cur_trans, type)) {
+                        spin_unlock(&fs_info->trans_lock);
+                        return -EBUSY;
+                }
                atomic_inc(&cur_trans->use_count);
                atomic_inc(&cur_trans->num_writers);
                cur_trans->num_joined++;
@@ -113,7 +124,7 @@ loop:
                 */
                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
                goto loop;
-        } else if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+        } else if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
                spin_unlock(&fs_info->trans_lock);
                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
                return -EROFS;
@@ -155,8 +166,12 @@ loop:
        spin_lock_init(&cur_trans->commit_lock);
        spin_lock_init(&cur_trans->delayed_refs.lock);
+        atomic_set(&cur_trans->delayed_refs.procs_running_refs, 0);
+        atomic_set(&cur_trans->delayed_refs.ref_seq, 0);
+        init_waitqueue_head(&cur_trans->delayed_refs.wait);
        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+        INIT_LIST_HEAD(&cur_trans->ordered_operations);
        list_add_tail(&cur_trans->list, &fs_info->trans_list);
        extent_io_tree_init(&cur_trans->dirty_pages,
                             fs_info->btree_inode->i_mapping);
@@ -301,7 +316,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, int type,
        int ret;
        u64 qgroup_reserved = 0;
-        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+        if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                return ERR_PTR(-EROFS);
        if (current->journal_info) {
@@ -359,8 +374,11 @@ again:
        do {
                ret = join_transaction(root, type);
-                if (ret == -EBUSY)
+                if (ret == -EBUSY) {
                        wait_current_trans(root);
+                        if (unlikely(type == TRANS_ATTACH))
+                                ret = -ENOENT;
+                }
        } while (ret == -EBUSY);
        if (ret < 0) {
@@ -382,9 +400,10 @@ again:
        h->block_rsv = NULL;
        h->orig_rsv = NULL;
        h->aborted = 0;
-        h->qgroup_reserved = qgroup_reserved;
+        h->qgroup_reserved = 0;
        h->delayed_ref_elem.seq = 0;
        h->type = type;
+        h->allocating_chunk = false;
        INIT_LIST_HEAD(&h->qgroup_ref_list);
        INIT_LIST_HEAD(&h->new_bgs);
@@ -400,6 +419,7 @@ again:
                h->block_rsv = &root->fs_info->trans_block_rsv;
                h->bytes_reserved = num_bytes;
        }
+        h->qgroup_reserved = qgroup_reserved;
 got_it:
        btrfs_record_root_in_trans(h, root);
@@ -451,11 +471,43 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
        return start_transaction(root, 0, TRANS_USERSPACE, 0);
 }
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is used when we want to commit the current the transaction, but
+ * don't want to start a new one.
+ *
+ * Note: If this function return -ENOENT, it just means there is no
+ * running transaction. But it is possible that the inactive transaction
+ * is still in the memory, not fully on disk. If you hope there is no
+ * inactive transaction in the fs when -ENOENT is returned, you should
+ * invoke
+ *     btrfs_attach_transaction_barrier()
+ */
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 {
        return start_transaction(root, 0, TRANS_ATTACH, 0);
 }
+/*
+ * btrfs_attach_transaction() - catch the running transaction
+ *
+ * It is similar to the above function, the differentia is this one
+ * will wait for all the inactive transactions until they fully
+ * complete.
+ */
+struct btrfs_trans_handle *
+btrfs_attach_transaction_barrier(struct btrfs_root *root)
+{
+        struct btrfs_trans_handle *trans;
+        trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+        if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
+                btrfs_wait_for_commit(root, 0);
+        return trans;
+}
 /* wait for a transaction commit to be fully complete */
 static noinline void wait_for_commit(struct btrfs_root *root,
                                    struct btrfs_transaction *commit)
@@ -587,7 +639,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        if (!list_empty(&trans->new_bgs))
                btrfs_create_pending_block_groups(trans, root);
-        while (count < 2) {
+        while (count < 1) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
                if (cur &&
@@ -599,6 +651,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                }
                count++;
        }
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
@@ -644,12 +697,10 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
                btrfs_run_delayed_iputs(root);
        if (trans->aborted ||
-            root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+            test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
                err = -EIO;
-        }
        assert_qgroups_uptodate(trans);
-        memset(trans, 0, sizeof(*trans));
        kmem_cache_free(btrfs_trans_handle_cachep, trans);
        return err;
 }
@@ -696,7 +747,9 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
        struct extent_state *cached_state = NULL;
        u64 start = 0;
        u64 end;
+        struct blk_plug plug;
+        blk_start_plug(&plug);
        while (!find_first_extent_bit(dirty_pages, start, &start, &end,
                                      mark, &cached_state)) {
                convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT,
@@ -710,6 +763,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
        }
        if (err)
                werr = err;
+        blk_finish_plug(&plug);
        return werr;
 }
@@ -960,10 +1014,10 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
 }
 /*
- * defrag a given btree.  If cacheonly == 1, this won't read from the disk,
+ * defrag a given btree.
- * otherwise every leaf in the btree is read and defragged.
+ * Every leaf in the btree is read and defragged.
 */
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
+int btrfs_defrag_root(struct btrfs_root *root)
 {
        struct btrfs_fs_info *info = root->fs_info;
        struct btrfs_trans_handle *trans;
@@ -977,7 +1031,7 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                if (IS_ERR(trans))
                        return PTR_ERR(trans);
-                ret = btrfs_defrag_leaves(trans, root, cacheonly);
+                ret = btrfs_defrag_leaves(trans, root);
                btrfs_end_transaction(trans, root);
                btrfs_btree_balance_dirty(info->tree_root);
@@ -985,6 +1039,12 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
                        break;
+                if (btrfs_defrag_cancelled(root->fs_info)) {
+                        printk(KERN_DEBUG "btrfs: defrag_root cancelled\n");
+                        ret = -EAGAIN;
+                        break;
+                }
        }
        root->defrag_running = 0;
        return ret;
@@ -1007,7 +1067,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct inode *parent_inode;
        struct btrfs_path *path;
        struct btrfs_dir_item *dir_item;
-        struct dentry *parent;
        struct dentry *dentry;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
@@ -1022,7 +1081,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        if (!path) {
                ret = pending->error = -ENOMEM;
-                goto path_alloc_fail;
+                return ret;
        }
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
@@ -1062,10 +1121,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        rsv = trans->block_rsv;
        trans->block_rsv = &pending->block_rsv;
+        trans->bytes_reserved = trans->block_rsv->reserved;
        dentry = pending->dentry;
-        parent = dget_parent(dentry);
+        parent_inode = pending->dir;
-        parent_inode = parent->d_inode;
        parent_root = BTRFS_I(parent_inode)->root;
        record_root_in_trans(trans, parent_root);
@@ -1213,14 +1272,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        if (ret)
                btrfs_abort_transaction(trans, root, ret);
 fail:
-        dput(parent);
        trans->block_rsv = rsv;
+        trans->bytes_reserved = 0;
 no_free_objectid:
        kfree(new_root_item);
 root_item_alloc_fail:
        btrfs_free_path(path);
-path_alloc_fail:
-        btrfs_block_rsv_release(root, &pending->block_rsv, (u64)-1);
        return ret;
 }
@@ -1306,13 +1363,13 @@ static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
 struct btrfs_async_commit {
        struct btrfs_trans_handle *newtrans;
        struct btrfs_root *root;
-        struct delayed_work work;
+        struct work_struct work;
 };
 static void do_async_commit(struct work_struct *work)
 {
        struct btrfs_async_commit *ac =
-                container_of(work, struct btrfs_async_commit, work.work);
+                container_of(work, struct btrfs_async_commit, work);
        /*
         * We've got freeze protection passed with the transaction.
@@ -1340,7 +1397,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
        if (!ac)
                return -ENOMEM;
-        INIT_DELAYED_WORK(&ac->work, do_async_commit);
+        INIT_WORK(&ac->work, do_async_commit);
        ac->root = root;
        ac->newtrans = btrfs_join_transaction(root);
        if (IS_ERR(ac->newtrans)) {
@@ -1364,7 +1421,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
                        &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1],
                        1, _THIS_IP_);
-        schedule_delayed_work(&ac->work, 0);
+        schedule_work(&ac->work);
        /* wait for transaction to start and unblock */
        if (wait_for_unblock)
@@ -1384,6 +1441,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root, int err)
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
+        DEFINE_WAIT(wait);
        WARN_ON(trans->use_count > 1);
@@ -1392,8 +1450,13 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans,
        spin_lock(&root->fs_info->trans_lock);
        list_del_init(&cur_trans->list);
        if (cur_trans == root->fs_info->running_transaction) {
+                root->fs_info->trans_no_join = 1;
+                spin_unlock(&root->fs_info->trans_lock);
+                wait_event(cur_trans->writer_wait,
+                           atomic_read(&cur_trans->num_writers) == 1);
+                spin_lock(&root->fs_info->trans_lock);
                root->fs_info->running_transaction = NULL;
-                root->fs_info->trans_no_join = 0;
        }
        spin_unlock(&root->fs_info->trans_lock);
@@ -1427,7 +1490,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
        }
        if (flush_on_commit || snap_pending) {
-                btrfs_start_delalloc_inodes(root, 1);
+                ret = btrfs_start_delalloc_inodes(root, 1);
+                if (ret)
+                        return ret;
                btrfs_wait_ordered_extents(root, 1);
        }
@@ -1449,9 +1514,9 @@ static int btrfs_flush_all_pending_stuffs(struct btrfs_trans_handle *trans,
         * it here and no for sure that nothing new will be added
         * to the list
         */
-        btrfs_run_ordered_operations(root, 1);
+        ret = btrfs_run_ordered_operations(trans, root, 1);
-        return 0;
+        return ret;
 }
 /*
@@ -1472,27 +1537,35 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        int should_grow = 0;
        unsigned long now = get_seconds();
-        ret = btrfs_run_ordered_operations(root, 0);
+        ret = btrfs_run_ordered_operations(trans, root, 0);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
-                goto cleanup_transaction;
+                btrfs_end_transaction(trans, root);
+                return ret;
        }
        /* Stop the commit early if ->aborted is set */
        if (unlikely(ACCESS_ONCE(cur_trans->aborted))) {
                ret = cur_trans->aborted;
-                goto cleanup_transaction;
+                btrfs_end_transaction(trans, root);
+                return ret;
        }
        /* make a pass through all the delayed refs we have so far
         * any runnings procs may add more while we are here
         */
        ret = btrfs_run_delayed_refs(trans, root, 0);
-        if (ret)
+        if (ret) {
-                goto cleanup_transaction;
+                btrfs_end_transaction(trans, root);
+                return ret;
+        }
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
+        if (trans->qgroup_reserved) {
+                btrfs_qgroup_free(root, trans->qgroup_reserved);
+                trans->qgroup_reserved = 0;
+        }
        cur_trans = trans->transaction;
@@ -1506,8 +1579,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                btrfs_create_pending_block_groups(trans, root);
        ret = btrfs_run_delayed_refs(trans, root, 0);
-        if (ret)
+        if (ret) {
-                goto cleanup_transaction;
+                btrfs_end_transaction(trans, root);
+                return ret;
+        }
        spin_lock(&cur_trans->commit_lock);
        if (cur_trans->in_commit) {
@@ -1771,6 +1846,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 cleanup_transaction:
        btrfs_trans_release_metadata(trans, root);
        trans->block_rsv = NULL;
+        if (trans->qgroup_reserved) {
+                btrfs_qgroup_free(root, trans->qgroup_reserved);
+                trans->qgroup_reserved = 0;
+        }
        btrfs_printk(root->fs_info, "Skipping commit of aborted transaction.\n");
 //      WARN_ON(1);
        if (current->journal_info == trans)
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 0e8aa1e6c287..3c8e0d25c8e4 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -43,6 +43,7 @@ struct btrfs_transaction {
        wait_queue_head_t writer_wait;
        wait_queue_head_t commit_wait;
        struct list_head pending_snapshots;
+        struct list_head ordered_operations;
        struct btrfs_delayed_ref_root delayed_refs;
        int aborted;
 };
@@ -68,6 +69,7 @@ struct btrfs_trans_handle {
        struct btrfs_block_rsv *orig_rsv;
        short aborted;
        short adding_csums;
+        bool allocating_chunk;
        enum btrfs_trans_type type;
        /*
         * this root is only needed to validate that the root passed to
@@ -82,11 +84,13 @@ struct btrfs_trans_handle {
 struct btrfs_pending_snapshot {
        struct dentry *dentry;
+        struct inode *dir;
        struct btrfs_root *root;
        struct btrfs_root *snap;
        struct btrfs_qgroup_inherit *inherit;
        /* block reservation for the operation */
        struct btrfs_block_rsv block_rsv;
+        u64 qgroup_reserved;
        /* extra metadata reseration for relocation */
        int error;
        bool readonly;
@@ -110,13 +114,15 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root);
+struct btrfs_trans_handle *btrfs_attach_transaction_barrier(
+                                        struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
 int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
 int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
+int btrfs_defrag_root(struct btrfs_root *root);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 3b580ee8ab1d..94e05c1f118a 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -23,13 +23,14 @@
 #include "transaction.h"
 #include "locking.h"
-/* defrag all the leaves in a given btree.  If cache_only == 1, don't read
+/*
- * things from disk, otherwise read all the leaves and try to get key order to
+ * Defrag all the leaves in a given btree.
+ * Read all the leaves and try to get key order to
 * better reflect disk order
 */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root, int cache_only)
+                        struct btrfs_root *root)
 {
        struct btrfs_path *path = NULL;
        struct btrfs_key key;
@@ -41,9 +42,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        u64 last_ret = 0;
        u64 min_trans = 0;
-        if (cache_only)
-                goto out;
        if (root->fs_info->extent_root == root) {
                /*
                 * there's recursion here right now in the tree locking,
@@ -86,11 +84,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        }
        path->keep_locks = 1;
-        if (cache_only)
-                min_trans = root->defrag_trans_start;
-        ret = btrfs_search_forward(root, &key, NULL, path,
+        ret = btrfs_search_forward(root, &key, NULL, path, min_trans);
-                                   cache_only, min_trans);
        if (ret < 0)
                goto out;
        if (ret > 0) {
@@ -109,11 +104,11 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                goto out;
        }
        path->slots[1] = btrfs_header_nritems(path->nodes[1]);
-        next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only,
+        next_key_ret = btrfs_find_next_key(root, path, &key, 1,
                                           min_trans);
        ret = btrfs_realloc_node(trans, root,
                                 path->nodes[1], 0,
-                                 cache_only, &last_ret,
+                                 &last_ret,
                                 &root->defrag_progress);
        if (ret) {
                WARN_ON(ret == -EAGAIN);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9027bb1e7466..c7ef569eb22a 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -278,8 +278,7 @@ static int process_one_buffer(struct btrfs_root *log,
                              struct walk_control *wc, u64 gen)
 {
        if (wc->pin)
-                btrfs_pin_extent_for_log_replay(wc->trans,
+                btrfs_pin_extent_for_log_replay(log->fs_info->extent_root,
-                                                log->fs_info->extent_root,
                                                eb->start, eb->len);
        if (btrfs_buffer_uptodate(eb, gen, 0)) {
@@ -485,7 +484,6 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                      struct btrfs_key *key)
 {
        int found_type;
-        u64 mask = root->sectorsize - 1;
        u64 extent_end;
        u64 start = key->offset;
        u64 saved_nbytes;
@@ -502,7 +500,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                extent_end = start + btrfs_file_extent_num_bytes(eb, item);
        else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                size = btrfs_file_extent_inline_len(eb, item);
-                extent_end = (start + size + mask) & ~mask;
+                extent_end = ALIGN(start + size, root->sectorsize);
        } else {
                ret = 0;
                goto out;
@@ -2281,6 +2279,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        unsigned long log_transid = 0;
        mutex_lock(&root->log_mutex);
+        log_transid = root->log_transid;
        index1 = root->log_transid % 2;
        if (atomic_read(&root->log_commit[index1])) {
                wait_log_commit(trans, root, root->log_transid);
@@ -2308,11 +2307,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        /* bail out if we need to do a full commit */
        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
                ret = -EAGAIN;
+                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&root->log_mutex);
                goto out;
        }
-        log_transid = root->log_transid;
        if (log_transid % 2 == 0)
                mark = EXTENT_DIRTY;
        else
@@ -2324,6 +2323,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
        ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
+                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&root->log_mutex);
                goto out;
        }
@@ -2363,6 +2363,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                }
                root->fs_info->last_trans_log_full_commit = trans->transid;
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = -EAGAIN;
                goto out;
@@ -2373,6 +2374,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
                wait_log_commit(trans, log_root_tree,
                                log_root_tree->log_transid);
+                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = 0;
                goto out;
@@ -2392,6 +2394,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         */
        if (root->fs_info->last_trans_log_full_commit == trans->transid) {
                btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                ret = -EAGAIN;
                goto out_wake_log_root;
@@ -2402,10 +2405,12 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                                EXTENT_DIRTY | EXTENT_NEW);
        if (ret) {
                btrfs_abort_transaction(trans, root, ret);
+                btrfs_free_logged_extents(log, log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
                goto out_wake_log_root;
        }
        btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
+        btrfs_wait_logged_extents(log, log_transid);
        btrfs_set_super_log_root(root->fs_info->super_for_commit,
                                log_root_tree->node->start);
@@ -2461,8 +2466,10 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
                .process_func = process_one_buffer
        };
-        ret = walk_log_tree(trans, log, &wc);
+        if (trans) {
-        BUG_ON(ret);
+                ret = walk_log_tree(trans, log, &wc);
+                BUG_ON(ret);
+        }
        while (1) {
                ret = find_first_extent_bit(&log->dirty_log_pages,
@@ -2475,6 +2482,14 @@ static void free_log_tree(struct btrfs_trans_handle *trans,
                                  EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS);
        }
+        /*
+         * We may have short-circuited the log tree with the full commit logic
+         * and left ordered extents on our list, so clear these out to keep us
+         * from leaking inodes and memory.
+         */
+        btrfs_free_logged_extents(log, 0);
+        btrfs_free_logged_extents(log, 1);
        free_extent_buffer(log->node);
        kfree(log);
 }
@@ -2724,7 +2739,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        path->keep_locks = 1;
        ret = btrfs_search_forward(root, &min_key, &max_key,
-                                   path, 0, trans->transid);
+                                   path, trans->transid);
        /*
         * we didn't find anything from this transaction, see if there
@@ -3271,16 +3286,21 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
        struct btrfs_root *log = root->log_root;
        struct btrfs_file_extent_item *fi;
        struct extent_buffer *leaf;
+        struct btrfs_ordered_extent *ordered;
        struct list_head ordered_sums;
        struct btrfs_map_token token;
        struct btrfs_key key;
-        u64 csum_offset = em->mod_start - em->start;
+        u64 mod_start = em->mod_start;
-        u64 csum_len = em->mod_len;
+        u64 mod_len = em->mod_len;
+        u64 csum_offset;
+        u64 csum_len;
        u64 extent_offset = em->start - em->orig_start;
        u64 block_len;
        int ret;
+        int index = log->log_transid % 2;
        bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
+insert:
        INIT_LIST_HEAD(&ordered_sums);
        btrfs_init_map_token(&token);
        key.objectid = btrfs_ino(inode);
@@ -3296,6 +3316,23 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
        leaf = path->nodes[0];
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
+        /*
+         * If we are overwriting an inline extent with a real one then we need
+         * to just delete the inline extent as it may not be large enough to
+         * have the entire file_extent_item.
+         */
+        if (ret && btrfs_token_file_extent_type(leaf, fi, &token) ==
+            BTRFS_FILE_EXTENT_INLINE) {
+                ret = btrfs_del_item(trans, log, path);
+                btrfs_release_path(path);
+                if (ret) {
+                        path->really_keep_locks = 0;
+                        return ret;
+                }
+                goto insert;
+        }
        btrfs_set_token_file_extent_generation(leaf, fi, em->generation,
                                               &token);
        if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3362,6 +3399,92 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
                csum_len = block_len;
        }
+        /*
+         * First check and see if our csums are on our outstanding ordered
+         * extents.
+         */
+again:
+        spin_lock_irq(&log->log_extents_lock[index]);
+        list_for_each_entry(ordered, &log->logged_list[index], log_list) {
+                struct btrfs_ordered_sum *sum;
+                if (!mod_len)
+                        break;
+                if (ordered->inode != inode)
+                        continue;
+                if (ordered->file_offset + ordered->len <= mod_start ||
+                    mod_start + mod_len <= ordered->file_offset)
+                        continue;
+                /*
+                 * We are going to copy all the csums on this ordered extent, so
+                 * go ahead and adjust mod_start and mod_len in case this
+                 * ordered extent has already been logged.
+                 */
+                if (ordered->file_offset > mod_start) {
+                        if (ordered->file_offset + ordered->len >=
+                            mod_start + mod_len)
+                                mod_len = ordered->file_offset - mod_start;
+                        /*
+                         * If we have this case
+                         *
+                         * |--------- logged extent ---------|
+                         *       |----- ordered extent ----|
+                         *
+                         * Just don't mess with mod_start and mod_len, we'll
+                         * just end up logging more csums than we need and it
+                         * will be ok.
+                         */
+                } else {
+                        if (ordered->file_offset + ordered->len <
+                            mod_start + mod_len) {
+                                mod_len = (mod_start + mod_len) -
+                                        (ordered->file_offset + ordered->len);
+                                mod_start = ordered->file_offset +
+                                        ordered->len;
+                        } else {
+                                mod_len = 0;
+                        }
+                }
+                /*
+                 * To keep us from looping for the above case of an ordered
+                 * extent that falls inside of the logged extent.
+                 */
+                if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM,
+                                     &ordered->flags))
+                        continue;
+                atomic_inc(&ordered->refs);
+                spin_unlock_irq(&log->log_extents_lock[index]);
+                /*
+                 * we've dropped the lock, we must either break or
+                 * start over after this.
+                 */
+                wait_event(ordered->wait, ordered->csum_bytes_left == 0);
+                list_for_each_entry(sum, &ordered->list, list) {
+                        ret = btrfs_csum_file_blocks(trans, log, sum);
+                        if (ret) {
+                                btrfs_put_ordered_extent(ordered);
+                                goto unlocked;
+                        }
+                }
+                btrfs_put_ordered_extent(ordered);
+                goto again;
+        }
+        spin_unlock_irq(&log->log_extents_lock[index]);
+unlocked:
+        if (!mod_len || ret)
+                return ret;
+        csum_offset = mod_start - em->start;
+        csum_len = mod_len;
        /* block start is already adjusted for the file extent offset. */
        ret = btrfs_lookup_csums_range(log->fs_info->csum_root,
                                       em->block_start + csum_offset,
@@ -3393,6 +3516,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
        u64 test_gen;
        int ret = 0;
+        int num = 0;
        INIT_LIST_HEAD(&extents);
@@ -3401,16 +3525,31 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
        list_for_each_entry_safe(em, n, &tree->modified_extents, list) {
                list_del_init(&em->list);
+                /*
+                 * Just an arbitrary number, this can be really CPU intensive
+                 * once we start getting a lot of extents, and really once we
+                 * have a bunch of extents we just want to commit since it will
+                 * be faster.
+                 */
+                if (++num > 32768) {
+                        list_del_init(&tree->modified_extents);
+                        ret = -EFBIG;
+                        goto process;
+                }
                if (em->generation <= test_gen)
                        continue;
                /* Need a ref to keep it from getting evicted from cache */
                atomic_inc(&em->refs);
                set_bit(EXTENT_FLAG_LOGGING, &em->flags);
                list_add_tail(&em->list, &extents);
+                num++;
        }
        list_sort(NULL, &extents, extent_cmp);
+process:
        while (!list_empty(&extents)) {
                em = list_entry(extents.next, struct extent_map, list);
@@ -3513,6 +3652,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        mutex_lock(&BTRFS_I(inode)->log_mutex);
+        btrfs_get_logged_extents(log, inode);
        /*
         * a brute force approach to making sure we get the most uptodate
         * copies of everything.
@@ -3558,7 +3699,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        while (1) {
                ins_nr = 0;
                ret = btrfs_search_forward(root, &min_key, &max_key,
-                                           path, 0, trans->transid);
+                                           path, trans->transid);
                if (ret != 0)
                        break;
 again:
@@ -3656,6 +3797,8 @@ log_extents:
        BTRFS_I(inode)->logged_trans = trans->transid;
        BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
+        if (err)
+                btrfs_free_logged_extents(log, log->log_transid);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
        btrfs_free_path(path);
@@ -3822,7 +3965,6 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 end_trans:
        dput(old_parent);
        if (ret < 0) {
-                WARN_ON(ret != -ENOSPC);
                root->fs_info->last_trans_log_full_commit = trans->transid;
                ret = 1;
        }
diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c
index 99be4c138db6..ddc61cad0080 100644
--- a/fs/btrfs/ulist.c
+++ b/fs/btrfs/ulist.c
@@ -5,7 +5,7 @@
 */
 #include <linux/slab.h>
-#include <linux/module.h>
+#include <linux/export.h>
 #include "ulist.h"
 /*
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 5cbb7f4b1672..35bb2d4ed29f 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -25,6 +25,8 @@
 #include <linux/capability.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
+#include <linux/raid/pq.h>
+#include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
 #include "extent_map.h"
@@ -32,6 +34,7 @@
 #include "transaction.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "raid56.h"
 #include "async-thread.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
@@ -647,6 +650,7 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
                new_device->writeable = 0;
                new_device->in_fs_metadata = 0;
                new_device->can_discard = 0;
+                spin_lock_init(&new_device->io_lock);
                list_replace_rcu(&device->dev_list, &new_device->dev_list);
                call_rcu(&device->rcu, free_device);
@@ -792,26 +796,75 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
        return ret;
 }
+/*
+ * Look for a btrfs signature on a device. This may be called out of the mount path
+ * and we are not allowed to call set_blocksize during the scan. The superblock
+ * is read via pagecache
+ */
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                          struct btrfs_fs_devices **fs_devices_ret)
 {
        struct btrfs_super_block *disk_super;
        struct block_device *bdev;
-        struct buffer_head *bh;
+        struct page *page;
-        int ret;
+        void *p;
+        int ret = -EINVAL;
        u64 devid;
        u64 transid;
        u64 total_devices;
+        u64 bytenr;
+        pgoff_t index;
+        /*
+         * we would like to check all the supers, but that would make
+         * a btrfs mount succeed after a mkfs from a different FS.
+         * So, we need to add a special mount option to scan for
+         * later supers, using BTRFS_SUPER_MIRROR_MAX instead
+         */
+        bytenr = btrfs_sb_offset(0);
        flags |= FMODE_EXCL;
        mutex_lock(&uuid_mutex);
-        ret = btrfs_get_bdev_and_sb(path, flags, holder, 0, &bdev, &bh);
-        if (ret)
+        bdev = blkdev_get_by_path(path, flags, holder);
+        if (IS_ERR(bdev)) {
+                ret = PTR_ERR(bdev);
                goto error;
-        disk_super = (struct btrfs_super_block *)bh->b_data;
+        }
+        /* make sure our super fits in the device */
+        if (bytenr + PAGE_CACHE_SIZE >= i_size_read(bdev->bd_inode))
+                goto error_bdev_put;
+        /* make sure our super fits in the page */
+        if (sizeof(*disk_super) > PAGE_CACHE_SIZE)
+                goto error_bdev_put;
+        /* make sure our super doesn't straddle pages on disk */
+        index = bytenr >> PAGE_CACHE_SHIFT;
+        if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_CACHE_SHIFT != index)
+                goto error_bdev_put;
+        /* pull in the page with our super */
+        page = read_cache_page_gfp(bdev->bd_inode->i_mapping,
+                                   index, GFP_NOFS);
+        if (IS_ERR_OR_NULL(page))
+                goto error_bdev_put;
+        p = kmap(page);
+        /* align our pointer to the offset of the super block */
+        disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
+        if (btrfs_super_bytenr(disk_super) != bytenr ||
+            disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
+                goto error_unmap;
        devid = btrfs_stack_device_id(&disk_super->dev_item);
        transid = btrfs_super_generation(disk_super);
        total_devices = btrfs_super_num_devices(disk_super);
        if (disk_super->label[0]) {
                if (disk_super->label[BTRFS_LABEL_SIZE - 1])
                        disk_super->label[BTRFS_LABEL_SIZE - 1] = '\0';
@@ -819,12 +872,19 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        } else {
                printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
        }
        printk(KERN_CONT "devid %llu transid %llu %s\n",
               (unsigned long long)devid, (unsigned long long)transid, path);
        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
        if (!ret && fs_devices_ret)
                (*fs_devices_ret)->total_devices = total_devices;
-        brelse(bh);
+error_unmap:
+        kunmap(page);
+        page_cache_release(page);
+error_bdev_put:
        blkdev_put(bdev, flags);
 error:
        mutex_unlock(&uuid_mutex);
@@ -1372,14 +1432,19 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        u64 devid;
        u64 num_devices;
        u8 *dev_uuid;
+        unsigned seq;
        int ret = 0;
        bool clear_super = false;
        mutex_lock(&uuid_mutex);
-        all_avail = root->fs_info->avail_data_alloc_bits |
+        do {
-                root->fs_info->avail_system_alloc_bits |
+                seq = read_seqbegin(&root->fs_info->profiles_lock);
-                root->fs_info->avail_metadata_alloc_bits;
+                all_avail = root->fs_info->avail_data_alloc_bits |
+                            root->fs_info->avail_system_alloc_bits |
+                            root->fs_info->avail_metadata_alloc_bits;
+        } while (read_seqretry(&root->fs_info->profiles_lock, seq));
        num_devices = root->fs_info->fs_devices->num_devices;
        btrfs_dev_replace_lock(&root->fs_info->dev_replace);
@@ -1403,6 +1468,21 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                goto out;
        }
+        if ((all_avail & BTRFS_BLOCK_GROUP_RAID5) &&
+            root->fs_info->fs_devices->rw_devices <= 2) {
+                printk(KERN_ERR "btrfs: unable to go below two "
+                       "devices on raid5\n");
+                ret = -EINVAL;
+                goto out;
+        }
+        if ((all_avail & BTRFS_BLOCK_GROUP_RAID6) &&
+            root->fs_info->fs_devices->rw_devices <= 3) {
+                printk(KERN_ERR "btrfs: unable to go below three "
+                       "devices on raid6\n");
+                ret = -EINVAL;
+                goto out;
+        }
        if (strcmp(device_path, "missing") == 0) {
                struct list_head *devices;
                struct btrfs_device *tmp;
@@ -2616,7 +2696,7 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
        chunk_used = btrfs_block_group_used(&cache->item);
        if (bargs->usage == 0)
-                user_thresh = 0;
+                user_thresh = 1;
        else if (bargs->usage > 100)
                user_thresh = cache->key.offset;
        else
@@ -2664,11 +2744,15 @@ static int chunk_drange_filter(struct extent_buffer *leaf,
                return 0;
        if (btrfs_chunk_type(leaf, chunk) & (BTRFS_BLOCK_GROUP_DUP |
-             BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10))
+             BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10)) {
-                factor = 2;
+                factor = num_stripes / 2;
-        else
+        } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID5) {
-                factor = 1;
+                factor = num_stripes - 1;
-        factor = num_stripes / factor;
+        } else if (btrfs_chunk_type(leaf, chunk) & BTRFS_BLOCK_GROUP_RAID6) {
+                factor = num_stripes - 2;
+        } else {
+                factor = num_stripes;
+        }
        for (i = 0; i < num_stripes; i++) {
                stripe = btrfs_stripe_nr(chunk, i);
@@ -2985,6 +3069,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        int mixed = 0;
        int ret;
        u64 num_devices;
+        unsigned seq;
        if (btrfs_fs_closing(fs_info) ||
            atomic_read(&fs_info->balance_pause_req) ||
@@ -3027,7 +3112,9 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1);
        else
                allowed |= (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
-                                BTRFS_BLOCK_GROUP_RAID10);
+                                BTRFS_BLOCK_GROUP_RAID10 |
+                                BTRFS_BLOCK_GROUP_RAID5 |
+                                BTRFS_BLOCK_GROUP_RAID6);
        if ((bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
            (!alloc_profile_is_valid(bctl->data.target, 1) ||
@@ -3067,23 +3154,29 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        /* allow to reduce meta or sys integrity only if force set */
        allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
-                        BTRFS_BLOCK_GROUP_RAID10;
+                        BTRFS_BLOCK_GROUP_RAID10 |
-        if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                        BTRFS_BLOCK_GROUP_RAID5 |
-             (fs_info->avail_system_alloc_bits & allowed) &&
+                        BTRFS_BLOCK_GROUP_RAID6;
-             !(bctl->sys.target & allowed)) ||
+        do {
-            ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
+                seq = read_seqbegin(&fs_info->profiles_lock);
-             (fs_info->avail_metadata_alloc_bits & allowed) &&
-             !(bctl->meta.target & allowed))) {
+                if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-                if (bctl->flags & BTRFS_BALANCE_FORCE) {
+                     (fs_info->avail_system_alloc_bits & allowed) &&
-                        printk(KERN_INFO "btrfs: force reducing metadata "
+                     !(bctl->sys.target & allowed)) ||
-                               "integrity\n");
+                    ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-                } else {
+                     (fs_info->avail_metadata_alloc_bits & allowed) &&
-                        printk(KERN_ERR "btrfs: balance will reduce metadata "
+                     !(bctl->meta.target & allowed))) {
-                               "integrity, use force if you want this\n");
+                        if (bctl->flags & BTRFS_BALANCE_FORCE) {
-                        ret = -EINVAL;
+                                printk(KERN_INFO "btrfs: force reducing metadata "
-                        goto out;
+                                       "integrity\n");
+                        } else {
+                                printk(KERN_ERR "btrfs: balance will reduce metadata "
+                                       "integrity, use force if you want this\n");
+                                ret = -EINVAL;
+                                goto out;
+                        }
                }
-        }
+        } while (read_seqretry(&fs_info->profiles_lock, seq));
        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
                int num_tolerated_disk_barrier_failures;
@@ -3127,21 +3220,16 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
        mutex_lock(&fs_info->balance_mutex);
        atomic_dec(&fs_info->balance_running);
-        if (bargs) {
-                memset(bargs, 0, sizeof(*bargs));
-                update_ioctl_balance_args(fs_info, 0, bargs);
-        }
-        if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
-            balance_need_close(fs_info)) {
-                __cancel_balance(fs_info);
-        }
        if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
                fs_info->num_tolerated_disk_barrier_failures =
                        btrfs_calc_num_tolerated_disk_barrier_failures(fs_info);
        }
+        if (bargs) {
+                memset(bargs, 0, sizeof(*bargs));
+                update_ioctl_balance_args(fs_info, 0, bargs);
+        }
        wake_up(&fs_info->balance_wait_q);
        return ret;
@@ -3504,13 +3592,86 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 }
 struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
-        { 2, 1, 0, 4, 2, 2 /* raid10 */ },
+        [BTRFS_RAID_RAID10] = {
-        { 1, 1, 2, 2, 2, 2 /* raid1 */ },
+                .sub_stripes    = 2,
-        { 1, 2, 1, 1, 1, 2 /* dup */ },
+                .dev_stripes    = 1,
-        { 1, 1, 0, 2, 1, 1 /* raid0 */ },
+                .devs_max       = 0,    /* 0 == as many as possible */
-        { 1, 1, 1, 1, 1, 1 /* single */ },
+                .devs_min       = 4,
+                .devs_increment = 2,
+                .ncopies        = 2,
+        },
+        [BTRFS_RAID_RAID1] = {
+                .sub_stripes    = 1,
+                .dev_stripes    = 1,
+                .devs_max       = 2,
+                .devs_min       = 2,
+                .devs_increment = 2,
+                .ncopies        = 2,
+        },
+        [BTRFS_RAID_DUP] = {
+                .sub_stripes    = 1,
+                .dev_stripes    = 2,
+                .devs_max       = 1,
+                .devs_min       = 1,
+                .devs_increment = 1,
+                .ncopies        = 2,
+        },
+        [BTRFS_RAID_RAID0] = {
+                .sub_stripes    = 1,
+                .dev_stripes    = 1,
+                .devs_max       = 0,
+                .devs_min       = 2,
+                .devs_increment = 1,
+                .ncopies        = 1,
+        },
+        [BTRFS_RAID_SINGLE] = {
+                .sub_stripes    = 1,
+                .dev_stripes    = 1,
+                .devs_max       = 1,
+                .devs_min       = 1,
+                .devs_increment = 1,
+                .ncopies        = 1,
+        },
+        [BTRFS_RAID_RAID5] = {
+                .sub_stripes    = 1,
+                .dev_stripes    = 1,
+                .devs_max       = 0,
+                .devs_min       = 2,
+                .devs_increment = 1,
+                .ncopies        = 2,
+        },
+        [BTRFS_RAID_RAID6] = {
+                .sub_stripes    = 1,
+                .dev_stripes    = 1,
+                .devs_max       = 0,
+                .devs_min       = 3,
+                .devs_increment = 1,
+                .ncopies        = 3,
+        },
 };
+static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
+{
+        /* TODO allow them to set a preferred stripe size */
+        return 64 * 1024;
+}
+static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
+{
+        u64 features;
+        if (!(type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)))
+                return;
+        features = btrfs_super_incompat_flags(info->super_copy);
+        if (features & BTRFS_FEATURE_INCOMPAT_RAID56)
+                return;
+        features |= BTRFS_FEATURE_INCOMPAT_RAID56;
+        btrfs_set_super_incompat_flags(info->super_copy, features);
+        printk(KERN_INFO "btrfs: setting RAID5/6 feature flag\n");
+}
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                               struct btrfs_root *extent_root,
                               struct map_lookup **map_ret,
@@ -3526,6 +3687,8 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        struct btrfs_device_info *devices_info = NULL;
        u64 total_avail;
        int num_stripes;        /* total number of stripes to allocate */
+        int data_stripes;       /* number of stripes that count for
+                                   block group size */
        int sub_stripes;        /* sub_stripes info for map */
        int dev_stripes;        /* stripes per dev */
        int devs_max;           /* max devs to use */
@@ -3537,6 +3700,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        u64 max_chunk_size;
        u64 stripe_size;
        u64 num_bytes;
+        u64 raid_stripe_len = BTRFS_STRIPE_LEN;
        int ndevs;
        int i;
        int j;
@@ -3631,12 +3795,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
                        continue;
+                if (ndevs == fs_devices->rw_devices) {
+                        WARN(1, "%s: found more than %llu devices\n",
+                             __func__, fs_devices->rw_devices);
+                        break;
+                }
                devices_info[ndevs].dev_offset = dev_offset;
                devices_info[ndevs].max_avail = max_avail;
                devices_info[ndevs].total_avail = total_avail;
                devices_info[ndevs].dev = device;
                ++ndevs;
-                WARN_ON(ndevs > fs_devices->rw_devices);
        }
        /*
@@ -3662,16 +3830,48 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        stripe_size = devices_info[ndevs-1].max_avail;
        num_stripes = ndevs * dev_stripes;
-        if (stripe_size * ndevs > max_chunk_size * ncopies) {
+        /*
-                stripe_size = max_chunk_size * ncopies;
+         * this will have to be fixed for RAID1 and RAID10 over
-                do_div(stripe_size, ndevs);
+         * more drives
+         */
+        data_stripes = num_stripes / ncopies;
+        if (type & BTRFS_BLOCK_GROUP_RAID5) {
+                raid_stripe_len = find_raid56_stripe_len(ndevs - 1,
+                                 btrfs_super_stripesize(info->super_copy));
+                data_stripes = num_stripes - 1;
+        }
+        if (type & BTRFS_BLOCK_GROUP_RAID6) {
+                raid_stripe_len = find_raid56_stripe_len(ndevs - 2,
+                                 btrfs_super_stripesize(info->super_copy));
+                data_stripes = num_stripes - 2;
+        }
+        /*
+         * Use the number of data stripes to figure out how big this chunk
+         * is really going to be in terms of logical address space,
+         * and compare that answer with the max chunk size
+         */
+        if (stripe_size * data_stripes > max_chunk_size) {
+                u64 mask = (1ULL << 24) - 1;
+                stripe_size = max_chunk_size;
+                do_div(stripe_size, data_stripes);
+                /* bump the answer up to a 16MB boundary */
+                stripe_size = (stripe_size + mask) & ~mask;
+                /* but don't go higher than the limits we found
+                 * while searching for free extents
+                 */
+                if (stripe_size > devices_info[ndevs-1].max_avail)
+                        stripe_size = devices_info[ndevs-1].max_avail;
        }
        do_div(stripe_size, dev_stripes);
        /* align to BTRFS_STRIPE_LEN */
-        do_div(stripe_size, BTRFS_STRIPE_LEN);
+        do_div(stripe_size, raid_stripe_len);
-        stripe_size *= BTRFS_STRIPE_LEN;
+        stripe_size *= raid_stripe_len;
        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
        if (!map) {
@@ -3689,14 +3889,14 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                }
        }
        map->sector_size = extent_root->sectorsize;
-        map->stripe_len = BTRFS_STRIPE_LEN;
+        map->stripe_len = raid_stripe_len;
-        map->io_align = BTRFS_STRIPE_LEN;
+        map->io_align = raid_stripe_len;
-        map->io_width = BTRFS_STRIPE_LEN;
+        map->io_width = raid_stripe_len;
        map->type = type;
        map->sub_stripes = sub_stripes;
        *map_ret = map;
-        num_bytes = stripe_size * (num_stripes / ncopies);
+        num_bytes = stripe_size * data_stripes;
        *stripe_size_out = stripe_size;
        *num_bytes_out = num_bytes;
@@ -3718,15 +3918,10 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
        write_lock(&em_tree->lock);
        ret = add_extent_mapping(em_tree, em);
        write_unlock(&em_tree->lock);
-        free_extent_map(em);
+        if (ret) {
-        if (ret)
+                free_extent_map(em);
-                goto error;
-        ret = btrfs_make_block_group(trans, extent_root, 0, type,
-                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                                     start, num_bytes);
-        if (ret)
                goto error;
+        }
        for (i = 0; i < map->num_stripes; ++i) {
                struct btrfs_device *device;
@@ -3739,15 +3934,44 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                                info->chunk_root->root_key.objectid,
                                BTRFS_FIRST_CHUNK_TREE_OBJECTID,
                                start, dev_offset, stripe_size);
-                if (ret) {
+                if (ret)
-                        btrfs_abort_transaction(trans, extent_root, ret);
+                        goto error_dev_extent;
-                        goto error;
+        }
-                }
+        ret = btrfs_make_block_group(trans, extent_root, 0, type,
+                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
+                                     start, num_bytes);
+        if (ret) {
+                i = map->num_stripes - 1;
+                goto error_dev_extent;
        }
+        free_extent_map(em);
+        check_raid56_incompat_flag(extent_root->fs_info, type);
        kfree(devices_info);
        return 0;
+error_dev_extent:
+        for (; i >= 0; i--) {
+                struct btrfs_device *device;
+                int err;
+                device = map->stripes[i].dev;
+                err = btrfs_free_dev_extent(trans, device, start);
+                if (err) {
+                        btrfs_abort_transaction(trans, extent_root, err);
+                        break;
+                }
+        }
+        write_lock(&em_tree->lock);
+        remove_extent_mapping(em_tree, em);
+        write_unlock(&em_tree->lock);
+        /* One for our allocation */
+        free_extent_map(em);
+        /* One for the tree reference */
+        free_extent_map(em);
 error:
        kfree(map);
        kfree(devices_info);
@@ -3887,10 +4111,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        if (ret)
                return ret;
-        alloc_profile = BTRFS_BLOCK_GROUP_METADATA |
+        alloc_profile = btrfs_get_alloc_profile(extent_root, 0);
-                                fs_info->avail_metadata_alloc_bits;
-        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
        ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size,
                                  &stripe_size, chunk_offset, alloc_profile);
        if (ret)
@@ -3898,10 +4119,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans,
        sys_chunk_offset = chunk_offset + chunk_size;
-        alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM |
+        alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0);
-                                fs_info->avail_system_alloc_bits;
-        alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile);
        ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map,
                                  &sys_chunk_size, &sys_stripe_size,
                                  sys_chunk_offset, alloc_profile);
@@ -4014,6 +4232,10 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
                ret = map->num_stripes;
        else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
                ret = map->sub_stripes;
+        else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
+                ret = 2;
+        else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+                ret = 3;
        else
                ret = 1;
        free_extent_map(em);
@@ -4026,6 +4248,52 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
        return ret;
 }
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+                                    struct btrfs_mapping_tree *map_tree,
+                                    u64 logical)
+{
+        struct extent_map *em;
+        struct map_lookup *map;
+        struct extent_map_tree *em_tree = &map_tree->map_tree;
+        unsigned long len = root->sectorsize;
+        read_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, logical, len);
+        read_unlock(&em_tree->lock);
+        BUG_ON(!em);
+        BUG_ON(em->start > logical || em->start + em->len < logical);
+        map = (struct map_lookup *)em->bdev;
+        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+                         BTRFS_BLOCK_GROUP_RAID6)) {
+                len = map->stripe_len * nr_data_stripes(map);
+        }
+        free_extent_map(em);
+        return len;
+}
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+                           u64 logical, u64 len, int mirror_num)
+{
+        struct extent_map *em;
+        struct map_lookup *map;
+        struct extent_map_tree *em_tree = &map_tree->map_tree;
+        int ret = 0;
+        read_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, logical, len);
+        read_unlock(&em_tree->lock);
+        BUG_ON(!em);
+        BUG_ON(em->start > logical || em->start + em->len < logical);
+        map = (struct map_lookup *)em->bdev;
+        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+                         BTRFS_BLOCK_GROUP_RAID6))
+                ret = 1;
+        free_extent_map(em);
+        return ret;
+}
 static int find_live_mirror(struct btrfs_fs_info *fs_info,
                            struct map_lookup *map, int first, int num,
                            int optimal, int dev_replace_is_ongoing)
@@ -4063,10 +4331,39 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info,
        return optimal;
 }
+static inline int parity_smaller(u64 a, u64 b)
+{
+        return a > b;
+}
+/* Bubble-sort the stripe set to put the parity/syndrome stripes last */
+static void sort_parity_stripes(struct btrfs_bio *bbio, u64 *raid_map)
+{
+        struct btrfs_bio_stripe s;
+        int i;
+        u64 l;
+        int again = 1;
+        while (again) {
+                again = 0;
+                for (i = 0; i < bbio->num_stripes - 1; i++) {
+                        if (parity_smaller(raid_map[i], raid_map[i+1])) {
+                                s = bbio->stripes[i];
+                                l = raid_map[i];
+                                bbio->stripes[i] = bbio->stripes[i+1];
+                                raid_map[i] = raid_map[i+1];
+                                bbio->stripes[i+1] = s;
+                                raid_map[i+1] = l;
+                                again = 1;
+                        }
+                }
+        }
+}
 static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                             u64 logical, u64 *length,
                             struct btrfs_bio **bbio_ret,
-                             int mirror_num)
+                             int mirror_num, u64 **raid_map_ret)
 {
        struct extent_map *em;
        struct map_lookup *map;
@@ -4078,6 +4375,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        u64 stripe_nr;
        u64 stripe_nr_orig;
        u64 stripe_nr_end;
+        u64 stripe_len;
+        u64 *raid_map = NULL;
        int stripe_index;
        int i;
        int ret = 0;
@@ -4089,6 +4388,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        int num_alloc_stripes;
        int patch_the_first_stripe_for_dev_replace = 0;
        u64 physical_to_patch_in_first_stripe = 0;
+        u64 raid56_full_stripe_start = (u64)-1;
        read_lock(&em_tree->lock);
        em = lookup_extent_mapping(em_tree, logical, *length);
@@ -4105,29 +4405,63 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        map = (struct map_lookup *)em->bdev;
        offset = logical - em->start;
+        if (mirror_num > map->num_stripes)
+                mirror_num = 0;
+        stripe_len = map->stripe_len;
        stripe_nr = offset;
        /*
         * stripe_nr counts the total number of stripes we have to stride
         * to get to this block
         */
-        do_div(stripe_nr, map->stripe_len);
+        do_div(stripe_nr, stripe_len);
-        stripe_offset = stripe_nr * map->stripe_len;
+        stripe_offset = stripe_nr * stripe_len;
        BUG_ON(offset < stripe_offset);
        /* stripe_offset is the offset of this block in its stripe*/
        stripe_offset = offset - stripe_offset;
-        if (rw & REQ_DISCARD)
+        /* if we're here for raid56, we need to know the stripe aligned start */
+        if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+                unsigned long full_stripe_len = stripe_len * nr_data_stripes(map);
+                raid56_full_stripe_start = offset;
+                /* allow a write of a full stripe, but make sure we don't
+                 * allow straddling of stripes
+                 */
+                do_div(raid56_full_stripe_start, full_stripe_len);
+                raid56_full_stripe_start *= full_stripe_len;
+        }
+        if (rw & REQ_DISCARD) {
+                /* we don't discard raid56 yet */
+                if (map->type &
+                    (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6)) {
+                        ret = -EOPNOTSUPP;
+                        goto out;
+                }
                *length = min_t(u64, em->len - offset, *length);
-        else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
+        } else if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
-                /* we limit the length of each bio to what fits in a stripe */
+                u64 max_len;
-                *length = min_t(u64, em->len - offset,
+                /* For writes to RAID[56], allow a full stripeset across all disks.
-                                map->stripe_len - stripe_offset);
+                   For other RAID types and for RAID[56] reads, just allow a single
+                   stripe (on a single disk). */
+                if (map->type & (BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6) &&
+                    (rw & REQ_WRITE)) {
+                        max_len = stripe_len * nr_data_stripes(map) -
+                                (offset - raid56_full_stripe_start);
+                } else {
+                        /* we limit the length of each bio to what fits in a stripe */
+                        max_len = stripe_len - stripe_offset;
+                }
+                *length = min_t(u64, em->len - offset, max_len);
        } else {
                *length = em->len - offset;
        }
+        /* This is for when we're called from btrfs_merge_bio_hook() and all
+           it cares about is the length */
        if (!bbio_ret)
                goto out;
@@ -4160,7 +4494,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                u64 physical_of_found = 0;
                ret = __btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS,
-                             logical, &tmp_length, &tmp_bbio, 0);
+                             logical, &tmp_length, &tmp_bbio, 0, NULL);
                if (ret) {
                        WARN_ON(tmp_bbio != NULL);
                        goto out;
@@ -4221,11 +4555,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        num_stripes = 1;
        stripe_index = 0;
        stripe_nr_orig = stripe_nr;
-        stripe_nr_end = (offset + *length + map->stripe_len - 1) &
+        stripe_nr_end = ALIGN(offset + *length, map->stripe_len);
-                        (~(map->stripe_len - 1));
        do_div(stripe_nr_end, map->stripe_len);
        stripe_end_offset = stripe_nr_end * map->stripe_len -
                            (offset + *length);
        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                if (rw & REQ_DISCARD)
                        num_stripes = min_t(u64, map->num_stripes,
@@ -4276,6 +4610,65 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                                              dev_replace_is_ongoing);
                        mirror_num = stripe_index - old_stripe_index + 1;
                }
+        } else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+                                BTRFS_BLOCK_GROUP_RAID6)) {
+                u64 tmp;
+                if (bbio_ret && ((rw & REQ_WRITE) || mirror_num > 1)
+                    && raid_map_ret) {
+                        int i, rot;
+                        /* push stripe_nr back to the start of the full stripe */
+                        stripe_nr = raid56_full_stripe_start;
+                        do_div(stripe_nr, stripe_len);
+                        stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+                        /* RAID[56] write or recovery. Return all stripes */
+                        num_stripes = map->num_stripes;
+                        max_errors = nr_parity_stripes(map);
+                        raid_map = kmalloc(sizeof(u64) * num_stripes,
+                                           GFP_NOFS);
+                        if (!raid_map) {
+                                ret = -ENOMEM;
+                                goto out;
+                        }
+                        /* Work out the disk rotation on this stripe-set */
+                        tmp = stripe_nr;
+                        rot = do_div(tmp, num_stripes);
+                        /* Fill in the logical address of each stripe */
+                        tmp = stripe_nr * nr_data_stripes(map);
+                        for (i = 0; i < nr_data_stripes(map); i++)
+                                raid_map[(i+rot) % num_stripes] =
+                                        em->start + (tmp + i) * map->stripe_len;
+                        raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
+                        if (map->type & BTRFS_BLOCK_GROUP_RAID6)
+                                raid_map[(i+rot+1) % num_stripes] =
+                                        RAID6_Q_STRIPE;
+                        *length = map->stripe_len;
+                        stripe_index = 0;
+                        stripe_offset = 0;
+                } else {
+                        /*
+                         * Mirror #0 or #1 means the original data block.
+                         * Mirror #2 is RAID5 parity block.
+                         * Mirror #3 is RAID6 Q block.
+                         */
+                        stripe_index = do_div(stripe_nr, nr_data_stripes(map));
+                        if (mirror_num > 1)
+                                stripe_index = nr_data_stripes(map) +
+                                                mirror_num - 2;
+                        /* We distribute the parity blocks across stripes */
+                        tmp = stripe_nr + stripe_index;
+                        stripe_index = do_div(tmp, map->num_stripes);
+                }
        } else {
                /*
                 * after this do_div call, stripe_nr is the number of stripes
@@ -4384,8 +4777,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
        if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) {
                if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
                                 BTRFS_BLOCK_GROUP_RAID10 |
+                                 BTRFS_BLOCK_GROUP_RAID5 |
                                 BTRFS_BLOCK_GROUP_DUP)) {
                        max_errors = 1;
+                } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
+                        max_errors = 2;
                }
        }
@@ -4486,6 +4882,10 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
                bbio->mirror_num = map->num_stripes + 1;
        }
+        if (raid_map) {
+                sort_parity_stripes(bbio, raid_map);
+                *raid_map_ret = raid_map;
+        }
 out:
        if (dev_replace_is_ongoing)
                btrfs_dev_replace_unlock(dev_replace);
@@ -4498,7 +4898,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
                      struct btrfs_bio **bbio_ret, int mirror_num)
 {
        return __btrfs_map_block(fs_info, rw, logical, length, bbio_ret,
-                                 mirror_num);
+                                 mirror_num, NULL);
 }
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -4512,6 +4912,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        u64 bytenr;
        u64 length;
        u64 stripe_nr;
+        u64 rmap_len;
        int i, j, nr = 0;
        read_lock(&em_tree->lock);
@@ -4522,10 +4923,17 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        map = (struct map_lookup *)em->bdev;
        length = em->len;
+        rmap_len = map->stripe_len;
        if (map->type & BTRFS_BLOCK_GROUP_RAID10)
                do_div(length, map->num_stripes / map->sub_stripes);
        else if (map->type & BTRFS_BLOCK_GROUP_RAID0)
                do_div(length, map->num_stripes);
+        else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
+                              BTRFS_BLOCK_GROUP_RAID6)) {
+                do_div(length, nr_data_stripes(map));
+                rmap_len = map->stripe_len * nr_data_stripes(map);
+        }
        buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS);
        BUG_ON(!buf); /* -ENOMEM */
@@ -4545,8 +4953,11 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
                        do_div(stripe_nr, map->sub_stripes);
                } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
                        stripe_nr = stripe_nr * map->num_stripes + i;
-                }
+                } /* else if RAID[56], multiply by nr_data_stripes().
-                bytenr = chunk_start + stripe_nr * map->stripe_len;
+                   * Alternatively, just use rmap_len below instead of
+                   * map->stripe_len */
+                bytenr = chunk_start + stripe_nr * rmap_len;
                WARN_ON(nr >= map->num_stripes);
                for (j = 0; j < nr; j++) {
                        if (buf[j] == bytenr)
@@ -4560,7 +4971,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        *logical = buf;
        *naddrs = nr;
-        *stripe_len = map->stripe_len;
+        *stripe_len = rmap_len;
        free_extent_map(em);
        return 0;
@@ -4634,7 +5045,7 @@ static void btrfs_end_bio(struct bio *bio, int err)
                bio->bi_bdev = (struct block_device *)
                                        (unsigned long)bbio->mirror_num;
                /* only send an error to the higher layers if it is
-                 * beyond the tolerance of the multi-bio
+                 * beyond the tolerance of the btrfs bio
                 */
                if (atomic_read(&bbio->error) > bbio->max_errors) {
                        err = -EIO;
@@ -4668,13 +5079,18 @@ struct async_sched {
 * This will add one bio to the pending list for a device and make sure
 * the work struct is scheduled.
 */
-static noinline void schedule_bio(struct btrfs_root *root,
+noinline void btrfs_schedule_bio(struct btrfs_root *root,
                                 struct btrfs_device *device,
                                 int rw, struct bio *bio)
 {
        int should_queue = 1;
        struct btrfs_pending_bios *pending_bios;
+        if (device->missing || !device->bdev) {
+                bio_endio(bio, -EIO);
+                return;
+        }
        /* don't bother with additional async steps for reads, right now */
        if (!(rw & REQ_WRITE)) {
                bio_get(bio);
@@ -4772,7 +5188,7 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio,
 #endif
        bio->bi_bdev = dev->bdev;
        if (async)
-                schedule_bio(root, dev, rw, bio);
+                btrfs_schedule_bio(root, dev, rw, bio);
        else
                btrfsic_submit_bio(rw, bio);
 }
@@ -4831,6 +5247,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        u64 logical = (u64)bio->bi_sector << 9;
        u64 length = 0;
        u64 map_length;
+        u64 *raid_map = NULL;
        int ret;
        int dev_nr = 0;
        int total_devs = 1;
@@ -4839,12 +5256,30 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
        length = bio->bi_size;
        map_length = length;
-        ret = btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
+        ret = __btrfs_map_block(root->fs_info, rw, logical, &map_length, &bbio,
-                              mirror_num);
+                              mirror_num, &raid_map);
-        if (ret)
+        if (ret) /* -ENOMEM */
                return ret;
        total_devs = bbio->num_stripes;
+        bbio->orig_bio = first_bio;
+        bbio->private = first_bio->bi_private;
+        bbio->end_io = first_bio->bi_end_io;
+        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
+        if (raid_map) {
+                /* In this case, map_length has been set to the length of
+                   a single stripe; not the whole write */
+                if (rw & WRITE) {
+                        return raid56_parity_write(root, bio, bbio,
+                                                   raid_map, map_length);
+                } else {
+                        return raid56_parity_recover(root, bio, bbio,
+                                                     raid_map, map_length,
+                                                     mirror_num);
+                }
+        }
        if (map_length < length) {
                printk(KERN_CRIT "btrfs: mapping failed logical %llu bio len %llu "
                       "len %llu\n", (unsigned long long)logical,
@@ -4853,11 +5288,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                BUG();
        }
-        bbio->orig_bio = first_bio;
-        bbio->private = first_bio->bi_private;
-        bbio->end_io = first_bio->bi_end_io;
-        atomic_set(&bbio->stripes_pending, bbio->num_stripes);
        while (dev_nr < total_devs) {
                dev = bbio->stripes[dev_nr].dev;
                if (!dev || !dev->bdev || (rw & WRITE && !dev->writeable)) {
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d3c3939ac751..062d8604d35b 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -21,8 +21,8 @@
 #include <linux/bio.h>
 #include <linux/sort.h>
+#include <linux/btrfs.h>
 #include "async-thread.h"
-#include "ioctl.h"
 #define BTRFS_STRIPE_LEN        (64 * 1024)
@@ -321,7 +321,14 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
 void btrfs_init_dev_replace_tgtdev_for_resume(struct btrfs_fs_info *fs_info,
                                              struct btrfs_device *tgtdev);
 int btrfs_scratch_superblock(struct btrfs_device *device);
+void btrfs_schedule_bio(struct btrfs_root *root,
+                        struct btrfs_device *device,
+                        int rw, struct bio *bio);
+int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
+                           u64 logical, u64 len, int mirror_num);
+unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
+                                    struct btrfs_mapping_tree *map_tree,
+                                    u64 logical);
 static inline void btrfs_dev_stat_inc(struct btrfs_device *dev,
                                      int index)
 {