56 files changed, 15026 insertions, 6963 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 7bb3c020e570..ecb9fd3be143 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -4,6 +4,8 @@ config BTRFS_FS
        select LIBCRC32C
        select ZLIB_INFLATE
        select ZLIB_DEFLATE
+        select LZO_COMPRESS
+        select LZO_DECOMPRESS
        help
          Btrfs is a new filesystem with extents, writable snapshotting,
          support for multiple devices and many more features.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index a35eb36b32fd..9b72dcf1cd25 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,5 +6,5 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
           transaction.o inode.o file.o tree-defrag.o \
           extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
           extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
-           export.o tree-log.o acl.o free-space-cache.o zlib.o \
+           export.o tree-log.o acl.o free-space-cache.o zlib.o lzo.o \
-           compression.o delayed-ref.o relocation.o
+           compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 2222d161c7b6..f66fc9959733 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,6 +37,9 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
        char *value = NULL;
        struct posix_acl *acl;
+        if (!IS_POSIXACL(inode))
+                return NULL;
        acl = get_cached_acl(inode, type);
        if (acl != ACL_NOT_CACHED)
                return acl;
@@ -60,8 +63,10 @@ static struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
                size = __btrfs_getxattr(inode, name, value, size);
                if (size > 0) {
                        acl = posix_acl_from_xattr(value, size);
-                        if (IS_ERR(acl))
+                        if (IS_ERR(acl)) {
+                                kfree(value);
                                return acl;
+                        }
                        set_cached_acl(inode, type, acl);
                }
                kfree(value);
@@ -82,6 +87,9 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name,
        struct posix_acl *acl;
        int ret = 0;
+        if (!IS_POSIXACL(dentry->d_inode))
+                return -EOPNOTSUPP;
        acl = btrfs_get_acl(dentry->d_inode, type);
        if (IS_ERR(acl))
@@ -162,7 +170,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        int ret;
        struct posix_acl *acl = NULL;
-        if (!is_owner_or_cap(dentry->d_inode))
+        if (!inode_owner_or_capable(dentry->d_inode))
                return -EPERM;
        if (!IS_POSIXACL(dentry->d_inode))
@@ -170,33 +178,40 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name,
        if (value) {
                acl = posix_acl_from_xattr(value, size);
-                if (acl == NULL) {
+                if (IS_ERR(acl))
-                        value = NULL;
-                        size = 0;
-                } else if (IS_ERR(acl)) {
                        return PTR_ERR(acl);
+                if (acl) {
+                        ret = posix_acl_valid(acl);
+                        if (ret)
+                                goto out;
                }
        }
        ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type);
+out:
        posix_acl_release(acl);
        return ret;
 }
-int btrfs_check_acl(struct inode *inode, int mask)
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags)
 {
-        struct posix_acl *acl;
        int error = -EAGAIN;
-        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
+        if (flags & IPERM_FLAG_RCU) {
+                if (!negative_cached_acl(inode, ACL_TYPE_ACCESS))
+                        error = -ECHILD;
-        if (IS_ERR(acl))
+        } else {
-                return PTR_ERR(acl);
+                struct posix_acl *acl;
-        if (acl) {
+                acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-                error = posix_acl_permission(inode, acl, mask);
+                if (IS_ERR(acl))
-                posix_acl_release(acl);
+                        return PTR_ERR(acl);
+                if (acl) {
+                        error = posix_acl_permission(inode, acl, mask);
+                        posix_acl_release(acl);
+                }
        }
        return error;
@@ -273,7 +288,7 @@ int btrfs_acl_chmod(struct inode *inode)
                return 0;
        acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS);
-        if (IS_ERR(acl) || !acl)
+        if (IS_ERR_OR_NULL(acl))
                return PTR_ERR(acl);
        clone = posix_acl_clone(acl, GFP_KERNEL);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 6ad63f17eca0..52d7eca8c7bf 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -22,6 +22,7 @@
 #include "extent_map.h"
 #include "extent_io.h"
 #include "ordered-data.h"
+#include "delayed-inode.h"
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -120,9 +121,6 @@ struct btrfs_inode {
         */
        u64 index_cnt;
-        /* the start of block group preferred for allocations. */
-        u64 block_group;
        /* the fsync log has some corner cases that mean we have to check
         * directories to see if any unlinks have been done before
         * the directory was logged.  See tree-log.c for all the
@@ -136,9 +134,8 @@ struct btrfs_inode {
         * items we think we'll end up using, and reserved_extents is the number
         * of extent items we've reserved metadata for.
         */
-        spinlock_t accounting_lock;
        atomic_t outstanding_extents;
-        int reserved_extents;
+        atomic_t reserved_extents;
        /*
         * ordered_data_close is set by truncate when a file that used
@@ -153,20 +150,34 @@ struct btrfs_inode {
        unsigned ordered_data_close:1;
        unsigned orphan_meta_reserved:1;
        unsigned dummy_inode:1;
+        unsigned in_defrag:1;
        /*
         * always compress this one file
         */
-        unsigned force_compress:1;
+        unsigned force_compress:4;
+        struct btrfs_delayed_node *delayed_node;
        struct inode vfs_inode;
 };
+extern unsigned char btrfs_filetype_table[];
 static inline struct btrfs_inode *BTRFS_I(struct inode *inode)
 {
        return container_of(inode, struct btrfs_inode, vfs_inode);
 }
+static inline u64 btrfs_ino(struct inode *inode)
+{
+        u64 ino = BTRFS_I(inode)->location.objectid;
+        if (ino <= BTRFS_FIRST_FREE_OBJECTID)
+                ino = inode->i_ino;
+        return ino;
+}
 static inline void btrfs_i_size_write(struct inode *inode, u64 size)
 {
        i_size_write(inode, size);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 396039b3a8a2..bfe42b03eaf9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -62,6 +62,9 @@ struct compressed_bio {
        /* number of bytes on disk */
        unsigned long compressed_len;
+        /* the compression algorithm for this bio */
+        int compress_type;
        /* number of compressed pages in the array */
        unsigned long nr_pages;
@@ -91,23 +94,10 @@ static inline int compressed_bio_size(struct btrfs_root *root,
 static struct bio *compressed_bio_alloc(struct block_device *bdev,
                                        u64 first_byte, gfp_t gfp_flags)
 {
-        struct bio *bio;
        int nr_vecs;
        nr_vecs = bio_get_nr_vecs(bdev);
-        bio = bio_alloc(gfp_flags, nr_vecs);
+        return btrfs_bio_alloc(bdev, first_byte >> 9, nr_vecs, gfp_flags);
-        if (bio == NULL && (current->flags & PF_MEMALLOC)) {
-                while (!bio && (nr_vecs /= 2))
-                        bio = bio_alloc(gfp_flags, nr_vecs);
-        }
-        if (bio) {
-                bio->bi_size = 0;
-                bio->bi_bdev = bdev;
-                bio->bi_sector = first_byte >> 9;
-        }
-        return bio;
 }
 static int check_compressed_csum(struct inode *inode,
@@ -135,9 +125,10 @@ static int check_compressed_csum(struct inode *inode,
                kunmap_atomic(kaddr, KM_USER0);
                if (csum != *cb_sum) {
-                        printk(KERN_INFO "btrfs csum failed ino %lu "
+                        printk(KERN_INFO "btrfs csum failed ino %llu "
                               "extent %llu csum %u "
-                               "wanted %u mirror %d\n", inode->i_ino,
+                               "wanted %u mirror %d\n",
+                               (unsigned long long)btrfs_ino(inode),
                               (unsigned long long)disk_start,
                               csum, *cb_sum, cb->mirror_num);
                        ret = -EIO;
@@ -163,7 +154,6 @@ fail:
 */
 static void end_compressed_bio_read(struct bio *bio, int err)
 {
-        struct extent_io_tree *tree;
        struct compressed_bio *cb = bio->bi_private;
        struct inode *inode;
        struct page *page;
@@ -187,12 +177,12 @@ static void end_compressed_bio_read(struct bio *bio, int err)
        /* ok, we're the last bio for this extent, lets start
         * the decompression.
         */
-        tree = &BTRFS_I(inode)->io_tree;
+        ret = btrfs_decompress_biovec(cb->compress_type,
-        ret = btrfs_zlib_decompress_biovec(cb->compressed_pages,
+                                      cb->compressed_pages,
-                                        cb->start,
+                                      cb->start,
-                                        cb->orig_bio->bi_io_vec,
+                                      cb->orig_bio->bi_io_vec,
-                                        cb->orig_bio->bi_vcnt,
+                                      cb->orig_bio->bi_vcnt,
-                                        cb->compressed_len);
+                                      cb->compressed_len);
 csum_failed:
        if (ret)
                cb->errors = 1;
@@ -343,7 +333,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        struct compressed_bio *cb;
        unsigned long bytes_left;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-        int page_index = 0;
+        int pg_index = 0;
        struct page *page;
        u64 first_byte = disk_start;
        struct block_device *bdev;
@@ -351,6 +341,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1));
        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+        if (!cb)
+                return -ENOMEM;
        atomic_set(&cb->pending_bios, 0);
        cb->errors = 0;
        cb->inode = inode;
@@ -365,14 +357,18 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
        bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS);
+        if(!bio) {
+                kfree(cb);
+                return -ENOMEM;
+        }
        bio->bi_private = cb;
        bio->bi_end_io = end_compressed_bio_write;
        atomic_inc(&cb->pending_bios);
        /* create and submit bios for the compressed pages */
        bytes_left = compressed_len;
-        for (page_index = 0; page_index < cb->nr_pages; page_index++) {
+        for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
-                page = compressed_pages[page_index];
+                page = compressed_pages[pg_index];
                page->mapping = inode->i_mapping;
                if (bio->bi_size)
                        ret = io_tree->ops->merge_bio_hook(page, 0,
@@ -437,7 +433,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                                     struct compressed_bio *cb)
 {
        unsigned long end_index;
-        unsigned long page_index;
+        unsigned long pg_index;
        u64 last_offset;
        u64 isize = i_size_read(inode);
        int ret;
@@ -461,13 +457,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
        end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
        while (last_offset < compressed_end) {
-                page_index = last_offset >> PAGE_CACHE_SHIFT;
+                pg_index = last_offset >> PAGE_CACHE_SHIFT;
-                if (page_index > end_index)
+                if (pg_index > end_index)
                        break;
                rcu_read_lock();
-                page = radix_tree_lookup(&mapping->page_tree, page_index);
+                page = radix_tree_lookup(&mapping->page_tree, pg_index);
                rcu_read_unlock();
                if (page) {
                        misses++;
@@ -481,7 +477,7 @@ static noinline int add_ra_bio_pages(struct inode *inode,
                if (!page)
                        break;
-                if (add_to_page_cache_lru(page, mapping, page_index,
+                if (add_to_page_cache_lru(page, mapping, pg_index,
                                                                GFP_NOFS)) {
                        page_cache_release(page);
                        goto next;
@@ -565,7 +561,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
        unsigned long compressed_len;
        unsigned long nr_pages;
-        unsigned long page_index;
+        unsigned long pg_index;
        struct page *page;
        struct block_device *bdev;
        struct bio *comp_bio;
@@ -573,7 +569,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        u64 em_len;
        u64 em_start;
        struct extent_map *em;
-        int ret;
+        int ret = -ENOMEM;
        u32 *sums;
        tree = &BTRFS_I(inode)->io_tree;
@@ -588,6 +584,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        compressed_len = em->block_len;
        cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS);
+        if (!cb)
+                goto out;
        atomic_set(&cb->pending_bios, 0);
        cb->errors = 0;
        cb->inode = inode;
@@ -603,17 +602,23 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        cb->len = uncompressed_len;
        cb->compressed_len = compressed_len;
+        cb->compress_type = extent_compress_type(bio_flags);
        cb->orig_bio = bio;
        nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) /
                                 PAGE_CACHE_SIZE;
-        cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages,
+        cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages,
                                       GFP_NOFS);
+        if (!cb->compressed_pages)
+                goto fail1;
        bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-        for (page_index = 0; page_index < nr_pages; page_index++) {
+        for (pg_index = 0; pg_index < nr_pages; pg_index++) {
-                cb->compressed_pages[page_index] = alloc_page(GFP_NOFS |
+                cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS |
                                                              __GFP_HIGHMEM);
+                if (!cb->compressed_pages[pg_index])
+                        goto fail2;
        }
        cb->nr_pages = nr_pages;
@@ -624,12 +629,14 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        cb->len = uncompressed_len;
        comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS);
+        if (!comp_bio)
+                goto fail2;
        comp_bio->bi_private = cb;
        comp_bio->bi_end_io = end_compressed_bio_read;
        atomic_inc(&cb->pending_bios);
-        for (page_index = 0; page_index < nr_pages; page_index++) {
+        for (pg_index = 0; pg_index < nr_pages; pg_index++) {
-                page = cb->compressed_pages[page_index];
+                page = cb->compressed_pages[pg_index];
                page->mapping = inode->i_mapping;
                page->index = em_start >> PAGE_CACHE_SHIFT;
@@ -657,8 +664,9 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                        atomic_inc(&cb->pending_bios);
                        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-                                btrfs_lookup_bio_sums(root, inode, comp_bio,
+                                ret = btrfs_lookup_bio_sums(root, inode,
-                                                      sums);
+                                                        comp_bio, sums);
+                                BUG_ON(ret);
                        }
                        sums += (comp_bio->bi_size + root->sectorsize - 1) /
                                root->sectorsize;
@@ -683,12 +691,339 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
        ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0);
        BUG_ON(ret);
-        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) {
-                btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+                ret = btrfs_lookup_bio_sums(root, inode, comp_bio, sums);
+                BUG_ON(ret);
+        }
        ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0);
        BUG_ON(ret);
        bio_put(comp_bio);
        return 0;
+fail2:
+        for (pg_index = 0; pg_index < nr_pages; pg_index++)
+                free_page((unsigned long)cb->compressed_pages[pg_index]);
+        kfree(cb->compressed_pages);
+fail1:
+        kfree(cb);
+out:
+        free_extent_map(em);
+        return ret;
+}
+static struct list_head comp_idle_workspace[BTRFS_COMPRESS_TYPES];
+static spinlock_t comp_workspace_lock[BTRFS_COMPRESS_TYPES];
+static int comp_num_workspace[BTRFS_COMPRESS_TYPES];
+static atomic_t comp_alloc_workspace[BTRFS_COMPRESS_TYPES];
+static wait_queue_head_t comp_workspace_wait[BTRFS_COMPRESS_TYPES];
+struct btrfs_compress_op *btrfs_compress_op[] = {
+        &btrfs_zlib_compress,
+        &btrfs_lzo_compress,
+};
+int __init btrfs_init_compress(void)
+{
+        int i;
+        for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+                INIT_LIST_HEAD(&comp_idle_workspace[i]);
+                spin_lock_init(&comp_workspace_lock[i]);
+                atomic_set(&comp_alloc_workspace[i], 0);
+                init_waitqueue_head(&comp_workspace_wait[i]);
+        }
+        return 0;
+}
+/*
+ * this finds an available workspace or allocates a new one
+ * ERR_PTR is returned if things go bad.
+ */
+static struct list_head *find_workspace(int type)
+{
+        struct list_head *workspace;
+        int cpus = num_online_cpus();
+        int idx = type - 1;
+        struct list_head *idle_workspace        = &comp_idle_workspace[idx];
+        spinlock_t *workspace_lock              = &comp_workspace_lock[idx];
+        atomic_t *alloc_workspace               = &comp_alloc_workspace[idx];
+        wait_queue_head_t *workspace_wait       = &comp_workspace_wait[idx];
+        int *num_workspace                      = &comp_num_workspace[idx];
+again:
+        spin_lock(workspace_lock);
+        if (!list_empty(idle_workspace)) {
+                workspace = idle_workspace->next;
+                list_del(workspace);
+                (*num_workspace)--;
+                spin_unlock(workspace_lock);
+                return workspace;
+        }
+        if (atomic_read(alloc_workspace) > cpus) {
+                DEFINE_WAIT(wait);
+                spin_unlock(workspace_lock);
+                prepare_to_wait(workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
+                if (atomic_read(alloc_workspace) > cpus && !*num_workspace)
+                        schedule();
+                finish_wait(workspace_wait, &wait);
+                goto again;
+        }
+        atomic_inc(alloc_workspace);
+        spin_unlock(workspace_lock);
+        workspace = btrfs_compress_op[idx]->alloc_workspace();
+        if (IS_ERR(workspace)) {
+                atomic_dec(alloc_workspace);
+                wake_up(workspace_wait);
+        }
+        return workspace;
+}
+/*
+ * put a workspace struct back on the list or free it if we have enough
+ * idle ones sitting around
+ */
+static void free_workspace(int type, struct list_head *workspace)
+{
+        int idx = type - 1;
+        struct list_head *idle_workspace        = &comp_idle_workspace[idx];
+        spinlock_t *workspace_lock              = &comp_workspace_lock[idx];
+        atomic_t *alloc_workspace               = &comp_alloc_workspace[idx];
+        wait_queue_head_t *workspace_wait       = &comp_workspace_wait[idx];
+        int *num_workspace                      = &comp_num_workspace[idx];
+        spin_lock(workspace_lock);
+        if (*num_workspace < num_online_cpus()) {
+                list_add_tail(workspace, idle_workspace);
+                (*num_workspace)++;
+                spin_unlock(workspace_lock);
+                goto wake;
+        }
+        spin_unlock(workspace_lock);
+        btrfs_compress_op[idx]->free_workspace(workspace);
+        atomic_dec(alloc_workspace);
+wake:
+        if (waitqueue_active(workspace_wait))
+                wake_up(workspace_wait);
+}
+/*
+ * cleanup function for module exit
+ */
+static void free_workspaces(void)
+{
+        struct list_head *workspace;
+        int i;
+        for (i = 0; i < BTRFS_COMPRESS_TYPES; i++) {
+                while (!list_empty(&comp_idle_workspace[i])) {
+                        workspace = comp_idle_workspace[i].next;
+                        list_del(workspace);
+                        btrfs_compress_op[i]->free_workspace(workspace);
+                        atomic_dec(&comp_alloc_workspace[i]);
+                }
+        }
+}
+/*
+ * given an address space and start/len, compress the bytes.
+ *
+ * pages are allocated to hold the compressed result and stored
+ * in 'pages'
+ *
+ * out_pages is used to return the number of pages allocated.  There
+ * may be pages allocated even if we return an error
+ *
+ * total_in is used to return the number of bytes actually read.  It
+ * may be smaller then len if we had to exit early because we
+ * ran out of room in the pages array or because we cross the
+ * max_out threshold.
+ *
+ * total_out is used to return the total number of compressed bytes
+ *
+ * max_out tells us the max number of bytes that we're allowed to
+ * stuff into pages
+ */
+int btrfs_compress_pages(int type, struct address_space *mapping,
+                         u64 start, unsigned long len,
+                         struct page **pages,
+                         unsigned long nr_dest_pages,
+                         unsigned long *out_pages,
+                         unsigned long *total_in,
+                         unsigned long *total_out,
+                         unsigned long max_out)
+{
+        struct list_head *workspace;
+        int ret;
+        workspace = find_workspace(type);
+        if (IS_ERR(workspace))
+                return -1;
+        ret = btrfs_compress_op[type-1]->compress_pages(workspace, mapping,
+                                                      start, len, pages,
+                                                      nr_dest_pages, out_pages,
+                                                      total_in, total_out,
+                                                      max_out);
+        free_workspace(type, workspace);
+        return ret;
+}
+/*
+ * pages_in is an array of pages with compressed data.
+ *
+ * disk_start is the starting logical offset of this array in the file
+ *
+ * bvec is a bio_vec of pages from the file that we want to decompress into
+ *
+ * vcnt is the count of pages in the biovec
+ *
+ * srclen is the number of bytes in pages_in
+ *
+ * The basic idea is that we have a bio that was created by readpages.
+ * The pages in the bio are for the uncompressed data, and they may not
+ * be contiguous.  They all correspond to the range of bytes covered by
+ * the compressed extent.
+ */
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
+                            struct bio_vec *bvec, int vcnt, size_t srclen)
+{
+        struct list_head *workspace;
+        int ret;
+        workspace = find_workspace(type);
+        if (IS_ERR(workspace))
+                return -ENOMEM;
+        ret = btrfs_compress_op[type-1]->decompress_biovec(workspace, pages_in,
+                                                         disk_start,
+                                                         bvec, vcnt, srclen);
+        free_workspace(type, workspace);
+        return ret;
+}
+/*
+ * a less complex decompression routine.  Our compressed data fits in a
+ * single page, and we want to read a single page out of it.
+ * start_byte tells us the offset into the compressed data we're interested in
+ */
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
+                     unsigned long start_byte, size_t srclen, size_t destlen)
+{
+        struct list_head *workspace;
+        int ret;
+        workspace = find_workspace(type);
+        if (IS_ERR(workspace))
+                return -ENOMEM;
+        ret = btrfs_compress_op[type-1]->decompress(workspace, data_in,
+                                                  dest_page, start_byte,
+                                                  srclen, destlen);
+        free_workspace(type, workspace);
+        return ret;
+}
+void btrfs_exit_compress(void)
+{
+        free_workspaces();
+}
+/*
+ * Copy uncompressed data from working buffer to pages.
+ *
+ * buf_start is the byte offset we're of the start of our workspace buffer.
+ *
+ * total_out is the last byte of the buffer
+ */
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
+                              unsigned long total_out, u64 disk_start,
+                              struct bio_vec *bvec, int vcnt,
+                              unsigned long *pg_index,
+                              unsigned long *pg_offset)
+{
+        unsigned long buf_offset;
+        unsigned long current_buf_start;
+        unsigned long start_byte;
+        unsigned long working_bytes = total_out - buf_start;
+        unsigned long bytes;
+        char *kaddr;
+        struct page *page_out = bvec[*pg_index].bv_page;
+        /*
+         * start byte is the first byte of the page we're currently
+         * copying into relative to the start of the compressed data.
+         */
+        start_byte = page_offset(page_out) - disk_start;
+        /* we haven't yet hit data corresponding to this page */
+        if (total_out <= start_byte)
+                return 1;
+        /*
+         * the start of the data we care about is offset into
+         * the middle of our working buffer
+         */
+        if (total_out > start_byte && buf_start < start_byte) {
+                buf_offset = start_byte - buf_start;
+                working_bytes -= buf_offset;
+        } else {
+                buf_offset = 0;
+        }
+        current_buf_start = buf_start;
+        /* copy bytes from the working buffer into the pages */
+        while (working_bytes > 0) {
+                bytes = min(PAGE_CACHE_SIZE - *pg_offset,
+                            PAGE_CACHE_SIZE - buf_offset);
+                bytes = min(bytes, working_bytes);
+                kaddr = kmap_atomic(page_out, KM_USER0);
+                memcpy(kaddr + *pg_offset, buf + buf_offset, bytes);
+                kunmap_atomic(kaddr, KM_USER0);
+                flush_dcache_page(page_out);
+                *pg_offset += bytes;
+                buf_offset += bytes;
+                working_bytes -= bytes;
+                current_buf_start += bytes;
+                /* check if we need to pick another page */
+                if (*pg_offset == PAGE_CACHE_SIZE) {
+                        (*pg_index)++;
+                        if (*pg_index >= vcnt)
+                                return 0;
+                        page_out = bvec[*pg_index].bv_page;
+                        *pg_offset = 0;
+                        start_byte = page_offset(page_out) - disk_start;
+                        /*
+                         * make sure our new page is covered by this
+                         * working buffer
+                         */
+                        if (total_out <= start_byte)
+                                return 1;
+                        /*
+                         * the next page in the biovec might not be adjacent
+                         * to the last page, but it might still be found
+                         * inside this working buffer. bump our offset pointer
+                         */
+                        if (total_out > start_byte &&
+                            current_buf_start < start_byte) {
+                                buf_offset = start_byte - buf_start;
+                                working_bytes = total_out - start_byte;
+                                current_buf_start = buf_start + buf_offset;
+                        }
+                }
+        }
+        return 1;
 }
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index 421f5b4aa715..a12059f4f0fd 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -19,24 +19,27 @@
 #ifndef __BTRFS_COMPRESSION_
 #define __BTRFS_COMPRESSION_
-int btrfs_zlib_decompress(unsigned char *data_in,
+int btrfs_init_compress(void);
-                          struct page *dest_page,
+void btrfs_exit_compress(void);
-                          unsigned long start_byte,
-                          size_t srclen, size_t destlen);
+int btrfs_compress_pages(int type, struct address_space *mapping,
-int btrfs_zlib_compress_pages(struct address_space *mapping,
+                         u64 start, unsigned long len,
-                              u64 start, unsigned long len,
+                         struct page **pages,
-                              struct page **pages,
+                         unsigned long nr_dest_pages,
-                              unsigned long nr_dest_pages,
+                         unsigned long *out_pages,
-                              unsigned long *out_pages,
+                         unsigned long *total_in,
-                              unsigned long *total_in,
+                         unsigned long *total_out,
-                              unsigned long *total_out,
+                         unsigned long max_out);
-                              unsigned long max_out);
+int btrfs_decompress_biovec(int type, struct page **pages_in, u64 disk_start,
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
+                            struct bio_vec *bvec, int vcnt, size_t srclen);
-                              u64 disk_start,
+int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
-                              struct bio_vec *bvec,
+                     unsigned long start_byte, size_t srclen, size_t destlen);
-                              int vcnt,
+int btrfs_decompress_buf2page(char *buf, unsigned long buf_start,
-                              size_t srclen);
+                              unsigned long total_out, u64 disk_start,
-void btrfs_zlib_exit(void);
+                              struct bio_vec *bvec, int vcnt,
+                              unsigned long *pg_index,
+                              unsigned long *pg_offset);
 int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                                  unsigned long len, u64 disk_start,
                                  unsigned long compressed_len,
@@ -44,4 +47,37 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start,
                                  unsigned long nr_pages);
 int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
                                 int mirror_num, unsigned long bio_flags);
+struct btrfs_compress_op {
+        struct list_head *(*alloc_workspace)(void);
+        void (*free_workspace)(struct list_head *workspace);
+        int (*compress_pages)(struct list_head *workspace,
+                              struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out);
+        int (*decompress_biovec)(struct list_head *workspace,
+                                 struct page **pages_in,
+                                 u64 disk_start,
+                                 struct bio_vec *bvec,
+                                 int vcnt,
+                                 size_t srclen);
+        int (*decompress)(struct list_head *workspace,
+                          unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen);
+};
+extern struct btrfs_compress_op btrfs_zlib_compress;
+extern struct btrfs_compress_op btrfs_lzo_compress;
 #endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c3df14ce2cc2..2e667868e0d2 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,18 +38,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
                              struct extent_buffer *src_buf);
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
                   struct btrfs_path *path, int level, int slot);
-static int setup_items_for_insert(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root, struct btrfs_path *path,
-                        struct btrfs_key *cpu_key, u32 *data_size,
-                        u32 total_data, u32 total_size, int nr);
 struct btrfs_path *btrfs_alloc_path(void)
 {
        struct btrfs_path *path;
        path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
-        if (path)
-                path->reada = 1;
        return path;
 }
@@ -105,7 +98,9 @@ noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
 /* this also releases the path */
 void btrfs_free_path(struct btrfs_path *p)
 {
-        btrfs_release_path(NULL, p);
+        if (!p)
+                return;
+        btrfs_release_path(p);
        kmem_cache_free(btrfs_path_cachep, p);
 }
@@ -115,7 +110,7 @@ void btrfs_free_path(struct btrfs_path *p)
 *
 * It is safe to call this on paths that no locks or extent buffers held.
 */
-noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
+noinline void btrfs_release_path(struct btrfs_path *p)
 {
        int i;
@@ -145,10 +140,11 @@ noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p)
 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
 {
        struct extent_buffer *eb;
-        spin_lock(&root->node_lock);
-        eb = root->node;
+        rcu_read_lock();
+        eb = rcu_dereference(root->node);
        extent_buffer_get(eb);
-        spin_unlock(&root->node_lock);
+        rcu_read_unlock();
        return eb;
 }
@@ -163,14 +159,8 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
        while (1) {
                eb = btrfs_root_node(root);
                btrfs_tree_lock(eb);
+                if (eb == root->node)
-                spin_lock(&root->node_lock);
-                if (eb == root->node) {
-                        spin_unlock(&root->node_lock);
                        break;
-                }
-                spin_unlock(&root->node_lock);
                btrfs_tree_unlock(eb);
                free_extent_buffer(eb);
        }
@@ -200,7 +190,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
                      struct extent_buffer **cow_ret, u64 new_root_objectid)
 {
        struct extent_buffer *cow;
-        u32 nritems;
        int ret = 0;
        int level;
        struct btrfs_disk_key disk_key;
@@ -210,7 +199,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
        WARN_ON(root->ref_cows && trans->transid != root->last_trans);
        level = btrfs_header_level(buf);
-        nritems = btrfs_header_nritems(buf);
        if (level == 0)
                btrfs_item_key(buf, &disk_key, 0);
        else
@@ -458,10 +446,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
                else
                        parent_start = 0;
-                spin_lock(&root->node_lock);
-                root->node = cow;
                extent_buffer_get(cow);
-                spin_unlock(&root->node_lock);
+                rcu_assign_pointer(root->node, cow);
                btrfs_free_tree_block(trans, root, buf, parent_start,
                                      last_ref);
@@ -542,6 +528,9 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
        ret = __btrfs_cow_block(trans, root, buf, parent,
                                 parent_slot, cow_ret, search_start, 0);
+        trace_btrfs_cow_block(root, buf, *cow_ret);
        return ret;
 }
@@ -686,6 +675,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                        if (!cur) {
                                cur = read_tree_block(root, blocknr,
                                                         blocksize, gen);
+                                if (!cur)
+                                        return -EIO;
                        } else if (!uptodate) {
                                btrfs_read_buffer(cur, gen);
                        }
@@ -732,122 +723,6 @@ static inline unsigned int leaf_data_end(struct btrfs_root *root,
        return btrfs_item_offset_nr(leaf, nr - 1);
 }
-/*
- * extra debugging checks to make sure all the items in a key are
- * well formed and in the proper order
- */
-static int check_node(struct btrfs_root *root, struct btrfs_path *path,
-                      int level)
-{
-        struct extent_buffer *parent = NULL;
-        struct extent_buffer *node = path->nodes[level];
-        struct btrfs_disk_key parent_key;
-        struct btrfs_disk_key node_key;
-        int parent_slot;
-        int slot;
-        struct btrfs_key cpukey;
-        u32 nritems = btrfs_header_nritems(node);
-        if (path->nodes[level + 1])
-                parent = path->nodes[level + 1];
-        slot = path->slots[level];
-        BUG_ON(nritems == 0);
-        if (parent) {
-                parent_slot = path->slots[level + 1];
-                btrfs_node_key(parent, &parent_key, parent_slot);
-                btrfs_node_key(node, &node_key, 0);
-                BUG_ON(memcmp(&parent_key, &node_key,
-                              sizeof(struct btrfs_disk_key)));
-                BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-                       btrfs_header_bytenr(node));
-        }
-        BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root));
-        if (slot != 0) {
-                btrfs_node_key_to_cpu(node, &cpukey, slot - 1);
-                btrfs_node_key(node, &node_key, slot);
-                BUG_ON(comp_keys(&node_key, &cpukey) <= 0);
-        }
-        if (slot < nritems - 1) {
-                btrfs_node_key_to_cpu(node, &cpukey, slot + 1);
-                btrfs_node_key(node, &node_key, slot);
-                BUG_ON(comp_keys(&node_key, &cpukey) >= 0);
-        }
-        return 0;
-}
-/*
- * extra checking to make sure all the items in a leaf are
- * well formed and in the proper order
- */
-static int check_leaf(struct btrfs_root *root, struct btrfs_path *path,
-                      int level)
-{
-        struct extent_buffer *leaf = path->nodes[level];
-        struct extent_buffer *parent = NULL;
-        int parent_slot;
-        struct btrfs_key cpukey;
-        struct btrfs_disk_key parent_key;
-        struct btrfs_disk_key leaf_key;
-        int slot = path->slots[0];
-        u32 nritems = btrfs_header_nritems(leaf);
-        if (path->nodes[level + 1])
-                parent = path->nodes[level + 1];
-        if (nritems == 0)
-                return 0;
-        if (parent) {
-                parent_slot = path->slots[level + 1];
-                btrfs_node_key(parent, &parent_key, parent_slot);
-                btrfs_item_key(leaf, &leaf_key, 0);
-                BUG_ON(memcmp(&parent_key, &leaf_key,
-                       sizeof(struct btrfs_disk_key)));
-                BUG_ON(btrfs_node_blockptr(parent, parent_slot) !=
-                       btrfs_header_bytenr(leaf));
-        }
-        if (slot != 0 && slot < nritems - 1) {
-                btrfs_item_key(leaf, &leaf_key, slot);
-                btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1);
-                if (comp_keys(&leaf_key, &cpukey) <= 0) {
-                        btrfs_print_leaf(root, leaf);
-                        printk(KERN_CRIT "slot %d offset bad key\n", slot);
-                        BUG_ON(1);
-                }
-                if (btrfs_item_offset_nr(leaf, slot - 1) !=
-                       btrfs_item_end_nr(leaf, slot)) {
-                        btrfs_print_leaf(root, leaf);
-                        printk(KERN_CRIT "slot %d offset bad\n", slot);
-                        BUG_ON(1);
-                }
-        }
-        if (slot < nritems - 1) {
-                btrfs_item_key(leaf, &leaf_key, slot);
-                btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1);
-                BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0);
-                if (btrfs_item_offset_nr(leaf, slot) !=
-                        btrfs_item_end_nr(leaf, slot + 1)) {
-                        btrfs_print_leaf(root, leaf);
-                        printk(KERN_CRIT "slot %d offset bad\n", slot);
-                        BUG_ON(1);
-                }
-        }
-        BUG_ON(btrfs_item_offset_nr(leaf, 0) +
-               btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root));
-        return 0;
-}
-static noinline int check_block(struct btrfs_root *root,
-                                struct btrfs_path *path, int level)
-{
-        return 0;
-        if (level == 0)
-                return check_leaf(root, path, level);
-        return check_node(root, path, level);
-}
 /*
 * search for key in the extent_buffer.  The items start at offset p,
@@ -1008,7 +883,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
        int wret;
        int pslot;
        int orig_slot = path->slots[level];
-        int err_on_enospc = 0;
        u64 orig_ptr;
        if (level == 0)
@@ -1047,9 +921,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                        goto enospc;
                }
-                spin_lock(&root->node_lock);
+                rcu_assign_pointer(root->node, child);
-                root->node = child;
-                spin_unlock(&root->node_lock);
                add_root_to_dirty_list(root);
                btrfs_tree_unlock(child);
@@ -1071,8 +943,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
            BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
                return 0;
-        if (btrfs_header_nritems(mid) < 2)
+        btrfs_header_nritems(mid);
-                err_on_enospc = 1;
        left = read_node_slot(root, parent, pslot - 1);
        if (left) {
@@ -1103,8 +974,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                wret = push_node_left(trans, root, left, mid, 1);
                if (wret < 0)
                        ret = wret;
-                if (btrfs_header_nritems(mid) < 2)
+                btrfs_header_nritems(mid);
-                        err_on_enospc = 1;
        }
        /*
@@ -1191,7 +1061,6 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
                }
        }
        /* double check we haven't messed things up */
-        check_block(root, path, level);
        if (orig_ptr !=
            btrfs_node_blockptr(path->nodes[level], path->slots[level]))
                BUG();
@@ -1224,14 +1093,12 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
        int wret;
        int pslot;
        int orig_slot = path->slots[level];
-        u64 orig_ptr;
        if (level == 0)
                return 1;
        mid = path->nodes[level];
        WARN_ON(btrfs_header_generation(mid) != trans->transid);
-        orig_ptr = btrfs_node_blockptr(mid, orig_slot);
        if (level < BTRFS_MAX_LEVEL - 1)
                parent = path->nodes[level + 1];
@@ -1355,11 +1222,13 @@ static void reada_for_search(struct btrfs_root *root,
        u64 search;
        u64 target;
        u64 nread = 0;
+        u64 gen;
        int direction = path->reada;
        struct extent_buffer *eb;
        u32 nr;
        u32 blocksize;
        u32 nscan = 0;
+        bool map = true;
        if (level != 1)
                return;
@@ -1381,7 +1250,19 @@ static void reada_for_search(struct btrfs_root *root,
        nritems = btrfs_header_nritems(node);
        nr = slot;
+        if (node->map_token || path->skip_locking)
+                map = false;
        while (1) {
+                if (map && !node->map_token) {
+                        unsigned long offset = btrfs_node_key_ptr_offset(nr);
+                        map_private_extent_buffer(node, offset,
+                                                  sizeof(struct btrfs_key_ptr),
+                                                  &node->map_token,
+                                                  &node->kaddr,
+                                                  &node->map_start,
+                                                  &node->map_len, KM_USER1);
+                }
                if (direction < 0) {
                        if (nr == 0)
                                break;
@@ -1399,14 +1280,23 @@ static void reada_for_search(struct btrfs_root *root,
                search = btrfs_node_blockptr(node, nr);
                if ((search <= target && target - search <= 65536) ||
                    (search > target && search - target <= 65536)) {
-                        readahead_tree_block(root, search, blocksize,
+                        gen = btrfs_node_ptr_generation(node, nr);
-                                     btrfs_node_ptr_generation(node, nr));
+                        if (map && node->map_token) {
+                                unmap_extent_buffer(node, node->map_token,
+                                                    KM_USER1);
+                                node->map_token = NULL;
+                        }
+                        readahead_tree_block(root, search, blocksize, gen);
                        nread += blocksize;
                }
                nscan++;
                if ((nread > 65536 || nscan > 32))
                        break;
        }
+        if (map && node->map_token) {
+                unmap_extent_buffer(node, node->map_token, KM_USER1);
+                node->map_token = NULL;
+        }
 }
 /*
@@ -1454,7 +1344,7 @@ static noinline int reada_for_balance(struct btrfs_root *root,
                ret = -EAGAIN;
                /* release the whole path */
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                /* read the blocks */
                if (block1)
@@ -1577,13 +1467,33 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        blocksize = btrfs_level_size(root, level - 1);
        tmp = btrfs_find_tree_block(root, blocknr, blocksize);
-        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+        if (tmp) {
-                /*
+                if (btrfs_buffer_uptodate(tmp, 0)) {
-                 * we found an up to date block without sleeping, return
+                        if (btrfs_buffer_uptodate(tmp, gen)) {
-                 * right away
+                                /*
-                 */
+                                 * we found an up to date block without
-                *eb_ret = tmp;
+                                 * sleeping, return
-                return 0;
+                                 * right away
+                                 */
+                                *eb_ret = tmp;
+                                return 0;
+                        }
+                        /* the pages were up to date, but we failed
+                         * the generation number check.  Do a full
+                         * read for the generation number that is correct.
+                         * We must do this without dropping locks so
+                         * we can trust our generation number
+                         */
+                        free_extent_buffer(tmp);
+                        tmp = read_tree_block(root, blocknr, blocksize, gen);
+                        if (tmp && btrfs_buffer_uptodate(tmp, gen)) {
+                                *eb_ret = tmp;
+                                return 0;
+                        }
+                        free_extent_buffer(tmp);
+                        btrfs_release_path(p);
+                        return -EIO;
+                }
        }
        /*
@@ -1596,12 +1506,11 @@ read_block_for_search(struct btrfs_trans_handle *trans,
        btrfs_unlock_up_safe(p, level + 1);
        btrfs_set_path_blocking(p);
-        if (tmp)
+        free_extent_buffer(tmp);
-                free_extent_buffer(tmp);
        if (p->reada)
                reada_for_search(root, p, level, slot, key->objectid);
-        btrfs_release_path(NULL, p);
+        btrfs_release_path(p);
        ret = -EAGAIN;
        tmp = read_tree_block(root, blocknr, blocksize, 0);
@@ -1670,7 +1579,7 @@ setup_nodes_for_search(struct btrfs_trans_handle *trans,
                }
                b = p->nodes[level];
                if (!b) {
-                        btrfs_release_path(NULL, p);
+                        btrfs_release_path(p);
                        goto again;
                }
                BUG_ON(btrfs_header_nritems(b) == 1);
@@ -1760,9 +1669,6 @@ again:
                }
 cow_done:
                BUG_ON(!cow && ins_len);
-                if (level != btrfs_header_level(b))
-                        WARN_ON(1);
-                level = btrfs_header_level(b);
                p->nodes[level] = b;
                if (!p->skip_locking)
@@ -1784,12 +1690,6 @@ cow_done:
                if (!cow)
                        btrfs_unlock_up_safe(p, level + 1);
-                ret = check_block(root, p, level);
-                if (ret) {
-                        ret = -1;
-                        goto done;
-                }
                ret = bin_search(b, key, level, &slot);
                if (level != 0) {
@@ -1866,7 +1766,7 @@ done:
        if (!p->leave_spinning)
                btrfs_set_path_blocking(p);
        if (ret < 0)
-                btrfs_release_path(root, p);
+                btrfs_release_path(p);
        return ret;
 }
@@ -2116,10 +2016,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(c);
-        spin_lock(&root->node_lock);
        old = root->node;
-        root->node = c;
+        rcu_assign_pointer(root->node, c);
-        spin_unlock(&root->node_lock);
        /* the super has an extra ref to root->node */
        free_extent_buffer(old);
@@ -2502,6 +2400,9 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
        btrfs_assert_tree_locked(path->nodes[1]);
        right = read_node_slot(root, upper, slot + 1);
+        if (right == NULL)
+                return 1;
        btrfs_tree_lock(right);
        btrfs_set_lock_blocking(right);
@@ -2548,7 +2449,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
 {
        struct btrfs_disk_key disk_key;
        struct extent_buffer *right = path->nodes[0];
-        int slot;
        int i;
        int push_space = 0;
        int push_items = 0;
@@ -2560,8 +2460,6 @@ static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
        u32 this_item_size;
        u32 old_left_item_size;
-        slot = path->slots[1];
        if (empty)
                nr = min(right_nritems, max_slot);
        else
@@ -2755,6 +2653,9 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
        btrfs_assert_tree_locked(path->nodes[1]);
        left = read_node_slot(root, path->nodes[1], slot - 1);
+        if (left == NULL)
+                return 1;
        btrfs_tree_lock(left);
        btrfs_set_lock_blocking(left);
@@ -3138,7 +3039,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
                                    struct btrfs_file_extent_item);
                extent_len = btrfs_file_extent_num_bytes(leaf, fi);
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        path->keep_locks = 1;
        path->search_for_split = 1;
@@ -3328,9 +3229,7 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
                        struct btrfs_path *path,
                        u32 new_size, int from_end)
 {
-        int ret = 0;
        int slot;
-        int slot_orig;
        struct extent_buffer *leaf;
        struct btrfs_item *item;
        u32 nritems;
@@ -3340,7 +3239,6 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
        unsigned int size_diff;
        int i;
-        slot_orig = path->slots[0];
        leaf = path->nodes[0];
        slot = path->slots[0];
@@ -3428,12 +3326,11 @@ int btrfs_truncate_item(struct btrfs_trans_handle *trans,
        btrfs_set_item_size(leaf, item, new_size);
        btrfs_mark_buffer_dirty(leaf);
-        ret = 0;
        if (btrfs_leaf_free_space(root, leaf) < 0) {
                btrfs_print_leaf(root, leaf);
                BUG();
        }
-        return ret;
+        return 0;
 }
 /*
@@ -3443,9 +3340,7 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, struct btrfs_path *path,
                      u32 data_size)
 {
-        int ret = 0;
        int slot;
-        int slot_orig;
        struct extent_buffer *leaf;
        struct btrfs_item *item;
        u32 nritems;
@@ -3454,7 +3349,6 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
        unsigned int old_size;
        int i;
-        slot_orig = path->slots[0];
        leaf = path->nodes[0];
        nritems = btrfs_header_nritems(leaf);
@@ -3510,12 +3404,11 @@ int btrfs_extend_item(struct btrfs_trans_handle *trans,
        btrfs_set_item_size(leaf, item, old_size + data_size);
        btrfs_mark_buffer_dirty(leaf);
-        ret = 0;
        if (btrfs_leaf_free_space(root, leaf) < 0) {
                btrfs_print_leaf(root, leaf);
                BUG();
        }
-        return ret;
+        return 0;
 }
 /*
@@ -3675,11 +3568,10 @@ out:
 * to save stack depth by doing the bulk of the work in a function
 * that doesn't call btrfs_search_slot
 */
-static noinline_for_stack int
+int setup_items_for_insert(struct btrfs_trans_handle *trans,
-setup_items_for_insert(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct btrfs_path *path,
-                      struct btrfs_root *root, struct btrfs_path *path,
+                           struct btrfs_key *cpu_key, u32 *data_size,
-                      struct btrfs_key *cpu_key, u32 *data_size,
+                           u32 total_data, u32 total_size, int nr)
-                      u32 total_data, u32 total_size, int nr)
 {
        struct btrfs_item *item;
        int i;
@@ -3763,7 +3655,6 @@ setup_items_for_insert(struct btrfs_trans_handle *trans,
        ret = 0;
        if (slot == 0) {
-                struct btrfs_disk_key disk_key;
                btrfs_cpu_key_to_disk(&disk_key, cpu_key);
                ret = fixup_low_keys(trans, root, path, &disk_key, 1);
        }
@@ -3787,7 +3678,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                            struct btrfs_key *cpu_key, u32 *data_size,
                            int nr)
 {
-        struct extent_buffer *leaf;
        int ret = 0;
        int slot;
        int i;
@@ -3804,7 +3694,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
        if (ret < 0)
                goto out;
-        leaf = path->nodes[0];
        slot = path->slots[0];
        BUG_ON(slot < 0);
@@ -3829,7 +3718,8 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
        unsigned long ptr;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
        if (!ret) {
                leaf = path->nodes[0];
@@ -4066,7 +3956,7 @@ int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
        else
                return 1;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                return ret;
@@ -4190,7 +4080,7 @@ find_next_key:
                        sret = btrfs_find_next_key(root, path, min_key, level,
                                                  cache_only, min_trans);
                        if (sret == 0) {
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                goto again;
                        } else {
                                goto out;
@@ -4206,6 +4096,7 @@ find_next_key:
                }
                btrfs_set_path_blocking(path);
                cur = read_node_slot(root, cur, slot);
+                BUG_ON(!cur);
                btrfs_tree_lock(cur);
@@ -4268,7 +4159,7 @@ next:
                                btrfs_node_key_to_cpu(c, &cur_key, slot);
                        orig_lowest = path->lowest_level;
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        path->lowest_level = level;
                        ret = btrfs_search_slot(NULL, root, &cur_key, path,
                                                0, 0);
@@ -4345,7 +4236,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 again:
        level = 1;
        next = NULL;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        path->keep_locks = 1;
@@ -4401,7 +4292,7 @@ again:
                        goto again;
                if (ret < 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto done;
                }
@@ -4440,7 +4331,7 @@ again:
                        goto again;
                if (ret < 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto done;
                }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index eaf286abad17..3b859a3e6a0e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -19,18 +19,21 @@
 #ifndef __BTRFS_CTREE__
 #define __BTRFS_CTREE__
-#include <linux/version.h>
 #include <linux/mm.h>
 #include <linux/highmem.h>
 #include <linux/fs.h>
+#include <linux/rwsem.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/slab.h>
+#include <linux/kobject.h>
+#include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
+#include "ioctl.h"
 struct btrfs_trans_handle;
 struct btrfs_transaction;
@@ -39,6 +42,7 @@ extern struct kmem_cache *btrfs_trans_handle_cachep;
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
+extern struct kmem_cache *btrfs_free_space_cachep;
 struct btrfs_ordered_sum;
 #define BTRFS_MAGIC "_BHRfS_M"
@@ -99,6 +103,15 @@ struct btrfs_ordered_sum;
 */
 #define BTRFS_EXTENT_CSUM_OBJECTID -10ULL
+/* For storing free space cache */
+#define BTRFS_FREE_SPACE_OBJECTID -11ULL
+/*
+ * The inode number assigned to the special inode for sotring
+ * free ino cache
+ */
+#define BTRFS_FREE_INO_OBJECTID -12ULL
 /* dummy objectid represents multiple objectids */
 #define BTRFS_MULTIPLE_OBJECTIDS -255ULL
@@ -181,7 +194,6 @@ struct btrfs_mapping_tree {
        struct extent_map_tree map_tree;
 };
-#define BTRFS_UUID_SIZE 16
 struct btrfs_dev_item {
        /* the internal btrfs device id */
        __le64 devid;
@@ -265,6 +277,22 @@ struct btrfs_chunk {
        /* additional stripes go here */
 } __attribute__ ((__packed__));
+#define BTRFS_FREE_SPACE_EXTENT 1
+#define BTRFS_FREE_SPACE_BITMAP 2
+struct btrfs_free_space_entry {
+        __le64 offset;
+        __le64 bytes;
+        u8 type;
+} __attribute__ ((__packed__));
+struct btrfs_free_space_header {
+        struct btrfs_disk_key location;
+        __le64 generation;
+        __le64 num_entries;
+        __le64 num_bitmaps;
+} __attribute__ ((__packed__));
 static inline unsigned long btrfs_chunk_item_size(int num_stripes)
 {
        BUG_ON(num_stripes == 0);
@@ -272,9 +300,16 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes)
                sizeof(struct btrfs_stripe) * (num_stripes - 1);
 }
-#define BTRFS_FSID_SIZE 16
 #define BTRFS_HEADER_FLAG_WRITTEN       (1ULL << 0)
 #define BTRFS_HEADER_FLAG_RELOC         (1ULL << 1)
+/*
+ * File system states
+ */
+/* Errors detected */
+#define BTRFS_SUPER_FLAG_ERROR          (1ULL << 2)
 #define BTRFS_SUPER_FLAG_SEEDING        (1ULL << 32)
 #define BTRFS_SUPER_FLAG_METADUMP       (1ULL << 33)
@@ -365,8 +400,10 @@ struct btrfs_super_block {
        char label[BTRFS_LABEL_SIZE];
+        __le64 cache_generation;
        /* future expansion */
-        __le64 reserved[32];
+        __le64 reserved[31];
        u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
@@ -375,13 +412,17 @@ struct btrfs_super_block {
 * ones specified below then we will fail to mount
 */
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF    (1ULL << 0)
-#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL   (2ULL << 0)
+#define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL   (1ULL << 1)
+#define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS     (1ULL << 2)
+#define BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO     (1ULL << 3)
 #define BTRFS_FEATURE_COMPAT_SUPP               0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SUPP            0ULL
-#define BTRFS_FEATURE_INCOMPAT_SUPP             \
+#define BTRFS_FEATURE_INCOMPAT_SUPP                     \
-        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
+        (BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF |         \
-         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)
+         BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL |        \
+         BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS |          \
+         BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO)
 /*
 * A leaf is full of items. offset and size tell us where to find
@@ -474,6 +515,12 @@ struct btrfs_extent_item_v0 {
 /* use full backrefs for extent pointers in the block */
 #define BTRFS_BLOCK_FLAG_FULL_BACKREF   (1ULL << 8)
+/*
+ * this flag is only used internally by scrub and may be changed at any time
+ * it is only declared here to avoid collisions
+ */
+#define BTRFS_EXTENT_FLAG_SUPER         (1ULL << 48)
 struct btrfs_tree_block_info {
        struct btrfs_disk_key key;
        u8 level;
@@ -528,9 +575,11 @@ struct btrfs_timespec {
 } __attribute__ ((__packed__));
 enum btrfs_compression_type {
-        BTRFS_COMPRESS_NONE = 0,
+        BTRFS_COMPRESS_NONE  = 0,
-        BTRFS_COMPRESS_ZLIB = 1,
+        BTRFS_COMPRESS_ZLIB  = 1,
-        BTRFS_COMPRESS_LAST = 2,
+        BTRFS_COMPRESS_LZO   = 2,
+        BTRFS_COMPRESS_TYPES = 2,
+        BTRFS_COMPRESS_LAST  = 3,
 };
 struct btrfs_inode_item {
@@ -574,6 +623,8 @@ struct btrfs_dir_item {
        u8 type;
 } __attribute__ ((__packed__));
+#define BTRFS_ROOT_SUBVOL_RDONLY        (1ULL << 0)
 struct btrfs_root_item {
        struct btrfs_inode_item inode;
        __le64 generation;
@@ -675,9 +726,10 @@ struct btrfs_block_group_item {
 struct btrfs_space_info {
        u64 flags;
-        u64 total_bytes;        /* total bytes in the space */
+        u64 total_bytes;        /* total bytes in the space,
+                                   this doesn't take mirrors into account */
        u64 bytes_used;         /* total bytes used,
-                                   this does't take mirrors into account */
+                                   this doesn't take mirrors into account */
        u64 bytes_pinned;       /* total bytes pinned, will be freed when the
                                   transaction finishes */
        u64 bytes_reserved;     /* total bytes the allocator has reserved for
@@ -687,11 +739,24 @@ struct btrfs_space_info {
        u64 bytes_may_use;      /* number of bytes that may be used for
                                   delalloc/allocations */
        u64 disk_used;          /* total bytes used on disk */
+        u64 disk_total;         /* total bytes on disk, takes mirrors into
+                                   account */
+        /*
+         * we bump reservation progress every time we decrement
+         * bytes_reserved.  This way people waiting for reservations
+         * know something good has happened and they can check
+         * for progress.  The number here isn't to be trusted, it
+         * just shows reclaim activity
+         */
+        unsigned long reservation_progress;
-        int full;               /* indicates that we cannot allocate any more
+        unsigned int full:1;    /* indicates that we cannot allocate any more
                                   chunks for this space */
-        int force_alloc;        /* set if we need to force a chunk alloc for
+        unsigned int chunk_alloc:1;     /* set if we are allocating a chunk */
-                                   this space */
+        unsigned int force_alloc;       /* set if we need to force a chunk
+                                           alloc for this space */
        struct list_head list;
@@ -732,9 +797,6 @@ struct btrfs_free_cluster {
        /* first extent starting offset */
        u64 window_start;
-        /* if this cluster simply points at a bitmap in the block group */
-        bool points_to_bitmap;
        struct btrfs_block_group_cache *block_group;
        /*
         * when a cluster is allocated from a block group, we put the
@@ -750,6 +812,14 @@ enum btrfs_caching_type {
        BTRFS_CACHE_FINISHED    = 2,
 };
+enum btrfs_disk_cache_state {
+        BTRFS_DC_WRITTEN        = 0,
+        BTRFS_DC_ERROR          = 1,
+        BTRFS_DC_CLEAR          = 2,
+        BTRFS_DC_SETUP          = 3,
+        BTRFS_DC_NEED_WRITE     = 4,
+};
 struct btrfs_caching_control {
        struct list_head list;
        struct mutex mutex;
@@ -763,6 +833,7 @@ struct btrfs_block_group_cache {
        struct btrfs_key key;
        struct btrfs_block_group_item item;
        struct btrfs_fs_info *fs_info;
+        struct inode *inode;
        spinlock_t lock;
        u64 pinned;
        u64 reserved;
@@ -770,11 +841,11 @@ struct btrfs_block_group_cache {
        u64 bytes_super;
        u64 flags;
        u64 sectorsize;
-        int extents_thresh;
+        unsigned int ro:1;
-        int free_extents;
+        unsigned int dirty:1;
-        int total_bitmaps;
+        unsigned int iref:1;
-        int ro;
-        int dirty;
+        int disk_cache_state;
        /* cache tracking stuff */
        int cached;
@@ -784,9 +855,7 @@ struct btrfs_block_group_cache {
        struct btrfs_space_info *space_info;
        /* free space cache stuff */
-        spinlock_t tree_lock;
+        struct btrfs_free_space_ctl *free_space_ctl;
-        struct rb_root free_space_offset;
-        u64 free_space;
        /* block group cache stuff */
        struct rb_node cache_node;
@@ -806,6 +875,7 @@ struct btrfs_block_group_cache {
 struct reloc_control;
 struct btrfs_device;
 struct btrfs_fs_devices;
+struct btrfs_delayed_root;
 struct btrfs_fs_info {
        u8 fsid[BTRFS_FSID_SIZE];
        u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
@@ -832,7 +902,10 @@ struct btrfs_fs_info {
        /* logical->physical extent mapping */
        struct btrfs_mapping_tree mapping_tree;
-        /* block reservation for extent, checksum and root tree */
+        /*
+         * block reservation for extent, checksum, root tree and
+         * delayed dir index item
+         */
        struct btrfs_block_rsv global_block_rsv;
        /* block reservation for delay allocation */
        struct btrfs_block_rsv delalloc_block_rsv;
@@ -856,13 +929,14 @@ struct btrfs_fs_info {
         * is required instead of the faster short fsync log commits
         */
        u64 last_trans_log_full_commit;
-        u64 open_ioctl_trans;
+        unsigned long mount_opt:20;
-        unsigned long mount_opt;
+        unsigned long compress_type:4;
        u64 max_inline;
        u64 alloc_start;
        struct btrfs_transaction *running_transaction;
        wait_queue_head_t transaction_throttle;
        wait_queue_head_t transaction_wait;
+        wait_queue_head_t transaction_blocked_wait;
        wait_queue_head_t async_submit_wait;
        struct btrfs_super_block super_copy;
@@ -871,7 +945,6 @@ struct btrfs_fs_info {
        struct super_block *sb;
        struct inode *btree_inode;
        struct backing_dev_info bdi;
-        struct mutex trans_mutex;
        struct mutex tree_log_mutex;
        struct mutex transaction_kthread_mutex;
        struct mutex cleaner_mutex;
@@ -892,6 +965,13 @@ struct btrfs_fs_info {
        struct rw_semaphore subvol_sem;
        struct srcu_struct subvol_srcu;
+        spinlock_t trans_lock;
+        /*
+         * the reloc mutex goes with the trans lock, it is taken
+         * during commit to protect us from the relocation code
+         */
+        struct mutex reloc_mutex;
        struct list_head trans_list;
        struct list_head hashers;
        struct list_head dead_roots;
@@ -904,6 +984,7 @@ struct btrfs_fs_info {
        atomic_t async_submit_draining;
        atomic_t nr_async_bios;
        atomic_t async_delalloc_pages;
+        atomic_t open_ioctl_trans;
        /*
         * this is used by the balancing code to wait for all the pending
@@ -949,6 +1030,7 @@ struct btrfs_fs_info {
        struct btrfs_workers endio_meta_workers;
        struct btrfs_workers endio_meta_write_workers;
        struct btrfs_workers endio_write_workers;
+        struct btrfs_workers endio_freespace_worker;
        struct btrfs_workers submit_workers;
        /*
         * fixup workers take dirty pages that didn't properly go through
@@ -956,6 +1038,7 @@ struct btrfs_fs_info {
         * for the sys_munmap function call path
         */
        struct btrfs_workers fixup_workers;
+        struct btrfs_workers delayed_workers;
        struct task_struct *transaction_kthread;
        struct task_struct *cleaner_kthread;
        int thread_pool_size;
@@ -966,6 +1049,7 @@ struct btrfs_fs_info {
        int closing;
        int log_root_recovering;
        int enospc_unlink;
+        int trans_no_join;
        u64 total_pinned;
@@ -987,7 +1071,6 @@ struct btrfs_fs_info {
        struct reloc_control *reloc_ctl;
        spinlock_t delalloc_lock;
-        spinlock_t new_trans_lock;
        u64 delalloc_bytes;
        /* data_alloc_cluster is only used in ssd mode */
@@ -996,6 +1079,11 @@ struct btrfs_fs_info {
        /* all metadata allocations go through this cluster */
        struct btrfs_free_cluster meta_alloc_cluster;
+        /* auto defrag inodes go here */
+        spinlock_t defrag_inodes_lock;
+        struct rb_root defrag_inodes;
+        atomic_t defrag_running;
        spinlock_t ref_cache_lock;
        u64 total_ref_cache_size;
@@ -1010,6 +1098,22 @@ struct btrfs_fs_info {
        unsigned metadata_ratio;
        void *bdev_holder;
+        /* private scrub information */
+        struct mutex scrub_lock;
+        atomic_t scrubs_running;
+        atomic_t scrub_pause_req;
+        atomic_t scrubs_paused;
+        atomic_t scrub_cancel_req;
+        wait_queue_head_t scrub_pause_wait;
+        struct rw_semaphore scrub_super_lock;
+        int scrub_workers_refcnt;
+        struct btrfs_workers scrub_workers;
+        /* filesystem state */
+        u64 fs_state;
+        struct btrfs_delayed_root *delayed_root;
 };
 /*
@@ -1019,9 +1123,6 @@ struct btrfs_fs_info {
 struct btrfs_root {
        struct extent_buffer *node;
-        /* the node lock is held while changing the node pointer */
-        spinlock_t node_lock;
        struct extent_buffer *commit_root;
        struct btrfs_root *log_root;
        struct btrfs_root *reloc_root;
@@ -1038,6 +1139,16 @@ struct btrfs_root {
        spinlock_t accounting_lock;
        struct btrfs_block_rsv *block_rsv;
+        /* free ino cache stuff */
+        struct mutex fs_commit_mutex;
+        struct btrfs_free_space_ctl *free_ino_ctl;
+        enum btrfs_caching_type cached;
+        spinlock_t cache_lock;
+        wait_queue_head_t cache_wait;
+        struct btrfs_free_space_ctl *free_ino_pinned;
+        u64 cache_progress;
+        struct inode *cache_inode;
        struct mutex log_mutex;
        wait_queue_head_t log_writer_wait;
        wait_queue_head_t log_commit_wait[2];
@@ -1066,6 +1177,14 @@ struct btrfs_root {
        u32 type;
        u64 highest_objectid;
+        /* btrfs_record_root_in_trans is a multi-step process,
+         * and it can race with the balancing code.   But the
+         * race is very small, and only the first time the root
+         * is added to each transaction.  So in_trans_setup
+         * is used to tell us when more checks are required
+         */
+        unsigned long in_trans_setup;
        int ref_cows;
        int track_dirty;
        int in_radix;
@@ -1075,7 +1194,6 @@ struct btrfs_root {
        struct btrfs_key defrag_max;
        int defrag_running;
        char *name;
-        int in_sysfs;
        /* the dirty list is only used by non-reference counted roots */
        struct list_head dirty_list;
@@ -1093,12 +1211,49 @@ struct btrfs_root {
        struct rb_root inode_tree;
        /*
+         * radix tree that keeps track of delayed nodes of every inode,
+         * protected by inode_lock
+         */
+        struct radix_tree_root delayed_nodes_tree;
+        /*
         * right now this just gets used so that a root has its own devid
         * for stat.  It may be used for more later
         */
        struct super_block anon_super;
 };
+struct btrfs_ioctl_defrag_range_args {
+        /* start of the defrag operation */
+        __u64 start;
+        /* number of bytes to defrag, use (u64)-1 to say all */
+        __u64 len;
+        /*
+         * flags for the operation, which can include turning
+         * on compression for this one defrag
+         */
+        __u64 flags;
+        /*
+         * any extent bigger than this will be considered
+         * already defragged.  Use 0 to take the kernel default
+         * Use 1 to say every single extent must be rewritten
+         */
+        __u32 extent_thresh;
+        /*
+         * which compression method to use if turning on compression
+         * for this defrag operation.  If unspecified, zlib will
+         * be used
+         */
+        __u32 compress_type;
+        /* spare for later */
+        __u32 unused[4];
+};
 /*
 * inode items have the data typically returned from stat and store other
 * info about object characteristics.  There is one for every file and dir in
@@ -1180,6 +1335,11 @@ struct btrfs_root {
 */
 #define BTRFS_STRING_ITEM_KEY   253
+/*
+ * Flags for mount options.
+ *
+ * Note: don't forget to add new options to btrfs_show_options()
+ */
 #define BTRFS_MOUNT_NODATASUM           (1 << 0)
 #define BTRFS_MOUNT_NODATACOW           (1 << 1)
 #define BTRFS_MOUNT_NOBARRIER           (1 << 2)
@@ -1192,6 +1352,12 @@ struct btrfs_root {
 #define BTRFS_MOUNT_NOSSD               (1 << 9)
 #define BTRFS_MOUNT_DISCARD             (1 << 10)
 #define BTRFS_MOUNT_FORCE_COMPRESS      (1 << 11)
+#define BTRFS_MOUNT_SPACE_CACHE         (1 << 12)
+#define BTRFS_MOUNT_CLEAR_CACHE         (1 << 13)
+#define BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED (1 << 14)
+#define BTRFS_MOUNT_ENOSPC_DEBUG         (1 << 15)
+#define BTRFS_MOUNT_AUTO_DEFRAG         (1 << 16)
+#define BTRFS_MOUNT_INODE_MAP_CACHE     (1 << 17)
 #define btrfs_clear_opt(o, opt)         ((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)           ((o) |= BTRFS_MOUNT_##opt)
@@ -1211,6 +1377,9 @@ struct btrfs_root {
 #define BTRFS_INODE_NODUMP              (1 << 8)
 #define BTRFS_INODE_NOATIME             (1 << 9)
 #define BTRFS_INODE_DIRSYNC             (1 << 10)
+#define BTRFS_INODE_COMPRESS            (1 << 11)
+#define BTRFS_INODE_ROOT_ITEM_INIT      (1 << 31)
 /* some macros to generate set/get funcs for the struct fields.  This
 * assumes there is a lefoo_to_cpu for every type, so lets make a simple
@@ -1364,26 +1533,12 @@ static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
        return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
 }
-static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
-                                             struct btrfs_chunk *c, int nr,
-                                             u64 val)
-{
-        btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
-}
 static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
                                         struct btrfs_chunk *c, int nr)
 {
        return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
 }
-static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
-                                             struct btrfs_chunk *c, int nr,
-                                             u64 val)
-{
-        btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
-}
 /* struct btrfs_block_group_item */
 BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
                         used, 64);
@@ -1441,14 +1596,6 @@ btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
        return (struct btrfs_timespec *)ptr;
 }
-static inline struct btrfs_timespec *
-btrfs_inode_otime(struct btrfs_inode_item *inode_item)
-{
-        unsigned long ptr = (unsigned long)inode_item;
-        ptr += offsetof(struct btrfs_inode_item, otime);
-        return (struct btrfs_timespec *)ptr;
-}
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
@@ -1665,6 +1812,27 @@ static inline void btrfs_set_dir_item_key(struct extent_buffer *eb,
        write_eb_member(eb, item, struct btrfs_dir_item, location, key);
 }
+BTRFS_SETGET_FUNCS(free_space_entries, struct btrfs_free_space_header,
+                   num_entries, 64);
+BTRFS_SETGET_FUNCS(free_space_bitmaps, struct btrfs_free_space_header,
+                   num_bitmaps, 64);
+BTRFS_SETGET_FUNCS(free_space_generation, struct btrfs_free_space_header,
+                   generation, 64);
+static inline void btrfs_free_space_key(struct extent_buffer *eb,
+                                        struct btrfs_free_space_header *h,
+                                        struct btrfs_disk_key *key)
+{
+        read_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
+static inline void btrfs_set_free_space_key(struct extent_buffer *eb,
+                                            struct btrfs_free_space_header *h,
+                                            struct btrfs_disk_key *key)
+{
+        write_eb_member(eb, h, struct btrfs_free_space_header, location, key);
+}
 /* struct btrfs_disk_key */
 BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key,
                         objectid, 64);
@@ -1778,33 +1946,6 @@ static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
        return (u8 *)ptr;
 }
-static inline u8 *btrfs_super_fsid(struct extent_buffer *eb)
-{
-        unsigned long ptr = offsetof(struct btrfs_super_block, fsid);
-        return (u8 *)ptr;
-}
-static inline u8 *btrfs_header_csum(struct extent_buffer *eb)
-{
-        unsigned long ptr = offsetof(struct btrfs_header, csum);
-        return (u8 *)ptr;
-}
-static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb)
-{
-        return NULL;
-}
-static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb)
-{
-        return NULL;
-}
-static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb)
-{
-        return NULL;
-}
 static inline int btrfs_is_leaf(struct extent_buffer *eb)
 {
        return btrfs_header_level(eb) == 0;
@@ -1829,6 +1970,11 @@ BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64);
 BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item,
                         last_snapshot, 64);
+static inline bool btrfs_root_readonly(struct btrfs_root *root)
+{
+        return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
+}
 /* struct btrfs_super_block */
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
@@ -1876,6 +2022,8 @@ BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block,
                         incompat_flags, 64);
 BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
                         csum_type, 16);
+BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
+                         cache_generation, 64);
 static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
 {
@@ -1951,22 +2099,6 @@ static inline struct btrfs_root *btrfs_sb(struct super_block *sb)
        return sb->s_fs_info;
 }
-static inline int btrfs_set_root_name(struct btrfs_root *root,
-                                      const char *name, int len)
-{
-        /* if we already have a name just free it */
-        kfree(root->name);
-        root->name = kmalloc(len+1, GFP_KERNEL);
-        if (!root->name)
-                return -ENOMEM;
-        memcpy(root->name, name, len);
-        root->name[len] = '\0';
-        return 0;
-}
 static inline u32 btrfs_level_size(struct btrfs_root *root, int level)
 {
        if (level == 0)
@@ -1988,7 +2120,20 @@ static inline struct dentry *fdentry(struct file *file)
        return file->f_path.dentry;
 }
+static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
+{
+        return ((space_info->flags & BTRFS_BLOCK_GROUP_METADATA) &&
+                (space_info->flags & BTRFS_BLOCK_GROUP_DATA));
+}
 /* extent-tree.c */
+static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
+                                                 int num_items)
+{
+        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+                3 * num_items;
+}
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root, unsigned long count);
@@ -1998,12 +2143,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
                             u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
                     u64 bytenr, u64 num, int reserved);
-int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
-                        struct btrfs_root *root, struct extent_buffer *leaf);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          u64 objectid, u64 offset, u64 bytenr);
-int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
 struct btrfs_block_group_cache *btrfs_lookup_block_group(
                                                 struct btrfs_fs_info *info,
                                                 u64 bytenr);
@@ -2051,6 +2193,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
                      u64 root_objectid, u64 owner, u64 offset);
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+                                u64 num_bytes, int reserve, int sinfo);
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
@@ -2073,13 +2217,14 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root, u64 group_start);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data);
 void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
 void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
 int btrfs_check_data_free_space(struct inode *inode, u64 bytes);
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes);
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
-                                int num_items, int *retries);
+                                int num_items);
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root);
 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
@@ -2100,7 +2245,7 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv,
-                        u64 num_bytes, int *retries);
+                        u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root,
                          struct btrfs_block_rsv *block_rsv,
@@ -2111,10 +2256,24 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 void btrfs_block_rsv_release(struct btrfs_root *root,
                             struct btrfs_block_rsv *block_rsv,
                             u64 num_bytes);
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_block_rsv *rsv);
 int btrfs_set_block_group_ro(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
                             struct btrfs_block_group_cache *cache);
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
+int btrfs_error_unpin_extent_range(struct btrfs_root *root,
+                                   u64 start, u64 end);
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                               u64 num_bytes, u64 *actual_bytes);
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 type);
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range);
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info);
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
                     int level, int *slot);
@@ -2166,10 +2325,12 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
                       struct btrfs_root *root, struct extent_buffer *parent,
                       int start_slot, int cache_only, u64 *last_ret,
                       struct btrfs_key *progress);
-void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
+void btrfs_release_path(struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
+void btrfs_clear_path_blocking(struct btrfs_path *p,
+                               struct extent_buffer *held);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -2181,13 +2342,12 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans,
        return btrfs_del_items(trans, root, path, path->slots[0], 1);
 }
+int setup_items_for_insert(struct btrfs_trans_handle *trans,
+                           struct btrfs_root *root, struct btrfs_path *path,
+                           struct btrfs_key *cpu_key, u32 *data_size,
+                           u32 total_data, u32 total_size, int nr);
 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root
                      *root, struct btrfs_key *key, void *data, u32 data_size);
-int btrfs_insert_some_items(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root,
-                            struct btrfs_path *path,
-                            struct btrfs_key *cpu_key, u32 *data_size,
-                            int nr);
 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_path *path,
@@ -2211,6 +2371,15 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct extent_buffer *node,
                        struct extent_buffer *parent);
+static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
+{
+        /*
+         * Get synced with close_ctree()
+         */
+        smp_mb();
+        return fs_info->closing;
+}
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
                        struct btrfs_path *path,
@@ -2233,16 +2402,16 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
                      *item);
 int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct
                         btrfs_root_item *item, struct btrfs_key *key);
-int btrfs_search_root(struct btrfs_root *root, u64 search_start,
-                      u64 *found_objectid);
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid);
 int btrfs_find_orphan_roots(struct btrfs_root *tree_root);
 int btrfs_set_root_node(struct btrfs_root_item *item,
                        struct extent_buffer *node);
+void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
 /* dir-item.c */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, const char *name,
-                          int name_len, u64 dir,
+                          int name_len, struct inode *dir,
                          struct btrfs_key *location, u8 type, u64 index);
 struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
                                             struct btrfs_root *root,
@@ -2276,6 +2445,9 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
                                          struct btrfs_path *path, u64 dir,
                                          const char *name, u16 name_len,
                                          int mod);
+int verify_dir_item(struct btrfs_root *root,
+                    struct extent_buffer *leaf,
+                    struct btrfs_dir_item *dir_item);
 /* orphan.c */
 int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
@@ -2284,12 +2456,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, u64 offset);
 int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
-/* inode-map.c */
-int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *fs_root,
-                             u64 dirid, u64 *objectid);
-int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid);
 /* inode-item.c */
 int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
@@ -2334,8 +2500,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
                           struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode,
                       struct bio *bio, u64 file_start, int contig);
-int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode,
-                          u64 start, unsigned long len);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
                                          struct btrfs_root *root,
                                          struct btrfs_path *path,
@@ -2343,8 +2507,8 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 int btrfs_csum_truncate(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root, struct btrfs_path *path,
                        u64 isize);
-int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start,
+int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-                             u64 end, struct list_head *list);
+                             struct list_head *list, int search_commit);
 /* inode.c */
 /* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */
@@ -2373,14 +2537,12 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
                               u32 min_type);
 int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput);
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput);
 int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end,
                              struct extent_state **cached_state);
 int btrfs_writepages(struct address_space *mapping,
                     struct writeback_control *wbc);
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *new_root,
+                             struct btrfs_root *new_root, u64 new_dirid);
-                             u64 new_dirid, u64 alloc_hint);
 int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
                         size_t size, struct bio *bio, unsigned long bio_flags);
@@ -2390,9 +2552,8 @@ unsigned long btrfs_force_ra(struct address_space *mapping,
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_evict_inode(struct inode *inode);
-void btrfs_put_inode(struct inode *inode);
 int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
-void btrfs_dirty_inode(struct inode *inode);
+void btrfs_dirty_inode(struct inode *inode, int flags);
 struct inode *btrfs_alloc_inode(struct super_block *sb);
 void btrfs_destroy_inode(struct inode *inode);
 int btrfs_drop_inode(struct inode *inode);
@@ -2401,17 +2562,15 @@ void btrfs_destroy_cachep(void);
 long btrfs_ioctl_trans_end(struct file *file);
 struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                         struct btrfs_root *root, int *was_new);
-int btrfs_commit_write(struct file *file, struct page *page,
-                       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
-                                    size_t page_offset, u64 start, u64 end,
+                                    size_t pg_offset, u64 start, u64 end,
                                    int create);
 int btrfs_update_inode(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              struct inode *inode);
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
-void btrfs_orphan_cleanup(struct btrfs_root *root);
+int btrfs_orphan_cleanup(struct btrfs_root *root);
 void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
                                struct btrfs_pending_snapshot *pending,
                                u64 *bytes_to_reserve);
@@ -2419,31 +2578,44 @@ void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
                                struct btrfs_pending_snapshot *pending);
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root);
-int btrfs_cont_expand(struct inode *inode, loff_t size);
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
 int btrfs_invalidate_inodes(struct btrfs_root *root);
 void btrfs_add_delayed_iput(struct inode *inode);
 void btrfs_run_delayed_iputs(struct btrfs_root *root);
 int btrfs_prealloc_file_range(struct inode *inode, int mode,
                              u64 start, u64 num_bytes, u64 min_size,
                              loff_t actual_len, u64 *alloc_hint);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
+                                    struct btrfs_trans_handle *trans, int mode,
+                                    u64 start, u64 num_bytes, u64 min_size,
+                                    loff_t actual_len, u64 *alloc_hint);
 extern const struct dentry_operations btrfs_dentry_operations;
 /* ioctl.c */
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+                      struct btrfs_ioctl_defrag_range_args *range,
+                      u64 newer_than, unsigned long max_pages);
 /* file.c */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+                           struct inode *inode);
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
 int btrfs_sync_file(struct file *file, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                            int skip_pinned);
-int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
 extern const struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
                       u64 start, u64 end, u64 *hint_byte, int drop_cache);
 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
                              struct inode *inode, u64 start, u64 end);
 int btrfs_release_file(struct inode *inode, struct file *file);
+void btrfs_drop_pages(struct page **pages, size_t num_pages);
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
+                      struct page **pages, size_t num_pages,
+                      loff_t pos, size_t write_bytes,
+                      struct extent_state **cached);
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -2452,10 +2624,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 /* sysfs.c */
 int btrfs_init_sysfs(void);
 void btrfs_exit_sysfs(void);
-int btrfs_sysfs_add_super(struct btrfs_fs_info *fs);
-int btrfs_sysfs_add_root(struct btrfs_root *root);
-void btrfs_sysfs_del_root(struct btrfs_root *root);
-void btrfs_sysfs_del_super(struct btrfs_fs_info *root);
 /* xattr.c */
 ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
@@ -2463,10 +2631,18 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 /* super.c */
 int btrfs_parse_options(struct btrfs_root *root, char *options);
 int btrfs_sync_fs(struct super_block *sb, int wait);
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                     unsigned int line, int errno);
+#define btrfs_std_error(fs_info, errno)                         \
+do {                                                            \
+        if ((errno))                                            \
+                __btrfs_std_error((fs_info), __func__, __LINE__, (errno));\
+} while (0)
 /* acl.c */
 #ifdef CONFIG_BTRFS_FS_POSIX_ACL
-int btrfs_check_acl(struct inode *inode, int mask);
+int btrfs_check_acl(struct inode *inode, int mask, unsigned int flags);
 #else
 #define btrfs_check_acl NULL
 #endif
@@ -2490,4 +2666,18 @@ void btrfs_reloc_pre_snapshot(struct btrfs_trans_handle *trans,
                              u64 *bytes_to_reserve);
 void btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
                              struct btrfs_pending_snapshot *pending);
+/* scrub.c */
+int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+                    struct btrfs_scrub_progress *progress, int readonly);
+int btrfs_scrub_pause(struct btrfs_root *root);
+int btrfs_scrub_pause_super(struct btrfs_root *root);
+int btrfs_scrub_continue(struct btrfs_root *root);
+int btrfs_scrub_continue_super(struct btrfs_root *root);
+int btrfs_scrub_cancel(struct btrfs_root *root);
+int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev);
+int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+                         struct btrfs_scrub_progress *progress);
 #endif
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
new file mode 100644
index 000000000000..98c68e658a9b
--- /dev/null
+++ b/fs/btrfs/delayed-inode.c
@@ -0,0 +1,1773 @@
+/*
+ * Copyright (C) 2011 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/slab.h>
+#include "delayed-inode.h"
+#include "disk-io.h"
+#include "transaction.h"
+#define BTRFS_DELAYED_WRITEBACK         400
+#define BTRFS_DELAYED_BACKGROUND        100
+static struct kmem_cache *delayed_node_cache;
+int __init btrfs_delayed_inode_init(void)
+{
+        delayed_node_cache = kmem_cache_create("delayed_node",
+                                        sizeof(struct btrfs_delayed_node),
+                                        0,
+                                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+                                        NULL);
+        if (!delayed_node_cache)
+                return -ENOMEM;
+        return 0;
+}
+void btrfs_delayed_inode_exit(void)
+{
+        if (delayed_node_cache)
+                kmem_cache_destroy(delayed_node_cache);
+}
+static inline void btrfs_init_delayed_node(
+                                struct btrfs_delayed_node *delayed_node,
+                                struct btrfs_root *root, u64 inode_id)
+{
+        delayed_node->root = root;
+        delayed_node->inode_id = inode_id;
+        atomic_set(&delayed_node->refs, 0);
+        delayed_node->count = 0;
+        delayed_node->in_list = 0;
+        delayed_node->inode_dirty = 0;
+        delayed_node->ins_root = RB_ROOT;
+        delayed_node->del_root = RB_ROOT;
+        mutex_init(&delayed_node->mutex);
+        delayed_node->index_cnt = 0;
+        INIT_LIST_HEAD(&delayed_node->n_list);
+        INIT_LIST_HEAD(&delayed_node->p_list);
+        delayed_node->bytes_reserved = 0;
+}
+static inline int btrfs_is_continuous_delayed_item(
+                                        struct btrfs_delayed_item *item1,
+                                        struct btrfs_delayed_item *item2)
+{
+        if (item1->key.type == BTRFS_DIR_INDEX_KEY &&
+            item1->key.objectid == item2->key.objectid &&
+            item1->key.type == item2->key.type &&
+            item1->key.offset + 1 == item2->key.offset)
+                return 1;
+        return 0;
+}
+static inline struct btrfs_delayed_root *btrfs_get_delayed_root(
+                                                        struct btrfs_root *root)
+{
+        return root->fs_info->delayed_root;
+}
+static struct btrfs_delayed_node *btrfs_get_delayed_node(struct inode *inode)
+{
+        struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+        struct btrfs_root *root = btrfs_inode->root;
+        u64 ino = btrfs_ino(inode);
+        struct btrfs_delayed_node *node;
+        node = ACCESS_ONCE(btrfs_inode->delayed_node);
+        if (node) {
+                atomic_inc(&node->refs);
+                return node;
+        }
+        spin_lock(&root->inode_lock);
+        node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
+        if (node) {
+                if (btrfs_inode->delayed_node) {
+                        atomic_inc(&node->refs);        /* can be accessed */
+                        BUG_ON(btrfs_inode->delayed_node != node);
+                        spin_unlock(&root->inode_lock);
+                        return node;
+                }
+                btrfs_inode->delayed_node = node;
+                atomic_inc(&node->refs);        /* can be accessed */
+                atomic_inc(&node->refs);        /* cached in the inode */
+                spin_unlock(&root->inode_lock);
+                return node;
+        }
+        spin_unlock(&root->inode_lock);
+        return NULL;
+}
+static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
+                                                        struct inode *inode)
+{
+        struct btrfs_delayed_node *node;
+        struct btrfs_inode *btrfs_inode = BTRFS_I(inode);
+        struct btrfs_root *root = btrfs_inode->root;
+        u64 ino = btrfs_ino(inode);
+        int ret;
+again:
+        node = btrfs_get_delayed_node(inode);
+        if (node)
+                return node;
+        node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+        if (!node)
+                return ERR_PTR(-ENOMEM);
+        btrfs_init_delayed_node(node, root, ino);
+        atomic_inc(&node->refs);        /* cached in the btrfs inode */
+        atomic_inc(&node->refs);        /* can be accessed */
+        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+        if (ret) {
+                kmem_cache_free(delayed_node_cache, node);
+                return ERR_PTR(ret);
+        }
+        spin_lock(&root->inode_lock);
+        ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
+        if (ret == -EEXIST) {
+                kmem_cache_free(delayed_node_cache, node);
+                spin_unlock(&root->inode_lock);
+                radix_tree_preload_end();
+                goto again;
+        }
+        btrfs_inode->delayed_node = node;
+        spin_unlock(&root->inode_lock);
+        radix_tree_preload_end();
+        return node;
+}
+/*
+ * Call it when holding delayed_node->mutex
+ *
+ * If mod = 1, add this node into the prepared list.
+ */
+static void btrfs_queue_delayed_node(struct btrfs_delayed_root *root,
+                                     struct btrfs_delayed_node *node,
+                                     int mod)
+{
+        spin_lock(&root->lock);
+        if (node->in_list) {
+                if (!list_empty(&node->p_list))
+                        list_move_tail(&node->p_list, &root->prepare_list);
+                else if (mod)
+                        list_add_tail(&node->p_list, &root->prepare_list);
+        } else {
+                list_add_tail(&node->n_list, &root->node_list);
+                list_add_tail(&node->p_list, &root->prepare_list);
+                atomic_inc(&node->refs);        /* inserted into list */
+                root->nodes++;
+                node->in_list = 1;
+        }
+        spin_unlock(&root->lock);
+}
+/* Call it when holding delayed_node->mutex */
+static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root,
+                                       struct btrfs_delayed_node *node)
+{
+        spin_lock(&root->lock);
+        if (node->in_list) {
+                root->nodes--;
+                atomic_dec(&node->refs);        /* not in the list */
+                list_del_init(&node->n_list);
+                if (!list_empty(&node->p_list))
+                        list_del_init(&node->p_list);
+                node->in_list = 0;
+        }
+        spin_unlock(&root->lock);
+}
+struct btrfs_delayed_node *btrfs_first_delayed_node(
+                        struct btrfs_delayed_root *delayed_root)
+{
+        struct list_head *p;
+        struct btrfs_delayed_node *node = NULL;
+        spin_lock(&delayed_root->lock);
+        if (list_empty(&delayed_root->node_list))
+                goto out;
+        p = delayed_root->node_list.next;
+        node = list_entry(p, struct btrfs_delayed_node, n_list);
+        atomic_inc(&node->refs);
+out:
+        spin_unlock(&delayed_root->lock);
+        return node;
+}
+struct btrfs_delayed_node *btrfs_next_delayed_node(
+                                                struct btrfs_delayed_node *node)
+{
+        struct btrfs_delayed_root *delayed_root;
+        struct list_head *p;
+        struct btrfs_delayed_node *next = NULL;
+        delayed_root = node->root->fs_info->delayed_root;
+        spin_lock(&delayed_root->lock);
+        if (!node->in_list) {   /* not in the list */
+                if (list_empty(&delayed_root->node_list))
+                        goto out;
+                p = delayed_root->node_list.next;
+        } else if (list_is_last(&node->n_list, &delayed_root->node_list))
+                goto out;
+        else
+                p = node->n_list.next;
+        next = list_entry(p, struct btrfs_delayed_node, n_list);
+        atomic_inc(&next->refs);
+out:
+        spin_unlock(&delayed_root->lock);
+        return next;
+}
+static void __btrfs_release_delayed_node(
+                                struct btrfs_delayed_node *delayed_node,
+                                int mod)
+{
+        struct btrfs_delayed_root *delayed_root;
+        if (!delayed_node)
+                return;
+        delayed_root = delayed_node->root->fs_info->delayed_root;
+        mutex_lock(&delayed_node->mutex);
+        if (delayed_node->count)
+                btrfs_queue_delayed_node(delayed_root, delayed_node, mod);
+        else
+                btrfs_dequeue_delayed_node(delayed_root, delayed_node);
+        mutex_unlock(&delayed_node->mutex);
+        if (atomic_dec_and_test(&delayed_node->refs)) {
+                struct btrfs_root *root = delayed_node->root;
+                spin_lock(&root->inode_lock);
+                if (atomic_read(&delayed_node->refs) == 0) {
+                        radix_tree_delete(&root->delayed_nodes_tree,
+                                          delayed_node->inode_id);
+                        kmem_cache_free(delayed_node_cache, delayed_node);
+                }
+                spin_unlock(&root->inode_lock);
+        }
+}
+static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node)
+{
+        __btrfs_release_delayed_node(node, 0);
+}
+struct btrfs_delayed_node *btrfs_first_prepared_delayed_node(
+                                        struct btrfs_delayed_root *delayed_root)
+{
+        struct list_head *p;
+        struct btrfs_delayed_node *node = NULL;
+        spin_lock(&delayed_root->lock);
+        if (list_empty(&delayed_root->prepare_list))
+                goto out;
+        p = delayed_root->prepare_list.next;
+        list_del_init(p);
+        node = list_entry(p, struct btrfs_delayed_node, p_list);
+        atomic_inc(&node->refs);
+out:
+        spin_unlock(&delayed_root->lock);
+        return node;
+}
+static inline void btrfs_release_prepared_delayed_node(
+                                        struct btrfs_delayed_node *node)
+{
+        __btrfs_release_delayed_node(node, 1);
+}
+struct btrfs_delayed_item *btrfs_alloc_delayed_item(u32 data_len)
+{
+        struct btrfs_delayed_item *item;
+        item = kmalloc(sizeof(*item) + data_len, GFP_NOFS);
+        if (item) {
+                item->data_len = data_len;
+                item->ins_or_del = 0;
+                item->bytes_reserved = 0;
+                item->delayed_node = NULL;
+                atomic_set(&item->refs, 1);
+        }
+        return item;
+}
+/*
+ * __btrfs_lookup_delayed_item - look up the delayed item by key
+ * @delayed_node: pointer to the delayed node
+ * @key:          the key to look up
+ * @prev:         used to store the prev item if the right item isn't found
+ * @next:         used to store the next item if the right item isn't found
+ *
+ * Note: if we don't find the right item, we will return the prev item and
+ * the next item.
+ */
+static struct btrfs_delayed_item *__btrfs_lookup_delayed_item(
+                                struct rb_root *root,
+                                struct btrfs_key *key,
+                                struct btrfs_delayed_item **prev,
+                                struct btrfs_delayed_item **next)
+{
+        struct rb_node *node, *prev_node = NULL;
+        struct btrfs_delayed_item *delayed_item = NULL;
+        int ret = 0;
+        node = root->rb_node;
+        while (node) {
+                delayed_item = rb_entry(node, struct btrfs_delayed_item,
+                                        rb_node);
+                prev_node = node;
+                ret = btrfs_comp_cpu_keys(&delayed_item->key, key);
+                if (ret < 0)
+                        node = node->rb_right;
+                else if (ret > 0)
+                        node = node->rb_left;
+                else
+                        return delayed_item;
+        }
+        if (prev) {
+                if (!prev_node)
+                        *prev = NULL;
+                else if (ret < 0)
+                        *prev = delayed_item;
+                else if ((node = rb_prev(prev_node)) != NULL) {
+                        *prev = rb_entry(node, struct btrfs_delayed_item,
+                                         rb_node);
+                } else
+                        *prev = NULL;
+        }
+        if (next) {
+                if (!prev_node)
+                        *next = NULL;
+                else if (ret > 0)
+                        *next = delayed_item;
+                else if ((node = rb_next(prev_node)) != NULL) {
+                        *next = rb_entry(node, struct btrfs_delayed_item,
+                                         rb_node);
+                } else
+                        *next = NULL;
+        }
+        return NULL;
+}
+struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item(
+                                        struct btrfs_delayed_node *delayed_node,
+                                        struct btrfs_key *key)
+{
+        struct btrfs_delayed_item *item;
+        item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+                                           NULL, NULL);
+        return item;
+}
+struct btrfs_delayed_item *__btrfs_lookup_delayed_deletion_item(
+                                        struct btrfs_delayed_node *delayed_node,
+                                        struct btrfs_key *key)
+{
+        struct btrfs_delayed_item *item;
+        item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
+                                           NULL, NULL);
+        return item;
+}
+struct btrfs_delayed_item *__btrfs_search_delayed_insertion_item(
+                                        struct btrfs_delayed_node *delayed_node,
+                                        struct btrfs_key *key)
+{
+        struct btrfs_delayed_item *item, *next;
+        item = __btrfs_lookup_delayed_item(&delayed_node->ins_root, key,
+                                           NULL, &next);
+        if (!item)
+                item = next;
+        return item;
+}
+struct btrfs_delayed_item *__btrfs_search_delayed_deletion_item(
+                                        struct btrfs_delayed_node *delayed_node,
+                                        struct btrfs_key *key)
+{
+        struct btrfs_delayed_item *item, *next;
+        item = __btrfs_lookup_delayed_item(&delayed_node->del_root, key,
+                                           NULL, &next);
+        if (!item)
+                item = next;
+        return item;
+}
+static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node,
+                                    struct btrfs_delayed_item *ins,
+                                    int action)
+{
+        struct rb_node **p, *node;
+        struct rb_node *parent_node = NULL;
+        struct rb_root *root;
+        struct btrfs_delayed_item *item;
+        int cmp;
+        if (action == BTRFS_DELAYED_INSERTION_ITEM)
+                root = &delayed_node->ins_root;
+        else if (action == BTRFS_DELAYED_DELETION_ITEM)
+                root = &delayed_node->del_root;
+        else
+                BUG();
+        p = &root->rb_node;
+        node = &ins->rb_node;
+        while (*p) {
+                parent_node = *p;
+                item = rb_entry(parent_node, struct btrfs_delayed_item,
+                                 rb_node);
+                cmp = btrfs_comp_cpu_keys(&item->key, &ins->key);
+                if (cmp < 0)
+                        p = &(*p)->rb_right;
+                else if (cmp > 0)
+                        p = &(*p)->rb_left;
+                else
+                        return -EEXIST;
+        }
+        rb_link_node(node, parent_node, p);
+        rb_insert_color(node, root);
+        ins->delayed_node = delayed_node;
+        ins->ins_or_del = action;
+        if (ins->key.type == BTRFS_DIR_INDEX_KEY &&
+            action == BTRFS_DELAYED_INSERTION_ITEM &&
+            ins->key.offset >= delayed_node->index_cnt)
+                        delayed_node->index_cnt = ins->key.offset + 1;
+        delayed_node->count++;
+        atomic_inc(&delayed_node->root->fs_info->delayed_root->items);
+        return 0;
+}
+static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node,
+                                              struct btrfs_delayed_item *item)
+{
+        return __btrfs_add_delayed_item(node, item,
+                                        BTRFS_DELAYED_INSERTION_ITEM);
+}
+static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node,
+                                             struct btrfs_delayed_item *item)
+{
+        return __btrfs_add_delayed_item(node, item,
+                                        BTRFS_DELAYED_DELETION_ITEM);
+}
+static void __btrfs_remove_delayed_item(struct btrfs_delayed_item *delayed_item)
+{
+        struct rb_root *root;
+        struct btrfs_delayed_root *delayed_root;
+        delayed_root = delayed_item->delayed_node->root->fs_info->delayed_root;
+        BUG_ON(!delayed_root);
+        BUG_ON(delayed_item->ins_or_del != BTRFS_DELAYED_DELETION_ITEM &&
+               delayed_item->ins_or_del != BTRFS_DELAYED_INSERTION_ITEM);
+        if (delayed_item->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM)
+                root = &delayed_item->delayed_node->ins_root;
+        else
+                root = &delayed_item->delayed_node->del_root;
+        rb_erase(&delayed_item->rb_node, root);
+        delayed_item->delayed_node->count--;
+        atomic_dec(&delayed_root->items);
+        if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND &&
+            waitqueue_active(&delayed_root->wait))
+                wake_up(&delayed_root->wait);
+}
+static void btrfs_release_delayed_item(struct btrfs_delayed_item *item)
+{
+        if (item) {
+                __btrfs_remove_delayed_item(item);
+                if (atomic_dec_and_test(&item->refs))
+                        kfree(item);
+        }
+}
+struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item(
+                                        struct btrfs_delayed_node *delayed_node)
+{
+        struct rb_node *p;
+        struct btrfs_delayed_item *item = NULL;
+        p = rb_first(&delayed_node->ins_root);
+        if (p)
+                item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+        return item;
+}
+struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item(
+                                        struct btrfs_delayed_node *delayed_node)
+{
+        struct rb_node *p;
+        struct btrfs_delayed_item *item = NULL;
+        p = rb_first(&delayed_node->del_root);
+        if (p)
+                item = rb_entry(p, struct btrfs_delayed_item, rb_node);
+        return item;
+}
+struct btrfs_delayed_item *__btrfs_next_delayed_item(
+                                                struct btrfs_delayed_item *item)
+{
+        struct rb_node *p;
+        struct btrfs_delayed_item *next = NULL;
+        p = rb_next(&item->rb_node);
+        if (p)
+                next = rb_entry(p, struct btrfs_delayed_item, rb_node);
+        return next;
+}
+static inline struct btrfs_root *btrfs_get_fs_root(struct btrfs_root *root,
+                                                   u64 root_id)
+{
+        struct btrfs_key root_key;
+        if (root->objectid == root_id)
+                return root;
+        root_key.objectid = root_id;
+        root_key.type = BTRFS_ROOT_ITEM_KEY;
+        root_key.offset = (u64)-1;
+        return btrfs_read_fs_root_no_name(root->fs_info, &root_key);
+}
+static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
+                                               struct btrfs_root *root,
+                                               struct btrfs_delayed_item *item)
+{
+        struct btrfs_block_rsv *src_rsv;
+        struct btrfs_block_rsv *dst_rsv;
+        u64 num_bytes;
+        int ret;
+        if (!trans->bytes_reserved)
+                return 0;
+        src_rsv = trans->block_rsv;
+        dst_rsv = &root->fs_info->global_block_rsv;
+        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+        if (!ret)
+                item->bytes_reserved = num_bytes;
+        return ret;
+}
+static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
+                                                struct btrfs_delayed_item *item)
+{
+        struct btrfs_block_rsv *rsv;
+        if (!item->bytes_reserved)
+                return;
+        rsv = &root->fs_info->global_block_rsv;
+        btrfs_block_rsv_release(root, rsv,
+                                item->bytes_reserved);
+}
+static int btrfs_delayed_inode_reserve_metadata(
+                                        struct btrfs_trans_handle *trans,
+                                        struct btrfs_root *root,
+                                        struct btrfs_delayed_node *node)
+{
+        struct btrfs_block_rsv *src_rsv;
+        struct btrfs_block_rsv *dst_rsv;
+        u64 num_bytes;
+        int ret;
+        if (!trans->bytes_reserved)
+                return 0;
+        src_rsv = trans->block_rsv;
+        dst_rsv = &root->fs_info->global_block_rsv;
+        num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+        ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
+        if (!ret)
+                node->bytes_reserved = num_bytes;
+        return ret;
+}
+static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
+                                                struct btrfs_delayed_node *node)
+{
+        struct btrfs_block_rsv *rsv;
+        if (!node->bytes_reserved)
+                return;
+        rsv = &root->fs_info->global_block_rsv;
+        btrfs_block_rsv_release(root, rsv,
+                                node->bytes_reserved);
+        node->bytes_reserved = 0;
+}
+/*
+ * This helper will insert some continuous items into the same leaf according
+ * to the free space of the leaf.
+ */
+static int btrfs_batch_insert_items(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root,
+                                struct btrfs_path *path,
+                                struct btrfs_delayed_item *item)
+{
+        struct btrfs_delayed_item *curr, *next;
+        int free_space;
+        int total_data_size = 0, total_size = 0;
+        struct extent_buffer *leaf;
+        char *data_ptr;
+        struct btrfs_key *keys;
+        u32 *data_size;
+        struct list_head head;
+        int slot;
+        int nitems;
+        int i;
+        int ret = 0;
+        BUG_ON(!path->nodes[0]);
+        leaf = path->nodes[0];
+        free_space = btrfs_leaf_free_space(root, leaf);
+        INIT_LIST_HEAD(&head);
+        next = item;
+        nitems = 0;
+        /*
+         * count the number of the continuous items that we can insert in batch
+         */
+        while (total_size + next->data_len + sizeof(struct btrfs_item) <=
+               free_space) {
+                total_data_size += next->data_len;
+                total_size += next->data_len + sizeof(struct btrfs_item);
+                list_add_tail(&next->tree_list, &head);
+                nitems++;
+                curr = next;
+                next = __btrfs_next_delayed_item(curr);
+                if (!next)
+                        break;
+                if (!btrfs_is_continuous_delayed_item(curr, next))
+                        break;
+        }
+        if (!nitems) {
+                ret = 0;
+                goto out;
+        }
+        /*
+         * we need allocate some memory space, but it might cause the task
+         * to sleep, so we set all locked nodes in the path to blocking locks
+         * first.
+         */
+        btrfs_set_path_blocking(path);
+        keys = kmalloc(sizeof(struct btrfs_key) * nitems, GFP_NOFS);
+        if (!keys) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        data_size = kmalloc(sizeof(u32) * nitems, GFP_NOFS);
+        if (!data_size) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        /* get keys of all the delayed items */
+        i = 0;
+        list_for_each_entry(next, &head, tree_list) {
+                keys[i] = next->key;
+                data_size[i] = next->data_len;
+                i++;
+        }
+        /* reset all the locked nodes in the patch to spinning locks. */
+        btrfs_clear_path_blocking(path, NULL);
+        /* insert the keys of the items */
+        ret = setup_items_for_insert(trans, root, path, keys, data_size,
+                                     total_data_size, total_size, nitems);
+        if (ret)
+                goto error;
+        /* insert the dir index items */
+        slot = path->slots[0];
+        list_for_each_entry_safe(curr, next, &head, tree_list) {
+                data_ptr = btrfs_item_ptr(leaf, slot, char);
+                write_extent_buffer(leaf, &curr->data,
+                                    (unsigned long)data_ptr,
+                                    curr->data_len);
+                slot++;
+                btrfs_delayed_item_release_metadata(root, curr);
+                list_del(&curr->tree_list);
+                btrfs_release_delayed_item(curr);
+        }
+error:
+        kfree(data_size);
+        kfree(keys);
+out:
+        return ret;
+}
+/*
+ * This helper can just do simple insertion that needn't extend item for new
+ * data, such as directory name index insertion, inode insertion.
+ */
+static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans,
+                                     struct btrfs_root *root,
+                                     struct btrfs_path *path,
+                                     struct btrfs_delayed_item *delayed_item)
+{
+        struct extent_buffer *leaf;
+        struct btrfs_item *item;
+        char *ptr;
+        int ret;
+        ret = btrfs_insert_empty_item(trans, root, path, &delayed_item->key,
+                                      delayed_item->data_len);
+        if (ret < 0 && ret != -EEXIST)
+                return ret;
+        leaf = path->nodes[0];
+        item = btrfs_item_nr(leaf, path->slots[0]);
+        ptr = btrfs_item_ptr(leaf, path->slots[0], char);
+        write_extent_buffer(leaf, delayed_item->data, (unsigned long)ptr,
+                            delayed_item->data_len);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_delayed_item_release_metadata(root, delayed_item);
+        return 0;
+}
+/*
+ * we insert an item first, then if there are some continuous items, we try
+ * to insert those items into the same leaf.
+ */
+static int btrfs_insert_delayed_items(struct btrfs_trans_handle *trans,
+                                      struct btrfs_path *path,
+                                      struct btrfs_root *root,
+                                      struct btrfs_delayed_node *node)
+{
+        struct btrfs_delayed_item *curr, *prev;
+        int ret = 0;
+do_again:
+        mutex_lock(&node->mutex);
+        curr = __btrfs_first_delayed_insertion_item(node);
+        if (!curr)
+                goto insert_end;
+        ret = btrfs_insert_delayed_item(trans, root, path, curr);
+        if (ret < 0) {
+                btrfs_release_path(path);
+                goto insert_end;
+        }
+        prev = curr;
+        curr = __btrfs_next_delayed_item(prev);
+        if (curr && btrfs_is_continuous_delayed_item(prev, curr)) {
+                /* insert the continuous items into the same leaf */
+                path->slots[0]++;
+                btrfs_batch_insert_items(trans, root, path, curr);
+        }
+        btrfs_release_delayed_item(prev);
+        btrfs_mark_buffer_dirty(path->nodes[0]);
+        btrfs_release_path(path);
+        mutex_unlock(&node->mutex);
+        goto do_again;
+insert_end:
+        mutex_unlock(&node->mutex);
+        return ret;
+}
+static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_path *path,
+                                    struct btrfs_delayed_item *item)
+{
+        struct btrfs_delayed_item *curr, *next;
+        struct extent_buffer *leaf;
+        struct btrfs_key key;
+        struct list_head head;
+        int nitems, i, last_item;
+        int ret = 0;
+        BUG_ON(!path->nodes[0]);
+        leaf = path->nodes[0];
+        i = path->slots[0];
+        last_item = btrfs_header_nritems(leaf) - 1;
+        if (i > last_item)
+                return -ENOENT; /* FIXME: Is errno suitable? */
+        next = item;
+        INIT_LIST_HEAD(&head);
+        btrfs_item_key_to_cpu(leaf, &key, i);
+        nitems = 0;
+        /*
+         * count the number of the dir index items that we can delete in batch
+         */
+        while (btrfs_comp_cpu_keys(&next->key, &key) == 0) {
+                list_add_tail(&next->tree_list, &head);
+                nitems++;
+                curr = next;
+                next = __btrfs_next_delayed_item(curr);
+                if (!next)
+                        break;
+                if (!btrfs_is_continuous_delayed_item(curr, next))
+                        break;
+                i++;
+                if (i > last_item)
+                        break;
+                btrfs_item_key_to_cpu(leaf, &key, i);
+        }
+        if (!nitems)
+                return 0;
+        ret = btrfs_del_items(trans, root, path, path->slots[0], nitems);
+        if (ret)
+                goto out;
+        list_for_each_entry_safe(curr, next, &head, tree_list) {
+                btrfs_delayed_item_release_metadata(root, curr);
+                list_del(&curr->tree_list);
+                btrfs_release_delayed_item(curr);
+        }
+out:
+        return ret;
+}
+static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans,
+                                      struct btrfs_path *path,
+                                      struct btrfs_root *root,
+                                      struct btrfs_delayed_node *node)
+{
+        struct btrfs_delayed_item *curr, *prev;
+        int ret = 0;
+do_again:
+        mutex_lock(&node->mutex);
+        curr = __btrfs_first_delayed_deletion_item(node);
+        if (!curr)
+                goto delete_fail;
+        ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1);
+        if (ret < 0)
+                goto delete_fail;
+        else if (ret > 0) {
+                /*
+                 * can't find the item which the node points to, so this node
+                 * is invalid, just drop it.
+                 */
+                prev = curr;
+                curr = __btrfs_next_delayed_item(prev);
+                btrfs_release_delayed_item(prev);
+                ret = 0;
+                btrfs_release_path(path);
+                if (curr)
+                        goto do_again;
+                else
+                        goto delete_fail;
+        }
+        btrfs_batch_delete_items(trans, root, path, curr);
+        btrfs_release_path(path);
+        mutex_unlock(&node->mutex);
+        goto do_again;
+delete_fail:
+        btrfs_release_path(path);
+        mutex_unlock(&node->mutex);
+        return ret;
+}
+static void btrfs_release_delayed_inode(struct btrfs_delayed_node *delayed_node)
+{
+        struct btrfs_delayed_root *delayed_root;
+        if (delayed_node && delayed_node->inode_dirty) {
+                BUG_ON(!delayed_node->root);
+                delayed_node->inode_dirty = 0;
+                delayed_node->count--;
+                delayed_root = delayed_node->root->fs_info->delayed_root;
+                atomic_dec(&delayed_root->items);
+                if (atomic_read(&delayed_root->items) <
+                    BTRFS_DELAYED_BACKGROUND &&
+                    waitqueue_active(&delayed_root->wait))
+                        wake_up(&delayed_root->wait);
+        }
+}
+static int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
+                                      struct btrfs_root *root,
+                                      struct btrfs_path *path,
+                                      struct btrfs_delayed_node *node)
+{
+        struct btrfs_key key;
+        struct btrfs_inode_item *inode_item;
+        struct extent_buffer *leaf;
+        int ret;
+        mutex_lock(&node->mutex);
+        if (!node->inode_dirty) {
+                mutex_unlock(&node->mutex);
+                return 0;
+        }
+        key.objectid = node->inode_id;
+        btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+        key.offset = 0;
+        ret = btrfs_lookup_inode(trans, root, path, &key, 1);
+        if (ret > 0) {
+                btrfs_release_path(path);
+                mutex_unlock(&node->mutex);
+                return -ENOENT;
+        } else if (ret < 0) {
+                mutex_unlock(&node->mutex);
+                return ret;
+        }
+        btrfs_unlock_up_safe(path, 1);
+        leaf = path->nodes[0];
+        inode_item = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_inode_item);
+        write_extent_buffer(leaf, &node->inode_item, (unsigned long)inode_item,
+                            sizeof(struct btrfs_inode_item));
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(path);
+        btrfs_delayed_inode_release_metadata(root, node);
+        btrfs_release_delayed_inode(node);
+        mutex_unlock(&node->mutex);
+        return 0;
+}
+/* Called when committing the transaction. */
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root)
+{
+        struct btrfs_delayed_root *delayed_root;
+        struct btrfs_delayed_node *curr_node, *prev_node;
+        struct btrfs_path *path;
+        struct btrfs_block_rsv *block_rsv;
+        int ret = 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        block_rsv = trans->block_rsv;
+        trans->block_rsv = &root->fs_info->global_block_rsv;
+        delayed_root = btrfs_get_delayed_root(root);
+        curr_node = btrfs_first_delayed_node(delayed_root);
+        while (curr_node) {
+                root = curr_node->root;
+                ret = btrfs_insert_delayed_items(trans, path, root,
+                                                 curr_node);
+                if (!ret)
+                        ret = btrfs_delete_delayed_items(trans, path, root,
+                                                         curr_node);
+                if (!ret)
+                        ret = btrfs_update_delayed_inode(trans, root, path,
+                                                         curr_node);
+                if (ret) {
+                        btrfs_release_delayed_node(curr_node);
+                        break;
+                }
+                prev_node = curr_node;
+                curr_node = btrfs_next_delayed_node(curr_node);
+                btrfs_release_delayed_node(prev_node);
+        }
+        btrfs_free_path(path);
+        trans->block_rsv = block_rsv;
+        return ret;
+}
+static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+                                              struct btrfs_delayed_node *node)
+{
+        struct btrfs_path *path;
+        struct btrfs_block_rsv *block_rsv;
+        int ret;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        block_rsv = trans->block_rsv;
+        trans->block_rsv = &node->root->fs_info->global_block_rsv;
+        ret = btrfs_insert_delayed_items(trans, path, node->root, node);
+        if (!ret)
+                ret = btrfs_delete_delayed_items(trans, path, node->root, node);
+        if (!ret)
+                ret = btrfs_update_delayed_inode(trans, node->root, path, node);
+        btrfs_free_path(path);
+        trans->block_rsv = block_rsv;
+        return ret;
+}
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+                                     struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+        int ret;
+        if (!delayed_node)
+                return 0;
+        mutex_lock(&delayed_node->mutex);
+        if (!delayed_node->count) {
+                mutex_unlock(&delayed_node->mutex);
+                btrfs_release_delayed_node(delayed_node);
+                return 0;
+        }
+        mutex_unlock(&delayed_node->mutex);
+        ret = __btrfs_commit_inode_delayed_items(trans, delayed_node);
+        btrfs_release_delayed_node(delayed_node);
+        return ret;
+}
+void btrfs_remove_delayed_node(struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node;
+        delayed_node = ACCESS_ONCE(BTRFS_I(inode)->delayed_node);
+        if (!delayed_node)
+                return;
+        BTRFS_I(inode)->delayed_node = NULL;
+        btrfs_release_delayed_node(delayed_node);
+}
+struct btrfs_async_delayed_node {
+        struct btrfs_root *root;
+        struct btrfs_delayed_node *delayed_node;
+        struct btrfs_work work;
+};
+static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
+{
+        struct btrfs_async_delayed_node *async_node;
+        struct btrfs_trans_handle *trans;
+        struct btrfs_path *path;
+        struct btrfs_delayed_node *delayed_node = NULL;
+        struct btrfs_root *root;
+        struct btrfs_block_rsv *block_rsv;
+        unsigned long nr = 0;
+        int need_requeue = 0;
+        int ret;
+        async_node = container_of(work, struct btrfs_async_delayed_node, work);
+        path = btrfs_alloc_path();
+        if (!path)
+                goto out;
+        path->leave_spinning = 1;
+        delayed_node = async_node->delayed_node;
+        root = delayed_node->root;
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans))
+                goto free_path;
+        block_rsv = trans->block_rsv;
+        trans->block_rsv = &root->fs_info->global_block_rsv;
+        ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
+        if (!ret)
+                ret = btrfs_delete_delayed_items(trans, path, root,
+                                                 delayed_node);
+        if (!ret)
+                btrfs_update_delayed_inode(trans, root, path, delayed_node);
+        /*
+         * Maybe new delayed items have been inserted, so we need requeue
+         * the work. Besides that, we must dequeue the empty delayed nodes
+         * to avoid the race between delayed items balance and the worker.
+         * The race like this:
+         *      Task1                           Worker thread
+         *                                      count == 0, needn't requeue
+         *                                        also needn't insert the
+         *                                        delayed node into prepare
+         *                                        list again.
+         *      add lots of delayed items
+         *      queue the delayed node
+         *        already in the list,
+         *        and not in the prepare
+         *        list, it means the delayed
+         *        node is being dealt with
+         *        by the worker.
+         *      do delayed items balance
+         *        the delayed node is being
+         *        dealt with by the worker
+         *        now, just wait.
+         *                                      the worker goto idle.
+         * Task1 will sleep until the transaction is commited.
+         */
+        mutex_lock(&delayed_node->mutex);
+        if (delayed_node->count)
+                need_requeue = 1;
+        else
+                btrfs_dequeue_delayed_node(root->fs_info->delayed_root,
+                                           delayed_node);
+        mutex_unlock(&delayed_node->mutex);
+        nr = trans->blocks_used;
+        trans->block_rsv = block_rsv;
+        btrfs_end_transaction_dmeta(trans, root);
+        __btrfs_btree_balance_dirty(root, nr);
+free_path:
+        btrfs_free_path(path);
+out:
+        if (need_requeue)
+                btrfs_requeue_work(&async_node->work);
+        else {
+                btrfs_release_prepared_delayed_node(delayed_node);
+                kfree(async_node);
+        }
+}
+static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
+                                     struct btrfs_root *root, int all)
+{
+        struct btrfs_async_delayed_node *async_node;
+        struct btrfs_delayed_node *curr;
+        int count = 0;
+again:
+        curr = btrfs_first_prepared_delayed_node(delayed_root);
+        if (!curr)
+                return 0;
+        async_node = kmalloc(sizeof(*async_node), GFP_NOFS);
+        if (!async_node) {
+                btrfs_release_prepared_delayed_node(curr);
+                return -ENOMEM;
+        }
+        async_node->root = root;
+        async_node->delayed_node = curr;
+        async_node->work.func = btrfs_async_run_delayed_node_done;
+        async_node->work.flags = 0;
+        btrfs_queue_worker(&root->fs_info->delayed_workers, &async_node->work);
+        count++;
+        if (all || count < 4)
+                goto again;
+        return 0;
+}
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root)
+{
+        struct btrfs_delayed_root *delayed_root;
+        delayed_root = btrfs_get_delayed_root(root);
+        WARN_ON(btrfs_first_delayed_node(delayed_root));
+}
+void btrfs_balance_delayed_items(struct btrfs_root *root)
+{
+        struct btrfs_delayed_root *delayed_root;
+        delayed_root = btrfs_get_delayed_root(root);
+        if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+                return;
+        if (atomic_read(&delayed_root->items) >= BTRFS_DELAYED_WRITEBACK) {
+                int ret;
+                ret = btrfs_wq_run_delayed_node(delayed_root, root, 1);
+                if (ret)
+                        return;
+                wait_event_interruptible_timeout(
+                                delayed_root->wait,
+                                (atomic_read(&delayed_root->items) <
+                                 BTRFS_DELAYED_BACKGROUND),
+                                HZ);
+                return;
+        }
+        btrfs_wq_run_delayed_node(delayed_root, root, 0);
+}
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root, const char *name,
+                                   int name_len, struct inode *dir,
+                                   struct btrfs_disk_key *disk_key, u8 type,
+                                   u64 index)
+{
+        struct btrfs_delayed_node *delayed_node;
+        struct btrfs_delayed_item *delayed_item;
+        struct btrfs_dir_item *dir_item;
+        int ret;
+        delayed_node = btrfs_get_or_create_delayed_node(dir);
+        if (IS_ERR(delayed_node))
+                return PTR_ERR(delayed_node);
+        delayed_item = btrfs_alloc_delayed_item(sizeof(*dir_item) + name_len);
+        if (!delayed_item) {
+                ret = -ENOMEM;
+                goto release_node;
+        }
+        ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
+        /*
+         * we have reserved enough space when we start a new transaction,
+         * so reserving metadata failure is impossible
+         */
+        BUG_ON(ret);
+        delayed_item->key.objectid = btrfs_ino(dir);
+        btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY);
+        delayed_item->key.offset = index;
+        dir_item = (struct btrfs_dir_item *)delayed_item->data;
+        dir_item->location = *disk_key;
+        dir_item->transid = cpu_to_le64(trans->transid);
+        dir_item->data_len = 0;
+        dir_item->name_len = cpu_to_le16(name_len);
+        dir_item->type = type;
+        memcpy((char *)(dir_item + 1), name, name_len);
+        mutex_lock(&delayed_node->mutex);
+        ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
+        if (unlikely(ret)) {
+                printk(KERN_ERR "err add delayed dir index item(name: %s) into "
+                                "the insertion tree of the delayed node"
+                                "(root id: %llu, inode id: %llu, errno: %d)\n",
+                                name,
+                                (unsigned long long)delayed_node->root->objectid,
+                                (unsigned long long)delayed_node->inode_id,
+                                ret);
+                BUG();
+        }
+        mutex_unlock(&delayed_node->mutex);
+release_node:
+        btrfs_release_delayed_node(delayed_node);
+        return ret;
+}
+static int btrfs_delete_delayed_insertion_item(struct btrfs_root *root,
+                                               struct btrfs_delayed_node *node,
+                                               struct btrfs_key *key)
+{
+        struct btrfs_delayed_item *item;
+        mutex_lock(&node->mutex);
+        item = __btrfs_lookup_delayed_insertion_item(node, key);
+        if (!item) {
+                mutex_unlock(&node->mutex);
+                return 1;
+        }
+        btrfs_delayed_item_release_metadata(root, item);
+        btrfs_release_delayed_item(item);
+        mutex_unlock(&node->mutex);
+        return 0;
+}
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root, struct inode *dir,
+                                   u64 index)
+{
+        struct btrfs_delayed_node *node;
+        struct btrfs_delayed_item *item;
+        struct btrfs_key item_key;
+        int ret;
+        node = btrfs_get_or_create_delayed_node(dir);
+        if (IS_ERR(node))
+                return PTR_ERR(node);
+        item_key.objectid = btrfs_ino(dir);
+        btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY);
+        item_key.offset = index;
+        ret = btrfs_delete_delayed_insertion_item(root, node, &item_key);
+        if (!ret)
+                goto end;
+        item = btrfs_alloc_delayed_item(0);
+        if (!item) {
+                ret = -ENOMEM;
+                goto end;
+        }
+        item->key = item_key;
+        ret = btrfs_delayed_item_reserve_metadata(trans, root, item);
+        /*
+         * we have reserved enough space when we start a new transaction,
+         * so reserving metadata failure is impossible.
+         */
+        BUG_ON(ret);
+        mutex_lock(&node->mutex);
+        ret = __btrfs_add_delayed_deletion_item(node, item);
+        if (unlikely(ret)) {
+                printk(KERN_ERR "err add delayed dir index item(index: %llu) "
+                                "into the deletion tree of the delayed node"
+                                "(root id: %llu, inode id: %llu, errno: %d)\n",
+                                (unsigned long long)index,
+                                (unsigned long long)node->root->objectid,
+                                (unsigned long long)node->inode_id,
+                                ret);
+                BUG();
+        }
+        mutex_unlock(&node->mutex);
+end:
+        btrfs_release_delayed_node(node);
+        return ret;
+}
+int btrfs_inode_delayed_dir_index_count(struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode);
+        if (!delayed_node)
+                return -ENOENT;
+        /*
+         * Since we have held i_mutex of this directory, it is impossible that
+         * a new directory index is added into the delayed node and index_cnt
+         * is updated now. So we needn't lock the delayed node.
+         */
+        if (!delayed_node->index_cnt) {
+                btrfs_release_delayed_node(delayed_node);
+                return -EINVAL;
+        }
+        BTRFS_I(inode)->index_cnt = delayed_node->index_cnt;
+        btrfs_release_delayed_node(delayed_node);
+        return 0;
+}
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+                             struct list_head *del_list)
+{
+        struct btrfs_delayed_node *delayed_node;
+        struct btrfs_delayed_item *item;
+        delayed_node = btrfs_get_delayed_node(inode);
+        if (!delayed_node)
+                return;
+        mutex_lock(&delayed_node->mutex);
+        item = __btrfs_first_delayed_insertion_item(delayed_node);
+        while (item) {
+                atomic_inc(&item->refs);
+                list_add_tail(&item->readdir_list, ins_list);
+                item = __btrfs_next_delayed_item(item);
+        }
+        item = __btrfs_first_delayed_deletion_item(delayed_node);
+        while (item) {
+                atomic_inc(&item->refs);
+                list_add_tail(&item->readdir_list, del_list);
+                item = __btrfs_next_delayed_item(item);
+        }
+        mutex_unlock(&delayed_node->mutex);
+        /*
+         * This delayed node is still cached in the btrfs inode, so refs
+         * must be > 1 now, and we needn't check it is going to be freed
+         * or not.
+         *
+         * Besides that, this function is used to read dir, we do not
+         * insert/delete delayed items in this period. So we also needn't
+         * requeue or dequeue this delayed node.
+         */
+        atomic_dec(&delayed_node->refs);
+}
+void btrfs_put_delayed_items(struct list_head *ins_list,
+                             struct list_head *del_list)
+{
+        struct btrfs_delayed_item *curr, *next;
+        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+                list_del(&curr->readdir_list);
+                if (atomic_dec_and_test(&curr->refs))
+                        kfree(curr);
+        }
+        list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+                list_del(&curr->readdir_list);
+                if (atomic_dec_and_test(&curr->refs))
+                        kfree(curr);
+        }
+}
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+                                  u64 index)
+{
+        struct btrfs_delayed_item *curr, *next;
+        int ret;
+        if (list_empty(del_list))
+                return 0;
+        list_for_each_entry_safe(curr, next, del_list, readdir_list) {
+                if (curr->key.offset > index)
+                        break;
+                list_del(&curr->readdir_list);
+                ret = (curr->key.offset == index);
+                if (atomic_dec_and_test(&curr->refs))
+                        kfree(curr);
+                if (ret)
+                        return 1;
+                else
+                        continue;
+        }
+        return 0;
+}
+/*
+ * btrfs_readdir_delayed_dir_index - read dir info stored in the delayed tree
+ *
+ */
+int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+                                    filldir_t filldir,
+                                    struct list_head *ins_list)
+{
+        struct btrfs_dir_item *di;
+        struct btrfs_delayed_item *curr, *next;
+        struct btrfs_key location;
+        char *name;
+        int name_len;
+        int over = 0;
+        unsigned char d_type;
+        if (list_empty(ins_list))
+                return 0;
+        /*
+         * Changing the data of the delayed item is impossible. So
+         * we needn't lock them. And we have held i_mutex of the
+         * directory, nobody can delete any directory indexes now.
+         */
+        list_for_each_entry_safe(curr, next, ins_list, readdir_list) {
+                list_del(&curr->readdir_list);
+                if (curr->key.offset < filp->f_pos) {
+                        if (atomic_dec_and_test(&curr->refs))
+                                kfree(curr);
+                        continue;
+                }
+                filp->f_pos = curr->key.offset;
+                di = (struct btrfs_dir_item *)curr->data;
+                name = (char *)(di + 1);
+                name_len = le16_to_cpu(di->name_len);
+                d_type = btrfs_filetype_table[di->type];
+                btrfs_disk_key_to_cpu(&location, &di->location);
+                over = filldir(dirent, name, name_len, curr->key.offset,
+                               location.objectid, d_type);
+                if (atomic_dec_and_test(&curr->refs))
+                        kfree(curr);
+                if (over)
+                        return 1;
+        }
+        return 0;
+}
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+                         generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+                         sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+                         transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
+                         nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+                         block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
+static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
+                                  struct btrfs_inode_item *inode_item,
+                                  struct inode *inode)
+{
+        btrfs_set_stack_inode_uid(inode_item, inode->i_uid);
+        btrfs_set_stack_inode_gid(inode_item, inode->i_gid);
+        btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size);
+        btrfs_set_stack_inode_mode(inode_item, inode->i_mode);
+        btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink);
+        btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode));
+        btrfs_set_stack_inode_generation(inode_item,
+                                         BTRFS_I(inode)->generation);
+        btrfs_set_stack_inode_sequence(inode_item, BTRFS_I(inode)->sequence);
+        btrfs_set_stack_inode_transid(inode_item, trans->transid);
+        btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev);
+        btrfs_set_stack_inode_flags(inode_item, BTRFS_I(inode)->flags);
+        btrfs_set_stack_inode_block_group(inode_item, 0);
+        btrfs_set_stack_timespec_sec(btrfs_inode_atime(inode_item),
+                                     inode->i_atime.tv_sec);
+        btrfs_set_stack_timespec_nsec(btrfs_inode_atime(inode_item),
+                                      inode->i_atime.tv_nsec);
+        btrfs_set_stack_timespec_sec(btrfs_inode_mtime(inode_item),
+                                     inode->i_mtime.tv_sec);
+        btrfs_set_stack_timespec_nsec(btrfs_inode_mtime(inode_item),
+                                      inode->i_mtime.tv_nsec);
+        btrfs_set_stack_timespec_sec(btrfs_inode_ctime(inode_item),
+                                     inode->i_ctime.tv_sec);
+        btrfs_set_stack_timespec_nsec(btrfs_inode_ctime(inode_item),
+                                      inode->i_ctime.tv_nsec);
+}
+int btrfs_fill_inode(struct inode *inode, u32 *rdev)
+{
+        struct btrfs_delayed_node *delayed_node;
+        struct btrfs_inode_item *inode_item;
+        struct btrfs_timespec *tspec;
+        delayed_node = btrfs_get_delayed_node(inode);
+        if (!delayed_node)
+                return -ENOENT;
+        mutex_lock(&delayed_node->mutex);
+        if (!delayed_node->inode_dirty) {
+                mutex_unlock(&delayed_node->mutex);
+                btrfs_release_delayed_node(delayed_node);
+                return -ENOENT;
+        }
+        inode_item = &delayed_node->inode_item;
+        inode->i_uid = btrfs_stack_inode_uid(inode_item);
+        inode->i_gid = btrfs_stack_inode_gid(inode_item);
+        btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item));
+        inode->i_mode = btrfs_stack_inode_mode(inode_item);
+        inode->i_nlink = btrfs_stack_inode_nlink(inode_item);
+        inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item));
+        BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item);
+        BTRFS_I(inode)->sequence = btrfs_stack_inode_sequence(inode_item);
+        inode->i_rdev = 0;
+        *rdev = btrfs_stack_inode_rdev(inode_item);
+        BTRFS_I(inode)->flags = btrfs_stack_inode_flags(inode_item);
+        tspec = btrfs_inode_atime(inode_item);
+        inode->i_atime.tv_sec = btrfs_stack_timespec_sec(tspec);
+        inode->i_atime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+        tspec = btrfs_inode_mtime(inode_item);
+        inode->i_mtime.tv_sec = btrfs_stack_timespec_sec(tspec);
+        inode->i_mtime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+        tspec = btrfs_inode_ctime(inode_item);
+        inode->i_ctime.tv_sec = btrfs_stack_timespec_sec(tspec);
+        inode->i_ctime.tv_nsec = btrfs_stack_timespec_nsec(tspec);
+        inode->i_generation = BTRFS_I(inode)->generation;
+        BTRFS_I(inode)->index_cnt = (u64)-1;
+        mutex_unlock(&delayed_node->mutex);
+        btrfs_release_delayed_node(delayed_node);
+        return 0;
+}
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root, struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node;
+        int ret = 0;
+        delayed_node = btrfs_get_or_create_delayed_node(inode);
+        if (IS_ERR(delayed_node))
+                return PTR_ERR(delayed_node);
+        mutex_lock(&delayed_node->mutex);
+        if (delayed_node->inode_dirty) {
+                fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+                goto release_node;
+        }
+        ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
+        /*
+         * we must reserve enough space when we start a new transaction,
+         * so reserving metadata failure is impossible
+         */
+        BUG_ON(ret);
+        fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
+        delayed_node->inode_dirty = 1;
+        delayed_node->count++;
+        atomic_inc(&root->fs_info->delayed_root->items);
+release_node:
+        mutex_unlock(&delayed_node->mutex);
+        btrfs_release_delayed_node(delayed_node);
+        return ret;
+}
+static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node)
+{
+        struct btrfs_root *root = delayed_node->root;
+        struct btrfs_delayed_item *curr_item, *prev_item;
+        mutex_lock(&delayed_node->mutex);
+        curr_item = __btrfs_first_delayed_insertion_item(delayed_node);
+        while (curr_item) {
+                btrfs_delayed_item_release_metadata(root, curr_item);
+                prev_item = curr_item;
+                curr_item = __btrfs_next_delayed_item(prev_item);
+                btrfs_release_delayed_item(prev_item);
+        }
+        curr_item = __btrfs_first_delayed_deletion_item(delayed_node);
+        while (curr_item) {
+                btrfs_delayed_item_release_metadata(root, curr_item);
+                prev_item = curr_item;
+                curr_item = __btrfs_next_delayed_item(prev_item);
+                btrfs_release_delayed_item(prev_item);
+        }
+        if (delayed_node->inode_dirty) {
+                btrfs_delayed_inode_release_metadata(root, delayed_node);
+                btrfs_release_delayed_inode(delayed_node);
+        }
+        mutex_unlock(&delayed_node->mutex);
+}
+void btrfs_kill_delayed_inode_items(struct inode *inode)
+{
+        struct btrfs_delayed_node *delayed_node;
+        delayed_node = btrfs_get_delayed_node(inode);
+        if (!delayed_node)
+                return;
+        __btrfs_kill_delayed_node(delayed_node);
+        btrfs_release_delayed_node(delayed_node);
+}
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
+{
+        u64 inode_id = 0;
+        struct btrfs_delayed_node *delayed_nodes[8];
+        int i, n;
+        while (1) {
+                spin_lock(&root->inode_lock);
+                n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
+                                           (void **)delayed_nodes, inode_id,
+                                           ARRAY_SIZE(delayed_nodes));
+                if (!n) {
+                        spin_unlock(&root->inode_lock);
+                        break;
+                }
+                inode_id = delayed_nodes[n - 1]->inode_id + 1;
+                for (i = 0; i < n; i++)
+                        atomic_inc(&delayed_nodes[i]->refs);
+                spin_unlock(&root->inode_lock);
+                for (i = 0; i < n; i++) {
+                        __btrfs_kill_delayed_node(delayed_nodes[i]);
+                        btrfs_release_delayed_node(delayed_nodes[i]);
+                }
+        }
+}
diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h
new file mode 100644
index 000000000000..8d27af4bd8b9
--- /dev/null
+++ b/fs/btrfs/delayed-inode.h
@@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2011 Fujitsu.  All rights reserved.
+ * Written by Miao Xie <miaox@cn.fujitsu.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_TREE_OPERATION_H
+#define __DELAYED_TREE_OPERATION_H
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+#include <asm/atomic.h>
+#include "ctree.h"
+/* types of the delayed item */
+#define BTRFS_DELAYED_INSERTION_ITEM    1
+#define BTRFS_DELAYED_DELETION_ITEM     2
+struct btrfs_delayed_root {
+        spinlock_t lock;
+        struct list_head node_list;
+        /*
+         * Used for delayed nodes which is waiting to be dealt with by the
+         * worker. If the delayed node is inserted into the work queue, we
+         * drop it from this list.
+         */
+        struct list_head prepare_list;
+        atomic_t items;         /* for delayed items */
+        int nodes;              /* for delayed nodes */
+        wait_queue_head_t wait;
+};
+struct btrfs_delayed_node {
+        u64 inode_id;
+        u64 bytes_reserved;
+        struct btrfs_root *root;
+        /* Used to add the node into the delayed root's node list. */
+        struct list_head n_list;
+        /*
+         * Used to add the node into the prepare list, the nodes in this list
+         * is waiting to be dealt with by the async worker.
+         */
+        struct list_head p_list;
+        struct rb_root ins_root;
+        struct rb_root del_root;
+        struct mutex mutex;
+        struct btrfs_inode_item inode_item;
+        atomic_t refs;
+        u64 index_cnt;
+        bool in_list;
+        bool inode_dirty;
+        int count;
+};
+struct btrfs_delayed_item {
+        struct rb_node rb_node;
+        struct btrfs_key key;
+        struct list_head tree_list;     /* used for batch insert/delete items */
+        struct list_head readdir_list;  /* used for readdir items */
+        u64 bytes_reserved;
+        struct btrfs_delayed_node *delayed_node;
+        atomic_t refs;
+        int ins_or_del;
+        u32 data_len;
+        char data[0];
+};
+static inline void btrfs_init_delayed_root(
+                                struct btrfs_delayed_root *delayed_root)
+{
+        atomic_set(&delayed_root->items, 0);
+        delayed_root->nodes = 0;
+        spin_lock_init(&delayed_root->lock);
+        init_waitqueue_head(&delayed_root->wait);
+        INIT_LIST_HEAD(&delayed_root->node_list);
+        INIT_LIST_HEAD(&delayed_root->prepare_list);
+}
+int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root, const char *name,
+                                   int name_len, struct inode *dir,
+                                   struct btrfs_disk_key *disk_key, u8 type,
+                                   u64 index);
+int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root, struct inode *dir,
+                                   u64 index);
+int btrfs_inode_delayed_dir_index_count(struct inode *inode);
+int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root);
+void btrfs_balance_delayed_items(struct btrfs_root *root);
+int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
+                                     struct inode *inode);
+/* Used for evicting the inode. */
+void btrfs_remove_delayed_node(struct inode *inode);
+void btrfs_kill_delayed_inode_items(struct inode *inode);
+int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
+                               struct btrfs_root *root, struct inode *inode);
+int btrfs_fill_inode(struct inode *inode, u32 *rdev);
+/* Used for drop dead root */
+void btrfs_kill_all_delayed_nodes(struct btrfs_root *root);
+/* Used for readdir() */
+void btrfs_get_delayed_items(struct inode *inode, struct list_head *ins_list,
+                             struct list_head *del_list);
+void btrfs_put_delayed_items(struct list_head *ins_list,
+                             struct list_head *del_list);
+int btrfs_should_delete_dir_index(struct list_head *del_list,
+                                  u64 index);
+int btrfs_readdir_delayed_dir_index(struct file *filp, void *dirent,
+                                    filldir_t filldir,
+                                    struct list_head *ins_list);
+/* for init */
+int __init btrfs_delayed_inode_init(void);
+void btrfs_delayed_inode_exit(void);
+/* for debugging */
+void btrfs_assert_delayed_root_empty(struct btrfs_root *root);
+#endif
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e807b143b857..125cf76fcd08 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -281,44 +281,6 @@ again:
 }
 /*
- * This checks to see if there are any delayed refs in the
- * btree for a given bytenr.  It returns one if it finds any
- * and zero otherwise.
- *
- * If it only finds a head node, it returns 0.
- *
- * The idea is to use this when deciding if you can safely delete an
- * extent from the extent allocation tree.  There may be a pending
- * ref in the rbtree that adds or removes references, so as long as this
- * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
- * allocation tree.
- */
-int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
-{
-        struct btrfs_delayed_ref_node *ref;
-        struct btrfs_delayed_ref_root *delayed_refs;
-        struct rb_node *prev_node;
-        int ret = 0;
-        delayed_refs = &trans->transaction->delayed_refs;
-        spin_lock(&delayed_refs->lock);
-        ref = find_ref_head(&delayed_refs->root, bytenr, NULL);
-        if (ref) {
-                prev_node = rb_prev(&ref->rb_node);
-                if (!prev_node)
-                        goto out;
-                ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
-                               rb_node);
-                if (ref->bytenr == bytenr)
-                        ret = 1;
-        }
-out:
-        spin_unlock(&delayed_refs->lock);
-        return ret;
-}
-/*
 * helper function to update an extent delayed ref in the
 * rbtree.  existing and update must both have the same
 * bytenr and parent
@@ -483,6 +445,8 @@ static noinline int add_delayed_ref_head(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&head_ref->cluster);
        mutex_init(&head_ref->mutex);
+        trace_btrfs_delayed_ref_head(ref, head_ref, action);
        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
        if (existing) {
@@ -537,6 +501,8 @@ static noinline int add_delayed_tree_ref(struct btrfs_trans_handle *trans,
        }
        full_ref->level = level;
+        trace_btrfs_delayed_tree_ref(ref, full_ref, action);
        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
        if (existing) {
@@ -591,6 +557,8 @@ static noinline int add_delayed_data_ref(struct btrfs_trans_handle *trans,
        full_ref->objectid = owner;
        full_ref->offset = offset;
+        trace_btrfs_delayed_data_ref(ref, full_ref, action);
        existing = tree_insert(&delayed_refs->root, &ref->rb_node);
        if (existing) {
@@ -741,79 +709,3 @@ btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
                return btrfs_delayed_node_to_head(ref);
        return NULL;
 }
-/*
- * add a delayed ref to the tree.  This does all of the accounting required
- * to make sure the delayed ref is eventually processed before this
- * transaction commits.
- *
- * The main point of this call is to add and remove a backreference in a single
- * shot, taking the lock only once, and only searching for the head node once.
- *
- * It is the same as doing a ref add and delete in two separate calls.
- */
-#if 0
-int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
-                          u64 bytenr, u64 num_bytes, u64 orig_parent,
-                          u64 parent, u64 orig_ref_root, u64 ref_root,
-                          u64 orig_ref_generation, u64 ref_generation,
-                          u64 owner_objectid, int pin)
-{
-        struct btrfs_delayed_ref *ref;
-        struct btrfs_delayed_ref *old_ref;
-        struct btrfs_delayed_ref_head *head_ref;
-        struct btrfs_delayed_ref_root *delayed_refs;
-        int ret;
-        ref = kmalloc(sizeof(*ref), GFP_NOFS);
-        if (!ref)
-                return -ENOMEM;
-        old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
-        if (!old_ref) {
-                kfree(ref);
-                return -ENOMEM;
-        }
-        /*
-         * the parent = 0 case comes from cases where we don't actually
-         * know the parent yet.  It will get updated later via a add/drop
-         * pair.
-         */
-        if (parent == 0)
-                parent = bytenr;
-        if (orig_parent == 0)
-                orig_parent = bytenr;
-        head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
-        if (!head_ref) {
-                kfree(ref);
-                kfree(old_ref);
-                return -ENOMEM;
-        }
-        delayed_refs = &trans->transaction->delayed_refs;
-        spin_lock(&delayed_refs->lock);
-        /*
-         * insert both the head node and the new ref without dropping
-         * the spin lock
-         */
-        ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
-                                      (u64)-1, 0, 0, 0,
-                                      BTRFS_UPDATE_DELAYED_HEAD, 0);
-        BUG_ON(ret);
-        ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
-                                      parent, ref_root, ref_generation,
-                                      owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
-        BUG_ON(ret);
-        ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
-                                      orig_parent, orig_ref_root,
-                                      orig_ref_generation, owner_objectid,
-                                      BTRFS_DROP_DELAYED_REF, pin);
-        BUG_ON(ret);
-        spin_unlock(&delayed_refs->lock);
-        return 0;
-}
-#endif
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 50e3cf92fbda..e287e3b0eab0 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -166,12 +166,6 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
 struct btrfs_delayed_ref_head *
 btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
-int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
-                          u64 bytenr, u64 num_bytes, u64 orig_parent,
-                          u64 parent, u64 orig_ref_root, u64 ref_root,
-                          u64 orig_ref_generation, u64 ref_generation,
-                          u64 owner_objectid, int pin);
 int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
                           struct btrfs_delayed_ref_head *head);
 int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index e9103b3baa49..685f2593c4f0 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -50,7 +50,6 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle
                if (di)
                        return ERR_PTR(-EEXIST);
                ret = btrfs_extend_item(trans, root, path, data_size);
-                WARN_ON(ret > 0);
        }
        if (ret < 0)
                return ERR_PTR(ret);
@@ -124,8 +123,9 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
 * to use for the second index (if one is created).
 */
 int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
-                          *root, const char *name, int name_len, u64 dir,
+                          *root, const char *name, int name_len,
-                          struct btrfs_key *location, u8 type, u64 index)
+                          struct inode *dir, struct btrfs_key *location,
+                          u8 type, u64 index)
 {
        int ret = 0;
        int ret2 = 0;
@@ -137,13 +137,17 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
        struct btrfs_disk_key disk_key;
        u32 data_size;
-        key.objectid = dir;
+        key.objectid = btrfs_ino(dir);
        btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
        key.offset = btrfs_name_hash(name, name_len);
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        path->leave_spinning = 1;
+        btrfs_cpu_key_to_disk(&disk_key, location);
        data_size = sizeof(*dir_item) + name_len;
        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
                                        name, name_len);
@@ -151,11 +155,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
                ret = PTR_ERR(dir_item);
                if (ret == -EEXIST)
                        goto second_insert;
-                goto out;
+                goto out_free;
        }
        leaf = path->nodes[0];
-        btrfs_cpu_key_to_disk(&disk_key, location);
        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
        btrfs_set_dir_type(leaf, dir_item, type);
        btrfs_set_dir_data_len(leaf, dir_item, 0);
@@ -170,29 +173,13 @@ second_insert:
        /* FIXME, use some real flag for selecting the extra index */
        if (root == root->fs_info->tree_root) {
                ret = 0;
-                goto out;
+                goto out_free;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
-        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
+        ret2 = btrfs_insert_delayed_dir_index(trans, root, name, name_len, dir,
-        key.offset = index;
+                                              &disk_key, type, index);
-        dir_item = insert_with_overflow(trans, root, path, &key, data_size,
+out_free:
-                                        name, name_len);
-        if (IS_ERR(dir_item)) {
-                ret2 = PTR_ERR(dir_item);
-                goto out;
-        }
-        leaf = path->nodes[0];
-        btrfs_cpu_key_to_disk(&disk_key, location);
-        btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
-        btrfs_set_dir_type(leaf, dir_item, type);
-        btrfs_set_dir_data_len(leaf, dir_item, 0);
-        btrfs_set_dir_name_len(leaf, dir_item, name_len);
-        btrfs_set_dir_transid(leaf, dir_item, trans->transid);
-        name_ptr = (unsigned long)(dir_item + 1);
-        write_extent_buffer(leaf, name, name_ptr, name_len);
-        btrfs_mark_buffer_dirty(leaf);
-out:
        btrfs_free_path(path);
        if (ret)
                return ret;
@@ -377,6 +364,9 @@ struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root,
        leaf = path->nodes[0];
        dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item);
+        if (verify_dir_item(root, leaf, dir_item))
+                return NULL;
        total_len = btrfs_item_size_nr(leaf, path->slots[0]);
        while (cur < total_len) {
                this_len = sizeof(*dir_item) +
@@ -427,5 +417,37 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
                ret = btrfs_truncate_item(trans, root, path,
                                          item_len - sub_item_len, 1);
        }
+        return ret;
+}
+int verify_dir_item(struct btrfs_root *root,
+                    struct extent_buffer *leaf,
+                    struct btrfs_dir_item *dir_item)
+{
+        u16 namelen = BTRFS_NAME_LEN;
+        u8 type = btrfs_dir_type(leaf, dir_item);
+        if (type >= BTRFS_FT_MAX) {
+                printk(KERN_CRIT "btrfs: invalid dir item type: %d\n",
+                       (int)type);
+                return 1;
+        }
+        if (type == BTRFS_FT_XATTR)
+                namelen = XATTR_NAME_MAX;
+        if (btrfs_dir_name_len(leaf, dir_item) > namelen) {
+                printk(KERN_CRIT "btrfs: invalid dir item name len: %u\n",
+                       (unsigned)btrfs_dir_data_len(leaf, dir_item));
+                return 1;
+        }
+        /* BTRFS_MAX_XATTR_SIZE is the same for all dir items */
+        if (btrfs_dir_data_len(leaf, dir_item) > BTRFS_MAX_XATTR_SIZE(root)) {
+                printk(KERN_CRIT "btrfs: invalid dir item data len: %u\n",
+                       (unsigned)btrfs_dir_data_len(leaf, dir_item));
+                return 1;
+        }
        return 0;
 }
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 64f10082f048..1ac8db5dc0a3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,9 @@
 #include <linux/freezer.h>
 #include <linux/crc32c.h>
 #include <linux/slab.h>
+#include <linux/migrate.h>
+#include <linux/ratelimit.h>
+#include <asm/unaligned.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -39,10 +42,25 @@
 #include "locking.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
+#include "inode-map.h"
 static struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                                    int read_only);
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root);
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root);
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                      struct btrfs_root *root);
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t);
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root);
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                        struct extent_io_tree *dirty_pages,
+                                        int mark);
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                       struct extent_io_tree *pinned_extents);
+static int btrfs_cleanup_transaction(struct btrfs_root *root);
 /*
 * end_io_wq structs are used to do processing in task context when an IO is
@@ -121,7 +139,7 @@ static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
 * that covers the entire device
 */
 static struct extent_map *btree_get_extent(struct inode *inode,
-                struct page *page, size_t page_offset, u64 start, u64 len,
+                struct page *page, size_t pg_offset, u64 start, u64 len,
                int create)
 {
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
@@ -138,7 +156,7 @@ static struct extent_map *btree_get_extent(struct inode *inode,
        }
        read_unlock(&em_tree->lock);
-        em = alloc_extent_map(GFP_NOFS);
+        em = alloc_extent_map();
        if (!em) {
                em = ERR_PTR(-ENOMEM);
                goto out;
@@ -183,7 +201,7 @@ u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len)
 void btrfs_csum_final(u32 crc, char *result)
 {
-        *(__le32 *)result = ~cpu_to_le32(crc);
+        put_unaligned_le32(~crc, result);
 }
 /*
@@ -238,14 +256,12 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
                        memcpy(&found, result, csum_size);
                        read_extent_buffer(buf, &val, 0, csum_size);
-                        if (printk_ratelimit()) {
+                        printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
-                                printk(KERN_INFO "btrfs: %s checksum verify "
                                       "failed on %llu wanted %X found %X "
                                       "level %d\n",
                                       root->fs_info->sb->s_id,
                                       (unsigned long long)buf->start, val, found,
                                       btrfs_header_level(buf));
-                        }
                        if (result != (char *)&inline_result)
                                kfree(result);
                        return 1;
@@ -280,13 +296,11 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
                ret = 0;
                goto out;
        }
-        if (printk_ratelimit()) {
+        printk_ratelimited("parent transid verify failed on %llu wanted %llu "
-                printk("parent transid verify failed on %llu wanted %llu "
                       "found %llu\n",
                       (unsigned long long)eb->start,
                       (unsigned long long)parent_transid,
                       (unsigned long long)btrfs_header_generation(eb));
-        }
        ret = 1;
        clear_extent_buffer_uptodate(io_tree, eb, &cached_state);
 out:
@@ -308,6 +322,7 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
        int num_copies = 0;
        int mirror_num = 0;
+        clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
        io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
        while (1) {
                ret = read_extent_buffer_pages(io_tree, eb, start, 1,
@@ -316,6 +331,14 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
                    !verify_parent_transid(io_tree, eb, parent_transid))
                        return ret;
+                /*
+                 * This buffer's crc is fine, but its contents are corrupted, so
+                 * there is no reason to read the other copies, they won't be
+                 * any less wrong.
+                 */
+                if (test_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags))
+                        return ret;
                num_copies = btrfs_num_copies(&root->fs_info->mapping_tree,
                                              eb->start, eb->len);
                if (num_copies == 1)
@@ -338,24 +361,33 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
        struct extent_io_tree *tree;
        u64 start = (u64)page->index << PAGE_CACHE_SHIFT;
        u64 found_start;
-        int found_level;
        unsigned long len;
        struct extent_buffer *eb;
        int ret;
        tree = &BTRFS_I(page->mapping->host)->io_tree;
-        if (page->private == EXTENT_PAGE_PRIVATE)
+        if (page->private == EXTENT_PAGE_PRIVATE) {
+                WARN_ON(1);
                goto out;
-        if (!page->private)
+        }
+        if (!page->private) {
+                WARN_ON(1);
                goto out;
+        }
        len = page->private >> 2;
        WARN_ON(len == 0);
-        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        eb = alloc_extent_buffer(tree, start, len, page);
+        if (eb == NULL) {
+                WARN_ON(1);
+                goto out;
+        }
        ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE,
                                             btrfs_header_generation(eb));
        BUG_ON(ret);
+        WARN_ON(!btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN));
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
                WARN_ON(1);
@@ -369,8 +401,6 @@ static int csum_dirty_buffer(struct btrfs_root *root, struct page *page)
                WARN_ON(1);
                goto err;
        }
-        found_level = btrfs_header_level(eb);
        csum_tree_block(root, eb, 0);
 err:
        free_extent_buffer(eb);
@@ -397,6 +427,73 @@ static int check_tree_block_fsid(struct btrfs_root *root,
        return ret;
 }
+#define CORRUPT(reason, eb, root, slot)                         \
+        printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu," \
+               "root=%llu, slot=%d\n", reason,                  \
+               (unsigned long long)btrfs_header_bytenr(eb),     \
+               (unsigned long long)root->objectid, slot)
+static noinline int check_leaf(struct btrfs_root *root,
+                               struct extent_buffer *leaf)
+{
+        struct btrfs_key key;
+        struct btrfs_key leaf_key;
+        u32 nritems = btrfs_header_nritems(leaf);
+        int slot;
+        if (nritems == 0)
+                return 0;
+        /* Check the 0 item */
+        if (btrfs_item_offset_nr(leaf, 0) + btrfs_item_size_nr(leaf, 0) !=
+            BTRFS_LEAF_DATA_SIZE(root)) {
+                CORRUPT("invalid item offset size pair", leaf, root, 0);
+                return -EIO;
+        }
+        /*
+         * Check to make sure each items keys are in the correct order and their
+         * offsets make sense.  We only have to loop through nritems-1 because
+         * we check the current slot against the next slot, which verifies the
+         * next slot's offset+size makes sense and that the current's slot
+         * offset is correct.
+         */
+        for (slot = 0; slot < nritems - 1; slot++) {
+                btrfs_item_key_to_cpu(leaf, &leaf_key, slot);
+                btrfs_item_key_to_cpu(leaf, &key, slot + 1);
+                /* Make sure the keys are in the right order */
+                if (btrfs_comp_cpu_keys(&leaf_key, &key) >= 0) {
+                        CORRUPT("bad key order", leaf, root, slot);
+                        return -EIO;
+                }
+                /*
+                 * Make sure the offset and ends are right, remember that the
+                 * item data starts at the end of the leaf and grows towards the
+                 * front.
+                 */
+                if (btrfs_item_offset_nr(leaf, slot) !=
+                        btrfs_item_end_nr(leaf, slot + 1)) {
+                        CORRUPT("slot offset bad", leaf, root, slot);
+                        return -EIO;
+                }
+                /*
+                 * Check to make sure that we don't point outside of the leaf,
+                 * just incase all the items are consistent to eachother, but
+                 * all point outside of the leaf.
+                 */
+                if (btrfs_item_end_nr(leaf, slot) >
+                    BTRFS_LEAF_DATA_SIZE(root)) {
+                        CORRUPT("slot end outside of leaf", leaf, root, slot);
+                        return -EIO;
+                }
+        }
+        return 0;
+}
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
 {
@@ -426,16 +523,18 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        len = page->private >> 2;
        WARN_ON(len == 0);
-        eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS);
+        eb = alloc_extent_buffer(tree, start, len, page);
+        if (eb == NULL) {
+                ret = -EIO;
+                goto out;
+        }
        found_start = btrfs_header_bytenr(eb);
        if (found_start != start) {
-                if (printk_ratelimit()) {
+                printk_ratelimited(KERN_INFO "btrfs bad tree block start "
-                        printk(KERN_INFO "btrfs bad tree block start "
                               "%llu %llu\n",
                               (unsigned long long)found_start,
                               (unsigned long long)eb->start);
-                }
                ret = -EIO;
                goto err;
        }
@@ -447,10 +546,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
                goto err;
        }
        if (check_tree_block_fsid(root, eb)) {
-                if (printk_ratelimit()) {
+                printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
-                        printk(KERN_INFO "btrfs bad fsid on block %llu\n",
                               (unsigned long long)eb->start);
-                }
                ret = -EIO;
                goto err;
        }
@@ -459,8 +556,20 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        btrfs_set_buffer_lockdep_class(eb, found_level);
        ret = csum_tree_block(root, eb, 1);
-        if (ret)
+        if (ret) {
                ret = -EIO;
+                goto err;
+        }
+        /*
+         * If this is a leaf block and it is corrupt, set the corrupt bit so
+         * that we don't try and read the other copies of this block, just
+         * return -EIO.
+         */
+        if (found_level == 0 && check_leaf(root, eb)) {
+                set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
+                ret = -EIO;
+        }
        end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
        end = eb->start + end - 1;
@@ -481,9 +590,12 @@ static void end_workqueue_bio(struct bio *bio, int err)
        end_io_wq->work.flags = 0;
        if (bio->bi_rw & REQ_WRITE) {
-                if (end_io_wq->metadata)
+                if (end_io_wq->metadata == 1)
                        btrfs_queue_worker(&fs_info->endio_meta_write_workers,
                                           &end_io_wq->work);
+                else if (end_io_wq->metadata == 2)
+                        btrfs_queue_worker(&fs_info->endio_freespace_worker,
+                                           &end_io_wq->work);
                else
                        btrfs_queue_worker(&fs_info->endio_write_workers,
                                           &end_io_wq->work);
@@ -497,6 +609,13 @@ static void end_workqueue_bio(struct bio *bio, int err)
        }
 }
+/*
+ * For the metadata arg you want
+ *
+ * 0 - if data
+ * 1 - if normal metadta
+ * 2 - if writing to the free space cache area
+ */
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata)
 {
@@ -525,19 +644,11 @@ unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info)
        return 256 * limit;
 }
-int btrfs_congested_async(struct btrfs_fs_info *info, int iodone)
-{
-        return atomic_read(&info->nr_async_bios) >
-                btrfs_async_submit_limit(info);
-}
 static void run_one_async_start(struct btrfs_work *work)
 {
-        struct btrfs_fs_info *fs_info;
        struct async_submit_bio *async;
        async = container_of(work, struct  async_submit_bio, work);
-        fs_info = BTRFS_I(async->inode)->root->fs_info;
        async->submit_bio_start(async->inode, async->rw, async->bio,
                               async->mirror_num, async->bio_flags,
                               async->bio_offset);
@@ -688,6 +799,27 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
                                   __btree_submit_bio_done);
 }
+#ifdef CONFIG_MIGRATION
+static int btree_migratepage(struct address_space *mapping,
+                        struct page *newpage, struct page *page)
+{
+        /*
+         * we can't safely write a btree page from here,
+         * we haven't done the locking hook
+         */
+        if (PageDirty(page))
+                return -EAGAIN;
+        /*
+         * Buffers may be managed in a filesystem specific way.
+         * We must have no buffers or drop them.
+         */
+        if (page_has_private(page) &&
+            !try_to_release_page(page, GFP_KERNEL))
+                return -EAGAIN;
+        return migrate_page(mapping, newpage, page);
+}
+#endif
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
        struct extent_io_tree *tree;
@@ -702,8 +834,7 @@ static int btree_writepage(struct page *page, struct writeback_control *wbc)
        }
        redirty_page_for_writepage(wbc, page);
-        eb = btrfs_find_tree_block(root, page_offset(page),
+        eb = btrfs_find_tree_block(root, page_offset(page), PAGE_CACHE_SIZE);
-                                      PAGE_CACHE_SIZE);
        WARN_ON(!eb);
        was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
@@ -793,7 +924,9 @@ static const struct address_space_operations btree_aops = {
        .writepages     = btree_writepages,
        .releasepage    = btree_releasepage,
        .invalidatepage = btree_invalidatepage,
-        .sync_page      = block_sync_page,
+#ifdef CONFIG_MIGRATION
+        .migratepage    = btree_migratepage,
+#endif
 };
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
@@ -818,7 +951,7 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
        struct inode *btree_inode = root->fs_info->btree_inode;
        struct extent_buffer *eb;
        eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-                                bytenr, blocksize, GFP_NOFS);
+                                bytenr, blocksize);
        return eb;
 }
@@ -829,7 +962,7 @@ struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
        struct extent_buffer *eb;
        eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree,
-                                 bytenr, blocksize, NULL, GFP_NOFS);
+                                 bytenr, blocksize, NULL);
        return eb;
 }
@@ -850,12 +983,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
                                      u32 blocksize, u64 parent_transid)
 {
        struct extent_buffer *buf = NULL;
-        struct inode *btree_inode = root->fs_info->btree_inode;
-        struct extent_io_tree *io_tree;
        int ret;
-        io_tree = &BTRFS_I(btree_inode)->io_tree;
        buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
        if (!buf)
                return NULL;
@@ -915,15 +1044,14 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->last_trans = 0;
        root->highest_objectid = 0;
        root->name = NULL;
-        root->in_sysfs = 0;
        root->inode_tree = RB_ROOT;
+        INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
        root->block_rsv = NULL;
        root->orphan_block_rsv = NULL;
        INIT_LIST_HEAD(&root->dirty_list);
        INIT_LIST_HEAD(&root->orphan_list);
        INIT_LIST_HEAD(&root->root_list);
-        spin_lock_init(&root->node_lock);
        spin_lock_init(&root->orphan_lock);
        spin_lock_init(&root->inode_lock);
        spin_lock_init(&root->accounting_lock);
@@ -939,7 +1067,7 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize,
        root->log_transid = 0;
        root->last_log_commit = 0;
        extent_io_tree_init(&root->dirty_log_pages,
-                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+                             fs_info->btree_inode->i_mapping);
        memset(&root->root_key, 0, sizeof(root->root_key));
        memset(&root->root_item, 0, sizeof(root->root_item));
@@ -980,7 +1108,10 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
        blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
        root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
                                     blocksize, generation);
-        BUG_ON(!root->node);
+        if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
+                free_extent_buffer(root->node);
+                return -EIO;
+        }
        root->commit_root = btrfs_root_node(root);
        return 0;
 }
@@ -1104,7 +1235,10 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
                     root, fs_info, location->objectid);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path) {
+                kfree(root);
+                return ERR_PTR(-ENOMEM);
+        }
        ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0);
        if (ret == 0) {
                l = path->nodes[0];
@@ -1115,6 +1249,7 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        }
        btrfs_free_path(path);
        if (ret) {
+                kfree(root);
                if (ret > 0)
                        ret = -ENOENT;
                return ERR_PTR(ret);
@@ -1127,27 +1262,14 @@ struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
        root->commit_root = btrfs_root_node(root);
        BUG_ON(!root->node);
 out:
-        if (location->objectid != BTRFS_TREE_LOG_OBJECTID)
+        if (location->objectid != BTRFS_TREE_LOG_OBJECTID) {
                root->ref_cows = 1;
+                btrfs_check_and_init_root_item(&root->root_item);
+        }
        return root;
 }
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
-                                        u64 root_objectid)
-{
-        struct btrfs_root *root;
-        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
-                return fs_info->tree_root;
-        if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
-                return fs_info->extent_root;
-        root = radix_tree_lookup(&fs_info->fs_roots_radix,
-                                 (unsigned long)root_objectid);
-        return root;
-}
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location)
 {
@@ -1176,7 +1298,22 @@ again:
        if (IS_ERR(root))
                return root;
-        set_anon_super(&root->anon_super, NULL);
+        root->free_ino_ctl = kzalloc(sizeof(*root->free_ino_ctl), GFP_NOFS);
+        root->free_ino_pinned = kzalloc(sizeof(*root->free_ino_pinned),
+                                        GFP_NOFS);
+        if (!root->free_ino_pinned || !root->free_ino_ctl) {
+                ret = -ENOMEM;
+                goto fail;
+        }
+        btrfs_init_free_ino_ctl(root);
+        mutex_init(&root->fs_commit_mutex);
+        spin_lock_init(&root->cache_lock);
+        init_waitqueue_head(&root->cache_wait);
+        ret = set_anon_super(&root->anon_super, NULL);
+        if (ret)
+                goto fail;
        if (btrfs_root_refs(&root->root_item) == 0) {
                ret = -ENOENT;
@@ -1219,41 +1356,6 @@ fail:
        return ERR_PTR(ret);
 }
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
-                                      struct btrfs_key *location,
-                                      const char *name, int namelen)
-{
-        return btrfs_read_fs_root_no_name(fs_info, location);
-#if 0
-        struct btrfs_root *root;
-        int ret;
-        root = btrfs_read_fs_root_no_name(fs_info, location);
-        if (!root)
-                return NULL;
-        if (root->in_sysfs)
-                return root;
-        ret = btrfs_set_root_name(root, name, namelen);
-        if (ret) {
-                free_extent_buffer(root->node);
-                kfree(root);
-                return ERR_PTR(ret);
-        }
-        ret = btrfs_sysfs_add_root(root);
-        if (ret) {
-                free_extent_buffer(root->node);
-                kfree(root->name);
-                kfree(root);
-                return ERR_PTR(ret);
-        }
-        root->in_sysfs = 1;
-        return root;
-#endif
-}
 static int btrfs_congested_fn(void *congested_data, int bdi_bits)
 {
        struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data;
@@ -1261,7 +1363,8 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
        struct btrfs_device *device;
        struct backing_dev_info *bdi;
-        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
+        rcu_read_lock();
+        list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) {
                if (!device->bdev)
                        continue;
                bdi = blk_get_backing_dev_info(device->bdev);
@@ -1270,86 +1373,11 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits)
                        break;
                }
        }
+        rcu_read_unlock();
        return ret;
 }
 /*
- * this unplugs every device on the box, and it is only used when page
- * is null
- */
-static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-        struct btrfs_device *device;
-        struct btrfs_fs_info *info;
-        info = (struct btrfs_fs_info *)bdi->unplug_io_data;
-        list_for_each_entry(device, &info->fs_devices->devices, dev_list) {
-                if (!device->bdev)
-                        continue;
-                bdi = blk_get_backing_dev_info(device->bdev);
-                if (bdi->unplug_io_fn)
-                        bdi->unplug_io_fn(bdi, page);
-        }
-}
-static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
-{
-        struct inode *inode;
-        struct extent_map_tree *em_tree;
-        struct extent_map *em;
-        struct address_space *mapping;
-        u64 offset;
-        /* the generic O_DIRECT read code does this */
-        if (1 || !page) {
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        /*
-         * page->mapping may change at any time.  Get a consistent copy
-         * and use that for everything below
-         */
-        smp_mb();
-        mapping = page->mapping;
-        if (!mapping)
-                return;
-        inode = mapping->host;
-        /*
-         * don't do the expensive searching for a small number of
-         * devices
-         */
-        if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) {
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        offset = page_offset(page);
-        em_tree = &BTRFS_I(inode)->extent_tree;
-        read_lock(&em_tree->lock);
-        em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
-        read_unlock(&em_tree->lock);
-        if (!em) {
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
-                free_extent_map(em);
-                __unplug_io_fn(bdi, page);
-                return;
-        }
-        offset = offset - em->start;
-        btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
-                          em->block_start + offset, page);
-        free_extent_map(em);
-}
-/*
 * If this fails, caller must call bdi_destroy() to get rid of the
 * bdi again.
 */
@@ -1363,8 +1391,6 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi)
                return err;
        bdi->ra_pages   = default_backing_dev_info.ra_pages;
-        bdi->unplug_io_fn       = btrfs_unplug_io_fn;
-        bdi->unplug_io_data     = info;
        bdi->congested_fn       = btrfs_congested_fn;
        bdi->congested_data     = info;
        return 0;
@@ -1377,7 +1403,6 @@ static int bio_ready_for_csum(struct bio *bio)
        u64 start = 0;
        struct page *page;
        struct extent_io_tree *io_tree = NULL;
-        struct btrfs_fs_info *info = NULL;
        struct bio_vec *bvec;
        int i;
        int ret;
@@ -1396,7 +1421,6 @@ static int bio_ready_for_csum(struct bio *bio)
                buf_len = page->private >> 2;
                start = page_offset(page) + bvec->bv_offset;
                io_tree = &BTRFS_I(page->mapping->host)->io_tree;
-                info = BTRFS_I(page->mapping->host)->root->fs_info;
        }
        /* are we fully contained in this bio? */
        if (buf_len <= length)
@@ -1452,6 +1476,7 @@ static int cleaner_kthread(void *arg)
                        btrfs_run_delayed_iputs(root);
                        btrfs_clean_old_snapshots(root);
                        mutex_unlock(&root->fs_info->cleaner_mutex);
+                        btrfs_run_defrag_inodes(root->fs_info);
                }
                if (freezing(current)) {
@@ -1481,24 +1506,25 @@ static int transaction_kthread(void *arg)
                vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
                mutex_lock(&root->fs_info->transaction_kthread_mutex);
-                spin_lock(&root->fs_info->new_trans_lock);
+                spin_lock(&root->fs_info->trans_lock);
                cur = root->fs_info->running_transaction;
                if (!cur) {
-                        spin_unlock(&root->fs_info->new_trans_lock);
+                        spin_unlock(&root->fs_info->trans_lock);
                        goto sleep;
                }
                now = get_seconds();
                if (!cur->blocked &&
                    (now < cur->start_time || now - cur->start_time < 30)) {
-                        spin_unlock(&root->fs_info->new_trans_lock);
+                        spin_unlock(&root->fs_info->trans_lock);
                        delay = HZ * 5;
                        goto sleep;
                }
                transid = cur->transid;
-                spin_unlock(&root->fs_info->new_trans_lock);
+                spin_unlock(&root->fs_info->trans_lock);
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
+                BUG_ON(IS_ERR(trans));
                if (transid == trans->transid) {
                        ret = btrfs_commit_transaction(trans, root);
                        BUG_ON(ret);
@@ -1539,10 +1565,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                                                 GFP_NOFS);
        struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root),
                                                 GFP_NOFS);
-        struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root),
+        struct btrfs_root *tree_root = btrfs_sb(sb);
-                                               GFP_NOFS);
+        struct btrfs_fs_info *fs_info = NULL;
-        struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info),
-                                                GFP_NOFS);
        struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root),
                                                GFP_NOFS);
        struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root),
@@ -1554,11 +1578,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        struct btrfs_super_block *disk_super;
-        if (!extent_root || !tree_root || !fs_info ||
+        if (!extent_root || !tree_root || !tree_root->fs_info ||
            !chunk_root || !dev_root || !csum_root) {
                err = -ENOMEM;
                goto fail;
        }
+        fs_info = tree_root->fs_info;
        ret = init_srcu_struct(&fs_info->subvol_srcu);
        if (ret) {
@@ -1578,6 +1603,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                goto fail_bdi;
        }
+        fs_info->btree_inode->i_mapping->flags &= ~__GFP_FS;
        INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
        INIT_LIST_HEAD(&fs_info->trans_list);
        INIT_LIST_HEAD(&fs_info->dead_roots);
@@ -1587,10 +1614,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        INIT_LIST_HEAD(&fs_info->ordered_operations);
        INIT_LIST_HEAD(&fs_info->caching_block_groups);
        spin_lock_init(&fs_info->delalloc_lock);
-        spin_lock_init(&fs_info->new_trans_lock);
+        spin_lock_init(&fs_info->trans_lock);
        spin_lock_init(&fs_info->ref_cache_lock);
        spin_lock_init(&fs_info->fs_roots_radix_lock);
        spin_lock_init(&fs_info->delayed_iput_lock);
+        spin_lock_init(&fs_info->defrag_inodes_lock);
+        mutex_init(&fs_info->reloc_mutex);
        init_completion(&fs_info->kobj_unregister);
        fs_info->tree_root = tree_root;
@@ -1613,15 +1642,34 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        atomic_set(&fs_info->async_delalloc_pages, 0);
        atomic_set(&fs_info->async_submit_draining, 0);
        atomic_set(&fs_info->nr_async_bios, 0);
+        atomic_set(&fs_info->defrag_running, 0);
        fs_info->sb = sb;
        fs_info->max_inline = 8192 * 1024;
        fs_info->metadata_ratio = 0;
+        fs_info->defrag_inodes = RB_ROOT;
+        fs_info->trans_no_join = 0;
        fs_info->thread_pool_size = min_t(unsigned long,
                                          num_online_cpus() + 2, 8);
        INIT_LIST_HEAD(&fs_info->ordered_extents);
        spin_lock_init(&fs_info->ordered_extent_lock);
+        fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
+                                        GFP_NOFS);
+        if (!fs_info->delayed_root) {
+                err = -ENOMEM;
+                goto fail_iput;
+        }
+        btrfs_init_delayed_root(fs_info->delayed_root);
+        mutex_init(&fs_info->scrub_lock);
+        atomic_set(&fs_info->scrubs_running, 0);
+        atomic_set(&fs_info->scrub_pause_req, 0);
+        atomic_set(&fs_info->scrubs_paused, 0);
+        atomic_set(&fs_info->scrub_cancel_req, 0);
+        init_waitqueue_head(&fs_info->scrub_pause_wait);
+        init_rwsem(&fs_info->scrub_super_lock);
+        fs_info->scrub_workers_refcnt = 0;
        sb->s_blocksize = 4096;
        sb->s_blocksize_bits = blksize_bits(4096);
@@ -1640,10 +1688,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        RB_CLEAR_NODE(&BTRFS_I(fs_info->btree_inode)->rb_node);
        extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree,
-                             fs_info->btree_inode->i_mapping,
+                             fs_info->btree_inode->i_mapping);
-                             GFP_NOFS);
+        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree);
-        extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree,
-                             GFP_NOFS);
        BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops;
@@ -1657,14 +1703,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->block_group_cache_tree = RB_ROOT;
        extent_io_tree_init(&fs_info->freed_extents[0],
-                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+                             fs_info->btree_inode->i_mapping);
        extent_io_tree_init(&fs_info->freed_extents[1],
-                             fs_info->btree_inode->i_mapping, GFP_NOFS);
+                             fs_info->btree_inode->i_mapping);
        fs_info->pinned_extents = &fs_info->freed_extents[0];
        fs_info->do_barriers = 1;
-        mutex_init(&fs_info->trans_mutex);
        mutex_init(&fs_info->ordered_operations_mutex);
        mutex_init(&fs_info->tree_log_mutex);
        mutex_init(&fs_info->chunk_mutex);
@@ -1680,15 +1725,17 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        init_waitqueue_head(&fs_info->transaction_throttle);
        init_waitqueue_head(&fs_info->transaction_wait);
+        init_waitqueue_head(&fs_info->transaction_blocked_wait);
        init_waitqueue_head(&fs_info->async_submit_wait);
        __setup_root(4096, 4096, 4096, 4096, tree_root,
                     fs_info, BTRFS_ROOT_TREE_OBJECTID);
        bh = btrfs_read_dev_super(fs_devices->latest_bdev);
-        if (!bh)
+        if (!bh) {
-                goto fail_iput;
+                err = -EINVAL;
+                goto fail_alloc;
+        }
        memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
        memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
@@ -1699,12 +1746,23 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        disk_super = &fs_info->super_copy;
        if (!btrfs_super_root(disk_super))
-                goto fail_iput;
+                goto fail_alloc;
+        /* check FS state, whether FS is broken. */
+        fs_info->fs_state |= btrfs_super_flags(disk_super);
+        btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
+        /*
+         * In the long term, we'll store the compression type in the super
+         * block, and it'll be used for per file compression control.
+         */
+        fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
        ret = btrfs_parse_options(tree_root, options);
        if (ret) {
                err = ret;
-                goto fail_iput;
+                goto fail_alloc;
        }
        features = btrfs_super_incompat_flags(disk_super) &
@@ -1714,14 +1772,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                       "unsupported optional features (%Lx).\n",
                       (unsigned long long)features);
                err = -EINVAL;
-                goto fail_iput;
+                goto fail_alloc;
        }
        features = btrfs_super_incompat_flags(disk_super);
-        if (!(features & BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF)) {
+        features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
-                features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
+        if (tree_root->fs_info->compress_type & BTRFS_COMPRESS_LZO)
-                btrfs_set_super_incompat_flags(disk_super, features);
+                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
-        }
+        btrfs_set_super_incompat_flags(disk_super, features);
        features = btrfs_super_compat_ro_flags(disk_super) &
                ~BTRFS_FEATURE_COMPAT_RO_SUPP;
@@ -1730,7 +1788,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                       "unsupported option features (%Lx).\n",
                       (unsigned long long)features);
                err = -EINVAL;
-                goto fail_iput;
+                goto fail_alloc;
        }
        btrfs_init_workers(&fs_info->generic_worker,
@@ -1775,6 +1833,11 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_init_workers(&fs_info->endio_write_workers, "endio-write",
                           fs_info->thread_pool_size,
                           &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->endio_freespace_worker, "freespace-write",
+                           1, &fs_info->generic_worker);
+        btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
+                           fs_info->thread_pool_size,
+                           &fs_info->generic_worker);
        /*
         * endios are largely parallel and should have a very
@@ -1795,6 +1858,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        btrfs_start_workers(&fs_info->endio_meta_workers, 1);
        btrfs_start_workers(&fs_info->endio_meta_write_workers, 1);
        btrfs_start_workers(&fs_info->endio_write_workers, 1);
+        btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
+        btrfs_start_workers(&fs_info->delayed_workers, 1);
        fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
        fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -1903,6 +1968,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        fs_info->metadata_alloc_profile = (u64)-1;
        fs_info->system_alloc_profile = fs_info->metadata_alloc_profile;
+        ret = btrfs_init_space_info(fs_info);
+        if (ret) {
+                printk(KERN_ERR "Failed to initial space info: %d\n", ret);
+                goto fail_block_groups;
+        }
        ret = btrfs_read_block_groups(extent_root);
        if (ret) {
                printk(KERN_ERR "Failed to read block groups: %d\n", ret);
@@ -1928,7 +1999,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
                btrfs_set_opt(fs_info->mount_opt, SSD);
        }
-        if (btrfs_super_log_root(disk_super) != 0) {
+        /* do not make disk changes in broken FS */
+        if (btrfs_super_log_root(disk_super) != 0 &&
+            !(fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)) {
                u64 bytenr = btrfs_super_log_root(disk_super);
                if (fs_devices->rw_devices == 0) {
@@ -1992,8 +2065,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
        if (!(sb->s_flags & MS_RDONLY)) {
                down_read(&fs_info->cleanup_work_sem);
-                btrfs_orphan_cleanup(fs_info->fs_root);
+                err = btrfs_orphan_cleanup(fs_info->fs_root);
+                if (!err)
+                        err = btrfs_orphan_cleanup(fs_info->tree_root);
                up_read(&fs_info->cleanup_work_sem);
+                if (err) {
+                        close_ctree(tree_root);
+                        return ERR_PTR(err);
+                }
        }
        return tree_root;
@@ -2035,7 +2114,11 @@ fail_sb_buffer:
        btrfs_stop_workers(&fs_info->endio_meta_workers);
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
+        btrfs_stop_workers(&fs_info->endio_freespace_worker);
        btrfs_stop_workers(&fs_info->submit_workers);
+        btrfs_stop_workers(&fs_info->delayed_workers);
+fail_alloc:
+        kfree(fs_info->delayed_root);
 fail_iput:
        invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
        iput(fs_info->btree_inode);
@@ -2063,11 +2146,9 @@ static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
        if (uptodate) {
                set_buffer_uptodate(bh);
        } else {
-                if (!buffer_eopnotsupp(bh) && printk_ratelimit()) {
+                printk_ratelimited(KERN_WARNING "lost page write due to "
-                        printk(KERN_WARNING "lost page write due to "
                                        "I/O error on %s\n",
                                       bdevname(bh->b_bdev, b));
-                }
                /* note, we dont' set_buffer_write_io_error because we have
                 * our own ways of dealing with the IO errors
                 */
@@ -2200,21 +2281,10 @@ static int write_dev_supers(struct btrfs_device *device,
                        bh->b_end_io = btrfs_end_buffer_write_sync;
                }
-                if (i == last_barrier && do_barriers && device->barriers) {
+                if (i == last_barrier && do_barriers)
-                        ret = submit_bh(WRITE_BARRIER, bh);
+                        ret = submit_bh(WRITE_FLUSH_FUA, bh);
-                        if (ret == -EOPNOTSUPP) {
+                else
-                                printk("btrfs: disabling barriers on dev %s\n",
-                                       device->name);
-                                set_buffer_uptodate(bh);
-                                device->barriers = 0;
-                                /* one reference for submit_bh */
-                                get_bh(bh);
-                                lock_buffer(bh);
-                                ret = submit_bh(WRITE_SYNC, bh);
-                        }
-                } else {
                        ret = submit_bh(WRITE_SYNC, bh);
-                }
                if (ret)
                        errors++;
@@ -2242,7 +2312,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
        head = &root->fs_info->fs_devices->devices;
-        list_for_each_entry(dev, head, dev_list) {
+        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev) {
                        total_errors++;
                        continue;
@@ -2275,7 +2345,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
        }
        total_errors = 0;
-        list_for_each_entry(dev, head, dev_list) {
+        list_for_each_entry_rcu(dev, head, dev_list) {
                if (!dev->bdev)
                        continue;
                if (!dev->in_fs_metadata || !dev->writeable)
@@ -2313,12 +2383,15 @@ int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
        if (btrfs_root_refs(&root->root_item) == 0)
                synchronize_srcu(&fs_info->subvol_srcu);
+        __btrfs_remove_free_space_cache(root->free_ino_pinned);
+        __btrfs_remove_free_space_cache(root->free_ino_ctl);
        free_fs_root(root);
        return 0;
 }
 static void free_fs_root(struct btrfs_root *root)
 {
+        iput(root->cache_inode);
        WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
        if (root->anon_super.s_dev) {
                down_write(&root->anon_super.s_umount);
@@ -2326,6 +2399,8 @@ static void free_fs_root(struct btrfs_root *root)
        }
        free_extent_buffer(root->node);
        free_extent_buffer(root->commit_root);
+        kfree(root->free_ino_ctl);
+        kfree(root->free_ino_pinned);
        kfree(root->name);
        kfree(root);
 }
@@ -2378,8 +2453,12 @@ int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
                root_objectid = gang[ret - 1]->root_key.objectid + 1;
                for (i = 0; i < ret; i++) {
+                        int err;
                        root_objectid = gang[i]->root_key.objectid;
-                        btrfs_orphan_cleanup(gang[i]);
+                        err = btrfs_orphan_cleanup(gang[i]);
+                        if (err)
+                                return err;
                }
                root_objectid++;
        }
@@ -2400,11 +2479,15 @@ int btrfs_commit_super(struct btrfs_root *root)
        down_write(&root->fs_info->cleanup_work_sem);
        up_write(&root->fs_info->cleanup_work_sem);
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        ret = btrfs_commit_transaction(trans, root);
        BUG_ON(ret);
        /* run commit again to drop the original snapshot */
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        btrfs_commit_transaction(trans, root);
        ret = btrfs_write_and_wait_transaction(NULL, root);
        BUG_ON(ret);
@@ -2421,8 +2504,38 @@ int close_ctree(struct btrfs_root *root)
        fs_info->closing = 1;
        smp_mb();
+        btrfs_scrub_cancel(root);
+        /* wait for any defraggers to finish */
+        wait_event(fs_info->transaction_wait,
+                   (atomic_read(&fs_info->defrag_running) == 0));
+        /* clear out the rbtree of defraggable inodes */
+        btrfs_run_defrag_inodes(root->fs_info);
+        btrfs_put_block_group_cache(fs_info);
+        /*
+         * Here come 2 situations when btrfs is broken to flip readonly:
+         *
+         * 1. when btrfs flips readonly somewhere else before
+         * btrfs_commit_super, sb->s_flags has MS_RDONLY flag,
+         * and btrfs will skip to write sb directly to keep
+         * ERROR state on disk.
+         *
+         * 2. when btrfs flips readonly just in btrfs_commit_super,
+         * and in such case, btrfs cannot write sb via btrfs_commit_super,
+         * and since fs_state has been set BTRFS_SUPER_FLAG_ERROR flag,
+         * btrfs will cleanup all FS resources first and write sb then.
+         */
        if (!(fs_info->sb->s_flags & MS_RDONLY)) {
-                ret =  btrfs_commit_super(root);
+                ret = btrfs_commit_super(root);
+                if (ret)
+                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
+        }
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                ret = btrfs_error_commit_super(root);
                if (ret)
                        printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
        }
@@ -2458,6 +2571,7 @@ int close_ctree(struct btrfs_root *root)
        del_fs_roots(fs_info);
        iput(fs_info->btree_inode);
+        kfree(fs_info->delayed_root);
        btrfs_stop_workers(&fs_info->generic_worker);
        btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2467,7 +2581,9 @@ int close_ctree(struct btrfs_root *root)
        btrfs_stop_workers(&fs_info->endio_meta_workers);
        btrfs_stop_workers(&fs_info->endio_meta_write_workers);
        btrfs_stop_workers(&fs_info->endio_write_workers);
+        btrfs_stop_workers(&fs_info->endio_freespace_worker);
        btrfs_stop_workers(&fs_info->submit_workers);
+        btrfs_stop_workers(&fs_info->delayed_workers);
        btrfs_close_devices(fs_info->fs_devices);
        btrfs_mapping_tree_free(&fs_info->mapping_tree);
@@ -2480,6 +2596,8 @@ int close_ctree(struct btrfs_root *root)
        kfree(fs_info->chunk_root);
        kfree(fs_info->dev_root);
        kfree(fs_info->csum_root);
+        kfree(fs_info);
        return 0;
 }
@@ -2542,6 +2660,29 @@ void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
        if (current->flags & PF_MEMALLOC)
                return;
+        btrfs_balance_delayed_items(root);
+        num_dirty = root->fs_info->dirty_metadata_bytes;
+        if (num_dirty > thresh) {
+                balance_dirty_pages_ratelimited_nr(
+                                   root->fs_info->btree_inode->i_mapping, 1);
+        }
+        return;
+}
+void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
+{
+        /*
+         * looks as though older kernels can get into trouble with
+         * this code, they end up stuck in balance_dirty_pages forever
+         */
+        u64 num_dirty;
+        unsigned long thresh = 32 * 1024 * 1024;
+        if (current->flags & PF_MEMALLOC)
+                return;
        num_dirty = root->fs_info->dirty_metadata_bytes;
        if (num_dirty > thresh) {
@@ -2574,7 +2715,7 @@ int btree_lock_page_hook(struct page *page)
                goto out;
        len = page->private >> 2;
-        eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS);
+        eb = find_extent_buffer(io_tree, bytenr, len);
        if (!eb)
                goto out;
@@ -2597,6 +2738,355 @@ out:
        return 0;
 }
+static void btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
+                              int read_only)
+{
+        if (read_only)
+                return;
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+                printk(KERN_WARNING "warning: mount fs with errors, "
+                       "running btrfsck is recommended\n");
+}
+int btrfs_error_commit_super(struct btrfs_root *root)
+{
+        int ret;
+        mutex_lock(&root->fs_info->cleaner_mutex);
+        btrfs_run_delayed_iputs(root);
+        mutex_unlock(&root->fs_info->cleaner_mutex);
+        down_write(&root->fs_info->cleanup_work_sem);
+        up_write(&root->fs_info->cleanup_work_sem);
+        /* cleanup FS via transaction */
+        btrfs_cleanup_transaction(root);
+        ret = write_ctree_super(NULL, root, 0);
+        return ret;
+}
+static int btrfs_destroy_ordered_operations(struct btrfs_root *root)
+{
+        struct btrfs_inode *btrfs_inode;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        mutex_lock(&root->fs_info->ordered_operations_mutex);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        list_splice_init(&root->fs_info->ordered_operations, &splice);
+        while (!list_empty(&splice)) {
+                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                         ordered_operations);
+                list_del_init(&btrfs_inode->ordered_operations);
+                btrfs_invalidate_inodes(btrfs_inode->root);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        mutex_unlock(&root->fs_info->ordered_operations_mutex);
+        return 0;
+}
+static int btrfs_destroy_ordered_extents(struct btrfs_root *root)
+{
+        struct list_head splice;
+        struct btrfs_ordered_extent *ordered;
+        struct inode *inode;
+        INIT_LIST_HEAD(&splice);
+        spin_lock(&root->fs_info->ordered_extent_lock);
+        list_splice_init(&root->fs_info->ordered_extents, &splice);
+        while (!list_empty(&splice)) {
+                ordered = list_entry(splice.next, struct btrfs_ordered_extent,
+                                     root_extent_list);
+                list_del_init(&ordered->root_extent_list);
+                atomic_inc(&ordered->refs);
+                /* the inode may be getting freed (in sys_unlink path). */
+                inode = igrab(ordered->inode);
+                spin_unlock(&root->fs_info->ordered_extent_lock);
+                if (inode)
+                        iput(inode);
+                atomic_set(&ordered->refs, 1);
+                btrfs_put_ordered_extent(ordered);
+                spin_lock(&root->fs_info->ordered_extent_lock);
+        }
+        spin_unlock(&root->fs_info->ordered_extent_lock);
+        return 0;
+}
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+                                      struct btrfs_root *root)
+{
+        struct rb_node *node;
+        struct btrfs_delayed_ref_root *delayed_refs;
+        struct btrfs_delayed_ref_node *ref;
+        int ret = 0;
+        delayed_refs = &trans->delayed_refs;
+        spin_lock(&delayed_refs->lock);
+        if (delayed_refs->num_entries == 0) {
+                spin_unlock(&delayed_refs->lock);
+                printk(KERN_INFO "delayed_refs has NO entry\n");
+                return ret;
+        }
+        node = rb_first(&delayed_refs->root);
+        while (node) {
+                ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+                node = rb_next(node);
+                ref->in_tree = 0;
+                rb_erase(&ref->rb_node, &delayed_refs->root);
+                delayed_refs->num_entries--;
+                atomic_set(&ref->refs, 1);
+                if (btrfs_delayed_ref_is_head(ref)) {
+                        struct btrfs_delayed_ref_head *head;
+                        head = btrfs_delayed_node_to_head(ref);
+                        mutex_lock(&head->mutex);
+                        kfree(head->extent_op);
+                        delayed_refs->num_heads--;
+                        if (list_empty(&head->cluster))
+                                delayed_refs->num_heads_ready--;
+                        list_del_init(&head->cluster);
+                        mutex_unlock(&head->mutex);
+                }
+                spin_unlock(&delayed_refs->lock);
+                btrfs_put_delayed_ref(ref);
+                cond_resched();
+                spin_lock(&delayed_refs->lock);
+        }
+        spin_unlock(&delayed_refs->lock);
+        return ret;
+}
+static int btrfs_destroy_pending_snapshots(struct btrfs_transaction *t)
+{
+        struct btrfs_pending_snapshot *snapshot;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        list_splice_init(&t->pending_snapshots, &splice);
+        while (!list_empty(&splice)) {
+                snapshot = list_entry(splice.next,
+                                      struct btrfs_pending_snapshot,
+                                      list);
+                list_del_init(&snapshot->list);
+                kfree(snapshot);
+        }
+        return 0;
+}
+static int btrfs_destroy_delalloc_inodes(struct btrfs_root *root)
+{
+        struct btrfs_inode *btrfs_inode;
+        struct list_head splice;
+        INIT_LIST_HEAD(&splice);
+        spin_lock(&root->fs_info->delalloc_lock);
+        list_splice_init(&root->fs_info->delalloc_inodes, &splice);
+        while (!list_empty(&splice)) {
+                btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+                                    delalloc_inodes);
+                list_del_init(&btrfs_inode->delalloc_inodes);
+                btrfs_invalidate_inodes(btrfs_inode->root);
+        }
+        spin_unlock(&root->fs_info->delalloc_lock);
+        return 0;
+}
+static int btrfs_destroy_marked_extents(struct btrfs_root *root,
+                                        struct extent_io_tree *dirty_pages,
+                                        int mark)
+{
+        int ret;
+        struct page *page;
+        struct inode *btree_inode = root->fs_info->btree_inode;
+        struct extent_buffer *eb;
+        u64 start = 0;
+        u64 end;
+        u64 offset;
+        unsigned long index;
+        while (1) {
+                ret = find_first_extent_bit(dirty_pages, start, &start, &end,
+                                            mark);
+                if (ret)
+                        break;
+                clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
+                while (start <= end) {
+                        index = start >> PAGE_CACHE_SHIFT;
+                        start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
+                        page = find_get_page(btree_inode->i_mapping, index);
+                        if (!page)
+                                continue;
+                        offset = page_offset(page);
+                        spin_lock(&dirty_pages->buffer_lock);
+                        eb = radix_tree_lookup(
+                             &(&BTRFS_I(page->mapping->host)->io_tree)->buffer,
+                                               offset >> PAGE_CACHE_SHIFT);
+                        spin_unlock(&dirty_pages->buffer_lock);
+                        if (eb) {
+                                ret = test_and_clear_bit(EXTENT_BUFFER_DIRTY,
+                                                         &eb->bflags);
+                                atomic_set(&eb->refs, 1);
+                        }
+                        if (PageWriteback(page))
+                                end_page_writeback(page);
+                        lock_page(page);
+                        if (PageDirty(page)) {
+                                clear_page_dirty_for_io(page);
+                                spin_lock_irq(&page->mapping->tree_lock);
+                                radix_tree_tag_clear(&page->mapping->page_tree,
+                                                        page_index(page),
+                                                        PAGECACHE_TAG_DIRTY);
+                                spin_unlock_irq(&page->mapping->tree_lock);
+                        }
+                        page->mapping->a_ops->invalidatepage(page, 0);
+                        unlock_page(page);
+                }
+        }
+        return ret;
+}
+static int btrfs_destroy_pinned_extent(struct btrfs_root *root,
+                                       struct extent_io_tree *pinned_extents)
+{
+        struct extent_io_tree *unpin;
+        u64 start;
+        u64 end;
+        int ret;
+        unpin = pinned_extents;
+        while (1) {
+                ret = find_first_extent_bit(unpin, 0, &start, &end,
+                                            EXTENT_DIRTY);
+                if (ret)
+                        break;
+                /* opt_discard */
+                if (btrfs_test_opt(root, DISCARD))
+                        ret = btrfs_error_discard_extent(root, start,
+                                                         end + 1 - start,
+                                                         NULL);
+                clear_extent_dirty(unpin, start, end, GFP_NOFS);
+                btrfs_error_unpin_extent_range(root, start, end);
+                cond_resched();
+        }
+        return 0;
+}
+static int btrfs_cleanup_transaction(struct btrfs_root *root)
+{
+        struct btrfs_transaction *t;
+        LIST_HEAD(list);
+        WARN_ON(1);
+        mutex_lock(&root->fs_info->transaction_kthread_mutex);
+        spin_lock(&root->fs_info->trans_lock);
+        list_splice_init(&root->fs_info->trans_list, &list);
+        root->fs_info->trans_no_join = 1;
+        spin_unlock(&root->fs_info->trans_lock);
+        while (!list_empty(&list)) {
+                t = list_entry(list.next, struct btrfs_transaction, list);
+                if (!t)
+                        break;
+                btrfs_destroy_ordered_operations(root);
+                btrfs_destroy_ordered_extents(root);
+                btrfs_destroy_delayed_refs(t, root);
+                btrfs_block_rsv_release(root,
+                                        &root->fs_info->trans_block_rsv,
+                                        t->dirty_pages.dirty_bytes);
+                /* FIXME: cleanup wait for commit */
+                t->in_commit = 1;
+                t->blocked = 1;
+                if (waitqueue_active(&root->fs_info->transaction_blocked_wait))
+                        wake_up(&root->fs_info->transaction_blocked_wait);
+                t->blocked = 0;
+                if (waitqueue_active(&root->fs_info->transaction_wait))
+                        wake_up(&root->fs_info->transaction_wait);
+                t->commit_done = 1;
+                if (waitqueue_active(&t->commit_wait))
+                        wake_up(&t->commit_wait);
+                btrfs_destroy_pending_snapshots(t);
+                btrfs_destroy_delalloc_inodes(root);
+                spin_lock(&root->fs_info->trans_lock);
+                root->fs_info->running_transaction = NULL;
+                spin_unlock(&root->fs_info->trans_lock);
+                btrfs_destroy_marked_extents(root, &t->dirty_pages,
+                                             EXTENT_DIRTY);
+                btrfs_destroy_pinned_extent(root,
+                                            root->fs_info->pinned_extents);
+                atomic_set(&t->use_count, 0);
+                list_del_init(&t->list);
+                memset(t, 0, sizeof(*t));
+                kmem_cache_free(btrfs_transaction_cachep, t);
+        }
+        spin_lock(&root->fs_info->trans_lock);
+        root->fs_info->trans_no_join = 0;
+        spin_unlock(&root->fs_info->trans_lock);
+        mutex_unlock(&root->fs_info->transaction_kthread_mutex);
+        return 0;
+}
 static struct extent_io_ops btree_extent_io_ops = {
        .write_cache_pages_lock_hook = btree_lock_page_hook,
        .readpage_end_io_hook = btree_readpage_end_io_hook,
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 88e825a0bf21..a0b610a67aae 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -52,37 +52,23 @@ int write_ctree_super(struct btrfs_trans_handle *trans,
                      struct btrfs_root *root, int max_mirrors);
 struct buffer_head *btrfs_read_dev_super(struct block_device *bdev);
 int btrfs_commit_super(struct btrfs_root *root);
+int btrfs_error_commit_super(struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
                                            u64 bytenr, u32 blocksize);
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
-                                        u64 root_objectid);
-struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
-                                      struct btrfs_key *location,
-                                      const char *name, int namelen);
 struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root,
                                               struct btrfs_key *location);
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
                                              struct btrfs_key *location);
 int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
-int btrfs_insert_dev_radix(struct btrfs_root *root,
-                           struct block_device *bdev,
-                           u64 device_id,
-                           u64 block_start,
-                           u64 num_blocks);
 void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
+void __btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
 int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
-void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
 int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
-int wait_on_tree_block_writeback(struct btrfs_root *root,
-                                 struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
-int btrfs_open_device(struct btrfs_device *dev);
-int btrfs_verify_block_csum(struct btrfs_root *root,
-                            struct extent_buffer *buf);
 int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
                        int metadata);
 int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
@@ -90,8 +76,6 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
                        unsigned long bio_flags, u64 bio_offset,
                        extent_submit_bio_hook_t *submit_bio_start,
                        extent_submit_bio_hook_t *submit_bio_done);
-int btrfs_congested_async(struct btrfs_fs_info *info, int iodone);
 unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info);
 int btrfs_write_tree_block(struct extent_buffer *buf);
 int btrfs_wait_tree_block_writeback(struct extent_buffer *buf);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 951ef09b82f4..1b8dc33778f9 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -21,14 +21,18 @@ static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len,
        int len = *max_len;
        int type;
-        if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) ||
+        if (connectable && (len < BTRFS_FID_SIZE_CONNECTABLE)) {
-            (connectable && len < BTRFS_FID_SIZE_CONNECTABLE))
+                *max_len = BTRFS_FID_SIZE_CONNECTABLE;
                return 255;
+        } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) {
+                *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE;
+                return 255;
+        }
        len  = BTRFS_FID_SIZE_NON_CONNECTABLE;
        type = FILEID_BTRFS_WITHOUT_PARENT;
-        fid->objectid = inode->i_ino;
+        fid->objectid = btrfs_ino(inode);
        fid->root_objectid = BTRFS_I(inode)->root->objectid;
        fid->gen = inode->i_generation;
@@ -65,7 +69,6 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
 {
        struct btrfs_fs_info *fs_info = btrfs_sb(sb)->fs_info;
        struct btrfs_root *root;
-        struct dentry *dentry;
        struct inode *inode;
        struct btrfs_key key;
        int index;
@@ -108,10 +111,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
                return ERR_PTR(-ESTALE);
        }
-        dentry = d_obtain_alias(inode);
+        return d_obtain_alias(inode);
-        if (!IS_ERR(dentry))
-                dentry->d_op = &btrfs_dentry_operations;
-        return dentry;
 fail:
        srcu_read_unlock(&fs_info->subvol_srcu, index);
        return ERR_PTR(err);
@@ -166,7 +166,6 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
 static struct dentry *btrfs_get_parent(struct dentry *child)
 {
        struct inode *dir = child->d_inode;
-        static struct dentry *dentry;
        struct btrfs_root *root = BTRFS_I(dir)->root;
        struct btrfs_path *path;
        struct extent_buffer *leaf;
@@ -176,14 +175,16 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        int ret;
        path = btrfs_alloc_path();
+        if (!path)
+                return ERR_PTR(-ENOMEM);
-        if (dir->i_ino == BTRFS_FIRST_FREE_OBJECTID) {
+        if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) {
                key.objectid = root->root_key.objectid;
                key.type = BTRFS_ROOT_BACKREF_KEY;
                key.offset = (u64)-1;
                root = root->fs_info->tree_root;
        } else {
-                key.objectid = dir->i_ino;
+                key.objectid = btrfs_ino(dir);
                key.type = BTRFS_INODE_REF_KEY;
                key.offset = (u64)-1;
        }
@@ -223,18 +224,94 @@ static struct dentry *btrfs_get_parent(struct dentry *child)
        key.type = BTRFS_INODE_ITEM_KEY;
        key.offset = 0;
-        dentry = d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
+        return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL));
-        if (!IS_ERR(dentry))
-                dentry->d_op = &btrfs_dentry_operations;
-        return dentry;
 fail:
        btrfs_free_path(path);
        return ERR_PTR(ret);
 }
+static int btrfs_get_name(struct dentry *parent, char *name,
+                          struct dentry *child)
+{
+        struct inode *inode = child->d_inode;
+        struct inode *dir = parent->d_inode;
+        struct btrfs_path *path;
+        struct btrfs_root *root = BTRFS_I(dir)->root;
+        struct btrfs_inode_ref *iref;
+        struct btrfs_root_ref *rref;
+        struct extent_buffer *leaf;
+        unsigned long name_ptr;
+        struct btrfs_key key;
+        int name_len;
+        int ret;
+        u64 ino;
+        if (!dir || !inode)
+                return -EINVAL;
+        if (!S_ISDIR(dir->i_mode))
+                return -EINVAL;
+        ino = btrfs_ino(inode);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+                key.objectid = BTRFS_I(inode)->root->root_key.objectid;
+                key.type = BTRFS_ROOT_BACKREF_KEY;
+                key.offset = (u64)-1;
+                root = root->fs_info->tree_root;
+        } else {
+                key.objectid = ino;
+                key.offset = btrfs_ino(dir);
+                key.type = BTRFS_INODE_REF_KEY;
+        }
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0) {
+                btrfs_free_path(path);
+                return ret;
+        } else if (ret > 0) {
+                if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+                        path->slots[0]--;
+                } else {
+                        btrfs_free_path(path);
+                        return -ENOENT;
+                }
+        }
+        leaf = path->nodes[0];
+        if (ino == BTRFS_FIRST_FREE_OBJECTID) {
+                rref = btrfs_item_ptr(leaf, path->slots[0],
+                                     struct btrfs_root_ref);
+                name_ptr = (unsigned long)(rref + 1);
+                name_len = btrfs_root_ref_name_len(leaf, rref);
+        } else {
+                iref = btrfs_item_ptr(leaf, path->slots[0],
+                                      struct btrfs_inode_ref);
+                name_ptr = (unsigned long)(iref + 1);
+                name_len = btrfs_inode_ref_name_len(leaf, iref);
+        }
+        read_extent_buffer(leaf, name, name_ptr, name_len);
+        btrfs_free_path(path);
+        /*
+         * have to add the null termination to make sure that reconnect_path
+         * gets the right len for strlen
+         */
+        name[name_len] = '\0';
+        return 0;
+}
 const struct export_operations btrfs_export_ops = {
        .encode_fh      = btrfs_encode_fh,
        .fh_to_dentry   = btrfs_fh_to_dentry,
        .fh_to_parent   = btrfs_fh_to_parent,
        .get_parent     = btrfs_get_parent,
+        .get_name       = btrfs_get_name,
 };
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 32d094002a57..71cd456fdb60 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,11 +33,28 @@
 #include "locking.h"
 #include "free-space-cache.h"
+/* control flags for do_chunk_alloc's force field
+ * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
+ * if we really need one.
+ *
+ * CHUNK_ALLOC_FORCE means it must try to allocate one
+ *
+ * CHUNK_ALLOC_LIMITED means to only try and allocate one
+ * if we have very few chunks already allocated.  This is
+ * used as part of the clustering code to help make sure
+ * we have a good pool of storage to cluster in, without
+ * filling the FS with empty chunks
+ *
+ */
+enum {
+        CHUNK_ALLOC_NO_FORCE = 0,
+        CHUNK_ALLOC_FORCE = 1,
+        CHUNK_ALLOC_LIMITED = 2,
+};
 static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc);
-static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                 u64 num_bytes, int reserve, int sinfo);
 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                u64 bytenr, u64 num_bytes, u64 parent,
@@ -77,7 +94,7 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
        return (cache->flags & bits) == bits;
 }
-void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
+static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
 {
        atomic_inc(&cache->count);
 }
@@ -88,6 +105,7 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
                WARN_ON(cache->pinned > 0);
                WARN_ON(cache->reserved > 0);
                WARN_ON(cache->reserved_pinned > 0);
+                kfree(cache->free_space_ctl);
                kfree(cache);
        }
 }
@@ -242,6 +260,12 @@ get_caching_control(struct btrfs_block_group_cache *cache)
                return NULL;
        }
+        /* We're loading it the fast way, so we don't have a caching_ctl. */
+        if (!cache->caching_ctl) {
+                spin_unlock(&cache->lock);
+                return NULL;
+        }
        ctl = cache->caching_ctl;
        atomic_inc(&ctl->count);
        spin_unlock(&cache->lock);
@@ -314,11 +338,6 @@ static int caching_kthread(void *data)
        if (!path)
                return -ENOMEM;
-        exclude_super_stripes(extent_root, block_group);
-        spin_lock(&block_group->space_info->lock);
-        block_group->space_info->bytes_readonly += block_group->bytes_super;
-        spin_unlock(&block_group->space_info->lock);
        last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
        /*
@@ -329,7 +348,7 @@ static int caching_kthread(void *data)
         */
        path->skip_locking = 1;
        path->search_commit_root = 1;
-        path->reada = 2;
+        path->reada = 1;
        key.objectid = last;
        key.offset = 0;
@@ -347,8 +366,7 @@ again:
        nritems = btrfs_header_nritems(leaf);
        while (1) {
-                smp_mb();
+                if (btrfs_fs_closing(fs_info) > 1) {
-                if (fs_info->closing > 1) {
                        last = (u64)-1;
                        break;
                }
@@ -360,15 +378,18 @@ again:
                        if (ret)
                                break;
-                        caching_ctl->progress = last;
+                        if (need_resched() ||
-                        btrfs_release_path(extent_root, path);
+                            btrfs_next_leaf(extent_root, path)) {
-                        up_read(&fs_info->extent_commit_sem);
+                                caching_ctl->progress = last;
-                        mutex_unlock(&caching_ctl->mutex);
+                                btrfs_release_path(path);
-                        if (btrfs_transaction_in_commit(fs_info))
+                                up_read(&fs_info->extent_commit_sem);
-                                schedule_timeout(1);
+                                mutex_unlock(&caching_ctl->mutex);
-                        else
                                cond_resched();
-                        goto again;
+                                goto again;
+                        }
+                        leaf = path->nodes[0];
+                        nritems = btrfs_header_nritems(leaf);
+                        continue;
                }
                if (key.objectid < block_group->key.objectid) {
@@ -421,7 +442,10 @@ err:
        return 0;
 }
-static int cache_block_group(struct btrfs_block_group_cache *cache)
+static int cache_block_group(struct btrfs_block_group_cache *cache,
+                             struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             int load_cache_only)
 {
        struct btrfs_fs_info *fs_info = cache->fs_info;
        struct btrfs_caching_control *caching_ctl;
@@ -432,7 +456,42 @@ static int cache_block_group(struct btrfs_block_group_cache *cache)
        if (cache->cached != BTRFS_CACHE_NO)
                return 0;
-        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_KERNEL);
+        /*
+         * We can't do the read from on-disk cache during a commit since we need
+         * to have the normal tree locking.  Also if we are currently trying to
+         * allocate blocks for the tree root we can't do the fast caching since
+         * we likely hold important locks.
+         */
+        if (trans && (!trans->transaction->in_commit) &&
+            (root && root != root->fs_info->tree_root)) {
+                spin_lock(&cache->lock);
+                if (cache->cached != BTRFS_CACHE_NO) {
+                        spin_unlock(&cache->lock);
+                        return 0;
+                }
+                cache->cached = BTRFS_CACHE_STARTED;
+                spin_unlock(&cache->lock);
+                ret = load_free_space_cache(fs_info, cache);
+                spin_lock(&cache->lock);
+                if (ret == 1) {
+                        cache->cached = BTRFS_CACHE_FINISHED;
+                        cache->last_byte_to_unpin = (u64)-1;
+                } else {
+                        cache->cached = BTRFS_CACHE_NO;
+                }
+                spin_unlock(&cache->lock);
+                if (ret == 1) {
+                        free_excluded_extents(fs_info->extent_root, cache);
+                        return 0;
+                }
+        }
+        if (load_cache_only)
+                return 0;
+        caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
        BUG_ON(!caching_ctl);
        INIT_LIST_HEAD(&caching_ctl->list);
@@ -509,7 +568,7 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
-                if (found->flags == flags) {
+                if (found->flags & flags) {
                        rcu_read_unlock();
                        return found;
                }
@@ -542,6 +601,15 @@ static u64 div_factor(u64 num, int factor)
        return num;
 }
+static u64 div_factor_fine(u64 num, int factor)
+{
+        if (factor == 100)
+                return num;
+        num *= factor;
+        do_div(num, 100);
+        return num;
+}
 u64 btrfs_find_block_group(struct btrfs_root *root,
                           u64 search_start, u64 search_hint, int owner)
 {
@@ -689,8 +757,12 @@ again:
                        atomic_inc(&head->node.refs);
                        spin_unlock(&delayed_refs->lock);
-                        btrfs_release_path(root->fs_info->extent_root, path);
+                        btrfs_release_path(path);
+                        /*
+                         * Mutex was contended, block until it's released and try
+                         * again
+                         */
                        mutex_lock(&head->mutex);
                        mutex_unlock(&head->mutex);
                        btrfs_put_delayed_ref(&head->node);
@@ -869,7 +941,7 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
                        break;
                }
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (owner < BTRFS_FIRST_FREE_OBJECTID)
                new_size += sizeof(*bi);
@@ -882,7 +954,6 @@ static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        ret = btrfs_extend_item(trans, root, path, new_size);
-        BUG_ON(ret);
        leaf = path->nodes[0];
        item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
@@ -977,7 +1048,7 @@ again:
                        return 0;
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
                key.type = BTRFS_EXTENT_REF_V0_KEY;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret < 0) {
                        err = ret;
@@ -1015,7 +1086,7 @@ again:
                if (match_extent_data_ref(leaf, ref, root_objectid,
                                          owner, offset)) {
                        if (recow) {
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                goto again;
                        }
                        err = 0;
@@ -1076,7 +1147,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
                        if (match_extent_data_ref(leaf, ref, root_objectid,
                                                  owner, offset))
                                break;
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        key.offset++;
                        ret = btrfs_insert_empty_item(trans, root, path, &key,
                                                      size);
@@ -1102,7 +1173,7 @@ static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
        btrfs_mark_buffer_dirty(leaf);
        ret = 0;
 fail:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return ret;
 }
@@ -1228,7 +1299,7 @@ static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
                ret = -ENOENT;
 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
        if (ret == -ENOENT && parent) {
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                key.type = BTRFS_EXTENT_REF_V0_KEY;
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret > 0)
@@ -1257,7 +1328,7 @@ static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
        }
        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return ret;
 }
@@ -1490,7 +1561,6 @@ int setup_inline_extent_backref(struct btrfs_trans_handle *trans,
        size = btrfs_extent_inline_ref_size(type);
        ret = btrfs_extend_item(trans, root, path, size);
-        BUG_ON(ret);
        ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
        refs = btrfs_extent_refs(leaf, ei);
@@ -1543,7 +1613,7 @@ static int lookup_extent_backref(struct btrfs_trans_handle *trans,
        if (ret != -ENOENT)
                return ret;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        *ref_ret = NULL;
        if (owner < BTRFS_FIRST_FREE_OBJECTID) {
@@ -1619,7 +1689,6 @@ int update_inline_extent_backref(struct btrfs_trans_handle *trans,
                                              end - ptr - size);
                item_size -= size;
                ret = btrfs_truncate_item(trans, root, path, item_size, 1);
-                BUG_ON(ret);
        }
        btrfs_mark_buffer_dirty(leaf);
        return 0;
@@ -1692,40 +1761,45 @@ static int remove_extent_backref(struct btrfs_trans_handle *trans,
        return ret;
 }
-static void btrfs_issue_discard(struct block_device *bdev,
+static int btrfs_issue_discard(struct block_device *bdev,
                                u64 start, u64 len)
 {
-        blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL,
+        return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
-                        BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
-                                u64 num_bytes)
+                                u64 num_bytes, u64 *actual_bytes)
 {
        int ret;
-        u64 map_length = num_bytes;
+        u64 discarded_bytes = 0;
        struct btrfs_multi_bio *multi = NULL;
-        if (!btrfs_test_opt(root, DISCARD))
-                return 0;
        /* Tell the block device(s) that the sectors can be discarded */
-        ret = btrfs_map_block(&root->fs_info->mapping_tree, READ,
+        ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
-                              bytenr, &map_length, &multi, 0);
+                              bytenr, &num_bytes, &multi, 0);
        if (!ret) {
                struct btrfs_bio_stripe *stripe = multi->stripes;
                int i;
-                if (map_length > num_bytes)
-                        map_length = num_bytes;
                for (i = 0; i < multi->num_stripes; i++, stripe++) {
-                        btrfs_issue_discard(stripe->dev->bdev,
+                        ret = btrfs_issue_discard(stripe->dev->bdev,
-                                            stripe->physical,
+                                                  stripe->physical,
-                                            map_length);
+                                                  stripe->length);
+                        if (!ret)
+                                discarded_bytes += stripe->length;
+                        else if (ret != -EOPNOTSUPP)
+                                break;
                }
                kfree(multi);
        }
+        if (discarded_bytes && ret == -EOPNOTSUPP)
+                ret = 0;
+        if (actual_bytes)
+                *actual_bytes = discarded_bytes;
        return ret;
 }
@@ -1792,7 +1866,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
                __run_delayed_extent_op(extent_op, leaf, item);
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(root->fs_info->extent_root, path);
+        btrfs_release_path(path);
        path->reada = 1;
        path->leave_spinning = 1;
@@ -2227,6 +2301,10 @@ again:
                                atomic_inc(&ref->refs);
                                spin_unlock(&delayed_refs->lock);
+                                /*
+                                 * Mutex was contended, block until it's
+                                 * released and try again
+                                 */
                                mutex_lock(&head->mutex);
                                mutex_unlock(&head->mutex);
@@ -2291,8 +2369,12 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
                atomic_inc(&head->node.refs);
                spin_unlock(&delayed_refs->lock);
-                btrfs_release_path(root->fs_info->extent_root, path);
+                btrfs_release_path(path);
+                /*
+                 * Mutex was contended, block until it's released and let
+                 * caller try again
+                 */
                mutex_lock(&head->mutex);
                mutex_unlock(&head->mutex);
                btrfs_put_delayed_ref(&head->node);
@@ -2440,126 +2522,6 @@ out:
        return ret;
 }
-#if 0
-int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
-                    struct extent_buffer *buf, u32 nr_extents)
-{
-        struct btrfs_key key;
-        struct btrfs_file_extent_item *fi;
-        u64 root_gen;
-        u32 nritems;
-        int i;
-        int level;
-        int ret = 0;
-        int shared = 0;
-        if (!root->ref_cows)
-                return 0;
-        if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
-                shared = 0;
-                root_gen = root->root_key.offset;
-        } else {
-                shared = 1;
-                root_gen = trans->transid - 1;
-        }
-        level = btrfs_header_level(buf);
-        nritems = btrfs_header_nritems(buf);
-        if (level == 0) {
-                struct btrfs_leaf_ref *ref;
-                struct btrfs_extent_info *info;
-                ref = btrfs_alloc_leaf_ref(root, nr_extents);
-                if (!ref) {
-                        ret = -ENOMEM;
-                        goto out;
-                }
-                ref->root_gen = root_gen;
-                ref->bytenr = buf->start;
-                ref->owner = btrfs_header_owner(buf);
-                ref->generation = btrfs_header_generation(buf);
-                ref->nritems = nr_extents;
-                info = ref->extents;
-                for (i = 0; nr_extents > 0 && i < nritems; i++) {
-                        u64 disk_bytenr;
-                        btrfs_item_key_to_cpu(buf, &key, i);
-                        if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                                continue;
-                        fi = btrfs_item_ptr(buf, i,
-                                            struct btrfs_file_extent_item);
-                        if (btrfs_file_extent_type(buf, fi) ==
-                            BTRFS_FILE_EXTENT_INLINE)
-                                continue;
-                        disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
-                        if (disk_bytenr == 0)
-                                continue;
-                        info->bytenr = disk_bytenr;
-                        info->num_bytes =
-                                btrfs_file_extent_disk_num_bytes(buf, fi);
-                        info->objectid = key.objectid;
-                        info->offset = key.offset;
-                        info++;
-                }
-                ret = btrfs_add_leaf_ref(root, ref, shared);
-                if (ret == -EEXIST && shared) {
-                        struct btrfs_leaf_ref *old;
-                        old = btrfs_lookup_leaf_ref(root, ref->bytenr);
-                        BUG_ON(!old);
-                        btrfs_remove_leaf_ref(root, old);
-                        btrfs_free_leaf_ref(root, old);
-                        ret = btrfs_add_leaf_ref(root, ref, shared);
-                }
-                WARN_ON(ret);
-                btrfs_free_leaf_ref(root, ref);
-        }
-out:
-        return ret;
-}
-/* when a block goes through cow, we update the reference counts of
- * everything that block points to.  The internal pointers of the block
- * can be in just about any order, and it is likely to have clusters of
- * things that are close together and clusters of things that are not.
- *
- * To help reduce the seeks that come with updating all of these reference
- * counts, sort them by byte number before actual updates are done.
- *
- * struct refsort is used to match byte number to slot in the btree block.
- * we sort based on the byte number and then use the slot to actually
- * find the item.
- *
- * struct refsort is smaller than strcut btrfs_item and smaller than
- * struct btrfs_key_ptr.  Since we're currently limited to the page size
- * for a btree block, there's no way for a kmalloc of refsorts for a
- * single node to be bigger than a page.
- */
-struct refsort {
-        u64 bytenr;
-        u32 slot;
-};
-/*
- * for passing into sort()
- */
-static int refsort_cmp(const void *a_void, const void *b_void)
-{
-        const struct refsort *a = a_void;
-        const struct refsort *b = b_void;
-        if (a->bytenr < b->bytenr)
-                return -1;
-        if (a->bytenr > b->bytenr)
-                return 1;
-        return 0;
-}
-#endif
 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
                           struct btrfs_root *root,
                           struct extent_buffer *buf,
@@ -2662,7 +2624,7 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
        bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
        write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(extent_root, path);
+        btrfs_release_path(path);
 fail:
        if (ret)
                return ret;
@@ -2688,6 +2650,111 @@ next_block_group(struct btrfs_root *root,
        return cache;
 }
+static int cache_save_setup(struct btrfs_block_group_cache *block_group,
+                            struct btrfs_trans_handle *trans,
+                            struct btrfs_path *path)
+{
+        struct btrfs_root *root = block_group->fs_info->tree_root;
+        struct inode *inode = NULL;
+        u64 alloc_hint = 0;
+        int dcs = BTRFS_DC_ERROR;
+        int num_pages = 0;
+        int retries = 0;
+        int ret = 0;
+        /*
+         * If this block group is smaller than 100 megs don't bother caching the
+         * block group.
+         */
+        if (block_group->key.offset < (100 * 1024 * 1024)) {
+                spin_lock(&block_group->lock);
+                block_group->disk_cache_state = BTRFS_DC_WRITTEN;
+                spin_unlock(&block_group->lock);
+                return 0;
+        }
+again:
+        inode = lookup_free_space_inode(root, block_group, path);
+        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+                ret = PTR_ERR(inode);
+                btrfs_release_path(path);
+                goto out;
+        }
+        if (IS_ERR(inode)) {
+                BUG_ON(retries);
+                retries++;
+                if (block_group->ro)
+                        goto out_free;
+                ret = create_free_space_inode(root, trans, block_group, path);
+                if (ret)
+                        goto out_free;
+                goto again;
+        }
+        /*
+         * We want to set the generation to 0, that way if anything goes wrong
+         * from here on out we know not to trust this cache when we load up next
+         * time.
+         */
+        BTRFS_I(inode)->generation = 0;
+        ret = btrfs_update_inode(trans, root, inode);
+        WARN_ON(ret);
+        if (i_size_read(inode) > 0) {
+                ret = btrfs_truncate_free_space_cache(root, trans, path,
+                                                      inode);
+                if (ret)
+                        goto out_put;
+        }
+        spin_lock(&block_group->lock);
+        if (block_group->cached != BTRFS_CACHE_FINISHED) {
+                /* We're not cached, don't bother trying to write stuff out */
+                dcs = BTRFS_DC_WRITTEN;
+                spin_unlock(&block_group->lock);
+                goto out_put;
+        }
+        spin_unlock(&block_group->lock);
+        num_pages = (int)div64_u64(block_group->key.offset, 1024 * 1024 * 1024);
+        if (!num_pages)
+                num_pages = 1;
+        /*
+         * Just to make absolutely sure we have enough space, we're going to
+         * preallocate 12 pages worth of space for each block group.  In
+         * practice we ought to use at most 8, but we need extra space so we can
+         * add our header and have a terminator between the extents and the
+         * bitmaps.
+         */
+        num_pages *= 16;
+        num_pages *= PAGE_CACHE_SIZE;
+        ret = btrfs_check_data_free_space(inode, num_pages);
+        if (ret)
+                goto out_put;
+        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
+                                              num_pages, num_pages,
+                                              &alloc_hint);
+        if (!ret)
+                dcs = BTRFS_DC_SETUP;
+        btrfs_free_reserved_data_space(inode, num_pages);
+out_put:
+        iput(inode);
+out_free:
+        btrfs_release_path(path);
+out:
+        spin_lock(&block_group->lock);
+        block_group->disk_cache_state = dcs;
+        spin_unlock(&block_group->lock);
+        return ret;
+}
 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
@@ -2700,6 +2767,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
+again:
+        while (1) {
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
+                while (cache) {
+                        if (cache->disk_cache_state == BTRFS_DC_CLEAR)
+                                break;
+                        cache = next_block_group(root, cache);
+                }
+                if (!cache) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
+                err = cache_save_setup(cache, trans, path);
+                last = cache->key.objectid + cache->key.offset;
+                btrfs_put_block_group(cache);
+        }
        while (1) {
                if (last == 0) {
                        err = btrfs_run_delayed_refs(trans, root,
@@ -2709,6 +2795,11 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                cache = btrfs_lookup_first_block_group(root->fs_info, last);
                while (cache) {
+                        if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
+                                btrfs_put_block_group(cache);
+                                goto again;
+                        }
                        if (cache->dirty)
                                break;
                        cache = next_block_group(root, cache);
@@ -2720,6 +2811,8 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                        continue;
                }
+                if (cache->disk_cache_state == BTRFS_DC_SETUP)
+                        cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
                cache->dirty = 0;
                last = cache->key.objectid + cache->key.offset;
@@ -2728,6 +2821,52 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
                btrfs_put_block_group(cache);
        }
+        while (1) {
+                /*
+                 * I don't think this is needed since we're just marking our
+                 * preallocated extent as written, but just in case it can't
+                 * hurt.
+                 */
+                if (last == 0) {
+                        err = btrfs_run_delayed_refs(trans, root,
+                                                     (unsigned long)-1);
+                        BUG_ON(err);
+                }
+                cache = btrfs_lookup_first_block_group(root->fs_info, last);
+                while (cache) {
+                        /*
+                         * Really this shouldn't happen, but it could if we
+                         * couldn't write the entire preallocated extent and
+                         * splitting the extent resulted in a new block.
+                         */
+                        if (cache->dirty) {
+                                btrfs_put_block_group(cache);
+                                goto again;
+                        }
+                        if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+                                break;
+                        cache = next_block_group(root, cache);
+                }
+                if (!cache) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
+                btrfs_write_out_cache(root, trans, cache, path);
+                /*
+                 * If we didn't have an error then the cache state is still
+                 * NEED_WRITE, so we can set it to WRITTEN.
+                 */
+                if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
+                        cache->disk_cache_state = BTRFS_DC_WRITTEN;
+                last = cache->key.objectid + cache->key.offset;
+                btrfs_put_block_group(cache);
+        }
        btrfs_free_path(path);
        return 0;
 }
@@ -2763,6 +2902,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        if (found) {
                spin_lock(&found->lock);
                found->total_bytes += total_bytes;
+                found->disk_total += total_bytes * factor;
                found->bytes_used += bytes_used;
                found->disk_used += bytes_used * factor;
                found->full = 0;
@@ -2782,6 +2922,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
                                BTRFS_BLOCK_GROUP_SYSTEM |
                                BTRFS_BLOCK_GROUP_METADATA);
        found->total_bytes = total_bytes;
+        found->disk_total = total_bytes * factor;
        found->bytes_used = bytes_used;
        found->disk_used = bytes_used * factor;
        found->bytes_pinned = 0;
@@ -2789,7 +2930,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
        found->bytes_readonly = 0;
        found->bytes_may_use = 0;
        found->full = 0;
-        found->force_alloc = 0;
+        found->force_alloc = CHUNK_ALLOC_NO_FORCE;
+        found->chunk_alloc = 0;
        *space_info = found;
        list_add_rcu(&found->list, &info->space_info);
        atomic_set(&found->caching_threads, 0);
@@ -2814,7 +2956,13 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 {
-        u64 num_devices = root->fs_info->fs_devices->rw_devices;
+        /*
+         * we add in the count of missing devices because we want
+         * to make sure that any RAID levels on a degraded FS
+         * continue to be honored.
+         */
+        u64 num_devices = root->fs_info->fs_devices->rw_devices +
+                root->fs_info->fs_devices->missing_devices;
        if (num_devices == 1)
                flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
@@ -2854,7 +3002,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
        return btrfs_reduce_alloc_profile(root, flags);
 }
-static u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
+u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
        u64 flags;
@@ -2883,11 +3031,17 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
        struct btrfs_space_info *data_sinfo;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 used;
-        int ret = 0, committed = 0;
+        int ret = 0, committed = 0, alloc_chunk = 1;
        /* make sure bytes are sectorsize aligned */
        bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+        if (root == root->fs_info->tree_root ||
+            BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
+                alloc_chunk = 0;
+                committed = 1;
+        }
        data_sinfo = BTRFS_I(inode)->space_info;
        if (!data_sinfo)
                goto alloc;
@@ -2906,23 +3060,28 @@ again:
                 * if we don't have enough free bytes in this space then we need
                 * to alloc a new chunk.
                 */
-                if (!data_sinfo->full) {
+                if (!data_sinfo->full && alloc_chunk) {
                        u64 alloc_target;
-                        data_sinfo->force_alloc = 1;
+                        data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
                        spin_unlock(&data_sinfo->lock);
 alloc:
                        alloc_target = btrfs_get_alloc_profile(root, 1);
-                        trans = btrfs_join_transaction(root, 1);
+                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
                        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
                                             bytes + 2 * 1024 * 1024,
-                                             alloc_target, 0);
+                                             alloc_target,
+                                             CHUNK_ALLOC_NO_FORCE);
                        btrfs_end_transaction(trans, root);
-                        if (ret < 0)
+                        if (ret < 0) {
-                                return ret;
+                                if (ret != -ENOSPC)
+                                        return ret;
+                                else
+                                        goto commit_trans;
+                        }
                        if (!data_sinfo) {
                                btrfs_set_inode_space_info(root, inode);
@@ -2930,12 +3089,21 @@ alloc:
                        }
                        goto again;
                }
+                /*
+                 * If we have less pinned bytes than we want to allocate then
+                 * don't bother committing the transaction, it won't help us.
+                 */
+                if (data_sinfo->bytes_pinned < bytes)
+                        committed = 1;
                spin_unlock(&data_sinfo->lock);
                /* commit the current transaction and try again */
-                if (!committed && !root->fs_info->open_ioctl_trans) {
+commit_trans:
+                if (!committed &&
+                    !atomic_read(&root->fs_info->open_ioctl_trans)) {
                        committed = 1;
-                        trans = btrfs_join_transaction(root, 1);
+                        trans = btrfs_join_transaction(root);
                        if (IS_ERR(trans))
                                return PTR_ERR(trans);
                        ret = btrfs_commit_transaction(trans, root);
@@ -2944,18 +3112,6 @@ alloc:
                        goto again;
                }
-#if 0 /* I hope we never need this code again, just in case */
-                printk(KERN_ERR "no space left, need %llu, %llu bytes_used, "
-                       "%llu bytes_reserved, " "%llu bytes_pinned, "
-                       "%llu bytes_readonly, %llu may use %llu total\n",
-                       (unsigned long long)bytes,
-                       (unsigned long long)data_sinfo->bytes_used,
-                       (unsigned long long)data_sinfo->bytes_reserved,
-                       (unsigned long long)data_sinfo->bytes_pinned,
-                       (unsigned long long)data_sinfo->bytes_readonly,
-                       (unsigned long long)data_sinfo->bytes_may_use,
-                       (unsigned long long)data_sinfo->total_bytes);
-#endif
                return -ENOSPC;
        }
        data_sinfo->bytes_may_use += bytes;
@@ -2993,24 +3149,56 @@ static void force_metadata_allocation(struct btrfs_fs_info *info)
        rcu_read_lock();
        list_for_each_entry_rcu(found, head, list) {
                if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
-                        found->force_alloc = 1;
+                        found->force_alloc = CHUNK_ALLOC_FORCE;
        }
        rcu_read_unlock();
 }
-static int should_alloc_chunk(struct btrfs_space_info *sinfo,
+static int should_alloc_chunk(struct btrfs_root *root,
-                              u64 alloc_bytes)
+                              struct btrfs_space_info *sinfo, u64 alloc_bytes,
+                              int force)
 {
        u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
+        u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
+        u64 thresh;
+        if (force == CHUNK_ALLOC_FORCE)
+                return 1;
-        if (sinfo->bytes_used + sinfo->bytes_reserved +
+        /*
-            alloc_bytes + 256 * 1024 * 1024 < num_bytes)
+         * in limited mode, we want to have some free space up to
+         * about 1% of the FS size.
+         */
+        if (force == CHUNK_ALLOC_LIMITED) {
+                thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+                thresh = max_t(u64, 64 * 1024 * 1024,
+                               div_factor_fine(thresh, 1));
+                if (num_bytes - num_allocated < thresh)
+                        return 1;
+        }
+        /*
+         * we have two similar checks here, one based on percentage
+         * and once based on a hard number of 256MB.  The idea
+         * is that if we have a good amount of free
+         * room, don't allocate a chunk.  A good mount is
+         * less than 80% utilized of the chunks we have allocated,
+         * or more than 256MB free
+         */
+        if (num_allocated + alloc_bytes + 256 * 1024 * 1024 < num_bytes)
                return 0;
-        if (sinfo->bytes_used + sinfo->bytes_reserved +
+        if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
-            alloc_bytes < div_factor(num_bytes, 8))
                return 0;
+        thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+        /* 256MB or 5% of the FS */
+        thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
+        if (num_bytes > thresh && sinfo->bytes_used < div_factor(num_bytes, 3))
+                return 0;
        return 1;
 }
@@ -3020,10 +3208,9 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 {
        struct btrfs_space_info *space_info;
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
+        int wait_for_alloc = 0;
        int ret = 0;
-        mutex_lock(&fs_info->chunk_mutex);
        flags = btrfs_reduce_alloc_profile(extent_root, flags);
        space_info = __find_space_info(extent_root->fs_info, flags);
@@ -3034,20 +3221,47 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
        }
        BUG_ON(!space_info);
+again:
        spin_lock(&space_info->lock);
        if (space_info->force_alloc)
-                force = 1;
+                force = space_info->force_alloc;
        if (space_info->full) {
                spin_unlock(&space_info->lock);
-                goto out;
+                return 0;
        }
-        if (!force && !should_alloc_chunk(space_info, alloc_bytes)) {
+        if (!should_alloc_chunk(extent_root, space_info, alloc_bytes, force)) {
                spin_unlock(&space_info->lock);
-                goto out;
+                return 0;
+        } else if (space_info->chunk_alloc) {
+                wait_for_alloc = 1;
+        } else {
+                space_info->chunk_alloc = 1;
        }
        spin_unlock(&space_info->lock);
+        mutex_lock(&fs_info->chunk_mutex);
+        /*
+         * The chunk_mutex is held throughout the entirety of a chunk
+         * allocation, so once we've acquired the chunk_mutex we know that the
+         * other guy is done and we need to recheck and see if we should
+         * allocate.
+         */
+        if (wait_for_alloc) {
+                mutex_unlock(&fs_info->chunk_mutex);
+                wait_for_alloc = 0;
+                goto again;
+        }
+        /*
+         * If we have mixed data/metadata chunks we want to make sure we keep
+         * allocating mixed chunks instead of individual chunks.
+         */
+        if (btrfs_mixed_space_info(space_info))
+                flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
        /*
         * if we're doing a data chunk, go ahead and make sure that
         * we keep a reasonable number of metadata chunks allocated in the
@@ -3066,167 +3280,220 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
                space_info->full = 1;
        else
                ret = 1;
-        space_info->force_alloc = 0;
+        space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
+        space_info->chunk_alloc = 0;
        spin_unlock(&space_info->lock);
-out:
        mutex_unlock(&extent_root->fs_info->chunk_mutex);
        return ret;
 }
-static int maybe_allocate_chunk(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *root,
-                                struct btrfs_space_info *sinfo, u64 num_bytes)
-{
-        int ret;
-        int end_trans = 0;
-        if (sinfo->full)
-                return 0;
-        spin_lock(&sinfo->lock);
-        ret = should_alloc_chunk(sinfo, num_bytes + 2 * 1024 * 1024);
-        spin_unlock(&sinfo->lock);
-        if (!ret)
-                return 0;
-        if (!trans) {
-                trans = btrfs_join_transaction(root, 1);
-                BUG_ON(IS_ERR(trans));
-                end_trans = 1;
-        }
-        ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                             num_bytes + 2 * 1024 * 1024,
-                             get_alloc_profile(root, sinfo->flags), 0);
-        if (end_trans)
-                btrfs_end_transaction(trans, root);
-        return ret == 1 ? 1 : 0;
-}
 /*
 * shrink metadata reservation for delalloc
 */
 static int shrink_delalloc(struct btrfs_trans_handle *trans,
-                           struct btrfs_root *root, u64 to_reclaim)
+                           struct btrfs_root *root, u64 to_reclaim, int sync)
 {
        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_space_info *space_info;
        u64 reserved;
        u64 max_reclaim;
        u64 reclaimed = 0;
-        int pause = 1;
+        long time_left;
-        int ret;
+        int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+        int loops = 0;
+        unsigned long progress;
        block_rsv = &root->fs_info->delalloc_block_rsv;
-        spin_lock(&block_rsv->lock);
+        space_info = block_rsv->space_info;
-        reserved = block_rsv->reserved;
-        spin_unlock(&block_rsv->lock);
+        smp_mb();
+        reserved = space_info->bytes_reserved;
+        progress = space_info->reservation_progress;
        if (reserved == 0)
                return 0;
        max_reclaim = min(reserved, to_reclaim);
-        while (1) {
+        while (loops < 1024) {
-                ret = btrfs_start_one_delalloc_inode(root, trans ? 1 : 0);
+                /* have the flusher threads jump in and do some IO */
-                if (!ret) {
+                smp_mb();
-                        __set_current_state(TASK_INTERRUPTIBLE);
+                nr_pages = min_t(unsigned long, nr_pages,
-                        schedule_timeout(pause);
+                       root->fs_info->delalloc_bytes >> PAGE_CACHE_SHIFT);
-                        pause <<= 1;
+                writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
-                        if (pause > HZ / 10)
-                                pause = HZ / 10;
-                } else {
-                        pause = 1;
-                }
-                spin_lock(&block_rsv->lock);
+                spin_lock(&space_info->lock);
-                if (reserved > block_rsv->reserved)
+                if (reserved > space_info->bytes_reserved)
-                        reclaimed = reserved - block_rsv->reserved;
+                        reclaimed += reserved - space_info->bytes_reserved;
-                reserved = block_rsv->reserved;
+                reserved = space_info->bytes_reserved;
-                spin_unlock(&block_rsv->lock);
+                spin_unlock(&space_info->lock);
+                loops++;
                if (reserved == 0 || reclaimed >= max_reclaim)
                        break;
                if (trans && trans->transaction->blocked)
                        return -EAGAIN;
+                time_left = schedule_timeout_interruptible(1);
+                /* We were interrupted, exit */
+                if (time_left)
+                        break;
+                /* we've kicked the IO a few times, if anything has been freed,
+                 * exit.  There is no sense in looping here for a long time
+                 * when we really need to commit the transaction, or there are
+                 * just too many writers without enough free space
+                 */
+                if (loops > 3) {
+                        smp_mb();
+                        if (progress != space_info->reservation_progress)
+                                break;
+                }
        }
        return reclaimed >= to_reclaim;
 }
-static int should_retry_reserve(struct btrfs_trans_handle *trans,
+/*
-                                struct btrfs_root *root,
+ * Retries tells us how many times we've called reserve_metadata_bytes.  The
-                                struct btrfs_block_rsv *block_rsv,
+ * idea is if this is the first call (retries == 0) then we will add to our
-                                u64 num_bytes, int *retries)
+ * reserved count if we can't make the allocation in order to hold our place
+ * while we go and try and free up space.  That way for retries > 1 we don't try
+ * and add space, we just check to see if the amount of unused space is >= the
+ * total space, meaning that our reservation is valid.
+ *
+ * However if we don't intend to retry this reservation, pass -1 as retries so
+ * that it short circuits this logic.
+ */
+static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
+                                  struct btrfs_root *root,
+                                  struct btrfs_block_rsv *block_rsv,
+                                  u64 orig_bytes, int flush)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
-        int ret;
+        u64 unused;
+        u64 num_bytes = orig_bytes;
+        int retries = 0;
+        int ret = 0;
+        bool reserved = false;
+        bool committed = false;
-        if ((*retries) > 2)
+again:
-                return -ENOSPC;
+        ret = -ENOSPC;
+        if (reserved)
+                num_bytes = 0;
-        ret = maybe_allocate_chunk(trans, root, space_info, num_bytes);
+        spin_lock(&space_info->lock);
-        if (ret)
+        unused = space_info->bytes_used + space_info->bytes_reserved +
-                return 1;
+                 space_info->bytes_pinned + space_info->bytes_readonly +
+                 space_info->bytes_may_use;
-        if (trans && trans->transaction->in_commit)
+        /*
-                return -ENOSPC;
+         * The idea here is that we've not already over-reserved the block group
+         * then we can go ahead and save our reservation first and then start
+         * flushing if we need to.  Otherwise if we've already overcommitted
+         * lets start flushing stuff first and then come back and try to make
+         * our reservation.
+         */
+        if (unused <= space_info->total_bytes) {
+                unused = space_info->total_bytes - unused;
+                if (unused >= num_bytes) {
+                        if (!reserved)
+                                space_info->bytes_reserved += orig_bytes;
+                        ret = 0;
+                } else {
+                        /*
+                         * Ok set num_bytes to orig_bytes since we aren't
+                         * overocmmitted, this way we only try and reclaim what
+                         * we need.
+                         */
+                        num_bytes = orig_bytes;
+                }
+        } else {
+                /*
+                 * Ok we're over committed, set num_bytes to the overcommitted
+                 * amount plus the amount of bytes that we need for this
+                 * reservation.
+                 */
+                num_bytes = unused - space_info->total_bytes +
+                        (orig_bytes * (retries + 1));
+        }
-        ret = shrink_delalloc(trans, root, num_bytes);
+        /*
-        if (ret)
+         * Couldn't make our reservation, save our place so while we're trying
-                return ret;
+         * to reclaim space we can actually use it instead of somebody else
+         * stealing it from us.
+         */
+        if (ret && !reserved) {
+                space_info->bytes_reserved += orig_bytes;
+                reserved = true;
+        }
-        spin_lock(&space_info->lock);
-        if (space_info->bytes_pinned < num_bytes)
-                ret = 1;
        spin_unlock(&space_info->lock);
-        if (ret)
-                return -ENOSPC;
-        (*retries)++;
+        if (!ret)
+                return 0;
-        if (trans)
-                return -EAGAIN;
-        trans = btrfs_join_transaction(root, 1);
+        if (!flush)
-        BUG_ON(IS_ERR(trans));
+                goto out;
-        ret = btrfs_commit_transaction(trans, root);
-        BUG_ON(ret);
-        return 1;
+        /*
-}
+         * We do synchronous shrinking since we don't actually unreserve
+         * metadata until after the IO is completed.
+         */
+        ret = shrink_delalloc(trans, root, num_bytes, 1);
+        if (ret > 0)
+                return 0;
+        else if (ret < 0)
+                goto out;
-static int reserve_metadata_bytes(struct btrfs_block_rsv *block_rsv,
+        /*
-                                  u64 num_bytes)
+         * So if we were overcommitted it's possible that somebody else flushed
-{
+         * out enough space and we simply didn't have enough space to reclaim,
-        struct btrfs_space_info *space_info = block_rsv->space_info;
+         * so go back around and try again.
-        u64 unused;
+         */
-        int ret = -ENOSPC;
+        if (retries < 2) {
+                retries++;
+                goto again;
+        }
        spin_lock(&space_info->lock);
-        unused = space_info->bytes_used + space_info->bytes_reserved +
+        /*
-                 space_info->bytes_pinned + space_info->bytes_readonly;
+         * Not enough space to be reclaimed, don't bother committing the
+         * transaction.
+         */
+        if (space_info->bytes_pinned < orig_bytes)
+                ret = -ENOSPC;
+        spin_unlock(&space_info->lock);
+        if (ret)
+                goto out;
-        if (unused < space_info->total_bytes)
+        ret = -EAGAIN;
-                unused = space_info->total_bytes - unused;
+        if (trans || committed)
-        else
+                goto out;
-                unused = 0;
-        if (unused >= num_bytes) {
+        ret = -ENOSPC;
-                if (block_rsv->priority >= 10) {
+        trans = btrfs_join_transaction(root);
-                        space_info->bytes_reserved += num_bytes;
+        if (IS_ERR(trans))
-                        ret = 0;
+                goto out;
-                } else {
+        ret = btrfs_commit_transaction(trans, root);
-                        if ((unused + block_rsv->reserved) *
+        if (!ret) {
-                            block_rsv->priority >=
+                trans = NULL;
-                            (num_bytes + block_rsv->reserved) * 10) {
+                committed = true;
-                                space_info->bytes_reserved += num_bytes;
+                goto again;
-                                ret = 0;
+        }
-                        }
-                }
+out:
+        if (reserved) {
+                spin_lock(&space_info->lock);
+                space_info->bytes_reserved -= orig_bytes;
+                spin_unlock(&space_info->lock);
        }
-        spin_unlock(&space_info->lock);
        return ret;
 }
@@ -3273,8 +3540,8 @@ static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
        spin_unlock(&block_rsv->lock);
 }
-void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
+static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
-                             struct btrfs_block_rsv *dest, u64 num_bytes)
+                                    struct btrfs_block_rsv *dest, u64 num_bytes)
 {
        struct btrfs_space_info *space_info = block_rsv->space_info;
@@ -3293,10 +3560,23 @@ void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
        if (num_bytes > 0) {
                if (dest) {
-                        block_rsv_add_bytes(dest, num_bytes, 0);
+                        spin_lock(&dest->lock);
-                } else {
+                        if (!dest->full) {
+                                u64 bytes_to_add;
+                                bytes_to_add = dest->size - dest->reserved;
+                                bytes_to_add = min(num_bytes, bytes_to_add);
+                                dest->reserved += bytes_to_add;
+                                if (dest->reserved >= dest->size)
+                                        dest->full = 1;
+                                num_bytes -= bytes_to_add;
+                        }
+                        spin_unlock(&dest->lock);
+                }
+                if (num_bytes) {
                        spin_lock(&space_info->lock);
                        space_info->bytes_reserved -= num_bytes;
+                        space_info->reservation_progress++;
                        spin_unlock(&space_info->lock);
                }
        }
@@ -3328,18 +3608,14 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 {
        struct btrfs_block_rsv *block_rsv;
        struct btrfs_fs_info *fs_info = root->fs_info;
-        u64 alloc_target;
        block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
        if (!block_rsv)
                return NULL;
        btrfs_init_block_rsv(block_rsv);
-        alloc_target = btrfs_get_alloc_profile(root, 0);
        block_rsv->space_info = __find_space_info(fs_info,
                                                  BTRFS_BLOCK_GROUP_METADATA);
        return block_rsv;
 }
@@ -3370,23 +3646,19 @@ void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
                        struct btrfs_root *root,
                        struct btrfs_block_rsv *block_rsv,
-                        u64 num_bytes, int *retries)
+                        u64 num_bytes)
 {
        int ret;
        if (num_bytes == 0)
                return 0;
-again:
-        ret = reserve_metadata_bytes(block_rsv, num_bytes);
+        ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
        if (!ret) {
                block_rsv_add_bytes(block_rsv, num_bytes, 1);
                return 0;
        }
-        ret = should_retry_reserve(trans, root, block_rsv, num_bytes, retries);
-        if (ret > 0)
-                goto again;
        return ret;
 }
@@ -3421,7 +3693,8 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                return 0;
        if (block_rsv->refill_used) {
-                ret = reserve_metadata_bytes(block_rsv, num_bytes);
+                ret = reserve_metadata_bytes(trans, root, block_rsv,
+                                             num_bytes, 0);
                if (!ret) {
                        block_rsv_add_bytes(block_rsv, num_bytes, 0);
                        return 0;
@@ -3432,17 +3705,12 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
                if (trans)
                        return -EAGAIN;
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
                ret = btrfs_commit_transaction(trans, root);
                return 0;
        }
-        WARN_ON(1);
-        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
-                block_rsv->size, block_rsv->reserved,
-                block_rsv->freed[0], block_rsv->freed[1]);
        return -ENOSPC;
 }
@@ -3476,23 +3744,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
        u64 meta_used;
        u64 data_used;
        int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
-#if 0
-        /*
-         * per tree used space accounting can be inaccuracy, so we
-         * can't rely on it.
-         */
-        spin_lock(&fs_info->extent_root->accounting_lock);
-        num_bytes = btrfs_root_used(&fs_info->extent_root->root_item);
-        spin_unlock(&fs_info->extent_root->accounting_lock);
-        spin_lock(&fs_info->csum_root->accounting_lock);
-        num_bytes += btrfs_root_used(&fs_info->csum_root->root_item);
-        spin_unlock(&fs_info->csum_root->accounting_lock);
-        spin_lock(&fs_info->tree_root->accounting_lock);
-        num_bytes += btrfs_root_used(&fs_info->tree_root->root_item);
-        spin_unlock(&fs_info->tree_root->accounting_lock);
-#endif
        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
        spin_lock(&sinfo->lock);
        data_used = sinfo->bytes_used;
@@ -3500,6 +3752,8 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
        sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
        spin_lock(&sinfo->lock);
+        if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
+                data_used = 0;
        meta_used = sinfo->bytes_used;
        spin_unlock(&sinfo->lock);
@@ -3527,7 +3781,8 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        block_rsv->size = num_bytes;
        num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
-                    sinfo->bytes_reserved + sinfo->bytes_readonly;
+                    sinfo->bytes_reserved + sinfo->bytes_readonly +
+                    sinfo->bytes_may_use;
        if (sinfo->total_bytes > num_bytes) {
                num_bytes = sinfo->total_bytes - num_bytes;
@@ -3538,13 +3793,11 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
        if (block_rsv->reserved >= block_rsv->size) {
                num_bytes = block_rsv->reserved - block_rsv->size;
                sinfo->bytes_reserved -= num_bytes;
+                sinfo->reservation_progress++;
                block_rsv->reserved = block_rsv->size;
                block_rsv->full = 1;
        }
-#if 0
-        printk(KERN_INFO"global block rsv size %llu reserved %llu\n",
-                block_rsv->size, block_rsv->reserved);
-#endif
        spin_unlock(&sinfo->lock);
        spin_unlock(&block_rsv->lock);
 }
@@ -3590,15 +3843,40 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
        WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 }
-static u64 calc_trans_metadata_size(struct btrfs_root *root, int num_items)
+int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
+                                    struct btrfs_root *root,
+                                    struct btrfs_block_rsv *rsv)
 {
-        return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+        struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
-                3 * num_items;
+        u64 num_bytes;
+        int ret;
+        /*
+         * Truncate should be freeing data, but give us 2 items just in case it
+         * needs to use some space.  We may want to be smarter about this in the
+         * future.
+         */
+        num_bytes = btrfs_calc_trans_metadata_size(root, 2);
+        /* We already have enough bytes, just return */
+        if (rsv->reserved >= num_bytes)
+                return 0;
+        num_bytes -= rsv->reserved;
+        /*
+         * You should have reserved enough space before hand to do this, so this
+         * should not fail.
+         */
+        ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
+        BUG_ON(ret);
+        return 0;
 }
 int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
-                                 int num_items, int *retries)
+                                 int num_items)
 {
        u64 num_bytes;
        int ret;
@@ -3606,9 +3884,9 @@ int btrfs_trans_reserve_metadata(struct btrfs_trans_handle *trans,
        if (num_items == 0 || root->fs_info->chunk_root == root)
                return 0;
-        num_bytes = calc_trans_metadata_size(root, num_items);
+        num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
        ret = btrfs_block_rsv_add(trans, root, &root->fs_info->trans_block_rsv,
-                                  num_bytes, retries);
+                                  num_bytes);
        if (!ret) {
                trans->bytes_reserved += num_bytes;
                trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -3636,23 +3914,18 @@ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
        struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
        /*
-         * one for deleting orphan item, one for updating inode and
+         * We need to hold space in order to delete our orphan item once we've
-         * two for calling btrfs_truncate_inode_items.
+         * added it, so this takes the reservation so we can release it later
-         *
+         * when we are truly done with the orphan item.
-         * btrfs_truncate_inode_items is a delete operation, it frees
-         * more space than it uses in most cases. So two units of
-         * metadata space should be enough for calling it many times.
-         * If all of the metadata space is used, we can commit
-         * transaction and use space it freed.
         */
-        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 void btrfs_orphan_release_metadata(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        u64 num_bytes = calc_trans_metadata_size(root, 4);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
        btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
 }
@@ -3666,7 +3939,7 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
         * two for root back/forward refs, two for directory entries
         * and one for root of the snapshot.
         */
-        u64 num_bytes = calc_trans_metadata_size(root, 5);
+        u64 num_bytes = btrfs_calc_trans_metadata_size(root, 5);
        dst_rsv->space_info = src_rsv->space_info;
        return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
@@ -3682,43 +3955,37 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
        struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
        u64 to_reserve;
        int nr_extents;
-        int retries = 0;
+        int reserved_extents;
        int ret;
        if (btrfs_transaction_in_commit(root->fs_info))
                schedule_timeout(1);
        num_bytes = ALIGN(num_bytes, root->sectorsize);
-again:
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents) + 1;
-        if (nr_extents > BTRFS_I(inode)->reserved_extents) {
+        reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
-                nr_extents -= BTRFS_I(inode)->reserved_extents;
-                to_reserve = calc_trans_metadata_size(root, nr_extents);
+        if (nr_extents > reserved_extents) {
+                nr_extents -= reserved_extents;
+                to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
        } else {
                nr_extents = 0;
                to_reserve = 0;
        }
        to_reserve += calc_csum_metadata_size(inode, num_bytes);
-        ret = reserve_metadata_bytes(block_rsv, to_reserve);
+        ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
-        if (ret) {
+        if (ret)
-                spin_unlock(&BTRFS_I(inode)->accounting_lock);
-                ret = should_retry_reserve(NULL, root, block_rsv, to_reserve,
-                                           &retries);
-                if (ret > 0)
-                        goto again;
                return ret;
-        }
-        BTRFS_I(inode)->reserved_extents += nr_extents;
+        atomic_add(nr_extents, &BTRFS_I(inode)->reserved_extents);
        atomic_inc(&BTRFS_I(inode)->outstanding_extents);
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
        block_rsv_add_bytes(block_rsv, to_reserve, 1);
        if (block_rsv->size > 512 * 1024 * 1024)
-                shrink_delalloc(NULL, root, to_reserve);
+                shrink_delalloc(NULL, root, to_reserve, 0);
        return 0;
 }
@@ -3728,23 +3995,34 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 to_free;
        int nr_extents;
+        int reserved_extents;
        num_bytes = ALIGN(num_bytes, root->sectorsize);
        atomic_dec(&BTRFS_I(inode)->outstanding_extents);
+        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents) < 0);
-        spin_lock(&BTRFS_I(inode)->accounting_lock);
+        reserved_extents = atomic_read(&BTRFS_I(inode)->reserved_extents);
-        nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
+        do {
-        if (nr_extents < BTRFS_I(inode)->reserved_extents) {
+                int old, new;
-                nr_extents = BTRFS_I(inode)->reserved_extents - nr_extents;
-                BTRFS_I(inode)->reserved_extents -= nr_extents;
+                nr_extents = atomic_read(&BTRFS_I(inode)->outstanding_extents);
-        } else {
+                if (nr_extents >= reserved_extents) {
-                nr_extents = 0;
+                        nr_extents = 0;
-        }
+                        break;
-        spin_unlock(&BTRFS_I(inode)->accounting_lock);
+                }
+                old = reserved_extents;
+                nr_extents = reserved_extents - nr_extents;
+                new = reserved_extents - nr_extents;
+                old = atomic_cmpxchg(&BTRFS_I(inode)->reserved_extents,
+                                     reserved_extents, new);
+                if (likely(old == reserved_extents))
+                        break;
+                reserved_extents = old;
+        } while (1);
        to_free = calc_csum_metadata_size(inode, num_bytes);
        if (nr_extents > 0)
-                to_free += calc_trans_metadata_size(root, nr_extents);
+                to_free += btrfs_calc_trans_metadata_size(root, nr_extents);
        btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
                                to_free);
@@ -3777,12 +4055,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                              struct btrfs_root *root,
                              u64 bytenr, u64 num_bytes, int alloc)
 {
-        struct btrfs_block_group_cache *cache;
+        struct btrfs_block_group_cache *cache = NULL;
        struct btrfs_fs_info *info = root->fs_info;
-        int factor;
        u64 total = num_bytes;
        u64 old_val;
        u64 byte_in_group;
+        int factor;
        /* block accounting for super block */
        spin_lock(&info->delalloc_lock);
@@ -3804,11 +4082,25 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        factor = 2;
                else
                        factor = 1;
+                /*
+                 * If this block group has free space cache written out, we
+                 * need to make sure to load it if we are removing space.  This
+                 * is because we need the unpinning stage to actually add the
+                 * space back to the block group, otherwise we will leak space.
+                 */
+                if (!alloc && cache->cached == BTRFS_CACHE_NO)
+                        cache_block_group(cache, trans, NULL, 1);
                byte_in_group = bytenr - cache->key.objectid;
                WARN_ON(byte_in_group > cache->key.offset);
                spin_lock(&cache->space_info->lock);
                spin_lock(&cache->lock);
+                if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+                    cache->disk_cache_state < BTRFS_DC_CLEAR)
+                        cache->disk_cache_state = BTRFS_DC_CLEAR;
                cache->dirty = 1;
                old_val = btrfs_block_group_used(&cache->item);
                num_bytes = min(total, cache->key.offset - byte_in_group);
@@ -3817,6 +4109,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
                        btrfs_set_block_group_used(&cache->item, old_val);
                        cache->reserved -= num_bytes;
                        cache->space_info->bytes_reserved -= num_bytes;
+                        cache->space_info->reservation_progress++;
                        cache->space_info->bytes_used += num_bytes;
                        cache->space_info->disk_used += num_bytes * factor;
                        spin_unlock(&cache->lock);
@@ -3868,6 +4161,7 @@ static int pin_down_extent(struct btrfs_root *root,
        if (reserved) {
                cache->reserved -= num_bytes;
                cache->space_info->bytes_reserved -= num_bytes;
+                cache->space_info->reservation_progress++;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&cache->space_info->lock);
@@ -3898,8 +4192,8 @@ int btrfs_pin_extent(struct btrfs_root *root,
 * update size of reserved extents. this function may return -EAGAIN
 * if 'reserve' is true or 'sinfo' is false.
 */
-static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
+int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-                                 u64 num_bytes, int reserve, int sinfo)
+                                u64 num_bytes, int reserve, int sinfo)
 {
        int ret = 0;
        if (sinfo) {
@@ -3918,6 +4212,7 @@ static int update_reserved_bytes(struct btrfs_block_group_cache *cache,
                                space_info->bytes_readonly += num_bytes;
                        cache->reserved -= num_bytes;
                        space_info->bytes_reserved -= num_bytes;
+                        space_info->reservation_progress++;
                }
                spin_unlock(&cache->lock);
                spin_unlock(&space_info->lock);
@@ -4037,7 +4332,9 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
                if (ret)
                        break;
-                ret = btrfs_discard_extent(root, start, end + 1 - start);
+                if (btrfs_test_opt(root, DISCARD))
+                        ret = btrfs_discard_extent(root, start,
+                                                   end + 1 - start, NULL);
                clear_extent_dirty(unpin, start, end, GFP_NOFS);
                unpin_extent_range(root, start, end);
@@ -4134,7 +4431,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                                    NULL, refs_to_drop,
                                                    is_data);
                        BUG_ON(ret);
-                        btrfs_release_path(extent_root, path);
+                        btrfs_release_path(path);
                        path->leave_spinning = 1;
                        key.objectid = bytenr;
@@ -4173,7 +4470,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                                             owner_objectid, 0);
                BUG_ON(ret < 0);
-                btrfs_release_path(extent_root, path);
+                btrfs_release_path(path);
                path->leave_spinning = 1;
                key.objectid = bytenr;
@@ -4243,7 +4540,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
                ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
                                      num_to_del);
                BUG_ON(ret);
-                btrfs_release_path(extent_root, path);
+                btrfs_release_path(path);
                if (is_data) {
                        ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
@@ -4378,10 +4675,10 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
                btrfs_add_free_space(cache, buf->start, buf->len);
-                ret = update_reserved_bytes(cache, buf->len, 0, 0);
+                ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
                if (ret == -EAGAIN) {
                        /* block group became read-only */
-                        update_reserved_bytes(cache, buf->len, 0, 1);
+                        btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
                        goto out;
                }
@@ -4396,6 +4693,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
                if (ret) {
                        spin_lock(&cache->space_info->lock);
                        cache->space_info->bytes_reserved -= buf->len;
+                        cache->space_info->reservation_progress++;
                        spin_unlock(&cache->space_info->lock);
                }
                goto out;
@@ -4417,6 +4715,11 @@ pin:
                }
        }
 out:
+        /*
+         * Deleting the buffer, clear the corrupt flag since it doesn't matter
+         * anymore.
+         */
+        clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
        btrfs_put_block_group(cache);
 }
@@ -4480,7 +4783,7 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
                return 0;
        wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
-                   (cache->free_space >= num_bytes));
+                   (cache->free_space_ctl->free_space >= num_bytes));
        put_caching_control(caching_ctl);
        return 0;
@@ -4539,7 +4842,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
                                     u64 num_bytes, u64 empty_size,
                                     u64 search_start, u64 search_end,
                                     u64 hint_byte, struct btrfs_key *ins,
-                                     int data)
+                                     u64 data)
 {
        int ret = 0;
        struct btrfs_root *root = orig_root->fs_info->extent_root;
@@ -4555,6 +4858,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        bool found_uncached_bg = false;
        bool failed_cluster_refill = false;
        bool failed_alloc = false;
+        bool use_cluster = true;
        u64 ideal_cache_percent = 0;
        u64 ideal_cache_offset = 0;
@@ -4565,20 +4869,28 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
        space_info = __find_space_info(root->fs_info, data);
        if (!space_info) {
-                printk(KERN_ERR "No space info for %d\n", data);
+                printk(KERN_ERR "No space info for %llu\n", data);
                return -ENOSPC;
        }
+        /*
+         * If the space info is for both data and metadata it means we have a
+         * small filesystem and we can't use the clustering stuff.
+         */
+        if (btrfs_mixed_space_info(space_info))
+                use_cluster = false;
        if (orig_root->ref_cows || empty_size)
                allowed_chunk_alloc = 1;
-        if (data & BTRFS_BLOCK_GROUP_METADATA) {
+        if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
                last_ptr = &root->fs_info->meta_alloc_cluster;
                if (!btrfs_test_opt(root, SSD))
                        empty_cluster = 64 * 1024;
        }
-        if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) {
+        if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
+            btrfs_test_opt(root, SSD)) {
                last_ptr = &root->fs_info->data_alloc_cluster;
        }
@@ -4638,10 +4950,34 @@ search:
                btrfs_get_block_group(block_group);
                search_start = block_group->key.objectid;
+                /*
+                 * this can happen if we end up cycling through all the
+                 * raid types, but we want to make sure we only allocate
+                 * for the proper type.
+                 */
+                if (!block_group_bits(block_group, data)) {
+                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
+                                BTRFS_BLOCK_GROUP_RAID1 |
+                                BTRFS_BLOCK_GROUP_RAID10;
+                        /*
+                         * if they asked for extra copies and this block group
+                         * doesn't provide them, bail.  This does allow us to
+                         * fill raid0 from raid1.
+                         */
+                        if ((data & extra) && !(block_group->flags & extra))
+                                goto loop;
+                }
 have_block_group:
                if (unlikely(block_group->cached == BTRFS_CACHE_NO)) {
                        u64 free_percent;
+                        ret = cache_block_group(block_group, trans,
+                                                orig_root, 1);
+                        if (block_group->cached == BTRFS_CACHE_FINISHED)
+                                goto have_block_group;
                        free_percent = btrfs_block_group_used(&block_group->item);
                        free_percent *= 100;
                        free_percent = div64_u64(free_percent,
@@ -4662,7 +4998,8 @@ have_block_group:
                        if (loop > LOOP_CACHING_NOWAIT ||
                            (loop > LOOP_FIND_IDEAL &&
                             atomic_read(&space_info->caching_threads) < 2)) {
-                                ret = cache_block_group(block_group);
+                                ret = cache_block_group(block_group, trans,
+                                                        orig_root, 0);
                                BUG_ON(ret);
                        }
                        found_uncached_bg = true;
@@ -4682,6 +5019,15 @@ have_block_group:
                if (unlikely(block_group->ro))
                        goto loop;
+                spin_lock(&block_group->free_space_ctl->tree_lock);
+                if (cached &&
+                    block_group->free_space_ctl->free_space <
+                    num_bytes + empty_size) {
+                        spin_unlock(&block_group->free_space_ctl->tree_lock);
+                        goto loop;
+                }
+                spin_unlock(&block_group->free_space_ctl->tree_lock);
                /*
                 * Ok we want to try and use the cluster allocator, so lets look
                 * there, unless we are on LOOP_NO_EMPTY_SIZE, since we will
@@ -4830,7 +5176,7 @@ checks:
                                             search_start - offset);
                BUG_ON(offset > search_start);
-                ret = update_reserved_bytes(block_group, num_bytes, 1,
+                ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
                                            (data & BTRFS_BLOCK_GROUP_DATA));
                if (ret == -EAGAIN) {
                        btrfs_add_free_space(block_group, offset, num_bytes);
@@ -4845,6 +5191,7 @@ checks:
                        btrfs_add_free_space(block_group, offset,
                                             search_start - offset);
                BUG_ON(offset > search_start);
+                btrfs_put_block_group(block_group);
                break;
 loop:
                failed_cluster_refill = false;
@@ -4867,9 +5214,7 @@ loop:
         * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
         *                      again
         */
-        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE &&
+        if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
-            (found_uncached_bg || empty_size || empty_cluster ||
-             allowed_chunk_alloc)) {
                index = 0;
                if (loop == LOOP_FIND_IDEAL && found_uncached_bg) {
                        found_uncached_bg = false;
@@ -4909,40 +5254,39 @@ loop:
                        goto search;
                }
-                if (loop < LOOP_CACHING_WAIT) {
+                loop++;
-                        loop++;
-                        goto search;
-                }
                if (loop == LOOP_ALLOC_CHUNK) {
-                        empty_size = 0;
+                       if (allowed_chunk_alloc) {
-                        empty_cluster = 0;
+                                ret = do_chunk_alloc(trans, root, num_bytes +
-                }
+                                                     2 * 1024 * 1024, data,
+                                                     CHUNK_ALLOC_LIMITED);
+                                allowed_chunk_alloc = 0;
+                                if (ret == 1)
+                                        done_chunk_alloc = 1;
+                        } else if (!done_chunk_alloc &&
+                                   space_info->force_alloc ==
+                                   CHUNK_ALLOC_NO_FORCE) {
+                                space_info->force_alloc = CHUNK_ALLOC_LIMITED;
+                        }
-                if (allowed_chunk_alloc) {
+                       /*
-                        ret = do_chunk_alloc(trans, root, num_bytes +
+                        * We didn't allocate a chunk, go ahead and drop the
-                                             2 * 1024 * 1024, data, 1);
+                        * empty size and loop again.
-                        allowed_chunk_alloc = 0;
+                        */
-                        done_chunk_alloc = 1;
+                       if (!done_chunk_alloc)
-                } else if (!done_chunk_alloc) {
+                               loop = LOOP_NO_EMPTY_SIZE;
-                        space_info->force_alloc = 1;
                }
-                if (loop < LOOP_NO_EMPTY_SIZE) {
+                if (loop == LOOP_NO_EMPTY_SIZE) {
-                        loop++;
+                        empty_size = 0;
-                        goto search;
+                        empty_cluster = 0;
                }
-                ret = -ENOSPC;
+                goto search;
        } else if (!ins->objectid) {
                ret = -ENOSPC;
-        }
+        } else if (ins->objectid) {
-        /* we found what we needed */
-        if (ins->objectid) {
-                if (!(data & BTRFS_BLOCK_GROUP_DATA))
-                        trans->block_group = block_group->key.objectid;
-                btrfs_put_block_group(block_group);
                ret = 0;
        }
@@ -5011,7 +5355,8 @@ again:
         */
        if (empty_size || root->ref_cows)
                ret = do_chunk_alloc(trans, root->fs_info->extent_root,
-                                     num_bytes + 2 * 1024 * 1024, data, 0);
+                                     num_bytes + 2 * 1024 * 1024, data,
+                                     CHUNK_ALLOC_NO_FORCE);
        WARN_ON(num_bytes < root->sectorsize);
        ret = find_free_extent(trans, root, num_bytes, empty_size,
@@ -5023,10 +5368,10 @@ again:
                num_bytes = num_bytes & ~(root->sectorsize - 1);
                num_bytes = max(num_bytes, min_alloc_size);
                do_chunk_alloc(trans, root->fs_info->extent_root,
-                               num_bytes, data, 1);
+                               num_bytes, data, CHUNK_ALLOC_FORCE);
                goto again;
        }
-        if (ret == -ENOSPC) {
+        if (ret == -ENOSPC && btrfs_test_opt(root, ENOSPC_DEBUG)) {
                struct btrfs_space_info *sinfo;
                sinfo = __find_space_info(root->fs_info, data);
@@ -5036,6 +5381,8 @@ again:
                dump_space_info(sinfo, num_bytes, 1);
        }
+        trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
        return ret;
 }
@@ -5051,12 +5398,15 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
                return -ENOSPC;
        }
-        ret = btrfs_discard_extent(root, start, len);
+        if (btrfs_test_opt(root, DISCARD))
+                ret = btrfs_discard_extent(root, start, len, NULL);
        btrfs_add_free_space(cache, start, len);
-        update_reserved_bytes(cache, len, 0, 1);
+        btrfs_update_reserved_bytes(cache, len, 0, 1);
        btrfs_put_block_group(cache);
+        trace_btrfs_reserved_extent_free(root, start, len);
        return ret;
 }
@@ -5083,7 +5433,8 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
        size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        path->leave_spinning = 1;
        ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
@@ -5219,7 +5570,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
        u64 num_bytes = ins->offset;
        block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
-        cache_block_group(block_group);
+        cache_block_group(block_group, trans, NULL, 0);
        caching_ctl = get_caching_control(block_group);
        if (!caching_ctl) {
@@ -5253,7 +5604,7 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
                put_caching_control(caching_ctl);
        }
-        ret = update_reserved_bytes(block_group, ins->offset, 1, 1);
+        ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
        BUG_ON(ret);
        btrfs_put_block_group(block_group);
        ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -5304,25 +5655,47 @@ use_block_rsv(struct btrfs_trans_handle *trans,
              struct btrfs_root *root, u32 blocksize)
 {
        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
        int ret;
        block_rsv = get_block_rsv(trans, root);
        if (block_rsv->size == 0) {
-                ret = reserve_metadata_bytes(block_rsv, blocksize);
+                ret = reserve_metadata_bytes(trans, root, block_rsv,
-                if (ret)
+                                             blocksize, 0);
+                /*
+                 * If we couldn't reserve metadata bytes try and use some from
+                 * the global reserve.
+                 */
+                if (ret && block_rsv != global_rsv) {
+                        ret = block_rsv_use_bytes(global_rsv, blocksize);
+                        if (!ret)
+                                return global_rsv;
                        return ERR_PTR(ret);
+                } else if (ret) {
+                        return ERR_PTR(ret);
+                }
                return block_rsv;
        }
        ret = block_rsv_use_bytes(block_rsv, blocksize);
        if (!ret)
                return block_rsv;
+        if (ret) {
-        WARN_ON(1);
+                WARN_ON(1);
-        printk(KERN_INFO"block_rsv size %llu reserved %llu freed %llu %llu\n",
+                ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
-                block_rsv->size, block_rsv->reserved,
+                                             0);
-                block_rsv->freed[0], block_rsv->freed[1]);
+                if (!ret) {
+                        spin_lock(&block_rsv->lock);
+                        block_rsv->size += blocksize;
+                        spin_unlock(&block_rsv->lock);
+                        return block_rsv;
+                } else if (ret && block_rsv != global_rsv) {
+                        ret = block_rsv_use_bytes(global_rsv, blocksize);
+                        if (!ret)
+                                return global_rsv;
+                }
+        }
        return ERR_PTR(-ENOSPC);
 }
@@ -5422,7 +5795,6 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
        u64 generation;
        u64 refs;
        u64 flags;
-        u64 last = 0;
        u32 nritems;
        u32 blocksize;
        struct btrfs_key key;
@@ -5490,7 +5862,6 @@ reada:
                                           generation);
                if (ret)
                        break;
-                last = bytenr + blocksize;
                nread++;
        }
        wc->reada_slot = slot;
@@ -5666,6 +6037,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
                if (reada && level == 1)
                        reada_walk_down(trans, root, wc, path);
                next = read_tree_block(root, bytenr, blocksize, generation);
+                if (!next)
+                        return -EIO;
                btrfs_tree_lock(next);
                btrfs_set_lock_blocking(next);
        }
@@ -5898,6 +6271,8 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
        BUG_ON(!wc);
        trans = btrfs_start_transaction(tree_root, 0);
+        BUG_ON(IS_ERR(trans));
        if (block_rsv)
                trans->block_rsv = block_rsv;
@@ -5995,11 +6370,12 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                        btrfs_end_transaction_throttle(trans, tree_root);
                        trans = btrfs_start_transaction(tree_root, 0);
+                        BUG_ON(IS_ERR(trans));
                        if (block_rsv)
                                trans->block_rsv = block_rsv;
                }
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        BUG_ON(err);
        ret = btrfs_del_root(trans, tree_root, &root->root_key);
@@ -6010,9 +6386,13 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
                                           NULL, NULL);
                BUG_ON(ret < 0);
                if (ret > 0) {
-                        ret = btrfs_del_orphan_item(trans, tree_root,
+                        /* if we fail to delete the orphan item this time
-                                                    root->root_key.objectid);
+                         * around, it'll get picked up the next time.
-                        BUG_ON(ret);
+                         *
+                         * The most common failure here is just -ENOENT.
+                         */
+                        btrfs_del_orphan_item(trans, tree_root,
+                                              root->root_key.objectid);
                }
        }
@@ -6050,10 +6430,14 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        wc = kzalloc(sizeof(*wc), GFP_NOFS);
-        BUG_ON(!wc);
+        if (!wc) {
+                btrfs_free_path(path);
+                return -ENOMEM;
+        }
        btrfs_assert_tree_locked(parent);
        parent_level = btrfs_header_level(parent);
@@ -6095,1500 +6479,20 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
        return ret;
 }
-#if 0
-static unsigned long calc_ra(unsigned long start, unsigned long last,
-                             unsigned long nr)
-{
-        return min(last, start + nr - 1);
-}
-static noinline int relocate_inode_pages(struct inode *inode, u64 start,
-                                         u64 len)
-{
-        u64 page_start;
-        u64 page_end;
-        unsigned long first_index;
-        unsigned long last_index;
-        unsigned long i;
-        struct page *page;
-        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-        struct file_ra_state *ra;
-        struct btrfs_ordered_extent *ordered;
-        unsigned int total_read = 0;
-        unsigned int total_dirty = 0;
-        int ret = 0;
-        ra = kzalloc(sizeof(*ra), GFP_NOFS);
-        mutex_lock(&inode->i_mutex);
-        first_index = start >> PAGE_CACHE_SHIFT;
-        last_index = (start + len - 1) >> PAGE_CACHE_SHIFT;
-        /* make sure the dirty trick played by the caller work */
-        ret = invalidate_inode_pages2_range(inode->i_mapping,
-                                            first_index, last_index);
-        if (ret)
-                goto out_unlock;
-        file_ra_state_init(ra, inode->i_mapping);
-        for (i = first_index ; i <= last_index; i++) {
-                if (total_read % ra->ra_pages == 0) {
-                        btrfs_force_ra(inode->i_mapping, ra, NULL, i,
-                                       calc_ra(i, last_index, ra->ra_pages));
-                }
-                total_read++;
-again:
-                if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode))
-                        BUG_ON(1);
-                page = grab_cache_page(inode->i_mapping, i);
-                if (!page) {
-                        ret = -ENOMEM;
-                        goto out_unlock;
-                }
-                if (!PageUptodate(page)) {
-                        btrfs_readpage(NULL, page);
-                        lock_page(page);
-                        if (!PageUptodate(page)) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                ret = -EIO;
-                                goto out_unlock;
-                        }
-                }
-                wait_on_page_writeback(page);
-                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-                page_end = page_start + PAGE_CACHE_SIZE - 1;
-                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                ordered = btrfs_lookup_ordered_extent(inode, page_start);
-                if (ordered) {
-                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                        unlock_page(page);
-                        page_cache_release(page);
-                        btrfs_start_ordered_extent(inode, ordered, 1);
-                        btrfs_put_ordered_extent(ordered);
-                        goto again;
-                }
-                set_page_extent_mapped(page);
-                if (i == first_index)
-                        set_extent_bits(io_tree, page_start, page_end,
-                                        EXTENT_BOUNDARY, GFP_NOFS);
-                btrfs_set_extent_delalloc(inode, page_start, page_end);
-                set_page_dirty(page);
-                total_dirty++;
-                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                unlock_page(page);
-                page_cache_release(page);
-        }
-out_unlock:
-        kfree(ra);
-        mutex_unlock(&inode->i_mutex);
-        balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty);
-        return ret;
-}
-static noinline int relocate_data_extent(struct inode *reloc_inode,
-                                         struct btrfs_key *extent_key,
-                                         u64 offset)
-{
-        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
-        struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree;
-        struct extent_map *em;
-        u64 start = extent_key->objectid - offset;
-        u64 end = start + extent_key->offset - 1;
-        em = alloc_extent_map(GFP_NOFS);
-        BUG_ON(!em || IS_ERR(em));
-        em->start = start;
-        em->len = extent_key->offset;
-        em->block_len = extent_key->offset;
-        em->block_start = extent_key->objectid;
-        em->bdev = root->fs_info->fs_devices->latest_bdev;
-        set_bit(EXTENT_FLAG_PINNED, &em->flags);
-        /* setup extent map to cheat btrfs_readpage */
-        lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
-        while (1) {
-                int ret;
-                write_lock(&em_tree->lock);
-                ret = add_extent_mapping(em_tree, em);
-                write_unlock(&em_tree->lock);
-                if (ret != -EEXIST) {
-                        free_extent_map(em);
-                        break;
-                }
-                btrfs_drop_extent_cache(reloc_inode, start, end, 0);
-        }
-        unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS);
-        return relocate_inode_pages(reloc_inode, start, extent_key->offset);
-}
-struct btrfs_ref_path {
-        u64 extent_start;
-        u64 nodes[BTRFS_MAX_LEVEL];
-        u64 root_objectid;
-        u64 root_generation;
-        u64 owner_objectid;
-        u32 num_refs;
-        int lowest_level;
-        int current_level;
-        int shared_level;
-        struct btrfs_key node_keys[BTRFS_MAX_LEVEL];
-        u64 new_nodes[BTRFS_MAX_LEVEL];
-};
-struct disk_extent {
-        u64 ram_bytes;
-        u64 disk_bytenr;
-        u64 disk_num_bytes;
-        u64 offset;
-        u64 num_bytes;
-        u8 compression;
-        u8 encryption;
-        u16 other_encoding;
-};
-static int is_cowonly_root(u64 root_objectid)
-{
-        if (root_objectid == BTRFS_ROOT_TREE_OBJECTID ||
-            root_objectid == BTRFS_EXTENT_TREE_OBJECTID ||
-            root_objectid == BTRFS_CHUNK_TREE_OBJECTID ||
-            root_objectid == BTRFS_DEV_TREE_OBJECTID ||
-            root_objectid == BTRFS_TREE_LOG_OBJECTID ||
-            root_objectid == BTRFS_CSUM_TREE_OBJECTID)
-                return 1;
-        return 0;
-}
-static noinline int __next_ref_path(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *extent_root,
-                                    struct btrfs_ref_path *ref_path,
-                                    int first_time)
-{
-        struct extent_buffer *leaf;
-        struct btrfs_path *path;
-        struct btrfs_extent_ref *ref;
-        struct btrfs_key key;
-        struct btrfs_key found_key;
-        u64 bytenr;
-        u32 nritems;
-        int level;
-        int ret = 1;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        if (first_time) {
-                ref_path->lowest_level = -1;
-                ref_path->current_level = -1;
-                ref_path->shared_level = -1;
-                goto walk_up;
-        }
-walk_down:
-        level = ref_path->current_level - 1;
-        while (level >= -1) {
-                u64 parent;
-                if (level < ref_path->lowest_level)
-                        break;
-                if (level >= 0)
-                        bytenr = ref_path->nodes[level];
-                else
-                        bytenr = ref_path->extent_start;
-                BUG_ON(bytenr == 0);
-                parent = ref_path->nodes[level + 1];
-                ref_path->nodes[level + 1] = 0;
-                ref_path->current_level = level;
-                BUG_ON(parent == 0);
-                key.objectid = bytenr;
-                key.offset = parent + 1;
-                key.type = BTRFS_EXTENT_REF_KEY;
-                ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
-                if (ret < 0)
-                        goto out;
-                BUG_ON(ret == 0);
-                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
-                if (path->slots[0] >= nritems) {
-                        ret = btrfs_next_leaf(extent_root, path);
-                        if (ret < 0)
-                                goto out;
-                        if (ret > 0)
-                                goto next;
-                        leaf = path->nodes[0];
-                }
-                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.objectid == bytenr &&
-                    found_key.type == BTRFS_EXTENT_REF_KEY) {
-                        if (level < ref_path->shared_level)
-                                ref_path->shared_level = level;
-                        goto found;
-                }
-next:
-                level--;
-                btrfs_release_path(extent_root, path);
-                cond_resched();
-        }
-        /* reached lowest level */
-        ret = 1;
-        goto out;
-walk_up:
-        level = ref_path->current_level;
-        while (level < BTRFS_MAX_LEVEL - 1) {
-                u64 ref_objectid;
-                if (level >= 0)
-                        bytenr = ref_path->nodes[level];
-                else
-                        bytenr = ref_path->extent_start;
-                BUG_ON(bytenr == 0);
-                key.objectid = bytenr;
-                key.offset = 0;
-                key.type = BTRFS_EXTENT_REF_KEY;
-                ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0);
-                if (ret < 0)
-                        goto out;
-                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
-                if (path->slots[0] >= nritems) {
-                        ret = btrfs_next_leaf(extent_root, path);
-                        if (ret < 0)
-                                goto out;
-                        if (ret > 0) {
-                                /* the extent was freed by someone */
-                                if (ref_path->lowest_level == level)
-                                        goto out;
-                                btrfs_release_path(extent_root, path);
-                                goto walk_down;
-                        }
-                        leaf = path->nodes[0];
-                }
-                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.objectid != bytenr ||
-                                found_key.type != BTRFS_EXTENT_REF_KEY) {
-                        /* the extent was freed by someone */
-                        if (ref_path->lowest_level == level) {
-                                ret = 1;
-                                goto out;
-                        }
-                        btrfs_release_path(extent_root, path);
-                        goto walk_down;
-                }
-found:
-                ref = btrfs_item_ptr(leaf, path->slots[0],
-                                struct btrfs_extent_ref);
-                ref_objectid = btrfs_ref_objectid(leaf, ref);
-                if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) {
-                        if (first_time) {
-                                level = (int)ref_objectid;
-                                BUG_ON(level >= BTRFS_MAX_LEVEL);
-                                ref_path->lowest_level = level;
-                                ref_path->current_level = level;
-                                ref_path->nodes[level] = bytenr;
-                        } else {
-                                WARN_ON(ref_objectid != level);
-                        }
-                } else {
-                        WARN_ON(level != -1);
-                }
-                first_time = 0;
-                if (ref_path->lowest_level == level) {
-                        ref_path->owner_objectid = ref_objectid;
-                        ref_path->num_refs = btrfs_ref_num_refs(leaf, ref);
-                }
-                /*
-                 * the block is tree root or the block isn't in reference
-                 * counted tree.
-                 */
-                if (found_key.objectid == found_key.offset ||
-                    is_cowonly_root(btrfs_ref_root(leaf, ref))) {
-                        ref_path->root_objectid = btrfs_ref_root(leaf, ref);
-                        ref_path->root_generation =
-                                btrfs_ref_generation(leaf, ref);
-                        if (level < 0) {
-                                /* special reference from the tree log */
-                                ref_path->nodes[0] = found_key.offset;
-                                ref_path->current_level = 0;
-                        }
-                        ret = 0;
-                        goto out;
-                }
-                level++;
-                BUG_ON(ref_path->nodes[level] != 0);
-                ref_path->nodes[level] = found_key.offset;
-                ref_path->current_level = level;
-                /*
-                 * the reference was created in the running transaction,
-                 * no need to continue walking up.
-                 */
-                if (btrfs_ref_generation(leaf, ref) == trans->transid) {
-                        ref_path->root_objectid = btrfs_ref_root(leaf, ref);
-                        ref_path->root_generation =
-                                btrfs_ref_generation(leaf, ref);
-                        ret = 0;
-                        goto out;
-                }
-                btrfs_release_path(extent_root, path);
-                cond_resched();
-        }
-        /* reached max tree level, but no tree root found. */
-        BUG();
-out:
-        btrfs_free_path(path);
-        return ret;
-}
-static int btrfs_first_ref_path(struct btrfs_trans_handle *trans,
-                                struct btrfs_root *extent_root,
-                                struct btrfs_ref_path *ref_path,
-                                u64 extent_start)
-{
-        memset(ref_path, 0, sizeof(*ref_path));
-        ref_path->extent_start = extent_start;
-        return __next_ref_path(trans, extent_root, ref_path, 1);
-}
-static int btrfs_next_ref_path(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *extent_root,
-                               struct btrfs_ref_path *ref_path)
-{
-        return __next_ref_path(trans, extent_root, ref_path, 0);
-}
-static noinline int get_new_locations(struct inode *reloc_inode,
-                                      struct btrfs_key *extent_key,
-                                      u64 offset, int no_fragment,
-                                      struct disk_extent **extents,
-                                      int *nr_extents)
-{
-        struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
-        struct btrfs_path *path;
-        struct btrfs_file_extent_item *fi;
-        struct extent_buffer *leaf;
-        struct disk_extent *exts = *extents;
-        struct btrfs_key found_key;
-        u64 cur_pos;
-        u64 last_byte;
-        u32 nritems;
-        int nr = 0;
-        int max = *nr_extents;
-        int ret;
-        WARN_ON(!no_fragment && *extents);
-        if (!exts) {
-                max = 1;
-                exts = kmalloc(sizeof(*exts) * max, GFP_NOFS);
-                if (!exts)
-                        return -ENOMEM;
-        }
-        path = btrfs_alloc_path();
-        BUG_ON(!path);
-        cur_pos = extent_key->objectid - offset;
-        last_byte = extent_key->objectid + extent_key->offset;
-        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
-                                       cur_pos, 0);
-        if (ret < 0)
-                goto out;
-        if (ret > 0) {
-                ret = -ENOENT;
-                goto out;
-        }
-        while (1) {
-                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
-                if (path->slots[0] >= nritems) {
-                        ret = btrfs_next_leaf(root, path);
-                        if (ret < 0)
-                                goto out;
-                        if (ret > 0)
-                                break;
-                        leaf = path->nodes[0];
-                }
-                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.offset != cur_pos ||
-                    found_key.type != BTRFS_EXTENT_DATA_KEY ||
-                    found_key.objectid != reloc_inode->i_ino)
-                        break;
-                fi = btrfs_item_ptr(leaf, path->slots[0],
-                                    struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, fi) !=
-                    BTRFS_FILE_EXTENT_REG ||
-                    btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
-                        break;
-                if (nr == max) {
-                        struct disk_extent *old = exts;
-                        max *= 2;
-                        exts = kzalloc(sizeof(*exts) * max, GFP_NOFS);
-                        memcpy(exts, old, sizeof(*exts) * nr);
-                        if (old != *extents)
-                                kfree(old);
-                }
-                exts[nr].disk_bytenr =
-                        btrfs_file_extent_disk_bytenr(leaf, fi);
-                exts[nr].disk_num_bytes =
-                        btrfs_file_extent_disk_num_bytes(leaf, fi);
-                exts[nr].offset = btrfs_file_extent_offset(leaf, fi);
-                exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-                exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi);
-                exts[nr].compression = btrfs_file_extent_compression(leaf, fi);
-                exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi);
-                exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf,
-                                                                           fi);
-                BUG_ON(exts[nr].offset > 0);
-                BUG_ON(exts[nr].compression || exts[nr].encryption);
-                BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes);
-                cur_pos += exts[nr].num_bytes;
-                nr++;
-                if (cur_pos + offset >= last_byte)
-                        break;
-                if (no_fragment) {
-                        ret = 1;
-                        goto out;
-                }
-                path->slots[0]++;
-        }
-        BUG_ON(cur_pos + offset > last_byte);
-        if (cur_pos + offset < last_byte) {
-                ret = -ENOENT;
-                goto out;
-        }
-        ret = 0;
-out:
-        btrfs_free_path(path);
-        if (ret) {
-                if (exts != *extents)
-                        kfree(exts);
-        } else {
-                *extents = exts;
-                *nr_extents = nr;
-        }
-        return ret;
-}
-static noinline int replace_one_extent(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
-                                        struct btrfs_path *path,
-                                        struct btrfs_key *extent_key,
-                                        struct btrfs_key *leaf_key,
-                                        struct btrfs_ref_path *ref_path,
-                                        struct disk_extent *new_extents,
-                                        int nr_extents)
-{
-        struct extent_buffer *leaf;
-        struct btrfs_file_extent_item *fi;
-        struct inode *inode = NULL;
-        struct btrfs_key key;
-        u64 lock_start = 0;
-        u64 lock_end = 0;
-        u64 num_bytes;
-        u64 ext_offset;
-        u64 search_end = (u64)-1;
-        u32 nritems;
-        int nr_scaned = 0;
-        int extent_locked = 0;
-        int extent_type;
-        int ret;
-        memcpy(&key, leaf_key, sizeof(key));
-        if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
-                if (key.objectid < ref_path->owner_objectid ||
-                    (key.objectid == ref_path->owner_objectid &&
-                     key.type < BTRFS_EXTENT_DATA_KEY)) {
-                        key.objectid = ref_path->owner_objectid;
-                        key.type = BTRFS_EXTENT_DATA_KEY;
-                        key.offset = 0;
-                }
-        }
-        while (1) {
-                ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
-                if (ret < 0)
-                        goto out;
-                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
-next:
-                if (extent_locked && ret > 0) {
-                        /*
-                         * the file extent item was modified by someone
-                         * before the extent got locked.
-                         */
-                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-                                      lock_end, GFP_NOFS);
-                        extent_locked = 0;
-                }
-                if (path->slots[0] >= nritems) {
-                        if (++nr_scaned > 2)
-                                break;
-                        BUG_ON(extent_locked);
-                        ret = btrfs_next_leaf(root, path);
-                        if (ret < 0)
-                                goto out;
-                        if (ret > 0)
-                                break;
-                        leaf = path->nodes[0];
-                        nritems = btrfs_header_nritems(leaf);
-                }
-                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) {
-                        if ((key.objectid > ref_path->owner_objectid) ||
-                            (key.objectid == ref_path->owner_objectid &&
-                             key.type > BTRFS_EXTENT_DATA_KEY) ||
-                            key.offset >= search_end)
-                                break;
-                }
-                if (inode && key.objectid != inode->i_ino) {
-                        BUG_ON(extent_locked);
-                        btrfs_release_path(root, path);
-                        mutex_unlock(&inode->i_mutex);
-                        iput(inode);
-                        inode = NULL;
-                        continue;
-                }
-                if (key.type != BTRFS_EXTENT_DATA_KEY) {
-                        path->slots[0]++;
-                        ret = 1;
-                        goto next;
-                }
-                fi = btrfs_item_ptr(leaf, path->slots[0],
-                                    struct btrfs_file_extent_item);
-                extent_type = btrfs_file_extent_type(leaf, fi);
-                if ((extent_type != BTRFS_FILE_EXTENT_REG &&
-                     extent_type != BTRFS_FILE_EXTENT_PREALLOC) ||
-                    (btrfs_file_extent_disk_bytenr(leaf, fi) !=
-                     extent_key->objectid)) {
-                        path->slots[0]++;
-                        ret = 1;
-                        goto next;
-                }
-                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-                ext_offset = btrfs_file_extent_offset(leaf, fi);
-                if (search_end == (u64)-1) {
-                        search_end = key.offset - ext_offset +
-                                btrfs_file_extent_ram_bytes(leaf, fi);
-                }
-                if (!extent_locked) {
-                        lock_start = key.offset;
-                        lock_end = lock_start + num_bytes - 1;
-                } else {
-                        if (lock_start > key.offset ||
-                            lock_end + 1 < key.offset + num_bytes) {
-                                unlock_extent(&BTRFS_I(inode)->io_tree,
-                                              lock_start, lock_end, GFP_NOFS);
-                                extent_locked = 0;
-                        }
-                }
-                if (!inode) {
-                        btrfs_release_path(root, path);
-                        inode = btrfs_iget_locked(root->fs_info->sb,
-                                                  key.objectid, root);
-                        if (inode->i_state & I_NEW) {
-                                BTRFS_I(inode)->root = root;
-                                BTRFS_I(inode)->location.objectid =
-                                        key.objectid;
-                                BTRFS_I(inode)->location.type =
-                                        BTRFS_INODE_ITEM_KEY;
-                                BTRFS_I(inode)->location.offset = 0;
-                                btrfs_read_locked_inode(inode);
-                                unlock_new_inode(inode);
-                        }
-                        /*
-                         * some code call btrfs_commit_transaction while
-                         * holding the i_mutex, so we can't use mutex_lock
-                         * here.
-                         */
-                        if (is_bad_inode(inode) ||
-                            !mutex_trylock(&inode->i_mutex)) {
-                                iput(inode);
-                                inode = NULL;
-                                key.offset = (u64)-1;
-                                goto skip;
-                        }
-                }
-                if (!extent_locked) {
-                        struct btrfs_ordered_extent *ordered;
-                        btrfs_release_path(root, path);
-                        lock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-                                    lock_end, GFP_NOFS);
-                        ordered = btrfs_lookup_first_ordered_extent(inode,
-                                                                    lock_end);
-                        if (ordered &&
-                            ordered->file_offset <= lock_end &&
-                            ordered->file_offset + ordered->len > lock_start) {
-                                unlock_extent(&BTRFS_I(inode)->io_tree,
-                                              lock_start, lock_end, GFP_NOFS);
-                                btrfs_start_ordered_extent(inode, ordered, 1);
-                                btrfs_put_ordered_extent(ordered);
-                                key.offset += num_bytes;
-                                goto skip;
-                        }
-                        if (ordered)
-                                btrfs_put_ordered_extent(ordered);
-                        extent_locked = 1;
-                        continue;
-                }
-                if (nr_extents == 1) {
-                        /* update extent pointer in place */
-                        btrfs_set_file_extent_disk_bytenr(leaf, fi,
-                                                new_extents[0].disk_bytenr);
-                        btrfs_set_file_extent_disk_num_bytes(leaf, fi,
-                                                new_extents[0].disk_num_bytes);
-                        btrfs_mark_buffer_dirty(leaf);
-                        btrfs_drop_extent_cache(inode, key.offset,
-                                                key.offset + num_bytes - 1, 0);
-                        ret = btrfs_inc_extent_ref(trans, root,
-                                                new_extents[0].disk_bytenr,
-                                                new_extents[0].disk_num_bytes,
-                                                leaf->start,
-                                                root->root_key.objectid,
-                                                trans->transid,
-                                                key.objectid);
-                        BUG_ON(ret);
-                        ret = btrfs_free_extent(trans, root,
-                                                extent_key->objectid,
-                                                extent_key->offset,
-                                                leaf->start,
-                                                btrfs_header_owner(leaf),
-                                                btrfs_header_generation(leaf),
-                                                key.objectid, 0);
-                        BUG_ON(ret);
-                        btrfs_release_path(root, path);
-                        key.offset += num_bytes;
-                } else {
-                        BUG_ON(1);
-#if 0
-                        u64 alloc_hint;
-                        u64 extent_len;
-                        int i;
-                        /*
-                         * drop old extent pointer at first, then insert the
-                         * new pointers one bye one
-                         */
-                        btrfs_release_path(root, path);
-                        ret = btrfs_drop_extents(trans, root, inode, key.offset,
-                                                 key.offset + num_bytes,
-                                                 key.offset, &alloc_hint);
-                        BUG_ON(ret);
-                        for (i = 0; i < nr_extents; i++) {
-                                if (ext_offset >= new_extents[i].num_bytes) {
-                                        ext_offset -= new_extents[i].num_bytes;
-                                        continue;
-                                }
-                                extent_len = min(new_extents[i].num_bytes -
-                                                 ext_offset, num_bytes);
-                                ret = btrfs_insert_empty_item(trans, root,
-                                                              path, &key,
-                                                              sizeof(*fi));
-                                BUG_ON(ret);
-                                leaf = path->nodes[0];
-                                fi = btrfs_item_ptr(leaf, path->slots[0],
-                                                struct btrfs_file_extent_item);
-                                btrfs_set_file_extent_generation(leaf, fi,
-                                                        trans->transid);
-                                btrfs_set_file_extent_type(leaf, fi,
-                                                        BTRFS_FILE_EXTENT_REG);
-                                btrfs_set_file_extent_disk_bytenr(leaf, fi,
-                                                new_extents[i].disk_bytenr);
-                                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
-                                                new_extents[i].disk_num_bytes);
-                                btrfs_set_file_extent_ram_bytes(leaf, fi,
-                                                new_extents[i].ram_bytes);
-                                btrfs_set_file_extent_compression(leaf, fi,
-                                                new_extents[i].compression);
-                                btrfs_set_file_extent_encryption(leaf, fi,
-                                                new_extents[i].encryption);
-                                btrfs_set_file_extent_other_encoding(leaf, fi,
-                                                new_extents[i].other_encoding);
-                                btrfs_set_file_extent_num_bytes(leaf, fi,
-                                                        extent_len);
-                                ext_offset += new_extents[i].offset;
-                                btrfs_set_file_extent_offset(leaf, fi,
-                                                        ext_offset);
-                                btrfs_mark_buffer_dirty(leaf);
-                                btrfs_drop_extent_cache(inode, key.offset,
-                                                key.offset + extent_len - 1, 0);
-                                ret = btrfs_inc_extent_ref(trans, root,
-                                                new_extents[i].disk_bytenr,
-                                                new_extents[i].disk_num_bytes,
-                                                leaf->start,
-                                                root->root_key.objectid,
-                                                trans->transid, key.objectid);
-                                BUG_ON(ret);
-                                btrfs_release_path(root, path);
-                                inode_add_bytes(inode, extent_len);
-                                ext_offset = 0;
-                                num_bytes -= extent_len;
-                                key.offset += extent_len;
-                                if (num_bytes == 0)
-                                        break;
-                        }
-                        BUG_ON(i >= nr_extents);
-#endif
-                }
-                if (extent_locked) {
-                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-                                      lock_end, GFP_NOFS);
-                        extent_locked = 0;
-                }
-skip:
-                if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS &&
-                    key.offset >= search_end)
-                        break;
-                cond_resched();
-        }
-        ret = 0;
-out:
-        btrfs_release_path(root, path);
-        if (inode) {
-                mutex_unlock(&inode->i_mutex);
-                if (extent_locked) {
-                        unlock_extent(&BTRFS_I(inode)->io_tree, lock_start,
-                                      lock_end, GFP_NOFS);
-                }
-                iput(inode);
-        }
-        return ret;
-}
-int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans,
-                               struct btrfs_root *root,
-                               struct extent_buffer *buf, u64 orig_start)
-{
-        int level;
-        int ret;
-        BUG_ON(btrfs_header_generation(buf) != trans->transid);
-        BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
-        level = btrfs_header_level(buf);
-        if (level == 0) {
-                struct btrfs_leaf_ref *ref;
-                struct btrfs_leaf_ref *orig_ref;
-                orig_ref = btrfs_lookup_leaf_ref(root, orig_start);
-                if (!orig_ref)
-                        return -ENOENT;
-                ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems);
-                if (!ref) {
-                        btrfs_free_leaf_ref(root, orig_ref);
-                        return -ENOMEM;
-                }
-                ref->nritems = orig_ref->nritems;
-                memcpy(ref->extents, orig_ref->extents,
-                        sizeof(ref->extents[0]) * ref->nritems);
-                btrfs_free_leaf_ref(root, orig_ref);
-                ref->root_gen = trans->transid;
-                ref->bytenr = buf->start;
-                ref->owner = btrfs_header_owner(buf);
-                ref->generation = btrfs_header_generation(buf);
-                ret = btrfs_add_leaf_ref(root, ref, 0);
-                WARN_ON(ret);
-                btrfs_free_leaf_ref(root, ref);
-        }
-        return 0;
-}
-static noinline int invalidate_extent_cache(struct btrfs_root *root,
-                                        struct extent_buffer *leaf,
-                                        struct btrfs_block_group_cache *group,
-                                        struct btrfs_root *target_root)
-{
-        struct btrfs_key key;
-        struct inode *inode = NULL;
-        struct btrfs_file_extent_item *fi;
-        struct extent_state *cached_state = NULL;
-        u64 num_bytes;
-        u64 skip_objectid = 0;
-        u32 nritems;
-        u32 i;
-        nritems = btrfs_header_nritems(leaf);
-        for (i = 0; i < nritems; i++) {
-                btrfs_item_key_to_cpu(leaf, &key, i);
-                if (key.objectid == skip_objectid ||
-                    key.type != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, fi) ==
-                    BTRFS_FILE_EXTENT_INLINE)
-                        continue;
-                if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0)
-                        continue;
-                if (!inode || inode->i_ino != key.objectid) {
-                        iput(inode);
-                        inode = btrfs_ilookup(target_root->fs_info->sb,
-                                              key.objectid, target_root, 1);
-                }
-                if (!inode) {
-                        skip_objectid = key.objectid;
-                        continue;
-                }
-                num_bytes = btrfs_file_extent_num_bytes(leaf, fi);
-                lock_extent_bits(&BTRFS_I(inode)->io_tree, key.offset,
-                                 key.offset + num_bytes - 1, 0, &cached_state,
-                                 GFP_NOFS);
-                btrfs_drop_extent_cache(inode, key.offset,
-                                        key.offset + num_bytes - 1, 1);
-                unlock_extent_cached(&BTRFS_I(inode)->io_tree, key.offset,
-                                     key.offset + num_bytes - 1, &cached_state,
-                                     GFP_NOFS);
-                cond_resched();
-        }
-        iput(inode);
-        return 0;
-}
-static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
-                                        struct extent_buffer *leaf,
-                                        struct btrfs_block_group_cache *group,
-                                        struct inode *reloc_inode)
-{
-        struct btrfs_key key;
-        struct btrfs_key extent_key;
-        struct btrfs_file_extent_item *fi;
-        struct btrfs_leaf_ref *ref;
-        struct disk_extent *new_extent;
-        u64 bytenr;
-        u64 num_bytes;
-        u32 nritems;
-        u32 i;
-        int ext_index;
-        int nr_extent;
-        int ret;
-        new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS);
-        BUG_ON(!new_extent);
-        ref = btrfs_lookup_leaf_ref(root, leaf->start);
-        BUG_ON(!ref);
-        ext_index = -1;
-        nritems = btrfs_header_nritems(leaf);
-        for (i = 0; i < nritems; i++) {
-                btrfs_item_key_to_cpu(leaf, &key, i);
-                if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
-                        continue;
-                fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
-                if (btrfs_file_extent_type(leaf, fi) ==
-                    BTRFS_FILE_EXTENT_INLINE)
-                        continue;
-                bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
-                num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
-                if (bytenr == 0)
-                        continue;
-                ext_index++;
-                if (bytenr >= group->key.objectid + group->key.offset ||
-                    bytenr + num_bytes <= group->key.objectid)
-                        continue;
-                extent_key.objectid = bytenr;
-                extent_key.offset = num_bytes;
-                extent_key.type = BTRFS_EXTENT_ITEM_KEY;
-                nr_extent = 1;
-                ret = get_new_locations(reloc_inode, &extent_key,
-                                        group->key.objectid, 1,
-                                        &new_extent, &nr_extent);
-                if (ret > 0)
-                        continue;
-                BUG_ON(ret < 0);
-                BUG_ON(ref->extents[ext_index].bytenr != bytenr);
-                BUG_ON(ref->extents[ext_index].num_bytes != num_bytes);
-                ref->extents[ext_index].bytenr = new_extent->disk_bytenr;
-                ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes;
-                btrfs_set_file_extent_disk_bytenr(leaf, fi,
-                                                new_extent->disk_bytenr);
-                btrfs_set_file_extent_disk_num_bytes(leaf, fi,
-                                                new_extent->disk_num_bytes);
-                btrfs_mark_buffer_dirty(leaf);
-                ret = btrfs_inc_extent_ref(trans, root,
-                                        new_extent->disk_bytenr,
-                                        new_extent->disk_num_bytes,
-                                        leaf->start,
-                                        root->root_key.objectid,
-                                        trans->transid, key.objectid);
-                BUG_ON(ret);
-                ret = btrfs_free_extent(trans, root,
-                                        bytenr, num_bytes, leaf->start,
-                                        btrfs_header_owner(leaf),
-                                        btrfs_header_generation(leaf),
-                                        key.objectid, 0);
-                BUG_ON(ret);
-                cond_resched();
-        }
-        kfree(new_extent);
-        BUG_ON(ext_index + 1 != ref->nritems);
-        btrfs_free_leaf_ref(root, ref);
-        return 0;
-}
-int btrfs_free_reloc_root(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root)
-{
-        struct btrfs_root *reloc_root;
-        int ret;
-        if (root->reloc_root) {
-                reloc_root = root->reloc_root;
-                root->reloc_root = NULL;
-                list_add(&reloc_root->dead_list,
-                         &root->fs_info->dead_reloc_roots);
-                btrfs_set_root_bytenr(&reloc_root->root_item,
-                                      reloc_root->node->start);
-                btrfs_set_root_level(&root->root_item,
-                                     btrfs_header_level(reloc_root->node));
-                memset(&reloc_root->root_item.drop_progress, 0,
-                        sizeof(struct btrfs_disk_key));
-                reloc_root->root_item.drop_level = 0;
-                ret = btrfs_update_root(trans, root->fs_info->tree_root,
-                                        &reloc_root->root_key,
-                                        &reloc_root->root_item);
-                BUG_ON(ret);
-        }
-        return 0;
-}
-int btrfs_drop_dead_reloc_roots(struct btrfs_root *root)
-{
-        struct btrfs_trans_handle *trans;
-        struct btrfs_root *reloc_root;
-        struct btrfs_root *prev_root = NULL;
-        struct list_head dead_roots;
-        int ret;
-        unsigned long nr;
-        INIT_LIST_HEAD(&dead_roots);
-        list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots);
-        while (!list_empty(&dead_roots)) {
-                reloc_root = list_entry(dead_roots.prev,
-                                        struct btrfs_root, dead_list);
-                list_del_init(&reloc_root->dead_list);
-                BUG_ON(reloc_root->commit_root != NULL);
-                while (1) {
-                        trans = btrfs_join_transaction(root, 1);
-                        BUG_ON(!trans);
-                        mutex_lock(&root->fs_info->drop_mutex);
-                        ret = btrfs_drop_snapshot(trans, reloc_root);
-                        if (ret != -EAGAIN)
-                                break;
-                        mutex_unlock(&root->fs_info->drop_mutex);
-                        nr = trans->blocks_used;
-                        ret = btrfs_end_transaction(trans, root);
-                        BUG_ON(ret);
-                        btrfs_btree_balance_dirty(root, nr);
-                }
-                free_extent_buffer(reloc_root->node);
-                ret = btrfs_del_root(trans, root->fs_info->tree_root,
-                                     &reloc_root->root_key);
-                BUG_ON(ret);
-                mutex_unlock(&root->fs_info->drop_mutex);
-                nr = trans->blocks_used;
-                ret = btrfs_end_transaction(trans, root);
-                BUG_ON(ret);
-                btrfs_btree_balance_dirty(root, nr);
-                kfree(prev_root);
-                prev_root = reloc_root;
-        }
-        if (prev_root) {
-                btrfs_remove_leaf_refs(prev_root, (u64)-1, 0);
-                kfree(prev_root);
-        }
-        return 0;
-}
-int btrfs_add_dead_reloc_root(struct btrfs_root *root)
-{
-        list_add(&root->dead_list, &root->fs_info->dead_reloc_roots);
-        return 0;
-}
-int btrfs_cleanup_reloc_trees(struct btrfs_root *root)
-{
-        struct btrfs_root *reloc_root;
-        struct btrfs_trans_handle *trans;
-        struct btrfs_key location;
-        int found;
-        int ret;
-        mutex_lock(&root->fs_info->tree_reloc_mutex);
-        ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL);
-        BUG_ON(ret);
-        found = !list_empty(&root->fs_info->dead_reloc_roots);
-        mutex_unlock(&root->fs_info->tree_reloc_mutex);
-        if (found) {
-                trans = btrfs_start_transaction(root, 1);
-                BUG_ON(!trans);
-                ret = btrfs_commit_transaction(trans, root);
-                BUG_ON(ret);
-        }
-        location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID;
-        location.offset = (u64)-1;
-        location.type = BTRFS_ROOT_ITEM_KEY;
-        reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
-        BUG_ON(!reloc_root);
-        btrfs_orphan_cleanup(reloc_root);
-        return 0;
-}
-static noinline int init_reloc_tree(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *root)
-{
-        struct btrfs_root *reloc_root;
-        struct extent_buffer *eb;
-        struct btrfs_root_item *root_item;
-        struct btrfs_key root_key;
-        int ret;
-        BUG_ON(!root->ref_cows);
-        if (root->reloc_root)
-                return 0;
-        root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
-        BUG_ON(!root_item);
-        ret = btrfs_copy_root(trans, root, root->commit_root,
-                              &eb, BTRFS_TREE_RELOC_OBJECTID);
-        BUG_ON(ret);
-        root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
-        root_key.offset = root->root_key.objectid;
-        root_key.type = BTRFS_ROOT_ITEM_KEY;
-        memcpy(root_item, &root->root_item, sizeof(root_item));
-        btrfs_set_root_refs(root_item, 0);
-        btrfs_set_root_bytenr(root_item, eb->start);
-        btrfs_set_root_level(root_item, btrfs_header_level(eb));
-        btrfs_set_root_generation(root_item, trans->transid);
-        btrfs_tree_unlock(eb);
-        free_extent_buffer(eb);
-        ret = btrfs_insert_root(trans, root->fs_info->tree_root,
-                                &root_key, root_item);
-        BUG_ON(ret);
-        kfree(root_item);
-        reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
-                                                 &root_key);
-        BUG_ON(!reloc_root);
-        reloc_root->last_trans = trans->transid;
-        reloc_root->commit_root = NULL;
-        reloc_root->ref_tree = &root->fs_info->reloc_ref_tree;
-        root->reloc_root = reloc_root;
-        return 0;
-}
-/*
- * Core function of space balance.
- *
- * The idea is using reloc trees to relocate tree blocks in reference
- * counted roots. There is one reloc tree for each subvol, and all
- * reloc trees share same root key objectid. Reloc trees are snapshots
- * of the latest committed roots of subvols (root->commit_root).
- *
- * To relocate a tree block referenced by a subvol, there are two steps.
- * COW the block through subvol's reloc tree, then update block pointer
- * in the subvol to point to the new block. Since all reloc trees share
- * same root key objectid, doing special handing for tree blocks owned
- * by them is easy. Once a tree block has been COWed in one reloc tree,
- * we can use the resulting new block directly when the same block is
- * required to COW again through other reloc trees. By this way, relocated
- * tree blocks are shared between reloc trees, so they are also shared
- * between subvols.
- */
-static noinline int relocate_one_path(struct btrfs_trans_handle *trans,
-                                      struct btrfs_root *root,
-                                      struct btrfs_path *path,
-                                      struct btrfs_key *first_key,
-                                      struct btrfs_ref_path *ref_path,
-                                      struct btrfs_block_group_cache *group,
-                                      struct inode *reloc_inode)
-{
-        struct btrfs_root *reloc_root;
-        struct extent_buffer *eb = NULL;
-        struct btrfs_key *keys;
-        u64 *nodes;
-        int level;
-        int shared_level;
-        int lowest_level = 0;
-        int ret;
-        if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
-                lowest_level = ref_path->owner_objectid;
-        if (!root->ref_cows) {
-                path->lowest_level = lowest_level;
-                ret = btrfs_search_slot(trans, root, first_key, path, 0, 1);
-                BUG_ON(ret < 0);
-                path->lowest_level = 0;
-                btrfs_release_path(root, path);
-                return 0;
-        }
-        mutex_lock(&root->fs_info->tree_reloc_mutex);
-        ret = init_reloc_tree(trans, root);
-        BUG_ON(ret);
-        reloc_root = root->reloc_root;
-        shared_level = ref_path->shared_level;
-        ref_path->shared_level = BTRFS_MAX_LEVEL - 1;
-        keys = ref_path->node_keys;
-        nodes = ref_path->new_nodes;
-        memset(&keys[shared_level + 1], 0,
-               sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1));
-        memset(&nodes[shared_level + 1], 0,
-               sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1));
-        if (nodes[lowest_level] == 0) {
-                path->lowest_level = lowest_level;
-                ret = btrfs_search_slot(trans, reloc_root, first_key, path,
-                                        0, 1);
-                BUG_ON(ret);
-                for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) {
-                        eb = path->nodes[level];
-                        if (!eb || eb == reloc_root->node)
-                                break;
-                        nodes[level] = eb->start;
-                        if (level == 0)
-                                btrfs_item_key_to_cpu(eb, &keys[level], 0);
-                        else
-                                btrfs_node_key_to_cpu(eb, &keys[level], 0);
-                }
-                if (nodes[0] &&
-                    ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-                        eb = path->nodes[0];
-                        ret = replace_extents_in_leaf(trans, reloc_root, eb,
-                                                      group, reloc_inode);
-                        BUG_ON(ret);
-                }
-                btrfs_release_path(reloc_root, path);
-        } else {
-                ret = btrfs_merge_path(trans, reloc_root, keys, nodes,
-                                       lowest_level);
-                BUG_ON(ret);
-        }
-        /*
-         * replace tree blocks in the fs tree with tree blocks in
-         * the reloc tree.
-         */
-        ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level);
-        BUG_ON(ret < 0);
-        if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-                ret = btrfs_search_slot(trans, reloc_root, first_key, path,
-                                        0, 0);
-                BUG_ON(ret);
-                extent_buffer_get(path->nodes[0]);
-                eb = path->nodes[0];
-                btrfs_release_path(reloc_root, path);
-                ret = invalidate_extent_cache(reloc_root, eb, group, root);
-                BUG_ON(ret);
-                free_extent_buffer(eb);
-        }
-        mutex_unlock(&root->fs_info->tree_reloc_mutex);
-        path->lowest_level = 0;
-        return 0;
-}
-static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
-                                        struct btrfs_root *root,
-                                        struct btrfs_path *path,
-                                        struct btrfs_key *first_key,
-                                        struct btrfs_ref_path *ref_path)
-{
-        int ret;
-        ret = relocate_one_path(trans, root, path, first_key,
-                                ref_path, NULL, NULL);
-        BUG_ON(ret);
-        return 0;
-}
-static noinline int del_extent_zero(struct btrfs_trans_handle *trans,
-                                    struct btrfs_root *extent_root,
-                                    struct btrfs_path *path,
-                                    struct btrfs_key *extent_key)
-{
-        int ret;
-        ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1);
-        if (ret)
-                goto out;
-        ret = btrfs_del_item(trans, extent_root, path);
-out:
-        btrfs_release_path(extent_root, path);
-        return ret;
-}
-static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info,
-                                                struct btrfs_ref_path *ref_path)
-{
-        struct btrfs_key root_key;
-        root_key.objectid = ref_path->root_objectid;
-        root_key.type = BTRFS_ROOT_ITEM_KEY;
-        if (is_cowonly_root(ref_path->root_objectid))
-                root_key.offset = 0;
-        else
-                root_key.offset = (u64)-1;
-        return btrfs_read_fs_root_no_name(fs_info, &root_key);
-}
-static noinline int relocate_one_extent(struct btrfs_root *extent_root,
-                                        struct btrfs_path *path,
-                                        struct btrfs_key *extent_key,
-                                        struct btrfs_block_group_cache *group,
-                                        struct inode *reloc_inode, int pass)
-{
-        struct btrfs_trans_handle *trans;
-        struct btrfs_root *found_root;
-        struct btrfs_ref_path *ref_path = NULL;
-        struct disk_extent *new_extents = NULL;
-        int nr_extents = 0;
-        int loops;
-        int ret;
-        int level;
-        struct btrfs_key first_key;
-        u64 prev_block = 0;
-        trans = btrfs_start_transaction(extent_root, 1);
-        BUG_ON(!trans);
-        if (extent_key->objectid == 0) {
-                ret = del_extent_zero(trans, extent_root, path, extent_key);
-                goto out;
-        }
-        ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS);
-        if (!ref_path) {
-                ret = -ENOMEM;
-                goto out;
-        }
-        for (loops = 0; ; loops++) {
-                if (loops == 0) {
-                        ret = btrfs_first_ref_path(trans, extent_root, ref_path,
-                                                   extent_key->objectid);
-                } else {
-                        ret = btrfs_next_ref_path(trans, extent_root, ref_path);
-                }
-                if (ret < 0)
-                        goto out;
-                if (ret > 0)
-                        break;
-                if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID ||
-                    ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID)
-                        continue;
-                found_root = read_ref_root(extent_root->fs_info, ref_path);
-                BUG_ON(!found_root);
-                /*
-                 * for reference counted tree, only process reference paths
-                 * rooted at the latest committed root.
-                 */
-                if (found_root->ref_cows &&
-                    ref_path->root_generation != found_root->root_key.offset)
-                        continue;
-                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-                        if (pass == 0) {
-                                /*
-                                 * copy data extents to new locations
-                                 */
-                                u64 group_start = group->key.objectid;
-                                ret = relocate_data_extent(reloc_inode,
-                                                           extent_key,
-                                                           group_start);
-                                if (ret < 0)
-                                        goto out;
-                                break;
-                        }
-                        level = 0;
-                } else {
-                        level = ref_path->owner_objectid;
-                }
-                if (prev_block != ref_path->nodes[level]) {
-                        struct extent_buffer *eb;
-                        u64 block_start = ref_path->nodes[level];
-                        u64 block_size = btrfs_level_size(found_root, level);
-                        eb = read_tree_block(found_root, block_start,
-                                             block_size, 0);
-                        btrfs_tree_lock(eb);
-                        BUG_ON(level != btrfs_header_level(eb));
-                        if (level == 0)
-                                btrfs_item_key_to_cpu(eb, &first_key, 0);
-                        else
-                                btrfs_node_key_to_cpu(eb, &first_key, 0);
-                        btrfs_tree_unlock(eb);
-                        free_extent_buffer(eb);
-                        prev_block = block_start;
-                }
-                mutex_lock(&extent_root->fs_info->trans_mutex);
-                btrfs_record_root_in_trans(found_root);
-                mutex_unlock(&extent_root->fs_info->trans_mutex);
-                if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
-                        /*
-                         * try to update data extent references while
-                         * keeping metadata shared between snapshots.
-                         */
-                        if (pass == 1) {
-                                ret = relocate_one_path(trans, found_root,
-                                                path, &first_key, ref_path,
-                                                group, reloc_inode);
-                                if (ret < 0)
-                                        goto out;
-                                continue;
-                        }
-                        /*
-                         * use fallback method to process the remaining
-                         * references.
-                         */
-                        if (!new_extents) {
-                                u64 group_start = group->key.objectid;
-                                new_extents = kmalloc(sizeof(*new_extents),
-                                                      GFP_NOFS);
-                                nr_extents = 1;
-                                ret = get_new_locations(reloc_inode,
-                                                        extent_key,
-                                                        group_start, 1,
-                                                        &new_extents,
-                                                        &nr_extents);
-                                if (ret)
-                                        goto out;
-                        }
-                        ret = replace_one_extent(trans, found_root,
-                                                path, extent_key,
-                                                &first_key, ref_path,
-                                                new_extents, nr_extents);
-                } else {
-                        ret = relocate_tree_block(trans, found_root, path,
-                                                  &first_key, ref_path);
-                }
-                if (ret < 0)
-                        goto out;
-        }
-        ret = 0;
-out:
-        btrfs_end_transaction(trans, extent_root);
-        kfree(new_extents);
-        kfree(ref_path);
-        return ret;
-}
-#endif
 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
 {
        u64 num_devices;
        u64 stripped = BTRFS_BLOCK_GROUP_RAID0 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
-        num_devices = root->fs_info->fs_devices->rw_devices;
+        /*
+         * we add in the count of missing devices because we want
+         * to make sure that any RAID levels on a degraded FS
+         * continue to be honored.
+         */
+        num_devices = root->fs_info->fs_devices->rw_devices +
+                root->fs_info->fs_devices->missing_devices;
        if (num_devices == 1) {
                stripped |= BTRFS_BLOCK_GROUP_DUP;
                stripped = flags & ~stripped;
@@ -7636,13 +6540,14 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache)
        if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
            sinfo->bytes_may_use + sinfo->bytes_readonly +
-            cache->reserved_pinned + num_bytes < sinfo->total_bytes) {
+            cache->reserved_pinned + num_bytes <= sinfo->total_bytes) {
                sinfo->bytes_readonly += num_bytes;
                sinfo->bytes_reserved += cache->reserved_pinned;
                cache->reserved_pinned = 0;
                cache->ro = 1;
                ret = 0;
        }
        spin_unlock(&cache->lock);
        spin_unlock(&sinfo->lock);
        return ret;
@@ -7658,18 +6563,20 @@ int btrfs_set_block_group_ro(struct btrfs_root *root,
        BUG_ON(cache->ro);
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
        BUG_ON(IS_ERR(trans));
        alloc_flags = update_block_group_flags(root, cache->flags);
        if (alloc_flags != cache->flags)
-                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+                do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                               CHUNK_ALLOC_FORCE);
        ret = set_block_group_ro(cache);
        if (!ret)
                goto out;
        alloc_flags = get_alloc_profile(root, cache->space_info->flags);
-        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags, 1);
+        ret = do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                             CHUNK_ALLOC_FORCE);
        if (ret < 0)
                goto out;
        ret = set_block_group_ro(cache);
@@ -7678,6 +6585,70 @@ out:
        return ret;
 }
+int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
+                            struct btrfs_root *root, u64 type)
+{
+        u64 alloc_flags = get_alloc_profile(root, type);
+        return do_chunk_alloc(trans, root, 2 * 1024 * 1024, alloc_flags,
+                              CHUNK_ALLOC_FORCE);
+}
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * list. takes mirrors into account.
+ */
+static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
+{
+        struct btrfs_block_group_cache *block_group;
+        u64 free_bytes = 0;
+        int factor;
+        list_for_each_entry(block_group, groups_list, list) {
+                spin_lock(&block_group->lock);
+                if (!block_group->ro) {
+                        spin_unlock(&block_group->lock);
+                        continue;
+                }
+                if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
+                                          BTRFS_BLOCK_GROUP_RAID10 |
+                                          BTRFS_BLOCK_GROUP_DUP))
+                        factor = 2;
+                else
+                        factor = 1;
+                free_bytes += (block_group->key.offset -
+                               btrfs_block_group_used(&block_group->item)) *
+                               factor;
+                spin_unlock(&block_group->lock);
+        }
+        return free_bytes;
+}
+/*
+ * helper to account the unused space of all the readonly block group in the
+ * space_info. takes mirrors into account.
+ */
+u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
+{
+        int i;
+        u64 free_bytes = 0;
+        spin_lock(&sinfo->lock);
+        for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
+                if (!list_empty(&sinfo->block_groups[i]))
+                        free_bytes += __btrfs_get_ro_block_group_free_space(
+                                                &sinfo->block_groups[i]);
+        spin_unlock(&sinfo->lock);
+        return free_bytes;
+}
 int btrfs_set_block_group_rw(struct btrfs_root *root,
                              struct btrfs_block_group_cache *cache)
 {
@@ -7758,7 +6729,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
        mutex_lock(&root->fs_info->chunk_mutex);
        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
                u64 min_free = btrfs_block_group_used(&block_group->item);
-                u64 dev_offset, max_avail;
+                u64 dev_offset;
                /*
                 * check to make sure we can actually find a chunk with enough
@@ -7766,7 +6737,7 @@ int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
                 */
                if (device->total_bytes > device->bytes_used + min_free) {
                        ret = find_free_dev_extent(NULL, device, min_free,
-                                                   &dev_offset, &max_avail);
+                                                   &dev_offset, NULL);
                        if (!ret)
                                break;
                        ret = -1;
@@ -7814,6 +6785,40 @@ out:
        return ret;
 }
+void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
+{
+        struct btrfs_block_group_cache *block_group;
+        u64 last = 0;
+        while (1) {
+                struct inode *inode;
+                block_group = btrfs_lookup_first_block_group(info, last);
+                while (block_group) {
+                        spin_lock(&block_group->lock);
+                        if (block_group->iref)
+                                break;
+                        spin_unlock(&block_group->lock);
+                        block_group = next_block_group(info->tree_root,
+                                                       block_group);
+                }
+                if (!block_group) {
+                        if (last == 0)
+                                break;
+                        last = 0;
+                        continue;
+                }
+                inode = block_group->inode;
+                block_group->iref = 0;
+                block_group->inode = NULL;
+                spin_unlock(&block_group->lock);
+                iput(inode);
+                last = block_group->key.objectid + block_group->key.offset;
+                btrfs_put_block_group(block_group);
+        }
+}
 int btrfs_free_block_groups(struct btrfs_fs_info *info)
 {
        struct btrfs_block_group_cache *block_group;
@@ -7845,6 +6850,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                if (block_group->cached == BTRFS_CACHE_STARTED)
                        wait_block_group_cache_done(block_group);
+                /*
+                 * We haven't cached this block group, which means we could
+                 * possibly have excluded extents on this block group.
+                 */
+                if (block_group->cached == BTRFS_CACHE_NO)
+                        free_excluded_extents(info->extent_root, block_group);
                btrfs_remove_free_space_cache(block_group);
                btrfs_put_block_group(block_group);
@@ -7897,6 +6909,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct extent_buffer *leaf;
+        int need_clear = 0;
+        u64 cache_gen;
        root = info->extent_root;
        key.objectid = 0;
@@ -7905,6 +6919,16 @@ int btrfs_read_block_groups(struct btrfs_root *root)
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
+        cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+        if (cache_gen != 0 &&
+            btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+                need_clear = 1;
+        if (btrfs_test_opt(root, CLEAR_CACHE))
+                need_clear = 1;
+        if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
+                printk(KERN_INFO "btrfs: disk space caching is enabled\n");
        while (1) {
                ret = find_first_block_group(root, path, &key);
@@ -7912,7 +6936,6 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                        break;
                if (ret != 0)
                        goto error;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                cache = kzalloc(sizeof(*cache), GFP_NOFS);
@@ -7920,21 +6943,22 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                        ret = -ENOMEM;
                        goto error;
                }
+                cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+                                                GFP_NOFS);
+                if (!cache->free_space_ctl) {
+                        kfree(cache);
+                        ret = -ENOMEM;
+                        goto error;
+                }
                atomic_set(&cache->count, 1);
                spin_lock_init(&cache->lock);
-                spin_lock_init(&cache->tree_lock);
                cache->fs_info = info;
                INIT_LIST_HEAD(&cache->list);
                INIT_LIST_HEAD(&cache->cluster_list);
-                /*
+                if (need_clear)
-                 * we only want to have 32k of ram per block group for keeping
+                        cache->disk_cache_state = BTRFS_DC_CLEAR;
-                 * track of free space, and if we pass 1/2 of that we want to
-                 * start converting things over to using bitmaps
-                 */
-                cache->extents_thresh = ((1024 * 32) / 2) /
-                        sizeof(struct btrfs_free_space);
                read_extent_buffer(leaf, &cache->item,
                                   btrfs_item_ptr_offset(leaf, path->slots[0]),
@@ -7942,10 +6966,19 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                memcpy(&cache->key, &found_key, sizeof(found_key));
                key.objectid = found_key.objectid + found_key.offset;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                cache->flags = btrfs_block_group_flags(&cache->item);
                cache->sectorsize = root->sectorsize;
+                btrfs_init_free_space_ctl(cache);
+                /*
+                 * We need to exclude the super stripes now so that the space
+                 * info has super bytes accounted for, otherwise we'll think
+                 * we have more space than we actually do.
+                 */
+                exclude_super_stripes(root, cache);
                /*
                 * check for two cases, either we are full, and therefore
                 * don't need to bother with the caching work since we won't
@@ -7954,12 +6987,10 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                 * time, particularly in the full case.
                 */
                if (found_key.offset == btrfs_block_group_used(&cache->item)) {
-                        exclude_super_stripes(root, cache);
                        cache->last_byte_to_unpin = (u64)-1;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        free_excluded_extents(root, cache);
                } else if (btrfs_block_group_used(&cache->item) == 0) {
-                        exclude_super_stripes(root, cache);
                        cache->last_byte_to_unpin = (u64)-1;
                        cache->cached = BTRFS_CACHE_FINISHED;
                        add_new_free_space(cache, root->fs_info,
@@ -8027,25 +7058,26 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache = kzalloc(sizeof(*cache), GFP_NOFS);
        if (!cache)
                return -ENOMEM;
+        cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+                                        GFP_NOFS);
+        if (!cache->free_space_ctl) {
+                kfree(cache);
+                return -ENOMEM;
+        }
        cache->key.objectid = chunk_offset;
        cache->key.offset = size;
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        cache->sectorsize = root->sectorsize;
+        cache->fs_info = root->fs_info;
-        /*
-         * we only want to have 32k of ram per block group for keeping track
-         * of free space, and if we pass 1/2 of that we want to start
-         * converting things over to using bitmaps
-         */
-        cache->extents_thresh = ((1024 * 32) / 2) /
-                sizeof(struct btrfs_free_space);
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);
-        spin_lock_init(&cache->tree_lock);
        INIT_LIST_HEAD(&cache->list);
        INIT_LIST_HEAD(&cache->cluster_list);
+        btrfs_init_free_space_ctl(cache);
        btrfs_set_block_group_used(&cache->item, bytes_used);
        btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
        cache->flags = type;
@@ -8088,8 +7120,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        struct btrfs_path *path;
        struct btrfs_block_group_cache *block_group;
        struct btrfs_free_cluster *cluster;
+        struct btrfs_root *tree_root = root->fs_info->tree_root;
        struct btrfs_key key;
+        struct inode *inode;
        int ret;
+        int factor;
        root = root->fs_info->extent_root;
@@ -8097,7 +7132,19 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        BUG_ON(!block_group);
        BUG_ON(!block_group->ro);
+        /*
+         * Free the reserved super bytes from this block group before
+         * remove it.
+         */
+        free_excluded_extents(root, block_group);
        memcpy(&key, &block_group->key, sizeof(key));
+        if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
+                                  BTRFS_BLOCK_GROUP_RAID1 |
+                                  BTRFS_BLOCK_GROUP_RAID10))
+                factor = 2;
+        else
+                factor = 1;
        /* make sure this block group isn't part of an allocation cluster */
        cluster = &root->fs_info->data_alloc_cluster;
@@ -8117,6 +7164,40 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        inode = lookup_free_space_inode(root, block_group, path);
+        if (!IS_ERR(inode)) {
+                btrfs_orphan_add(trans, inode);
+                clear_nlink(inode);
+                /* One for the block groups ref */
+                spin_lock(&block_group->lock);
+                if (block_group->iref) {
+                        block_group->iref = 0;
+                        block_group->inode = NULL;
+                        spin_unlock(&block_group->lock);
+                        iput(inode);
+                } else {
+                        spin_unlock(&block_group->lock);
+                }
+                /* One for our lookup ref */
+                iput(inode);
+        }
+        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+        key.offset = block_group->key.objectid;
+        key.type = 0;
+        ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
+        if (ret < 0)
+                goto out;
+        if (ret > 0)
+                btrfs_release_path(path);
+        if (ret == 0) {
+                ret = btrfs_del_item(trans, tree_root, path);
+                if (ret)
+                        goto out;
+                btrfs_release_path(path);
+        }
        spin_lock(&root->fs_info->block_group_cache_lock);
        rb_erase(&block_group->cache_node,
                 &root->fs_info->block_group_cache_tree);
@@ -8138,8 +7219,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
        spin_lock(&block_group->space_info->lock);
        block_group->space_info->total_bytes -= block_group->key.offset;
        block_group->space_info->bytes_readonly -= block_group->key.offset;
+        block_group->space_info->disk_total -= block_group->key.offset * factor;
        spin_unlock(&block_group->space_info->lock);
+        memcpy(&key, &block_group->key, sizeof(key));
        btrfs_clear_space_info_full(root->fs_info);
        btrfs_put_block_group(block_group);
@@ -8156,3 +7240,100 @@ out:
        btrfs_free_path(path);
        return ret;
 }
+int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
+{
+        struct btrfs_space_info *space_info;
+        struct btrfs_super_block *disk_super;
+        u64 features;
+        u64 flags;
+        int mixed = 0;
+        int ret;
+        disk_super = &fs_info->super_copy;
+        if (!btrfs_super_root(disk_super))
+                return 1;
+        features = btrfs_super_incompat_flags(disk_super);
+        if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
+                mixed = 1;
+        flags = BTRFS_BLOCK_GROUP_SYSTEM;
+        ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+        if (ret)
+                goto out;
+        if (mixed) {
+                flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
+                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+        } else {
+                flags = BTRFS_BLOCK_GROUP_METADATA;
+                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+                if (ret)
+                        goto out;
+                flags = BTRFS_BLOCK_GROUP_DATA;
+                ret = update_space_info(fs_info, flags, 0, 0, &space_info);
+        }
+out:
+        return ret;
+}
+int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
+{
+        return unpin_extent_range(root, start, end);
+}
+int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
+                               u64 num_bytes, u64 *actual_bytes)
+{
+        return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
+}
+int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_block_group_cache *cache = NULL;
+        u64 group_trimmed;
+        u64 start;
+        u64 end;
+        u64 trimmed = 0;
+        int ret = 0;
+        cache = btrfs_lookup_block_group(fs_info, range->start);
+        while (cache) {
+                if (cache->key.objectid >= (range->start + range->len)) {
+                        btrfs_put_block_group(cache);
+                        break;
+                }
+                start = max(range->start, cache->key.objectid);
+                end = min(range->start + range->len,
+                                cache->key.objectid + cache->key.offset);
+                if (end - start >= range->minlen) {
+                        if (!block_group_cache_done(cache)) {
+                                ret = cache_block_group(cache, NULL, root, 0);
+                                if (!ret)
+                                        wait_block_group_cache_done(cache);
+                        }
+                        ret = btrfs_trim_block_group(cache,
+                                                     &group_trimmed,
+                                                     start,
+                                                     end,
+                                                     range->minlen);
+                        trimmed += group_trimmed;
+                        if (ret) {
+                                btrfs_put_block_group(cache);
+                                break;
+                        }
+                }
+                cache = next_block_group(fs_info->tree_root, cache);
+        }
+        range->len = trimmed;
+        return ret;
+}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d74e6af9b53a..7055d11c1efd 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -10,6 +10,8 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/pagevec.h>
+#include <linux/prefetch.h>
+#include <linux/cleancache.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "compat.h"
@@ -101,10 +103,10 @@ void extent_io_exit(void)
 }
 void extent_io_tree_init(struct extent_io_tree *tree,
-                          struct address_space *mapping, gfp_t mask)
+                         struct address_space *mapping)
 {
        tree->state = RB_ROOT;
-        tree->buffer = RB_ROOT;
+        INIT_RADIX_TREE(&tree->buffer, GFP_ATOMIC);
        tree->ops = NULL;
        tree->dirty_bytes = 0;
        spin_lock_init(&tree->lock);
@@ -235,50 +237,6 @@ static inline struct rb_node *tree_search(struct extent_io_tree *tree,
        return ret;
 }
-static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree,
-                                          u64 offset, struct rb_node *node)
-{
-        struct rb_root *root = &tree->buffer;
-        struct rb_node **p = &root->rb_node;
-        struct rb_node *parent = NULL;
-        struct extent_buffer *eb;
-        while (*p) {
-                parent = *p;
-                eb = rb_entry(parent, struct extent_buffer, rb_node);
-                if (offset < eb->start)
-                        p = &(*p)->rb_left;
-                else if (offset > eb->start)
-                        p = &(*p)->rb_right;
-                else
-                        return eb;
-        }
-        rb_link_node(node, parent, p);
-        rb_insert_color(node, root);
-        return NULL;
-}
-static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
-                                           u64 offset)
-{
-        struct rb_root *root = &tree->buffer;
-        struct rb_node *n = root->rb_node;
-        struct extent_buffer *eb;
-        while (n) {
-                eb = rb_entry(n, struct extent_buffer, rb_node);
-                if (offset < eb->start)
-                        n = n->rb_left;
-                else if (offset > eb->start)
-                        n = n->rb_right;
-                else
-                        return eb;
-        }
-        return NULL;
-}
 static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
                     struct extent_state *other)
 {
@@ -483,6 +441,15 @@ static int clear_state_bit(struct extent_io_tree *tree,
        return ret;
 }
+static struct extent_state *
+alloc_extent_state_atomic(struct extent_state *prealloc)
+{
+        if (!prealloc)
+                prealloc = alloc_extent_state(GFP_ATOMIC);
+        return prealloc;
+}
 /*
 * clear some bits on a range in the tree.  This may require splitting
 * or inserting elements in the tree, so the gfp mask is used to
@@ -573,8 +540,8 @@ hit_next:
         */
        if (state->start < start) {
-                if (!prealloc)
+                prealloc = alloc_extent_state_atomic(prealloc);
-                        prealloc = alloc_extent_state(GFP_ATOMIC);
+                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, start);
                BUG_ON(err == -EEXIST);
                prealloc = NULL;
@@ -595,8 +562,8 @@ hit_next:
         * on the first half
         */
        if (state->start <= end && state->end > end) {
-                if (!prealloc)
+                prealloc = alloc_extent_state_atomic(prealloc);
-                        prealloc = alloc_extent_state(GFP_ATOMIC);
+                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
                if (wake)
@@ -734,6 +701,15 @@ static void cache_state(struct extent_state *state,
        }
 }
+static void uncache_state(struct extent_state **cached_ptr)
+{
+        if (cached_ptr && (*cached_ptr)) {
+                struct extent_state *state = *cached_ptr;
+                *cached_ptr = NULL;
+                free_extent_state(state);
+        }
+}
 /*
 * set some bits on a range in the tree.  This may require allocations or
 * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -760,8 +736,7 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 again:
        if (!prealloc && (mask & __GFP_WAIT)) {
                prealloc = alloc_extent_state(mask);
-                if (!prealloc)
+                BUG_ON(!prealloc);
-                        return -ENOMEM;
        }
        spin_lock(&tree->lock);
@@ -778,6 +753,8 @@ again:
         */
        node = tree_search(tree, start);
        if (!node) {
+                prealloc = alloc_extent_state_atomic(prealloc);
+                BUG_ON(!prealloc);
                err = insert_state(tree, prealloc, start, end, &bits);
                prealloc = NULL;
                BUG_ON(err == -EEXIST);
@@ -806,20 +783,18 @@ hit_next:
                if (err)
                        goto out;
+                next_node = rb_next(node);
                cache_state(state, cached_state);
                merge_state(tree, state);
                if (last_end == (u64)-1)
                        goto out;
                start = last_end + 1;
-                if (start < end && prealloc && !need_resched()) {
+                if (next_node && start < end && prealloc && !need_resched()) {
-                        next_node = rb_next(node);
+                        state = rb_entry(next_node, struct extent_state,
-                        if (next_node) {
+                                         rb_node);
-                                state = rb_entry(next_node, struct extent_state,
+                        if (state->start == start)
-                                                 rb_node);
+                                goto hit_next;
-                                if (state->start == start)
-                                        goto hit_next;
-                        }
                }
                goto search_again;
        }
@@ -846,6 +821,9 @@ hit_next:
                        err = -EEXIST;
                        goto out;
                }
+                prealloc = alloc_extent_state_atomic(prealloc);
+                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, start);
                BUG_ON(err == -EEXIST);
                prealloc = NULL;
@@ -876,14 +854,25 @@ hit_next:
                        this_end = end;
                else
                        this_end = last_start - 1;
+                prealloc = alloc_extent_state_atomic(prealloc);
+                BUG_ON(!prealloc);
+                /*
+                 * Avoid to free 'prealloc' if it can be merged with
+                 * the later extent.
+                 */
+                atomic_inc(&prealloc->refs);
                err = insert_state(tree, prealloc, start, this_end,
                                   &bits);
                BUG_ON(err == -EEXIST);
                if (err) {
+                        free_extent_state(prealloc);
                        prealloc = NULL;
                        goto out;
                }
                cache_state(prealloc, cached_state);
+                free_extent_state(prealloc);
                prealloc = NULL;
                start = this_end + 1;
                goto search_again;
@@ -900,6 +889,9 @@ hit_next:
                        err = -EEXIST;
                        goto out;
                }
+                prealloc = alloc_extent_state_atomic(prealloc);
+                BUG_ON(!prealloc);
                err = split_state(tree, state, prealloc, end + 1);
                BUG_ON(err == -EEXIST);
@@ -976,18 +968,11 @@ int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                              NULL, mask);
 }
-static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
-                       gfp_t mask)
-{
-        return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0,
-                                NULL, mask);
-}
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask)
+                        struct extent_state **cached_state, gfp_t mask)
 {
-        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL,
+        return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0,
-                              NULL, mask);
+                              NULL, cached_state, mask);
 }
 static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
@@ -998,11 +983,6 @@ static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
                                cached_state, mask);
 }
-int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end)
-{
-        return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK);
-}
 /*
 * either insert or lock state struct between start and end use mask to tell
 * us if waiting is desired.
@@ -1056,33 +1036,13 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
                                mask);
 }
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
+int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask)
-                  gfp_t mask)
 {
        return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
                                mask);
 }
 /*
- * helper function to set pages and extents in the tree dirty
- */
-int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end)
-{
-        unsigned long index = start >> PAGE_CACHE_SHIFT;
-        unsigned long end_index = end >> PAGE_CACHE_SHIFT;
-        struct page *page;
-        while (index <= end_index) {
-                page = find_get_page(tree->mapping, index);
-                BUG_ON(!page);
-                __set_page_dirty_nobuffers(page);
-                page_cache_release(page);
-                index++;
-        }
-        return 0;
-}
-/*
 * helper function to set both pages and extents in the tree writeback
 */
 static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
@@ -1477,12 +1437,13 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 */
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end, u64 max_bytes,
-                     unsigned long bits)
+                     unsigned long bits, int contig)
 {
        struct rb_node *node;
        struct extent_state *state;
        u64 cur_start = *start;
        u64 total_bytes = 0;
+        u64 last = 0;
        int found = 0;
        if (search_end <= cur_start) {
@@ -1507,15 +1468,20 @@ u64 count_range_bits(struct extent_io_tree *tree,
                state = rb_entry(node, struct extent_state, rb_node);
                if (state->start > search_end)
                        break;
-                if (state->end >= cur_start && (state->state & bits)) {
+                if (contig && found && state->start > last + 1)
+                        break;
+                if (state->end >= cur_start && (state->state & bits) == bits) {
                        total_bytes += min(search_end, state->end) + 1 -
                                       max(cur_start, state->start);
                        if (total_bytes >= max_bytes)
                                break;
                        if (!found) {
-                                *start = state->start;
+                                *start = max(cur_start, state->start);
                                found = 1;
                        }
+                        last = state->end;
+                } else if (contig && found) {
+                        break;
                }
                node = rb_next(node);
                if (!node)
@@ -1773,6 +1739,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
        do {
                struct page *page = bvec->bv_page;
+                struct extent_state *cached = NULL;
+                struct extent_state *state;
                tree = &BTRFS_I(page->mapping->host)->io_tree;
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1787,9 +1756,20 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                if (++bvec <= bvec_end)
                        prefetchw(&bvec->bv_page->flags);
+                spin_lock(&tree->lock);
+                state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
+                if (state && state->start == start) {
+                        /*
+                         * take a reference on the state, unlock will drop
+                         * the ref
+                         */
+                        cache_state(state, &cached);
+                }
+                spin_unlock(&tree->lock);
                if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
                        ret = tree->ops->readpage_end_io_hook(page, start, end,
-                                                              NULL);
+                                                              state);
                        if (ret)
                                uptodate = 0;
                }
@@ -1802,15 +1782,16 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
                                if (err)
                                        uptodate = 0;
+                                uncache_state(&cached);
                                continue;
                        }
                }
                if (uptodate) {
-                        set_extent_uptodate(tree, start, end,
+                        set_extent_uptodate(tree, start, end, &cached,
                                            GFP_ATOMIC);
                }
-                unlock_extent(tree, start, end, GFP_ATOMIC);
+                unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
                if (whole_page) {
                        if (uptodate) {
@@ -1834,47 +1815,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
        bio_put(bio);
 }
-/*
+struct bio *
- * IO done from prepare_write is pretty simple, we just unlock
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
- * the structs in the extent tree when done, and set the uptodate bits
+                gfp_t gfp_flags)
- * as appropriate.
- */
-static void end_bio_extent_preparewrite(struct bio *bio, int err)
-{
-        const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
-        struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
-        struct extent_io_tree *tree;
-        u64 start;
-        u64 end;
-        do {
-                struct page *page = bvec->bv_page;
-                tree = &BTRFS_I(page->mapping->host)->io_tree;
-                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
-                        bvec->bv_offset;
-                end = start + bvec->bv_len - 1;
-                if (--bvec >= bio->bi_io_vec)
-                        prefetchw(&bvec->bv_page->flags);
-                if (uptodate) {
-                        set_extent_uptodate(tree, start, end, GFP_ATOMIC);
-                } else {
-                        ClearPageUptodate(page);
-                        SetPageError(page);
-                }
-                unlock_extent(tree, start, end, GFP_ATOMIC);
-        } while (bvec >= bio->bi_io_vec);
-        bio_put(bio);
-}
-static struct bio *
-extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
-                 gfp_t gfp_flags)
 {
        struct bio *bio;
@@ -1901,17 +1844,15 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
        struct page *page = bvec->bv_page;
        struct extent_io_tree *tree = bio->bi_private;
        u64 start;
-        u64 end;
        start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset;
-        end = start + bvec->bv_len - 1;
        bio->bi_private = NULL;
        bio_get(bio);
        if (tree->ops && tree->ops->submit_bio_hook)
-                tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
+                ret = tree->ops->submit_bio_hook(page->mapping->host, rw, bio,
                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
@@ -1965,7 +1906,9 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
        else
                nr = bio_get_nr_vecs(bdev);
-        bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+        bio = btrfs_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH);
+        if (!bio)
+                return -ENOMEM;
        bio_add_page(bio, page, page_size, offset);
        bio->bi_end_io = end_io_func;
@@ -1990,6 +1933,7 @@ void set_page_extent_mapped(struct page *page)
 static void set_page_extent_head(struct page *page, unsigned long len)
 {
+        WARN_ON(!PagePrivate(page));
        set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2);
 }
@@ -2019,7 +1963,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        struct btrfs_ordered_extent *ordered;
        int ret;
        int nr = 0;
-        size_t page_offset = 0;
+        size_t pg_offset = 0;
        size_t iosize;
        size_t disk_io_size;
        size_t blocksize = inode->i_sb->s_blocksize;
@@ -2027,6 +1971,13 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        set_page_extent_mapped(page);
+        if (!PageUptodate(page)) {
+                if (cleancache_get_page(page) == 0) {
+                        BUG_ON(blocksize != PAGE_SIZE);
+                        goto out;
+                }
+        }
        end = page_end;
        while (1) {
                lock_extent(tree, start, end, GFP_NOFS);
@@ -2053,19 +2004,22 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
        while (cur <= end) {
                if (cur >= last_byte) {
                        char *userpage;
-                        iosize = PAGE_CACHE_SIZE - page_offset;
+                        struct extent_state *cached = NULL;
+                        iosize = PAGE_CACHE_SIZE - pg_offset;
                        userpage = kmap_atomic(page, KM_USER0);
-                        memset(userpage + page_offset, 0, iosize);
+                        memset(userpage + pg_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
-                                            GFP_NOFS);
+                                            &cached, GFP_NOFS);
-                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        unlock_extent_cached(tree, cur, cur + iosize - 1,
+                                             &cached, GFP_NOFS);
                        break;
                }
-                em = get_extent(inode, page, page_offset, cur,
+                em = get_extent(inode, page, pg_offset, cur,
                                end - cur + 1, 0);
-                if (IS_ERR(em) || !em) {
+                if (IS_ERR_OR_NULL(em)) {
                        SetPageError(page);
                        unlock_extent(tree, cur, end, GFP_NOFS);
                        break;
@@ -2074,8 +2028,11 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                BUG_ON(extent_map_end(em) <= cur);
                BUG_ON(end < cur);
-                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        this_bio_flag = EXTENT_BIO_COMPRESSED;
+                        extent_set_compress_type(&this_bio_flag,
+                                                 em->compress_type);
+                }
                iosize = min(extent_map_end(em) - cur, end - cur + 1);
                cur_end = min(extent_map_end(em) - 1, end);
@@ -2097,16 +2054,19 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                /* we've found a hole, just zero and go on */
                if (block_start == EXTENT_MAP_HOLE) {
                        char *userpage;
+                        struct extent_state *cached = NULL;
                        userpage = kmap_atomic(page, KM_USER0);
-                        memset(userpage + page_offset, 0, iosize);
+                        memset(userpage + pg_offset, 0, iosize);
                        flush_dcache_page(page);
                        kunmap_atomic(userpage, KM_USER0);
                        set_extent_uptodate(tree, cur, cur + iosize - 1,
-                                            GFP_NOFS);
+                                            &cached, GFP_NOFS);
-                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
+                        unlock_extent_cached(tree, cur, cur + iosize - 1,
+                                             &cached, GFP_NOFS);
                        cur = cur + iosize;
-                        page_offset += iosize;
+                        pg_offset += iosize;
                        continue;
                }
                /* the get_extent function already copied into the page */
@@ -2115,7 +2075,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        check_page_uptodate(tree, page);
                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
                        cur = cur + iosize;
-                        page_offset += iosize;
+                        pg_offset += iosize;
                        continue;
                }
                /* we have an inline extent but it didn't get marked up
@@ -2125,7 +2085,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        SetPageError(page);
                        unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS);
                        cur = cur + iosize;
-                        page_offset += iosize;
+                        pg_offset += iosize;
                        continue;
                }
@@ -2138,7 +2098,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                        unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
                        pnr -= page->index;
                        ret = submit_extent_page(READ, tree, page,
-                                         sector, disk_io_size, page_offset,
+                                         sector, disk_io_size, pg_offset,
                                         bdev, bio, pnr,
                                         end_bio_extent_readpage, mirror_num,
                                         *bio_flags,
@@ -2149,8 +2109,9 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
                if (ret)
                        SetPageError(page);
                cur = cur + iosize;
-                page_offset += iosize;
+                pg_offset += iosize;
        }
+out:
        if (!nr) {
                if (!PageError(page))
                        SetPageUptodate(page);
@@ -2169,7 +2130,7 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
        ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
                                      &bio_flags);
        if (bio)
-                submit_one_bio(READ, bio, 0, bio_flags);
+                ret = submit_one_bio(READ, bio, 0, bio_flags);
        return ret;
 }
@@ -2204,7 +2165,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        u64 last_byte = i_size_read(inode);
        u64 block_start;
        u64 iosize;
-        u64 unlock_start;
        sector_t sector;
        struct extent_state *cached_state = NULL;
        struct extent_map *em;
@@ -2223,10 +2183,12 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
        unsigned long nr_written = 0;
        if (wbc->sync_mode == WB_SYNC_ALL)
-                write_flags = WRITE_SYNC_PLUG;
+                write_flags = WRITE_SYNC;
        else
                write_flags = WRITE;
+        trace___extent_writepage(page, inode, wbc);
        WARN_ON(!PageLocked(page));
        pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
        if (page->index > end_index ||
@@ -2329,7 +2291,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                if (tree->ops && tree->ops->writepage_end_io_hook)
                        tree->ops->writepage_end_io_hook(page, start,
                                                         page_end, NULL, 1);
-                unlock_start = page_end + 1;
                goto done;
        }
@@ -2340,12 +2301,11 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        if (tree->ops && tree->ops->writepage_end_io_hook)
                                tree->ops->writepage_end_io_hook(page, cur,
                                                         page_end, NULL, 1);
-                        unlock_start = page_end + 1;
                        break;
                }
                em = epd->get_extent(inode, page, pg_offset, cur,
                                     end - cur + 1, 1);
-                if (IS_ERR(em) || !em) {
+                if (IS_ERR_OR_NULL(em)) {
                        SetPageError(page);
                        break;
                }
@@ -2387,7 +2347,6 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
                        cur += iosize;
                        pg_offset += iosize;
-                        unlock_start = cur;
                        continue;
                }
                /* leave this out until we have a page_mkwrite call */
@@ -2473,7 +2432,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
        pgoff_t index;
        pgoff_t end;            /* Inclusive */
        int scanned = 0;
-        int range_whole = 0;
        pagevec_init(&pvec, 0);
        if (wbc->range_cyclic) {
@@ -2482,8 +2440,6 @@ static int extent_write_cache_pages(struct extent_io_tree *tree,
        } else {
                index = wbc->range_start >> PAGE_CACHE_SHIFT;
                end = wbc->range_end >> PAGE_CACHE_SHIFT;
-                if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
-                        range_whole = 1;
                scanned = 1;
        }
 retry:
@@ -2689,7 +2645,7 @@ int extent_readpages(struct extent_io_tree *tree,
                prefetchw(&page->flags);
                list_del(&page->lru);
                if (!add_to_page_cache_lru(page, mapping,
-                                        page->index, GFP_KERNEL)) {
+                                        page->index, GFP_NOFS)) {
                        __extent_read_full_page(tree, page, get_extent,
                                                &bio, 0, &bio_flags);
                }
@@ -2728,123 +2684,6 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 }
 /*
- * simple commit_write call, set_range_dirty is used to mark both
- * the pages and the extent records as dirty
- */
-int extent_commit_write(struct extent_io_tree *tree,
-                        struct inode *inode, struct page *page,
-                        unsigned from, unsigned to)
-{
-        loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
-        set_page_extent_mapped(page);
-        set_page_dirty(page);
-        if (pos > inode->i_size) {
-                i_size_write(inode, pos);
-                mark_inode_dirty(inode);
-        }
-        return 0;
-}
-int extent_prepare_write(struct extent_io_tree *tree,
-                         struct inode *inode, struct page *page,
-                         unsigned from, unsigned to, get_extent_t *get_extent)
-{
-        u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-        u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-        u64 block_start;
-        u64 orig_block_start;
-        u64 block_end;
-        u64 cur_end;
-        struct extent_map *em;
-        unsigned blocksize = 1 << inode->i_blkbits;
-        size_t page_offset = 0;
-        size_t block_off_start;
-        size_t block_off_end;
-        int err = 0;
-        int iocount = 0;
-        int ret = 0;
-        int isnew;
-        set_page_extent_mapped(page);
-        block_start = (page_start + from) & ~((u64)blocksize - 1);
-        block_end = (page_start + to - 1) | (blocksize - 1);
-        orig_block_start = block_start;
-        lock_extent(tree, page_start, page_end, GFP_NOFS);
-        while (block_start <= block_end) {
-                em = get_extent(inode, page, page_offset, block_start,
-                                block_end - block_start + 1, 1);
-                if (IS_ERR(em) || !em)
-                        goto err;
-                cur_end = min(block_end, extent_map_end(em) - 1);
-                block_off_start = block_start & (PAGE_CACHE_SIZE - 1);
-                block_off_end = block_off_start + blocksize;
-                isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS);
-                if (!PageUptodate(page) && isnew &&
-                    (block_off_end > to || block_off_start < from)) {
-                        void *kaddr;
-                        kaddr = kmap_atomic(page, KM_USER0);
-                        if (block_off_end > to)
-                                memset(kaddr + to, 0, block_off_end - to);
-                        if (block_off_start < from)
-                                memset(kaddr + block_off_start, 0,
-                                       from - block_off_start);
-                        flush_dcache_page(page);
-                        kunmap_atomic(kaddr, KM_USER0);
-                }
-                if ((em->block_start != EXTENT_MAP_HOLE &&
-                     em->block_start != EXTENT_MAP_INLINE) &&
-                    !isnew && !PageUptodate(page) &&
-                    (block_off_end > to || block_off_start < from) &&
-                    !test_range_bit(tree, block_start, cur_end,
-                                    EXTENT_UPTODATE, 1, NULL)) {
-                        u64 sector;
-                        u64 extent_offset = block_start - em->start;
-                        size_t iosize;
-                        sector = (em->block_start + extent_offset) >> 9;
-                        iosize = (cur_end - block_start + blocksize) &
-                                ~((u64)blocksize - 1);
-                        /*
-                         * we've already got the extent locked, but we
-                         * need to split the state such that our end_bio
-                         * handler can clear the lock.
-                         */
-                        set_extent_bit(tree, block_start,
-                                       block_start + iosize - 1,
-                                       EXTENT_LOCKED, 0, NULL, NULL, GFP_NOFS);
-                        ret = submit_extent_page(READ, tree, page,
-                                         sector, iosize, page_offset, em->bdev,
-                                         NULL, 1,
-                                         end_bio_extent_preparewrite, 0,
-                                         0, 0);
-                        iocount++;
-                        block_start = block_start + iosize;
-                } else {
-                        set_extent_uptodate(tree, block_start, cur_end,
-                                            GFP_NOFS);
-                        unlock_extent(tree, block_start, cur_end, GFP_NOFS);
-                        block_start = cur_end + 1;
-                }
-                page_offset = block_start & (PAGE_CACHE_SIZE - 1);
-                free_extent_map(em);
-        }
-        if (iocount) {
-                wait_extent_bit(tree, orig_block_start,
-                                block_end, EXTENT_LOCKED);
-        }
-        check_page_uptodate(tree, page);
-err:
-        /* FIXME, zero out newly allocated blocks on error */
-        return err;
-}
-/*
 * a helper for releasepage, this tests for areas of the page that
 * are locked or under IO and drops the related state bits if it is safe
 * to drop the page.
@@ -2867,9 +2706,17 @@ int try_release_extent_state(struct extent_map_tree *map,
                 * at this point we can safely clear everything except the
                 * locked bit and the nodatasum bit
                 */
-                clear_extent_bit(tree, start, end,
+                ret = clear_extent_bit(tree, start, end,
                                 ~(EXTENT_LOCKED | EXTENT_NODATASUM),
                                 0, 0, NULL, mask);
+                /* if clear_extent_bit failed for enomem reasons,
+                 * we can't allow the release to continue.
+                 */
+                if (ret < 0)
+                        ret = 0;
+                else
+                        ret = 1;
        }
        return ret;
 }
@@ -2894,7 +2741,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
                        len = end - start + 1;
                        write_lock(&map->lock);
                        em = lookup_extent_mapping(map, start, len);
-                        if (!em || IS_ERR(em)) {
+                        if (IS_ERR_OR_NULL(em)) {
                                write_unlock(&map->lock);
                                break;
                        }
@@ -2922,76 +2769,169 @@ int try_release_extent_mapping(struct extent_map_tree *map,
        return try_release_extent_state(map, tree, page, mask);
 }
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
+/*
-                get_extent_t *get_extent)
+ * helper function for fiemap, which doesn't want to see any holes.
+ * This maps until we find something past 'last'
+ */
+static struct extent_map *get_extent_skip_holes(struct inode *inode,
+                                                u64 offset,
+                                                u64 last,
+                                                get_extent_t *get_extent)
 {
-        struct inode *inode = mapping->host;
+        u64 sectorsize = BTRFS_I(inode)->root->sectorsize;
-        struct extent_state *cached_state = NULL;
-        u64 start = iblock << inode->i_blkbits;
-        sector_t sector = 0;
-        size_t blksize = (1 << inode->i_blkbits);
        struct extent_map *em;
+        u64 len;
-        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + blksize - 1,
+        if (offset >= last)
-                         0, &cached_state, GFP_NOFS);
+                return NULL;
-        em = get_extent(inode, NULL, 0, start, blksize, 0);
-        unlock_extent_cached(&BTRFS_I(inode)->io_tree, start,
-                             start + blksize - 1, &cached_state, GFP_NOFS);
-        if (!em || IS_ERR(em))
-                return 0;
-        if (em->block_start > EXTENT_MAP_LAST_BYTE)
+        while(1) {
-                goto out;
+                len = last - offset;
+                if (len == 0)
+                        break;
+                len = (len + sectorsize - 1) & ~(sectorsize - 1);
+                em = get_extent(inode, NULL, 0, offset, len, 0);
+                if (IS_ERR_OR_NULL(em))
+                        return em;
-        sector = (em->block_start + start - em->start) >> inode->i_blkbits;
+                /* if this isn't a hole return it */
-out:
+                if (!test_bit(EXTENT_FLAG_VACANCY, &em->flags) &&
-        free_extent_map(em);
+                    em->block_start != EXTENT_MAP_HOLE) {
-        return sector;
+                        return em;
+                }
+                /* this is a hole, advance to the next extent */
+                offset = extent_map_end(em);
+                free_extent_map(em);
+                if (offset >= last)
+                        break;
+        }
+        return NULL;
 }
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent)
 {
-        int ret;
+        int ret = 0;
        u64 off = start;
        u64 max = start + len;
        u32 flags = 0;
+        u32 found_type;
+        u64 last;
+        u64 last_for_get_extent = 0;
        u64 disko = 0;
+        u64 isize = i_size_read(inode);
+        struct btrfs_key found_key;
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
+        struct btrfs_path *path;
+        struct btrfs_file_extent_item *item;
        int end = 0;
-        u64 em_start = 0, em_len = 0;
+        u64 em_start = 0;
+        u64 em_len = 0;
+        u64 em_end = 0;
        unsigned long emflags;
-        ret = 0;
        if (len == 0)
                return -EINVAL;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->leave_spinning = 1;
+        /*
+         * lookup the last file extent.  We're not using i_size here
+         * because there might be preallocation past i_size
+         */
+        ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root,
+                                       path, btrfs_ino(inode), -1, 0);
+        if (ret < 0) {
+                btrfs_free_path(path);
+                return ret;
+        }
+        WARN_ON(!ret);
+        path->slots[0]--;
+        item = btrfs_item_ptr(path->nodes[0], path->slots[0],
+                              struct btrfs_file_extent_item);
+        btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]);
+        found_type = btrfs_key_type(&found_key);
+        /* No extents, but there might be delalloc bits */
+        if (found_key.objectid != btrfs_ino(inode) ||
+            found_type != BTRFS_EXTENT_DATA_KEY) {
+                /* have to trust i_size as the end */
+                last = (u64)-1;
+                last_for_get_extent = isize;
+        } else {
+                /*
+                 * remember the start of the last extent.  There are a
+                 * bunch of different factors that go into the length of the
+                 * extent, so its much less complex to remember where it started
+                 */
+                last = found_key.offset;
+                last_for_get_extent = last + 1;
+        }
+        btrfs_free_path(path);
+        /*
+         * we might have some extents allocated but more delalloc past those
+         * extents.  so, we trust isize unless the start of the last extent is
+         * beyond isize
+         */
+        if (last < isize) {
+                last = (u64)-1;
+                last_for_get_extent = isize;
+        }
        lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len, 0,
                         &cached_state, GFP_NOFS);
-        em = get_extent(inode, NULL, 0, off, max - off, 0);
+        em = get_extent_skip_holes(inode, off, last_for_get_extent,
+                                   get_extent);
        if (!em)
                goto out;
        if (IS_ERR(em)) {
                ret = PTR_ERR(em);
                goto out;
        }
        while (!end) {
-                off = em->start + em->len;
+                u64 offset_in_extent;
-                if (off >= max)
-                        end = 1;
+                /* break if the extent we found is outside the range */
+                if (em->start >= max || extent_map_end(em) < off)
+                        break;
-                em_start = em->start;
+                /*
-                em_len = em->len;
+                 * get_extent may return an extent that starts before our
+                 * requested range.  We have to make sure the ranges
+                 * we return to fiemap always move forward and don't
+                 * overlap, so adjust the offsets here
+                 */
+                em_start = max(em->start, off);
+                /*
+                 * record the offset from the start of the extent
+                 * for adjusting the disk offset below
+                 */
+                offset_in_extent = em_start - em->start;
+                em_end = extent_map_end(em);
+                em_len = em_end - em_start;
+                emflags = em->flags;
                disko = 0;
                flags = 0;
+                /*
+                 * bump off for our next call to get_extent
+                 */
+                off = extent_map_end(em);
+                if (off >= max)
+                        end = 1;
                if (em->block_start == EXTENT_MAP_LAST_BYTE) {
                        end = 1;
                        flags |= FIEMAP_EXTENT_LAST;
-                } else if (em->block_start == EXTENT_MAP_HOLE) {
-                        flags |= FIEMAP_EXTENT_UNWRITTEN;
                } else if (em->block_start == EXTENT_MAP_INLINE) {
                        flags |= (FIEMAP_EXTENT_DATA_INLINE |
                                  FIEMAP_EXTENT_NOT_ALIGNED);
@@ -2999,32 +2939,32 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        flags |= (FIEMAP_EXTENT_DELALLOC |
                                  FIEMAP_EXTENT_UNKNOWN);
                } else {
-                        disko = em->block_start;
+                        disko = em->block_start + offset_in_extent;
                }
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags))
                        flags |= FIEMAP_EXTENT_ENCODED;
-                emflags = em->flags;
                free_extent_map(em);
                em = NULL;
+                if ((em_start >= last) || em_len == (u64)-1 ||
+                   (last == (u64)-1 && isize <= em_end)) {
+                        flags |= FIEMAP_EXTENT_LAST;
+                        end = 1;
+                }
-                if (!end) {
+                /* now scan forward to see if this is really the last extent. */
-                        em = get_extent(inode, NULL, 0, off, max - off, 0);
+                em = get_extent_skip_holes(inode, off, last_for_get_extent,
-                        if (!em)
+                                           get_extent);
-                                goto out;
+                if (IS_ERR(em)) {
-                        if (IS_ERR(em)) {
+                        ret = PTR_ERR(em);
-                                ret = PTR_ERR(em);
+                        goto out;
-                                goto out;
-                        }
-                        emflags = em->flags;
                }
-                if (test_bit(EXTENT_FLAG_VACANCY, &emflags)) {
+                if (!em) {
                        flags |= FIEMAP_EXTENT_LAST;
                        end = 1;
                }
                ret = fiemap_fill_next_extent(fieinfo, em_start, disko,
-                                        em_len, flags);
+                                              em_len, flags);
                if (ret)
                        goto out_free;
        }
@@ -3078,6 +3018,8 @@ static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 #endif
        eb = kmem_cache_zalloc(extent_buffer_cache, mask);
+        if (eb == NULL)
+                return NULL;
        eb->start = start;
        eb->len = len;
        spin_lock_init(&eb->lock);
@@ -3104,10 +3046,42 @@ static void __free_extent_buffer(struct extent_buffer *eb)
        kmem_cache_free(extent_buffer_cache, eb);
 }
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+                                                unsigned long start_idx)
+{
+        unsigned long index;
+        struct page *page;
+        if (!eb->first_page)
+                return;
+        index = num_extent_pages(eb->start, eb->len);
+        if (start_idx >= index)
+                return;
+        do {
+                index--;
+                page = extent_buffer_page(eb, index);
+                if (page)
+                        page_cache_release(page);
+        } while (index != start_idx);
+}
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+        btrfs_release_extent_buffer_page(eb, 0);
+        __free_extent_buffer(eb);
+}
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len,
-                                          struct page *page0,
+                                          struct page *page0)
-                                          gfp_t mask)
 {
        unsigned long num_pages = num_extent_pages(start, len);
        unsigned long i;
@@ -3117,18 +3091,18 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
        struct page *p;
        struct address_space *mapping = tree->mapping;
        int uptodate = 1;
+        int ret;
-        spin_lock(&tree->buffer_lock);
+        rcu_read_lock();
-        eb = buffer_search(tree, start);
+        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-        if (eb) {
+        if (eb && atomic_inc_not_zero(&eb->refs)) {
-                atomic_inc(&eb->refs);
+                rcu_read_unlock();
-                spin_unlock(&tree->buffer_lock);
                mark_page_accessed(eb->first_page);
                return eb;
        }
-        spin_unlock(&tree->buffer_lock);
+        rcu_read_unlock();
-        eb = __alloc_extent_buffer(tree, start, len, mask);
+        eb = __alloc_extent_buffer(tree, start, len, GFP_NOFS);
        if (!eb)
                return NULL;
@@ -3145,7 +3119,7 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                i = 0;
        }
        for (; i < num_pages; i++, index++) {
-                p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM);
+                p = find_or_create_page(mapping, index, GFP_NOFS | __GFP_HIGHMEM);
                if (!p) {
                        WARN_ON(1);
                        goto free_eb;
@@ -3160,50 +3134,77 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                }
                if (!PageUptodate(p))
                        uptodate = 0;
-                unlock_page(p);
+                /*
+                 * see below about how we avoid a nasty race with release page
+                 * and why we unlock later
+                 */
+                if (i != 0)
+                        unlock_page(p);
        }
        if (uptodate)
                set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+        ret = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
+        if (ret)
+                goto free_eb;
        spin_lock(&tree->buffer_lock);
-        exists = buffer_tree_insert(tree, start, &eb->rb_node);
+        ret = radix_tree_insert(&tree->buffer, start >> PAGE_CACHE_SHIFT, eb);
-        if (exists) {
+        if (ret == -EEXIST) {
+                exists = radix_tree_lookup(&tree->buffer,
+                                                start >> PAGE_CACHE_SHIFT);
                /* add one reference for the caller */
                atomic_inc(&exists->refs);
                spin_unlock(&tree->buffer_lock);
+                radix_tree_preload_end();
                goto free_eb;
        }
        /* add one reference for the tree */
        atomic_inc(&eb->refs);
        spin_unlock(&tree->buffer_lock);
+        radix_tree_preload_end();
+        /*
+         * there is a race where release page may have
+         * tried to find this extent buffer in the radix
+         * but failed.  It will tell the VM it is safe to
+         * reclaim the, and it will clear the page private bit.
+         * We must make sure to set the page private bit properly
+         * after the extent buffer is in the radix tree so
+         * it doesn't get lost
+         */
+        set_page_extent_mapped(eb->first_page);
+        set_page_extent_head(eb->first_page, eb->len);
+        if (!page0)
+                unlock_page(eb->first_page);
        return eb;
 free_eb:
+        if (eb->first_page && !page0)
+                unlock_page(eb->first_page);
        if (!atomic_dec_and_test(&eb->refs))
                return exists;
-        for (index = 1; index < i; index++)
+        btrfs_release_extent_buffer(eb);
-                page_cache_release(extent_buffer_page(eb, index));
-        page_cache_release(extent_buffer_page(eb, 0));
-        __free_extent_buffer(eb);
        return exists;
 }
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-                                         u64 start, unsigned long len,
+                                         u64 start, unsigned long len)
-                                          gfp_t mask)
 {
        struct extent_buffer *eb;
-        spin_lock(&tree->buffer_lock);
+        rcu_read_lock();
-        eb = buffer_search(tree, start);
+        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-        if (eb)
+        if (eb && atomic_inc_not_zero(&eb->refs)) {
-                atomic_inc(&eb->refs);
+                rcu_read_unlock();
-        spin_unlock(&tree->buffer_lock);
-        if (eb)
                mark_page_accessed(eb->first_page);
+                return eb;
+        }
+        rcu_read_unlock();
-        return eb;
+        return NULL;
 }
 void free_extent_buffer(struct extent_buffer *eb)
@@ -3232,10 +3233,11 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                        continue;
                lock_page(page);
+                WARN_ON(!PagePrivate(page));
+                set_page_extent_mapped(page);
                if (i == 0)
                        set_page_extent_head(page, eb->len);
-                else
-                        set_page_private(page, EXTENT_PAGE_PRIVATE);
                clear_page_dirty_for_io(page);
                spin_lock_irq(&page->mapping->tree_lock);
@@ -3250,13 +3252,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
        return 0;
 }
-int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
-                                    struct extent_buffer *eb)
-{
-        return wait_on_extent_writeback(tree, eb->start,
-                                        eb->start + eb->len - 1);
-}
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
                             struct extent_buffer *eb)
 {
@@ -3302,7 +3297,7 @@ int set_extent_buffer_uptodate(struct extent_io_tree *tree,
        num_pages = num_extent_pages(eb->start, eb->len);
        set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1,
-                            GFP_NOFS);
+                            NULL, GFP_NOFS);
        for (i = 0; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
                if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
@@ -3425,6 +3420,13 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
        for (i = start_i; i < num_pages; i++) {
                page = extent_buffer_page(eb, i);
+                WARN_ON(!PagePrivate(page));
+                set_page_extent_mapped(page);
+                if (i == 0)
+                        set_page_extent_head(page, eb->len);
                if (inc_all_pages)
                        page_cache_get(page);
                if (!PageUptodate(page)) {
@@ -3530,6 +3532,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
                       "wanted %lu %lu\n", (unsigned long long)eb->start,
                       eb->len, start, min_len);
                WARN_ON(1);
+                return -EINVAL;
        }
        p = extent_buffer_page(eb, i);
@@ -3722,6 +3725,12 @@ static void move_pages(struct page *dst_page, struct page *src_page,
        kunmap_atomic(dst_kaddr, KM_USER0);
 }
+static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
+{
+        unsigned long distance = (src > dst) ? src - dst : dst - src;
+        return distance < len;
+}
 static void copy_pages(struct page *dst_page, struct page *src_page,
                       unsigned long dst_off, unsigned long src_off,
                       unsigned long len)
@@ -3729,10 +3738,12 @@ static void copy_pages(struct page *dst_page, struct page *src_page,
        char *dst_kaddr = kmap_atomic(dst_page, KM_USER0);
        char *src_kaddr;
-        if (dst_page != src_page)
+        if (dst_page != src_page) {
                src_kaddr = kmap_atomic(src_page, KM_USER1);
-        else
+        } else {
                src_kaddr = dst_kaddr;
+                BUG_ON(areas_overlap(src_off, dst_off, len));
+        }
        memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len);
        kunmap_atomic(dst_kaddr, KM_USER0);
@@ -3807,7 +3818,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                       "len %lu len %lu\n", dst_offset, len, dst->len);
                BUG_ON(1);
        }
-        if (dst_offset < src_offset) {
+        if (!areas_overlap(src_offset, dst_offset, len)) {
                memcpy_extent_buffer(dst, dst_offset, src_offset, len);
                return;
        }
@@ -3833,34 +3844,47 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
        }
 }
+static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head)
+{
+        struct extent_buffer *eb =
+                        container_of(head, struct extent_buffer, rcu_head);
+        btrfs_release_extent_buffer(eb);
+}
 int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
 {
        u64 start = page_offset(page);
        struct extent_buffer *eb;
        int ret = 1;
-        unsigned long i;
-        unsigned long num_pages;
        spin_lock(&tree->buffer_lock);
-        eb = buffer_search(tree, start);
+        eb = radix_tree_lookup(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-        if (!eb)
+        if (!eb) {
-                goto out;
+                spin_unlock(&tree->buffer_lock);
+                return ret;
+        }
-        if (atomic_read(&eb->refs) > 1) {
+        if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
                ret = 0;
                goto out;
        }
-        if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+        /*
+         * set @eb->refs to 0 if it is already 1, and then release the @eb.
+         * Or go back.
+         */
+        if (atomic_cmpxchg(&eb->refs, 1, 0) != 1) {
                ret = 0;
                goto out;
        }
-        /* at this point we can safely release the extent buffer */
-        num_pages = num_extent_pages(eb->start, eb->len);
+        radix_tree_delete(&tree->buffer, start >> PAGE_CACHE_SHIFT);
-        for (i = 0; i < num_pages; i++)
-                page_cache_release(extent_buffer_page(eb, i));
-        rb_erase(&eb->rb_node, &tree->buffer);
-        __free_extent_buffer(eb);
 out:
        spin_unlock(&tree->buffer_lock);
+        /* at this point we can safely release the extent buffer */
+        if (atomic_read(&eb->refs) == 0)
+                call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu);
        return ret;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 5691c7b590da..a11a92ee2d30 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -20,13 +20,18 @@
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
-/* flags for bio submission */
+/*
+ * flags for bio submission. The high bits indicate the compression
+ * type for this bio
+ */
 #define EXTENT_BIO_COMPRESSED 1
+#define EXTENT_BIO_FLAG_SHIFT 16
 /* these are bit numbers for test/set bit */
 #define EXTENT_BUFFER_UPTODATE 0
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
+#define EXTENT_BUFFER_CORRUPT 3
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
@@ -85,7 +90,7 @@ struct extent_io_ops {
 struct extent_io_tree {
        struct rb_root state;
-        struct rb_root buffer;
+        struct radix_tree_root buffer;
        struct address_space *mapping;
        u64 dirty_bytes;
        spinlock_t lock;
@@ -121,9 +126,9 @@ struct extent_buffer {
        unsigned long map_len;
        struct page *first_page;
        unsigned long bflags;
-        atomic_t refs;
        struct list_head leak_list;
-        struct rb_node rb_node;
+        struct rcu_head rcu_head;
+        atomic_t refs;
        /* the spinlock is used to protect most operations */
        spinlock_t lock;
@@ -135,25 +140,27 @@ struct extent_buffer {
        wait_queue_head_t lock_wq;
 };
-struct extent_map_tree;
+static inline void extent_set_compress_type(unsigned long *bio_flags,
+                                            int compress_type)
+{
+        *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
+}
-static inline struct extent_state *extent_state_next(struct extent_state *state)
+static inline int extent_compress_type(unsigned long bio_flags)
 {
-        struct rb_node *node;
+        return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
-        node = rb_next(&state->rb_node);
-        if (!node)
-                return NULL;
-        return rb_entry(node, struct extent_state, rb_node);
 }
+struct extent_map_tree;
 typedef struct extent_map *(get_extent_t)(struct inode *inode,
                                          struct page *page,
-                                          size_t page_offset,
+                                          size_t pg_offset,
                                          u64 start, u64 len,
                                          int create);
 void extent_io_tree_init(struct extent_io_tree *tree,
-                          struct address_space *mapping, gfp_t mask);
+                         struct address_space *mapping);
 int try_release_extent_mapping(struct extent_map_tree *map,
                               struct extent_io_tree *tree, struct page *page,
                               gfp_t mask);
@@ -176,7 +183,7 @@ void extent_io_exit(void);
 u64 count_range_bits(struct extent_io_tree *tree,
                     u64 *start, u64 search_end,
-                     u64 max_bytes, unsigned long bits);
+                     u64 max_bytes, unsigned long bits, int contig);
 void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
@@ -192,21 +199,15 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
                   int bits, int exclusive_bits, u64 *failed_start,
                   struct extent_state **cached_state, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-                        gfp_t mask);
+                        struct extent_state **cached_state, gfp_t mask);
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
                   gfp_t mask);
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
                       gfp_t mask);
-int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-                       gfp_t mask);
-int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
-                                  u64 end, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
                        struct extent_state **cached_state, gfp_t mask);
-int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
-                     gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
                          u64 *start_ret, u64 *end_ret, int bits);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
@@ -227,28 +228,17 @@ int extent_readpages(struct extent_io_tree *tree,
                     struct address_space *mapping,
                     struct list_head *pages, unsigned nr_pages,
                     get_extent_t get_extent);
-int extent_prepare_write(struct extent_io_tree *tree,
-                         struct inode *inode, struct page *page,
-                         unsigned from, unsigned to, get_extent_t *get_extent);
-int extent_commit_write(struct extent_io_tree *tree,
-                        struct inode *inode, struct page *page,
-                        unsigned from, unsigned to);
-sector_t extent_bmap(struct address_space *mapping, sector_t iblock,
-                get_extent_t *get_extent);
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len, get_extent_t *get_extent);
-int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end);
 int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
                                          u64 start, unsigned long len,
-                                          struct page *page0,
+                                          struct page *page0);
-                                          gfp_t mask);
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
-                                         u64 start, unsigned long len,
+                                         u64 start, unsigned long len);
-                                          gfp_t mask);
 void free_extent_buffer(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
                             struct extent_buffer *eb, u64 start, int wait,
@@ -276,16 +266,11 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
                           unsigned long src_offset, unsigned long len);
 void memset_extent_buffer(struct extent_buffer *eb, char c,
                          unsigned long start, unsigned long len);
-int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
-                                    struct extent_buffer *eb);
-int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
 int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
                              struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
                             struct extent_buffer *eb);
-int test_extent_buffer_dirty(struct extent_io_tree *tree,
-                             struct extent_buffer *eb);
 int set_extent_buffer_uptodate(struct extent_io_tree *tree,
                               struct extent_buffer *eb);
 int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -303,11 +288,13 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
                      unsigned long *map_start,
                      unsigned long *map_len, int km);
 void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km);
-int release_extent_buffer_tail_pages(struct extent_buffer *eb);
 int extent_range_uptodate(struct extent_io_tree *tree,
                          u64 start, u64 end);
 int extent_clear_unlock_delalloc(struct inode *inode,
                                struct extent_io_tree *tree,
                                u64 start, u64 end, struct page *locked_page,
                                unsigned long op);
+struct bio *
+btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
+                gfp_t gfp_flags);
 #endif
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 454ca52d6451..2d0410344ea3 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -3,6 +3,7 @@
 #include <linux/module.h>
 #include <linux/spinlock.h>
 #include <linux/hardirq.h>
+#include "ctree.h"
 #include "extent_map.h"
@@ -27,12 +28,11 @@ void extent_map_exit(void)
 /**
 * extent_map_tree_init - initialize extent map tree
 * @tree:               tree to initialize
- * @mask:               flags for memory allocations during tree operations
 *
 * Initialize the extent tree @tree.  Should be called for each new inode
 * or other user of the extent_map interface.
 */
-void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
+void extent_map_tree_init(struct extent_map_tree *tree)
 {
        tree->map = RB_ROOT;
        rwlock_init(&tree->lock);
@@ -40,20 +40,20 @@ void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask)
 /**
 * alloc_extent_map - allocate new extent map structure
- * @mask:       memory allocation flags
 *
 * Allocate a new extent_map structure.  The new structure is
 * returned with a reference count of one and needs to be
 * freed using free_extent_map()
 */
-struct extent_map *alloc_extent_map(gfp_t mask)
+struct extent_map *alloc_extent_map(void)
 {
        struct extent_map *em;
-        em = kmem_cache_alloc(extent_map_cache, mask);
+        em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
-        if (!em || IS_ERR(em))
+        if (!em)
-                return em;
+                return NULL;
        em->in_tree = 0;
        em->flags = 0;
+        em->compress_type = BTRFS_COMPRESS_NONE;
        atomic_set(&em->refs, 1);
        return em;
 }
@@ -241,7 +241,7 @@ out:
 * Insert @em into @tree or perform a simple forward/backward merge with
 * existing mappings.  The extent_map struct passed in will be inserted
 * into the tree directly, with an additional reference taken, or a
- * reference dropped if the merge attempt was successfull.
+ * reference dropped if the merge attempt was successful.
 */
 int add_extent_mapping(struct extent_map_tree *tree,
                       struct extent_map *em)
@@ -335,7 +335,7 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
                goto out;
        }
        if (IS_ERR(rb_node)) {
-                em = ERR_PTR(PTR_ERR(rb_node));
+                em = ERR_CAST(rb_node);
                goto out;
        }
        em = rb_entry(rb_node, struct extent_map, rb_node);
@@ -384,7 +384,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
                goto out;
        }
        if (IS_ERR(rb_node)) {
-                em = ERR_PTR(PTR_ERR(rb_node));
+                em = ERR_CAST(rb_node);
                goto out;
        }
        em = rb_entry(rb_node, struct extent_map, rb_node);
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index ab6d74b6e647..33a7890b1f40 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -26,7 +26,8 @@ struct extent_map {
        unsigned long flags;
        struct block_device *bdev;
        atomic_t refs;
-        int in_tree;
+        unsigned int in_tree:1;
+        unsigned int compress_type:4;
 };
 struct extent_map_tree {
@@ -48,14 +49,14 @@ static inline u64 extent_map_block_end(struct extent_map *em)
        return em->block_start + em->block_len;
 }
-void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask);
+void extent_map_tree_init(struct extent_map_tree *tree);
 struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
                                         u64 start, u64 len);
 int add_extent_mapping(struct extent_map_tree *tree,
                       struct extent_map *em);
 int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
-struct extent_map *alloc_extent_map(gfp_t mask);
+struct extent_map *alloc_extent_map(void);
 void free_extent_map(struct extent_map *em);
 int __init extent_map_init(void);
 void extent_map_exit(void);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a562a250ae77..90d4ee52cd45 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -48,7 +48,8 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        file_key.objectid = objectid;
        file_key.offset = pos;
        btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
@@ -169,6 +170,8 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        if (bio->bi_size > PAGE_CACHE_SIZE * 8)
                path->reada = 2;
@@ -190,7 +193,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                        u32 item_size;
                        if (item)
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                        item = btrfs_lookup_csum(NULL, root->fs_info->csum_root,
                                                 path, disk_bytenr, 0);
                        if (IS_ERR(item)) {
@@ -205,12 +208,13 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
                                                EXTENT_NODATASUM, GFP_NOFS);
                                } else {
                                        printk(KERN_INFO "btrfs no csum found "
-                                               "for inode %lu start %llu\n",
+                                               "for inode %llu start %llu\n",
-                                               inode->i_ino,
+                                               (unsigned long long)
+                                               btrfs_ino(inode),
                                               (unsigned long long)offset);
                                }
                                item = NULL;
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                goto found;
                        }
                        btrfs_item_key_to_cpu(path->nodes[0], &found_key,
@@ -263,7 +267,7 @@ int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
 }
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
-                             struct list_head *list)
+                             struct list_head *list, int search_commit)
 {
        struct btrfs_key key;
        struct btrfs_path *path;
@@ -280,6 +284,12 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        if (search_commit) {
+                path->skip_locking = 1;
+                path->reada = 2;
+                path->search_commit_root = 1;
+        }
        key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
        key.offset = start;
        key.type = BTRFS_EXTENT_CSUM_KEY;
@@ -492,7 +502,6 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
                u32 new_size = (bytenr - key->offset) >> blocksize_bits;
                new_size *= csum_size;
                ret = btrfs_truncate_item(trans, root, path, new_size, 1);
-                BUG_ON(ret);
        } else if (key->offset >= bytenr && csum_end > end_byte &&
                   end_byte > key->offset) {
                /*
@@ -505,7 +514,6 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
                new_size *= csum_size;
                ret = btrfs_truncate_item(trans, root, path, new_size, 0);
-                BUG_ON(ret);
                key->offset = end_byte;
                ret = btrfs_set_item_key_safe(trans, root, path, key);
@@ -536,6 +544,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
        root = root->fs_info->csum_root;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        while (1) {
                key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -546,9 +556,12 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
                if (ret > 0) {
                        if (path->slots[0] == 0)
-                                goto out;
+                                break;
                        path->slots[0]--;
+                } else if (ret < 0) {
+                        break;
                }
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
@@ -571,7 +584,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                /* delete the entire item, it is inside our range */
                if (key.offset >= bytenr && csum_end <= end_byte) {
                        ret = btrfs_del_item(trans, root, path);
-                        BUG_ON(ret);
+                        if (ret)
+                                goto out;
                        if (key.offset == bytenr)
                                break;
                } else if (key.offset < bytenr && csum_end > end_byte) {
@@ -623,11 +637,12 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
                        if (key.offset < bytenr)
                                break;
                }
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
        }
+        ret = 0;
 out:
        btrfs_free_path(path);
-        return 0;
+        return ret;
 }
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
@@ -714,7 +729,7 @@ again:
         * at this point, we know the tree has an item, but it isn't big
         * enough yet to put our csum in.  Grow it
         */
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = btrfs_search_slot(trans, root, &file_key, path,
                                csum_size, 1);
        if (ret < 0)
@@ -753,12 +768,11 @@ again:
                        goto insert;
                ret = btrfs_extend_item(trans, root, path, diff);
-                BUG_ON(ret);
                goto csum;
        }
 insert:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        csum_offset = 0;
        if (found_next) {
                u64 tmp = total_bytes + root->sectorsize;
@@ -842,7 +856,7 @@ next_sector:
        }
        btrfs_mark_buffer_dirty(path->nodes[0]);
        if (total_bytes < sums->len) {
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                cond_resched();
                goto again;
        }
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e354c33df082..fa4ef18b66b1 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -24,6 +24,7 @@
 #include <linux/string.h>
 #include <linux/backing-dev.h>
 #include <linux/mpage.h>
+#include <linux/falloc.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/statfs.h>
@@ -39,16 +40,274 @@
 #include "locking.h"
 #include "compat.h"
+/*
+ * when auto defrag is enabled we
+ * queue up these defrag structs to remember which
+ * inodes need defragging passes
+ */
+struct inode_defrag {
+        struct rb_node rb_node;
+        /* objectid */
+        u64 ino;
+        /*
+         * transid where the defrag was added, we search for
+         * extents newer than this
+         */
+        u64 transid;
+        /* root objectid */
+        u64 root;
+        /* last offset we were able to defrag */
+        u64 last_offset;
+        /* if we've wrapped around back to zero once already */
+        int cycled;
+};
+/* pop a record for an inode into the defrag tree.  The lock
+ * must be held already
+ *
+ * If you're inserting a record for an older transid than an
+ * existing record, the transid already in the tree is lowered
+ *
+ * If an existing record is found the defrag item you
+ * pass in is freed
+ */
+static int __btrfs_add_inode_defrag(struct inode *inode,
+                                    struct inode_defrag *defrag)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct inode_defrag *entry;
+        struct rb_node **p;
+        struct rb_node *parent = NULL;
+        p = &root->fs_info->defrag_inodes.rb_node;
+        while (*p) {
+                parent = *p;
+                entry = rb_entry(parent, struct inode_defrag, rb_node);
+                if (defrag->ino < entry->ino)
+                        p = &parent->rb_left;
+                else if (defrag->ino > entry->ino)
+                        p = &parent->rb_right;
+                else {
+                        /* if we're reinserting an entry for
+                         * an old defrag run, make sure to
+                         * lower the transid of our existing record
+                         */
+                        if (defrag->transid < entry->transid)
+                                entry->transid = defrag->transid;
+                        if (defrag->last_offset > entry->last_offset)
+                                entry->last_offset = defrag->last_offset;
+                        goto exists;
+                }
+        }
+        BTRFS_I(inode)->in_defrag = 1;
+        rb_link_node(&defrag->rb_node, parent, p);
+        rb_insert_color(&defrag->rb_node, &root->fs_info->defrag_inodes);
+        return 0;
+exists:
+        kfree(defrag);
+        return 0;
+}
+/*
+ * insert a defrag record for this inode if auto defrag is
+ * enabled
+ */
+int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
+                           struct inode *inode)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct inode_defrag *defrag;
+        int ret = 0;
+        u64 transid;
+        if (!btrfs_test_opt(root, AUTO_DEFRAG))
+                return 0;
+        if (btrfs_fs_closing(root->fs_info))
+                return 0;
+        if (BTRFS_I(inode)->in_defrag)
+                return 0;
+        if (trans)
+                transid = trans->transid;
+        else
+                transid = BTRFS_I(inode)->root->last_trans;
+        defrag = kzalloc(sizeof(*defrag), GFP_NOFS);
+        if (!defrag)
+                return -ENOMEM;
+        defrag->ino = btrfs_ino(inode);
+        defrag->transid = transid;
+        defrag->root = root->root_key.objectid;
+        spin_lock(&root->fs_info->defrag_inodes_lock);
+        if (!BTRFS_I(inode)->in_defrag)
+                ret = __btrfs_add_inode_defrag(inode, defrag);
+        spin_unlock(&root->fs_info->defrag_inodes_lock);
+        return ret;
+}
+/*
+ * must be called with the defrag_inodes lock held
+ */
+struct inode_defrag *btrfs_find_defrag_inode(struct btrfs_fs_info *info, u64 ino,
+                                             struct rb_node **next)
+{
+        struct inode_defrag *entry = NULL;
+        struct rb_node *p;
+        struct rb_node *parent = NULL;
+        p = info->defrag_inodes.rb_node;
+        while (p) {
+                parent = p;
+                entry = rb_entry(parent, struct inode_defrag, rb_node);
+                if (ino < entry->ino)
+                        p = parent->rb_left;
+                else if (ino > entry->ino)
+                        p = parent->rb_right;
+                else
+                        return entry;
+        }
+        if (next) {
+                while (parent && ino > entry->ino) {
+                        parent = rb_next(parent);
+                        entry = rb_entry(parent, struct inode_defrag, rb_node);
+                }
+                *next = parent;
+        }
+        return NULL;
+}
+/*
+ * run through the list of inodes in the FS that need
+ * defragging
+ */
+int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
+{
+        struct inode_defrag *defrag;
+        struct btrfs_root *inode_root;
+        struct inode *inode;
+        struct rb_node *n;
+        struct btrfs_key key;
+        struct btrfs_ioctl_defrag_range_args range;
+        u64 first_ino = 0;
+        int num_defrag;
+        int defrag_batch = 1024;
+        memset(&range, 0, sizeof(range));
+        range.len = (u64)-1;
+        atomic_inc(&fs_info->defrag_running);
+        spin_lock(&fs_info->defrag_inodes_lock);
+        while(1) {
+                n = NULL;
+                /* find an inode to defrag */
+                defrag = btrfs_find_defrag_inode(fs_info, first_ino, &n);
+                if (!defrag) {
+                        if (n)
+                                defrag = rb_entry(n, struct inode_defrag, rb_node);
+                        else if (first_ino) {
+                                first_ino = 0;
+                                continue;
+                        } else {
+                                break;
+                        }
+                }
+                /* remove it from the rbtree */
+                first_ino = defrag->ino + 1;
+                rb_erase(&defrag->rb_node, &fs_info->defrag_inodes);
+                if (btrfs_fs_closing(fs_info))
+                        goto next_free;
+                spin_unlock(&fs_info->defrag_inodes_lock);
+                /* get the inode */
+                key.objectid = defrag->root;
+                btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
+                key.offset = (u64)-1;
+                inode_root = btrfs_read_fs_root_no_name(fs_info, &key);
+                if (IS_ERR(inode_root))
+                        goto next;
+                key.objectid = defrag->ino;
+                btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
+                key.offset = 0;
+                inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL);
+                if (IS_ERR(inode))
+                        goto next;
+                /* do a chunk of defrag */
+                BTRFS_I(inode)->in_defrag = 0;
+                range.start = defrag->last_offset;
+                num_defrag = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
+                                               defrag_batch);
+                /*
+                 * if we filled the whole defrag batch, there
+                 * must be more work to do.  Queue this defrag
+                 * again
+                 */
+                if (num_defrag == defrag_batch) {
+                        defrag->last_offset = range.start;
+                        __btrfs_add_inode_defrag(inode, defrag);
+                        /*
+                         * we don't want to kfree defrag, we added it back to
+                         * the rbtree
+                         */
+                        defrag = NULL;
+                } else if (defrag->last_offset && !defrag->cycled) {
+                        /*
+                         * we didn't fill our defrag batch, but
+                         * we didn't start at zero.  Make sure we loop
+                         * around to the start of the file.
+                         */
+                        defrag->last_offset = 0;
+                        defrag->cycled = 1;
+                        __btrfs_add_inode_defrag(inode, defrag);
+                        defrag = NULL;
+                }
+                iput(inode);
+next:
+                spin_lock(&fs_info->defrag_inodes_lock);
+next_free:
+                kfree(defrag);
+        }
+        spin_unlock(&fs_info->defrag_inodes_lock);
+        atomic_dec(&fs_info->defrag_running);
+        /*
+         * during unmount, we use the transaction_wait queue to
+         * wait for the defragger to stop
+         */
+        wake_up(&fs_info->transaction_wait);
+        return 0;
+}
 /* simple helper to fault in pages and copy.  This should go away
 * and be replaced with calls into generic code.
 */
 static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
-                                         int write_bytes,
+                                         size_t write_bytes,
                                         struct page **prepared_pages,
                                         struct iov_iter *i)
 {
-        size_t copied;
+        size_t copied = 0;
+        size_t total_copied = 0;
        int pg = 0;
        int offset = pos & (PAGE_CACHE_SIZE - 1);
@@ -56,23 +315,38 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
                size_t count = min_t(size_t,
                                     PAGE_CACHE_SIZE - offset, write_bytes);
                struct page *page = prepared_pages[pg];
-again:
+                /*
-                if (unlikely(iov_iter_fault_in_readable(i, count)))
+                 * Copy data from userspace to the current page
-                        return -EFAULT;
+                 *
+                 * Disable pagefault to avoid recursive lock since
-                /* Copy data from userspace to the current page */
+                 * the pages are already locked
-                copied = iov_iter_copy_from_user(page, i, offset, count);
+                 */
+                pagefault_disable();
+                copied = iov_iter_copy_from_user_atomic(page, i, offset, count);
+                pagefault_enable();
                /* Flush processor's dcache for this page */
                flush_dcache_page(page);
+                /*
+                 * if we get a partial write, we can end up with
+                 * partially up to date pages.  These add
+                 * a lot of complexity, so make sure they don't
+                 * happen by forcing this copy to be retried.
+                 *
+                 * The rest of the btrfs_file_write code will fall
+                 * back to page at a time copies after we return 0.
+                 */
+                if (!PageUptodate(page) && copied < count)
+                        copied = 0;
                iov_iter_advance(i, copied);
                write_bytes -= copied;
+                total_copied += copied;
-                if (unlikely(copied == 0)) {
+                /* Return to btrfs_file_aio_write to fault page */
-                        count = min_t(size_t, PAGE_CACHE_SIZE - offset,
+                if (unlikely(copied == 0))
-                                      iov_iter_single_seg_count(i));
+                        break;
-                        goto again;
-                }
                if (unlikely(copied < PAGE_CACHE_SIZE - offset)) {
                        offset += copied;
@@ -81,18 +355,16 @@ again:
                        offset = 0;
                }
        }
-        return 0;
+        return total_copied;
 }
 /*
 * unlocks pages after btrfs_file_write is done with them
 */
-static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
+void btrfs_drop_pages(struct page **pages, size_t num_pages)
 {
        size_t i;
        for (i = 0; i < num_pages; i++) {
-                if (!pages[i])
-                        break;
                /* page checked is some magic around finding pages that
                 * have been modified without going through btrfs_set_page_dirty
                 * clear it here
@@ -112,17 +384,13 @@ static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages)
 * this also makes the decision about creating an inline extent vs
 * doing real data extents, marking pages dirty and delalloc as required.
 */
-static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
+int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
-                                   struct btrfs_root *root,
+                      struct page **pages, size_t num_pages,
-                                   struct file *file,
+                      loff_t pos, size_t write_bytes,
-                                   struct page **pages,
+                      struct extent_state **cached)
-                                   size_t num_pages,
-                                   loff_t pos,
-                                   size_t write_bytes)
 {
        int err = 0;
        int i;
-        struct inode *inode = fdentry(file)->d_inode;
        u64 num_bytes;
        u64 start_pos;
        u64 end_of_last_block;
@@ -135,8 +403,9 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
        end_of_last_block = start_pos + num_bytes - 1;
        err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
-                                        NULL);
+                                        cached);
-        BUG_ON(err);
+        if (err)
+                return err;
        for (i = 0; i < num_pages; i++) {
                struct page *p = pages[i];
@@ -144,13 +413,14 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
                ClearPageChecked(p);
                set_page_dirty(p);
        }
-        if (end_pos > isize) {
+        /*
+         * we've only changed i_size in ram, and we haven't updated
+         * the disk i_size.  There is no need to log the inode
+         * at this time.
+         */
+        if (end_pos > isize)
                i_size_write(inode, end_pos);
-                /* we've only changed i_size in ram, and we haven't updated
-                 * the disk i_size.  There is no need to log the inode
-                 * at this time.
-                 */
-        }
        return 0;
 }
@@ -178,9 +448,10 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
        }
        while (1) {
                if (!split)
-                        split = alloc_extent_map(GFP_NOFS);
+                        split = alloc_extent_map();
                if (!split2)
-                        split2 = alloc_extent_map(GFP_NOFS);
+                        split2 = alloc_extent_map();
+                BUG_ON(!split || !split2);
                write_lock(&em_tree->lock);
                em = lookup_extent_mapping(em_tree, start, len);
@@ -220,6 +491,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->bdev = em->bdev;
                        split->flags = flags;
+                        split->compress_type = em->compress_type;
                        ret = add_extent_mapping(em_tree, split);
                        BUG_ON(ret);
                        free_extent_map(split);
@@ -234,6 +506,7 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
                        split->len = em->start + em->len - (start + len);
                        split->bdev = em->bdev;
                        split->flags = flags;
+                        split->compress_type = em->compress_type;
                        if (compressed) {
                                split->block_len = em->block_len;
@@ -282,6 +555,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
        struct btrfs_path *path;
        struct btrfs_key key;
        struct btrfs_key new_key;
+        u64 ino = btrfs_ino(inode);
        u64 search_start = start;
        u64 disk_bytenr = 0;
        u64 num_bytes = 0;
@@ -302,14 +576,14 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode,
        while (1) {
                recow = 0;
-                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                ret = btrfs_lookup_file_extent(trans, root, path, ino,
                                               search_start, -1);
                if (ret < 0)
                        break;
                if (ret > 0 && path->slots[0] > 0 && search_start == start) {
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
-                        if (key.objectid == inode->i_ino &&
+                        if (key.objectid == ino &&
                            key.type == BTRFS_EXTENT_DATA_KEY)
                                path->slots[0]--;
                }
@@ -330,7 +604,7 @@ next_slot:
                }
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                if (key.objectid > inode->i_ino ||
+                if (key.objectid > ino ||
                    key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end)
                        break;
@@ -360,7 +634,7 @@ next_slot:
                search_start = max(key.offset, start);
                if (recow) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        continue;
                }
@@ -377,7 +651,7 @@ next_slot:
                        ret = btrfs_duplicate_item(trans, root, path,
                                                   &new_key);
                        if (ret == -EAGAIN) {
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                continue;
                        }
                        if (ret < 0)
@@ -500,7 +774,7 @@ next_slot:
                        del_nr = 0;
                        del_slot = 0;
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        continue;
                }
@@ -576,6 +850,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
        int del_slot = 0;
        int recow;
        int ret;
+        u64 ino = btrfs_ino(inode);
        btrfs_drop_extent_cache(inode, start, end - 1, 0);
@@ -584,18 +859,19 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 again:
        recow = 0;
        split = start;
-        key.objectid = inode->i_ino;
+        key.objectid = ino;
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = split;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+        if (ret < 0)
+                goto out;
        if (ret > 0 && path->slots[0] > 0)
                path->slots[0]--;
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-        BUG_ON(key.objectid != inode->i_ino ||
+        BUG_ON(key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY);
-               key.type != BTRFS_EXTENT_DATA_KEY);
        fi = btrfs_item_ptr(leaf, path->slots[0],
                            struct btrfs_file_extent_item);
        BUG_ON(btrfs_file_extent_type(leaf, fi) !=
@@ -612,7 +888,7 @@ again:
                other_start = 0;
                other_end = start;
                if (extent_mergeable(leaf, path->slots[0] - 1,
-                                     inode->i_ino, bytenr, orig_offset,
+                                     ino, bytenr, orig_offset,
                                     &other_start, &other_end)) {
                        new_key.offset = end;
                        btrfs_set_item_key_safe(trans, root, path, &new_key);
@@ -635,7 +911,7 @@ again:
                other_start = end;
                other_end = 0;
                if (extent_mergeable(leaf, path->slots[0] + 1,
-                                     inode->i_ino, bytenr, orig_offset,
+                                     ino, bytenr, orig_offset,
                                     &other_start, &other_end)) {
                        fi = btrfs_item_ptr(leaf, path->slots[0],
                                            struct btrfs_file_extent_item);
@@ -663,7 +939,7 @@ again:
                new_key.offset = split;
                ret = btrfs_duplicate_item(trans, root, path, &new_key);
                if (ret == -EAGAIN) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto again;
                }
                BUG_ON(ret < 0);
@@ -684,7 +960,7 @@ again:
                ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0,
                                           root->root_key.objectid,
-                                           inode->i_ino, orig_offset);
+                                           ino, orig_offset);
                BUG_ON(ret);
                if (split == start) {
@@ -700,10 +976,10 @@ again:
        other_start = end;
        other_end = 0;
        if (extent_mergeable(leaf, path->slots[0] + 1,
-                             inode->i_ino, bytenr, orig_offset,
+                             ino, bytenr, orig_offset,
                             &other_start, &other_end)) {
                if (recow) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto again;
                }
                extent_end = other_end;
@@ -711,16 +987,16 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                        inode->i_ino, orig_offset);
+                                        ino, orig_offset);
                BUG_ON(ret);
        }
        other_start = 0;
        other_end = start;
        if (extent_mergeable(leaf, path->slots[0] - 1,
-                             inode->i_ino, bytenr, orig_offset,
+                             ino, bytenr, orig_offset,
                             &other_start, &other_end)) {
                if (recow) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto again;
                }
                key.offset = other_start;
@@ -728,7 +1004,7 @@ again:
                del_nr++;
                ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
                                        0, root->root_key.objectid,
-                                        inode->i_ino, orig_offset);
+                                        ino, orig_offset);
                BUG_ON(ret);
        }
        if (del_nr == 0) {
@@ -755,6 +1031,27 @@ out:
 }
 /*
+ * on error we return an unlocked page and the error value
+ * on success we return a locked page and 0
+ */
+static int prepare_uptodate_page(struct page *page, u64 pos)
+{
+        int ret = 0;
+        if ((pos & (PAGE_CACHE_SIZE - 1)) && !PageUptodate(page)) {
+                ret = btrfs_readpage(NULL, page);
+                if (ret)
+                        return ret;
+                lock_page(page);
+                if (!PageUptodate(page)) {
+                        unlock_page(page);
+                        return -EIO;
+                }
+        }
+        return 0;
+}
+/*
 * this gets pages into the page cache and locks them down, it also properly
 * waits for data=ordered extents to finish before allowing the pages to be
 * modified.
@@ -769,6 +1066,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        unsigned long index = pos >> PAGE_CACHE_SHIFT;
        struct inode *inode = fdentry(file)->d_inode;
        int err = 0;
+        int faili = 0;
        u64 start_pos;
        u64 last_pos;
@@ -776,21 +1074,33 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
        last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
        if (start_pos > inode->i_size) {
-                err = btrfs_cont_expand(inode, start_pos);
+                err = btrfs_cont_expand(inode, i_size_read(inode), start_pos);
                if (err)
                        return err;
        }
-        memset(pages, 0, num_pages * sizeof(struct page *));
 again:
        for (i = 0; i < num_pages; i++) {
                pages[i] = grab_cache_page(inode->i_mapping, index + i);
                if (!pages[i]) {
+                        faili = i - 1;
                        err = -ENOMEM;
-                        BUG_ON(1);
+                        goto fail;
+                }
+                if (i == 0)
+                        err = prepare_uptodate_page(pages[i], pos);
+                if (i == num_pages - 1)
+                        err = prepare_uptodate_page(pages[i],
+                                                    pos + write_bytes);
+                if (err) {
+                        page_cache_release(pages[i]);
+                        faili = i - 1;
+                        goto fail;
                }
                wait_on_page_writeback(pages[i]);
        }
+        err = 0;
        if (start_pos < inode->i_size) {
                struct btrfs_ordered_extent *ordered;
                lock_extent_bits(&BTRFS_I(inode)->io_tree,
@@ -830,199 +1140,264 @@ again:
                WARN_ON(!PageLocked(pages[i]));
        }
        return 0;
+fail:
+        while (faili >= 0) {
+                unlock_page(pages[faili]);
+                page_cache_release(pages[faili]);
+                faili--;
+        }
+        return err;
 }
-static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
+static noinline ssize_t __btrfs_buffered_write(struct file *file,
-                                    const struct iovec *iov,
+                                               struct iov_iter *i,
-                                    unsigned long nr_segs, loff_t pos)
+                                               loff_t pos)
 {
-        struct file *file = iocb->ki_filp;
        struct inode *inode = fdentry(file)->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        struct page *pinned[2];
        struct page **pages = NULL;
-        struct iov_iter i;
-        loff_t *ppos = &iocb->ki_pos;
-        loff_t start_pos;
-        ssize_t num_written = 0;
-        ssize_t err = 0;
-        size_t count;
-        size_t ocount;
-        int ret = 0;
-        int nrptrs;
        unsigned long first_index;
        unsigned long last_index;
-        int will_write;
+        size_t num_written = 0;
-        int buffered = 0;
+        int nrptrs;
+        int ret = 0;
-        will_write = ((file->f_flags & O_DSYNC) || IS_SYNC(inode) ||
+        nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) /
-                      (file->f_flags & O_DIRECT));
+                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
+                     (sizeof(struct page *)));
+        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
+        if (!pages)
+                return -ENOMEM;
-        pinned[0] = NULL;
+        first_index = pos >> PAGE_CACHE_SHIFT;
-        pinned[1] = NULL;
+        last_index = (pos + iov_iter_count(i)) >> PAGE_CACHE_SHIFT;
-        start_pos = pos;
+        while (iov_iter_count(i) > 0) {
+                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+                size_t write_bytes = min(iov_iter_count(i),
+                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
+                                         offset);
+                size_t num_pages = (write_bytes + offset +
+                                    PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+                size_t dirty_pages;
+                size_t copied;
-        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+                WARN_ON(num_pages > nrptrs);
-        mutex_lock(&inode->i_mutex);
+                /*
+                 * Fault pages before locking them in prepare_pages
+                 * to avoid recursive lock
+                 */
+                if (unlikely(iov_iter_fault_in_readable(i, write_bytes))) {
+                        ret = -EFAULT;
+                        break;
+                }
-        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
+                ret = btrfs_delalloc_reserve_space(inode,
-        if (err)
+                                        num_pages << PAGE_CACHE_SHIFT);
-                goto out;
+                if (ret)
-        count = ocount;
+                        break;
-        current->backing_dev_info = inode->i_mapping->backing_dev_info;
+                /*
-        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
+                 * This is going to setup the pages array with the number of
-        if (err)
+                 * pages we want, so we don't really need to worry about the
-                goto out;
+                 * contents of pages from loop to loop
+                 */
+                ret = prepare_pages(root, file, pages, num_pages,
+                                    pos, first_index, last_index,
+                                    write_bytes);
+                if (ret) {
+                        btrfs_delalloc_release_space(inode,
+                                        num_pages << PAGE_CACHE_SHIFT);
+                        break;
+                }
-        if (count == 0)
+                copied = btrfs_copy_from_user(pos, num_pages,
-                goto out;
+                                           write_bytes, pages, i);
-        err = file_remove_suid(file);
+                /*
-        if (err)
+                 * if we have trouble faulting in the pages, fall
-                goto out;
+                 * back to one page at a time
+                 */
+                if (copied < write_bytes)
+                        nrptrs = 1;
-        file_update_time(file);
+                if (copied == 0)
-        BTRFS_I(inode)->sequence++;
+                        dirty_pages = 0;
+                else
+                        dirty_pages = (copied + offset +
+                                       PAGE_CACHE_SIZE - 1) >>
+                                       PAGE_CACHE_SHIFT;
-        if (unlikely(file->f_flags & O_DIRECT)) {
-                num_written = generic_file_direct_write(iocb, iov, &nr_segs,
-                                                        pos, ppos, count,
-                                                        ocount);
                /*
-                 * the generic O_DIRECT will update in-memory i_size after the
+                 * If we had a short copy we need to release the excess delaloc
-                 * DIOs are done.  But our endio handlers that update the on
+                 * bytes we reserved.  We need to increment outstanding_extents
-                 * disk i_size never update past the in memory i_size.  So we
+                 * because btrfs_delalloc_release_space will decrement it, but
-                 * need one more update here to catch any additions to the
+                 * we still have an outstanding extent for the chunk we actually
-                 * file
+                 * managed to copy.
                 */
-                if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
+                if (num_pages > dirty_pages) {
-                        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+                        if (copied > 0)
-                        mark_inode_dirty(inode);
+                                atomic_inc(
+                                        &BTRFS_I(inode)->outstanding_extents);
+                        btrfs_delalloc_release_space(inode,
+                                        (num_pages - dirty_pages) <<
+                                        PAGE_CACHE_SHIFT);
                }
-                if (num_written < 0) {
+                if (copied > 0) {
-                        ret = num_written;
+                        ret = btrfs_dirty_pages(root, inode, pages,
-                        num_written = 0;
+                                                dirty_pages, pos, copied,
-                        goto out;
+                                                NULL);
-                } else if (num_written == count) {
+                        if (ret) {
-                        /* pick up pos changes done by the generic code */
+                                btrfs_delalloc_release_space(inode,
-                        pos = *ppos;
+                                        dirty_pages << PAGE_CACHE_SHIFT);
-                        goto out;
+                                btrfs_drop_pages(pages, num_pages);
+                                break;
+                        }
                }
-                /*
-                 * We are going to do buffered for the rest of the range, so we
+                btrfs_drop_pages(pages, num_pages);
-                 * need to make sure to invalidate the buffered pages when we're
-                 * done.
+                cond_resched();
-                 */
-                buffered = 1;
+                balance_dirty_pages_ratelimited_nr(inode->i_mapping,
-                pos += num_written;
+                                                   dirty_pages);
+                if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
+                        btrfs_btree_balance_dirty(root, 1);
+                btrfs_throttle(root);
+                pos += copied;
+                num_written += copied;
        }
-        iov_iter_init(&i, iov, nr_segs, count, num_written);
+        kfree(pages);
-        nrptrs = min((iov_iter_count(&i) + PAGE_CACHE_SIZE - 1) /
-                     PAGE_CACHE_SIZE, PAGE_CACHE_SIZE /
-                     (sizeof(struct page *)));
-        pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
-        /* generic_write_checks can change our pos */
+        return num_written ? num_written : ret;
-        start_pos = pos;
+}
-        first_index = pos >> PAGE_CACHE_SHIFT;
+static ssize_t __btrfs_direct_write(struct kiocb *iocb,
-        last_index = (pos + iov_iter_count(&i)) >> PAGE_CACHE_SHIFT;
+                                    const struct iovec *iov,
+                                    unsigned long nr_segs, loff_t pos,
+                                    loff_t *ppos, size_t count, size_t ocount)
+{
+        struct file *file = iocb->ki_filp;
+        struct inode *inode = fdentry(file)->d_inode;
+        struct iov_iter i;
+        ssize_t written;
+        ssize_t written_buffered;
+        loff_t endbyte;
+        int err;
+        written = generic_file_direct_write(iocb, iov, &nr_segs, pos, ppos,
+                                            count, ocount);
        /*
-         * there are lots of better ways to do this, but this code
+         * the generic O_DIRECT will update in-memory i_size after the
-         * makes sure the first and last page in the file range are
+         * DIOs are done.  But our endio handlers that update the on
-         * up to date and ready for cow
+         * disk i_size never update past the in memory i_size.  So we
+         * need one more update here to catch any additions to the
+         * file
         */
-        if ((pos & (PAGE_CACHE_SIZE - 1))) {
+        if (inode->i_size != BTRFS_I(inode)->disk_i_size) {
-                pinned[0] = grab_cache_page(inode->i_mapping, first_index);
+                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-                if (!PageUptodate(pinned[0])) {
+                mark_inode_dirty(inode);
-                        ret = btrfs_readpage(NULL, pinned[0]);
-                        BUG_ON(ret);
-                        wait_on_page_locked(pinned[0]);
-                } else {
-                        unlock_page(pinned[0]);
-                }
        }
-        if ((pos + iov_iter_count(&i)) & (PAGE_CACHE_SIZE - 1)) {
-                pinned[1] = grab_cache_page(inode->i_mapping, last_index);
+        if (written < 0 || written == count)
-                if (!PageUptodate(pinned[1])) {
+                return written;
-                        ret = btrfs_readpage(NULL, pinned[1]);
-                        BUG_ON(ret);
+        pos += written;
-                        wait_on_page_locked(pinned[1]);
+        count -= written;
-                } else {
+        iov_iter_init(&i, iov, nr_segs, count, written);
-                        unlock_page(pinned[1]);
+        written_buffered = __btrfs_buffered_write(file, &i, pos);
-                }
+        if (written_buffered < 0) {
+                err = written_buffered;
+                goto out;
        }
+        endbyte = pos + written_buffered - 1;
+        err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte);
+        if (err)
+                goto out;
+        written += written_buffered;
+        *ppos = pos + written_buffered;
+        invalidate_mapping_pages(file->f_mapping, pos >> PAGE_CACHE_SHIFT,
+                                 endbyte >> PAGE_CACHE_SHIFT);
+out:
+        return written ? written : err;
+}
-        while (iov_iter_count(&i) > 0) {
+static ssize_t btrfs_file_aio_write(struct kiocb *iocb,
-                size_t offset = pos & (PAGE_CACHE_SIZE - 1);
+                                    const struct iovec *iov,
-                size_t write_bytes = min(iov_iter_count(&i),
+                                    unsigned long nr_segs, loff_t pos)
-                                         nrptrs * (size_t)PAGE_CACHE_SIZE -
+{
-                                         offset);
+        struct file *file = iocb->ki_filp;
-                size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
+        struct inode *inode = fdentry(file)->d_inode;
-                                        PAGE_CACHE_SHIFT;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        loff_t *ppos = &iocb->ki_pos;
+        ssize_t num_written = 0;
+        ssize_t err = 0;
+        size_t count, ocount;
-                WARN_ON(num_pages > nrptrs);
+        vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
-                memset(pages, 0, sizeof(struct page *) * nrptrs);
-                ret = btrfs_delalloc_reserve_space(inode, write_bytes);
+        mutex_lock(&inode->i_mutex);
-                if (ret)
-                        goto out;
-                ret = prepare_pages(root, file, pages, num_pages,
+        err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-                                    pos, first_index, last_index,
+        if (err) {
-                                    write_bytes);
+                mutex_unlock(&inode->i_mutex);
-                if (ret) {
+                goto out;
-                        btrfs_delalloc_release_space(inode, write_bytes);
+        }
-                        goto out;
+        count = ocount;
-                }
-                ret = btrfs_copy_from_user(pos, num_pages,
+        current->backing_dev_info = inode->i_mapping->backing_dev_info;
-                                           write_bytes, pages, &i);
+        err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
-                if (ret == 0) {
+        if (err) {
-                        dirty_and_release_pages(NULL, root, file, pages,
+                mutex_unlock(&inode->i_mutex);
-                                                num_pages, pos, write_bytes);
+                goto out;
-                }
+        }
-                btrfs_drop_pages(pages, num_pages);
+        if (count == 0) {
-                if (ret) {
+                mutex_unlock(&inode->i_mutex);
-                        btrfs_delalloc_release_space(inode, write_bytes);
+                goto out;
-                        goto out;
+        }
-                }
-                if (will_write) {
+        err = file_remove_suid(file);
-                        filemap_fdatawrite_range(inode->i_mapping, pos,
+        if (err) {
-                                                 pos + write_bytes - 1);
+                mutex_unlock(&inode->i_mutex);
-                } else {
+                goto out;
-                        balance_dirty_pages_ratelimited_nr(inode->i_mapping,
+        }
-                                                           num_pages);
-                        if (num_pages <
-                            (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
-                                btrfs_btree_balance_dirty(root, 1);
-                        btrfs_throttle(root);
-                }
-                pos += write_bytes;
+        /*
-                num_written += write_bytes;
+         * If BTRFS flips readonly due to some impossible error
+         * (fs_info->fs_state now has BTRFS_SUPER_FLAG_ERROR),
+         * although we have opened a file as writable, we have
+         * to stop this write operation to ensure FS consistency.
+         */
+        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                mutex_unlock(&inode->i_mutex);
+                err = -EROFS;
+                goto out;
+        }
-                cond_resched();
+        file_update_time(file);
+        BTRFS_I(inode)->sequence++;
+        if (unlikely(file->f_flags & O_DIRECT)) {
+                num_written = __btrfs_direct_write(iocb, iov, nr_segs,
+                                                   pos, ppos, count, ocount);
+        } else {
+                struct iov_iter i;
+                iov_iter_init(&i, iov, nr_segs, count, num_written);
+                num_written = __btrfs_buffered_write(file, &i, pos);
+                if (num_written > 0)
+                        *ppos = pos + num_written;
        }
-out:
-        mutex_unlock(&inode->i_mutex);
-        if (ret)
-                err = ret;
-        kfree(pages);
+        mutex_unlock(&inode->i_mutex);
-        if (pinned[0])
-                page_cache_release(pinned[0]);
-        if (pinned[1])
-                page_cache_release(pinned[1]);
-        *ppos = pos;
        /*
         * we want to make sure fsync finds this change
@@ -1037,36 +1412,12 @@ out:
         * one running right now.
         */
        BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+        if (num_written > 0 || num_written == -EIOCBQUEUED) {
-        if (num_written > 0 && will_write) {
+                err = generic_write_sync(file, pos, num_written);
-                struct btrfs_trans_handle *trans;
+                if (err < 0 && num_written > 0)
-                err = btrfs_wait_ordered_range(inode, start_pos, num_written);
-                if (err)
                        num_written = err;
-                if ((file->f_flags & O_DSYNC) || IS_SYNC(inode)) {
-                        trans = btrfs_start_transaction(root, 0);
-                        ret = btrfs_log_dentry_safe(trans, root,
-                                                    file->f_dentry);
-                        if (ret == 0) {
-                                ret = btrfs_sync_log(trans, root);
-                                if (ret == 0)
-                                        btrfs_end_transaction(trans, root);
-                                else
-                                        btrfs_commit_transaction(trans, root);
-                        } else if (ret != BTRFS_NO_LOG_SYNC) {
-                                btrfs_commit_transaction(trans, root);
-                        } else {
-                                btrfs_end_transaction(trans, root);
-                        }
-                }
-                if (file->f_flags & O_DIRECT && buffered) {
-                        invalidate_mapping_pages(inode->i_mapping,
-                              start_pos >> PAGE_CACHE_SHIFT,
-                             (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
-                }
        }
+out:
        current->backing_dev_info = NULL;
        return num_written ? num_written : err;
 }
@@ -1109,6 +1460,7 @@ int btrfs_sync_file(struct file *file, int datasync)
        int ret = 0;
        struct btrfs_trans_handle *trans;
+        trace_btrfs_sync_file(file, datasync);
        /* we wait first, since the writeback may change the inode */
        root->log_batch++;
@@ -1128,14 +1480,12 @@ int btrfs_sync_file(struct file *file, int datasync)
         * the current transaction, we can bail out now without any
         * syncing
         */
-        mutex_lock(&root->fs_info->trans_mutex);
+        smp_mb();
        if (BTRFS_I(inode)->last_trans <=
            root->fs_info->last_trans_committed) {
                BTRFS_I(inode)->last_trans = 0;
-                mutex_unlock(&root->fs_info->trans_mutex);
                goto out;
        }
-        mutex_unlock(&root->fs_info->trans_mutex);
        /*
         * ok we haven't committed the transaction yet, lets do a commit
@@ -1202,6 +1552,118 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
        return 0;
 }
+static long btrfs_fallocate(struct file *file, int mode,
+                            loff_t offset, loff_t len)
+{
+        struct inode *inode = file->f_path.dentry->d_inode;
+        struct extent_state *cached_state = NULL;
+        u64 cur_offset;
+        u64 last_byte;
+        u64 alloc_start;
+        u64 alloc_end;
+        u64 alloc_hint = 0;
+        u64 locked_end;
+        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
+        struct extent_map *em;
+        int ret;
+        alloc_start = offset & ~mask;
+        alloc_end =  (offset + len + mask) & ~mask;
+        /* We only support the FALLOC_FL_KEEP_SIZE mode */
+        if (mode & ~FALLOC_FL_KEEP_SIZE)
+                return -EOPNOTSUPP;
+        /*
+         * wait for ordered IO before we have any locks.  We'll loop again
+         * below with the locks held.
+         */
+        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
+        mutex_lock(&inode->i_mutex);
+        ret = inode_newsize_ok(inode, alloc_end);
+        if (ret)
+                goto out;
+        if (alloc_start > inode->i_size) {
+                ret = btrfs_cont_expand(inode, i_size_read(inode),
+                                        alloc_start);
+                if (ret)
+                        goto out;
+        }
+        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
+        if (ret)
+                goto out;
+        locked_end = alloc_end - 1;
+        while (1) {
+                struct btrfs_ordered_extent *ordered;
+                /* the extent lock is ordered inside the running
+                 * transaction
+                 */
+                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
+                                 locked_end, 0, &cached_state, GFP_NOFS);
+                ordered = btrfs_lookup_first_ordered_extent(inode,
+                                                            alloc_end - 1);
+                if (ordered &&
+                    ordered->file_offset + ordered->len > alloc_start &&
+                    ordered->file_offset < alloc_end) {
+                        btrfs_put_ordered_extent(ordered);
+                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                             alloc_start, locked_end,
+                                             &cached_state, GFP_NOFS);
+                        /*
+                         * we can't wait on the range with the transaction
+                         * running or with the extent lock held
+                         */
+                        btrfs_wait_ordered_range(inode, alloc_start,
+                                                 alloc_end - alloc_start);
+                } else {
+                        if (ordered)
+                                btrfs_put_ordered_extent(ordered);
+                        break;
+                }
+        }
+        cur_offset = alloc_start;
+        while (1) {
+                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
+                                      alloc_end - cur_offset, 0);
+                BUG_ON(IS_ERR_OR_NULL(em));
+                last_byte = min(extent_map_end(em), alloc_end);
+                last_byte = (last_byte + mask) & ~mask;
+                if (em->block_start == EXTENT_MAP_HOLE ||
+                    (cur_offset >= inode->i_size &&
+                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
+                                                        last_byte - cur_offset,
+                                                        1 << inode->i_blkbits,
+                                                        offset + len,
+                                                        &alloc_hint);
+                        if (ret < 0) {
+                                free_extent_map(em);
+                                break;
+                        }
+                }
+                free_extent_map(em);
+                cur_offset = last_byte;
+                if (cur_offset >= alloc_end) {
+                        ret = 0;
+                        break;
+                }
+        }
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
+                             &cached_state, GFP_NOFS);
+        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+out:
+        mutex_unlock(&inode->i_mutex);
+        return ret;
+}
 const struct file_operations btrfs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
@@ -1213,6 +1675,7 @@ const struct file_operations btrfs_file_operations = {
        .open           = generic_file_open,
        .release        = btrfs_release_file,
        .fsync          = btrfs_sync_file,
+        .fallocate      = btrfs_fallocate,
        .unlocked_ioctl = btrfs_ioctl,
 #ifdef CONFIG_COMPAT
        .compat_ioctl   = btrfs_ioctl,
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f488fac04d99..bf0d61567f3d 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -23,34 +23,937 @@
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
+#include "disk-io.h"
+#include "extent_io.h"
+#include "inode-map.h"
 #define BITS_PER_BITMAP         (PAGE_CACHE_SIZE * 8)
 #define MAX_CACHE_BYTES_PER_GIG (32 * 1024)
-static inline unsigned long offset_to_bit(u64 bitmap_start, u64 sectorsize,
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
+                           struct btrfs_free_space *info);
+static struct inode *__lookup_free_space_inode(struct btrfs_root *root,
+                                               struct btrfs_path *path,
+                                               u64 offset)
+{
+        struct btrfs_key key;
+        struct btrfs_key location;
+        struct btrfs_disk_key disk_key;
+        struct btrfs_free_space_header *header;
+        struct extent_buffer *leaf;
+        struct inode *inode = NULL;
+        int ret;
+        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+        key.offset = offset;
+        key.type = 0;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                return ERR_PTR(ret);
+        if (ret > 0) {
+                btrfs_release_path(path);
+                return ERR_PTR(-ENOENT);
+        }
+        leaf = path->nodes[0];
+        header = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_free_space_header);
+        btrfs_free_space_key(leaf, header, &disk_key);
+        btrfs_disk_key_to_cpu(&location, &disk_key);
+        btrfs_release_path(path);
+        inode = btrfs_iget(root->fs_info->sb, &location, root, NULL);
+        if (!inode)
+                return ERR_PTR(-ENOENT);
+        if (IS_ERR(inode))
+                return inode;
+        if (is_bad_inode(inode)) {
+                iput(inode);
+                return ERR_PTR(-ENOENT);
+        }
+        inode->i_mapping->flags &= ~__GFP_FS;
+        return inode;
+}
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+                                      struct btrfs_block_group_cache
+                                      *block_group, struct btrfs_path *path)
+{
+        struct inode *inode = NULL;
+        spin_lock(&block_group->lock);
+        if (block_group->inode)
+                inode = igrab(block_group->inode);
+        spin_unlock(&block_group->lock);
+        if (inode)
+                return inode;
+        inode = __lookup_free_space_inode(root, path,
+                                          block_group->key.objectid);
+        if (IS_ERR(inode))
+                return inode;
+        spin_lock(&block_group->lock);
+        if (!btrfs_fs_closing(root->fs_info)) {
+                block_group->inode = igrab(inode);
+                block_group->iref = 1;
+        }
+        spin_unlock(&block_group->lock);
+        return inode;
+}
+int __create_free_space_inode(struct btrfs_root *root,
+                              struct btrfs_trans_handle *trans,
+                              struct btrfs_path *path, u64 ino, u64 offset)
+{
+        struct btrfs_key key;
+        struct btrfs_disk_key disk_key;
+        struct btrfs_free_space_header *header;
+        struct btrfs_inode_item *inode_item;
+        struct extent_buffer *leaf;
+        int ret;
+        ret = btrfs_insert_empty_inode(trans, root, path, ino);
+        if (ret)
+                return ret;
+        leaf = path->nodes[0];
+        inode_item = btrfs_item_ptr(leaf, path->slots[0],
+                                    struct btrfs_inode_item);
+        btrfs_item_key(leaf, &disk_key, path->slots[0]);
+        memset_extent_buffer(leaf, 0, (unsigned long)inode_item,
+                             sizeof(*inode_item));
+        btrfs_set_inode_generation(leaf, inode_item, trans->transid);
+        btrfs_set_inode_size(leaf, inode_item, 0);
+        btrfs_set_inode_nbytes(leaf, inode_item, 0);
+        btrfs_set_inode_uid(leaf, inode_item, 0);
+        btrfs_set_inode_gid(leaf, inode_item, 0);
+        btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
+        btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
+                              BTRFS_INODE_PREALLOC | BTRFS_INODE_NODATASUM);
+        btrfs_set_inode_nlink(leaf, inode_item, 1);
+        btrfs_set_inode_transid(leaf, inode_item, trans->transid);
+        btrfs_set_inode_block_group(leaf, inode_item, offset);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(path);
+        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+        key.offset = offset;
+        key.type = 0;
+        ret = btrfs_insert_empty_item(trans, root, path, &key,
+                                      sizeof(struct btrfs_free_space_header));
+        if (ret < 0) {
+                btrfs_release_path(path);
+                return ret;
+        }
+        leaf = path->nodes[0];
+        header = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_free_space_header);
+        memset_extent_buffer(leaf, 0, (unsigned long)header, sizeof(*header));
+        btrfs_set_free_space_key(leaf, header, &disk_key);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(path);
+        return 0;
+}
+int create_free_space_inode(struct btrfs_root *root,
+                            struct btrfs_trans_handle *trans,
+                            struct btrfs_block_group_cache *block_group,
+                            struct btrfs_path *path)
+{
+        int ret;
+        u64 ino;
+        ret = btrfs_find_free_objectid(root, &ino);
+        if (ret < 0)
+                return ret;
+        return __create_free_space_inode(root, trans, path, ino,
+                                         block_group->key.objectid);
+}
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+                                    struct btrfs_trans_handle *trans,
+                                    struct btrfs_path *path,
+                                    struct inode *inode)
+{
+        loff_t oldsize;
+        int ret = 0;
+        trans->block_rsv = root->orphan_block_rsv;
+        ret = btrfs_block_rsv_check(trans, root,
+                                    root->orphan_block_rsv,
+                                    0, 5);
+        if (ret)
+                return ret;
+        oldsize = i_size_read(inode);
+        btrfs_i_size_write(inode, 0);
+        truncate_pagecache(inode, oldsize, 0);
+        /*
+         * We don't need an orphan item because truncating the free space cache
+         * will never be split across transactions.
+         */
+        ret = btrfs_truncate_inode_items(trans, root, inode,
+                                         0, BTRFS_EXTENT_DATA_KEY);
+        if (ret) {
+                WARN_ON(1);
+                return ret;
+        }
+        ret = btrfs_update_inode(trans, root, inode);
+        return ret;
+}
+static int readahead_cache(struct inode *inode)
+{
+        struct file_ra_state *ra;
+        unsigned long last_index;
+        ra = kzalloc(sizeof(*ra), GFP_NOFS);
+        if (!ra)
+                return -ENOMEM;
+        file_ra_state_init(ra, inode->i_mapping);
+        last_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT;
+        page_cache_sync_readahead(inode->i_mapping, ra, NULL, 0, last_index);
+        kfree(ra);
+        return 0;
+}
+int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
+                            struct btrfs_free_space_ctl *ctl,
+                            struct btrfs_path *path, u64 offset)
+{
+        struct btrfs_free_space_header *header;
+        struct extent_buffer *leaf;
+        struct page *page;
+        u32 *checksums = NULL, *crc;
+        char *disk_crcs = NULL;
+        struct btrfs_key key;
+        struct list_head bitmaps;
+        u64 num_entries;
+        u64 num_bitmaps;
+        u64 generation;
+        u32 cur_crc = ~(u32)0;
+        pgoff_t index = 0;
+        unsigned long first_page_offset;
+        int num_checksums;
+        int ret = 0;
+        INIT_LIST_HEAD(&bitmaps);
+        /* Nothing in the space cache, goodbye */
+        if (!i_size_read(inode))
+                goto out;
+        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+        key.offset = offset;
+        key.type = 0;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        else if (ret > 0) {
+                btrfs_release_path(path);
+                ret = 0;
+                goto out;
+        }
+        ret = -1;
+        leaf = path->nodes[0];
+        header = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_free_space_header);
+        num_entries = btrfs_free_space_entries(leaf, header);
+        num_bitmaps = btrfs_free_space_bitmaps(leaf, header);
+        generation = btrfs_free_space_generation(leaf, header);
+        btrfs_release_path(path);
+        if (BTRFS_I(inode)->generation != generation) {
+                printk(KERN_ERR "btrfs: free space inode generation (%llu) did"
+                       " not match free space cache generation (%llu)\n",
+                       (unsigned long long)BTRFS_I(inode)->generation,
+                       (unsigned long long)generation);
+                goto out;
+        }
+        if (!num_entries)
+                goto out;
+        /* Setup everything for doing checksumming */
+        num_checksums = i_size_read(inode) / PAGE_CACHE_SIZE;
+        checksums = crc = kzalloc(sizeof(u32) * num_checksums, GFP_NOFS);
+        if (!checksums)
+                goto out;
+        first_page_offset = (sizeof(u32) * num_checksums) + sizeof(u64);
+        disk_crcs = kzalloc(first_page_offset, GFP_NOFS);
+        if (!disk_crcs)
+                goto out;
+        ret = readahead_cache(inode);
+        if (ret)
+                goto out;
+        while (1) {
+                struct btrfs_free_space_entry *entry;
+                struct btrfs_free_space *e;
+                void *addr;
+                unsigned long offset = 0;
+                unsigned long start_offset = 0;
+                int need_loop = 0;
+                if (!num_entries && !num_bitmaps)
+                        break;
+                if (index == 0) {
+                        start_offset = first_page_offset;
+                        offset = start_offset;
+                }
+                page = grab_cache_page(inode->i_mapping, index);
+                if (!page)
+                        goto free_cache;
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                printk(KERN_ERR "btrfs: error reading free "
+                                       "space cache\n");
+                                goto free_cache;
+                        }
+                }
+                addr = kmap(page);
+                if (index == 0) {
+                        u64 *gen;
+                        memcpy(disk_crcs, addr, first_page_offset);
+                        gen = addr + (sizeof(u32) * num_checksums);
+                        if (*gen != BTRFS_I(inode)->generation) {
+                                printk(KERN_ERR "btrfs: space cache generation"
+                                       " (%llu) does not match inode (%llu)\n",
+                                       (unsigned long long)*gen,
+                                       (unsigned long long)
+                                       BTRFS_I(inode)->generation);
+                                kunmap(page);
+                                unlock_page(page);
+                                page_cache_release(page);
+                                goto free_cache;
+                        }
+                        crc = (u32 *)disk_crcs;
+                }
+                entry = addr + start_offset;
+                /* First lets check our crc before we do anything fun */
+                cur_crc = ~(u32)0;
+                cur_crc = btrfs_csum_data(root, addr + start_offset, cur_crc,
+                                          PAGE_CACHE_SIZE - start_offset);
+                btrfs_csum_final(cur_crc, (char *)&cur_crc);
+                if (cur_crc != *crc) {
+                        printk(KERN_ERR "btrfs: crc mismatch for page %lu\n",
+                               index);
+                        kunmap(page);
+                        unlock_page(page);
+                        page_cache_release(page);
+                        goto free_cache;
+                }
+                crc++;
+                while (1) {
+                        if (!num_entries)
+                                break;
+                        need_loop = 1;
+                        e = kmem_cache_zalloc(btrfs_free_space_cachep,
+                                              GFP_NOFS);
+                        if (!e) {
+                                kunmap(page);
+                                unlock_page(page);
+                                page_cache_release(page);
+                                goto free_cache;
+                        }
+                        e->offset = le64_to_cpu(entry->offset);
+                        e->bytes = le64_to_cpu(entry->bytes);
+                        if (!e->bytes) {
+                                kunmap(page);
+                                kmem_cache_free(btrfs_free_space_cachep, e);
+                                unlock_page(page);
+                                page_cache_release(page);
+                                goto free_cache;
+                        }
+                        if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
+                                spin_lock(&ctl->tree_lock);
+                                ret = link_free_space(ctl, e);
+                                spin_unlock(&ctl->tree_lock);
+                                if (ret) {
+                                        printk(KERN_ERR "Duplicate entries in "
+                                               "free space cache, dumping\n");
+                                        kunmap(page);
+                                        unlock_page(page);
+                                        page_cache_release(page);
+                                        goto free_cache;
+                                }
+                        } else {
+                                e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+                                if (!e->bitmap) {
+                                        kunmap(page);
+                                        kmem_cache_free(
+                                                btrfs_free_space_cachep, e);
+                                        unlock_page(page);
+                                        page_cache_release(page);
+                                        goto free_cache;
+                                }
+                                spin_lock(&ctl->tree_lock);
+                                ret = link_free_space(ctl, e);
+                                ctl->total_bitmaps++;
+                                ctl->op->recalc_thresholds(ctl);
+                                spin_unlock(&ctl->tree_lock);
+                                if (ret) {
+                                        printk(KERN_ERR "Duplicate entries in "
+                                               "free space cache, dumping\n");
+                                        kunmap(page);
+                                        unlock_page(page);
+                                        page_cache_release(page);
+                                        goto free_cache;
+                                }
+                                list_add_tail(&e->list, &bitmaps);
+                        }
+                        num_entries--;
+                        offset += sizeof(struct btrfs_free_space_entry);
+                        if (offset + sizeof(struct btrfs_free_space_entry) >=
+                            PAGE_CACHE_SIZE)
+                                break;
+                        entry++;
+                }
+                /*
+                 * We read an entry out of this page, we need to move on to the
+                 * next page.
+                 */
+                if (need_loop) {
+                        kunmap(page);
+                        goto next;
+                }
+                /*
+                 * We add the bitmaps at the end of the entries in order that
+                 * the bitmap entries are added to the cache.
+                 */
+                e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+                list_del_init(&e->list);
+                memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
+                kunmap(page);
+                num_bitmaps--;
+next:
+                unlock_page(page);
+                page_cache_release(page);
+                index++;
+        }
+        ret = 1;
+out:
+        kfree(checksums);
+        kfree(disk_crcs);
+        return ret;
+free_cache:
+        __btrfs_remove_free_space_cache(ctl);
+        goto out;
+}
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+                          struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct btrfs_root *root = fs_info->tree_root;
+        struct inode *inode;
+        struct btrfs_path *path;
+        int ret;
+        bool matched;
+        u64 used = btrfs_block_group_used(&block_group->item);
+        /*
+         * If we're unmounting then just return, since this does a search on the
+         * normal root and not the commit root and we could deadlock.
+         */
+        if (btrfs_fs_closing(fs_info))
+                return 0;
+        /*
+         * If this block group has been marked to be cleared for one reason or
+         * another then we can't trust the on disk cache, so just return.
+         */
+        spin_lock(&block_group->lock);
+        if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+                spin_unlock(&block_group->lock);
+                return 0;
+        }
+        spin_unlock(&block_group->lock);
+        path = btrfs_alloc_path();
+        if (!path)
+                return 0;
+        inode = lookup_free_space_inode(root, block_group, path);
+        if (IS_ERR(inode)) {
+                btrfs_free_path(path);
+                return 0;
+        }
+        ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
+                                      path, block_group->key.objectid);
+        btrfs_free_path(path);
+        if (ret <= 0)
+                goto out;
+        spin_lock(&ctl->tree_lock);
+        matched = (ctl->free_space == (block_group->key.offset - used -
+                                       block_group->bytes_super));
+        spin_unlock(&ctl->tree_lock);
+        if (!matched) {
+                __btrfs_remove_free_space_cache(ctl);
+                printk(KERN_ERR "block group %llu has an wrong amount of free "
+                       "space\n", block_group->key.objectid);
+                ret = -1;
+        }
+out:
+        if (ret < 0) {
+                /* This cache is bogus, make sure it gets cleared */
+                spin_lock(&block_group->lock);
+                block_group->disk_cache_state = BTRFS_DC_CLEAR;
+                spin_unlock(&block_group->lock);
+                ret = 0;
+                printk(KERN_ERR "btrfs: failed to load free space cache "
+                       "for block group %llu\n", block_group->key.objectid);
+        }
+        iput(inode);
+        return ret;
+}
+int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
+                            struct btrfs_free_space_ctl *ctl,
+                            struct btrfs_block_group_cache *block_group,
+                            struct btrfs_trans_handle *trans,
+                            struct btrfs_path *path, u64 offset)
+{
+        struct btrfs_free_space_header *header;
+        struct extent_buffer *leaf;
+        struct rb_node *node;
+        struct list_head *pos, *n;
+        struct page **pages;
+        struct page *page;
+        struct extent_state *cached_state = NULL;
+        struct btrfs_free_cluster *cluster = NULL;
+        struct extent_io_tree *unpin = NULL;
+        struct list_head bitmap_list;
+        struct btrfs_key key;
+        u64 start, end, len;
+        u64 bytes = 0;
+        u32 *crc, *checksums;
+        unsigned long first_page_offset;
+        int index = 0, num_pages = 0;
+        int entries = 0;
+        int bitmaps = 0;
+        int ret = -1;
+        bool next_page = false;
+        bool out_of_space = false;
+        INIT_LIST_HEAD(&bitmap_list);
+        node = rb_first(&ctl->free_space_offset);
+        if (!node)
+                return 0;
+        if (!i_size_read(inode))
+                return -1;
+        num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+                PAGE_CACHE_SHIFT;
+        /* Since the first page has all of our checksums and our generation we
+         * need to calculate the offset into the page that we can start writing
+         * our entries.
+         */
+        first_page_offset = (sizeof(u32) * num_pages) + sizeof(u64);
+        filemap_write_and_wait(inode->i_mapping);
+        btrfs_wait_ordered_range(inode, inode->i_size &
+                                 ~(root->sectorsize - 1), (u64)-1);
+        /* make sure we don't overflow that first page */
+        if (first_page_offset + sizeof(struct btrfs_free_space_entry) >= PAGE_CACHE_SIZE) {
+                /* this is really the same as running out of space, where we also return 0 */
+                printk(KERN_CRIT "Btrfs: free space cache was too big for the crc page\n");
+                ret = 0;
+                goto out_update;
+        }
+        /* We need a checksum per page. */
+        crc = checksums = kzalloc(sizeof(u32) * num_pages, GFP_NOFS);
+        if (!crc)
+                return -1;
+        pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
+        if (!pages) {
+                kfree(crc);
+                return -1;
+        }
+        /* Get the cluster for this block_group if it exists */
+        if (block_group && !list_empty(&block_group->cluster_list))
+                cluster = list_entry(block_group->cluster_list.next,
+                                     struct btrfs_free_cluster,
+                                     block_group_list);
+        /*
+         * We shouldn't have switched the pinned extents yet so this is the
+         * right one
+         */
+        unpin = root->fs_info->pinned_extents;
+        /*
+         * Lock all pages first so we can lock the extent safely.
+         *
+         * NOTE: Because we hold the ref the entire time we're going to write to
+         * the page find_get_page should never fail, so we don't do a check
+         * after find_get_page at this point.  Just putting this here so people
+         * know and don't freak out.
+         */
+        while (index < num_pages) {
+                page = grab_cache_page(inode->i_mapping, index);
+                if (!page) {
+                        int i;
+                        for (i = 0; i < num_pages; i++) {
+                                unlock_page(pages[i]);
+                                page_cache_release(pages[i]);
+                        }
+                        goto out_free;
+                }
+                pages[index] = page;
+                index++;
+        }
+        index = 0;
+        lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
+                         0, &cached_state, GFP_NOFS);
+        /*
+         * When searching for pinned extents, we need to start at our start
+         * offset.
+         */
+        if (block_group)
+                start = block_group->key.objectid;
+        /* Write out the extent entries */
+        do {
+                struct btrfs_free_space_entry *entry;
+                void *addr;
+                unsigned long offset = 0;
+                unsigned long start_offset = 0;
+                next_page = false;
+                if (index == 0) {
+                        start_offset = first_page_offset;
+                        offset = start_offset;
+                }
+                if (index >= num_pages) {
+                        out_of_space = true;
+                        break;
+                }
+                page = pages[index];
+                addr = kmap(page);
+                entry = addr + start_offset;
+                memset(addr, 0, PAGE_CACHE_SIZE);
+                while (node && !next_page) {
+                        struct btrfs_free_space *e;
+                        e = rb_entry(node, struct btrfs_free_space, offset_index);
+                        entries++;
+                        entry->offset = cpu_to_le64(e->offset);
+                        entry->bytes = cpu_to_le64(e->bytes);
+                        if (e->bitmap) {
+                                entry->type = BTRFS_FREE_SPACE_BITMAP;
+                                list_add_tail(&e->list, &bitmap_list);
+                                bitmaps++;
+                        } else {
+                                entry->type = BTRFS_FREE_SPACE_EXTENT;
+                        }
+                        node = rb_next(node);
+                        if (!node && cluster) {
+                                node = rb_first(&cluster->root);
+                                cluster = NULL;
+                        }
+                        offset += sizeof(struct btrfs_free_space_entry);
+                        if (offset + sizeof(struct btrfs_free_space_entry) >=
+                            PAGE_CACHE_SIZE)
+                                next_page = true;
+                        entry++;
+                }
+                /*
+                 * We want to add any pinned extents to our free space cache
+                 * so we don't leak the space
+                 */
+                while (block_group && !next_page &&
+                       (start < block_group->key.objectid +
+                        block_group->key.offset)) {
+                        ret = find_first_extent_bit(unpin, start, &start, &end,
+                                                    EXTENT_DIRTY);
+                        if (ret) {
+                                ret = 0;
+                                break;
+                        }
+                        /* This pinned extent is out of our range */
+                        if (start >= block_group->key.objectid +
+                            block_group->key.offset)
+                                break;
+                        len = block_group->key.objectid +
+                                block_group->key.offset - start;
+                        len = min(len, end + 1 - start);
+                        entries++;
+                        entry->offset = cpu_to_le64(start);
+                        entry->bytes = cpu_to_le64(len);
+                        entry->type = BTRFS_FREE_SPACE_EXTENT;
+                        start = end + 1;
+                        offset += sizeof(struct btrfs_free_space_entry);
+                        if (offset + sizeof(struct btrfs_free_space_entry) >=
+                            PAGE_CACHE_SIZE)
+                                next_page = true;
+                        entry++;
+                }
+                *crc = ~(u32)0;
+                *crc = btrfs_csum_data(root, addr + start_offset, *crc,
+                                       PAGE_CACHE_SIZE - start_offset);
+                kunmap(page);
+                btrfs_csum_final(*crc, (char *)crc);
+                crc++;
+                bytes += PAGE_CACHE_SIZE;
+                index++;
+        } while (node || next_page);
+        /* Write out the bitmaps */
+        list_for_each_safe(pos, n, &bitmap_list) {
+                void *addr;
+                struct btrfs_free_space *entry =
+                        list_entry(pos, struct btrfs_free_space, list);
+                if (index >= num_pages) {
+                        out_of_space = true;
+                        break;
+                }
+                page = pages[index];
+                addr = kmap(page);
+                memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
+                *crc = ~(u32)0;
+                *crc = btrfs_csum_data(root, addr, *crc, PAGE_CACHE_SIZE);
+                kunmap(page);
+                btrfs_csum_final(*crc, (char *)crc);
+                crc++;
+                bytes += PAGE_CACHE_SIZE;
+                list_del_init(&entry->list);
+                index++;
+        }
+        if (out_of_space) {
+                btrfs_drop_pages(pages, num_pages);
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+                                     i_size_read(inode) - 1, &cached_state,
+                                     GFP_NOFS);
+                ret = 0;
+                goto out_free;
+        }
+        /* Zero out the rest of the pages just to make sure */
+        while (index < num_pages) {
+                void *addr;
+                page = pages[index];
+                addr = kmap(page);
+                memset(addr, 0, PAGE_CACHE_SIZE);
+                kunmap(page);
+                bytes += PAGE_CACHE_SIZE;
+                index++;
+        }
+        /* Write the checksums and trans id to the first page */
+        {
+                void *addr;
+                u64 *gen;
+                page = pages[0];
+                addr = kmap(page);
+                memcpy(addr, checksums, sizeof(u32) * num_pages);
+                gen = addr + (sizeof(u32) * num_pages);
+                *gen = trans->transid;
+                kunmap(page);
+        }
+        ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
+                                            bytes, &cached_state);
+        btrfs_drop_pages(pages, num_pages);
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+                             i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+        if (ret) {
+                ret = 0;
+                goto out_free;
+        }
+        BTRFS_I(inode)->generation = trans->transid;
+        filemap_write_and_wait(inode->i_mapping);
+        key.objectid = BTRFS_FREE_SPACE_OBJECTID;
+        key.offset = offset;
+        key.type = 0;
+        ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+        if (ret < 0) {
+                ret = -1;
+                clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+                                 EXTENT_DIRTY | EXTENT_DELALLOC |
+                                 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+                goto out_free;
+        }
+        leaf = path->nodes[0];
+        if (ret > 0) {
+                struct btrfs_key found_key;
+                BUG_ON(!path->slots[0]);
+                path->slots[0]--;
+                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+                if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
+                    found_key.offset != offset) {
+                        ret = -1;
+                        clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+                                         EXTENT_DIRTY | EXTENT_DELALLOC |
+                                         EXTENT_DO_ACCOUNTING, 0, 0, NULL,
+                                         GFP_NOFS);
+                        btrfs_release_path(path);
+                        goto out_free;
+                }
+        }
+        header = btrfs_item_ptr(leaf, path->slots[0],
+                                struct btrfs_free_space_header);
+        btrfs_set_free_space_entries(leaf, header, entries);
+        btrfs_set_free_space_bitmaps(leaf, header, bitmaps);
+        btrfs_set_free_space_generation(leaf, header, trans->transid);
+        btrfs_mark_buffer_dirty(leaf);
+        btrfs_release_path(path);
+        ret = 1;
+out_free:
+        kfree(checksums);
+        kfree(pages);
+out_update:
+        if (ret != 1) {
+                invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+                BTRFS_I(inode)->generation = 0;
+        }
+        btrfs_update_inode(trans, root, inode);
+        return ret;
+}
+int btrfs_write_out_cache(struct btrfs_root *root,
+                          struct btrfs_trans_handle *trans,
+                          struct btrfs_block_group_cache *block_group,
+                          struct btrfs_path *path)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct inode *inode;
+        int ret = 0;
+        root = root->fs_info->tree_root;
+        spin_lock(&block_group->lock);
+        if (block_group->disk_cache_state < BTRFS_DC_SETUP) {
+                spin_unlock(&block_group->lock);
+                return 0;
+        }
+        spin_unlock(&block_group->lock);
+        inode = lookup_free_space_inode(root, block_group, path);
+        if (IS_ERR(inode))
+                return 0;
+        ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
+                                      path, block_group->key.objectid);
+        if (ret < 0) {
+                spin_lock(&block_group->lock);
+                block_group->disk_cache_state = BTRFS_DC_ERROR;
+                spin_unlock(&block_group->lock);
+                ret = 0;
+                printk(KERN_ERR "btrfs: failed to write free space cace "
+                       "for block group %llu\n", block_group->key.objectid);
+        }
+        iput(inode);
+        return ret;
+}
+static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
                                          u64 offset)
 {
        BUG_ON(offset < bitmap_start);
        offset -= bitmap_start;
-        return (unsigned long)(div64_u64(offset, sectorsize));
+        return (unsigned long)(div_u64(offset, unit));
 }
-static inline unsigned long bytes_to_bits(u64 bytes, u64 sectorsize)
+static inline unsigned long bytes_to_bits(u64 bytes, u32 unit)
 {
-        return (unsigned long)(div64_u64(bytes, sectorsize));
+        return (unsigned long)(div_u64(bytes, unit));
 }
-static inline u64 offset_to_bitmap(struct btrfs_block_group_cache *block_group,
+static inline u64 offset_to_bitmap(struct btrfs_free_space_ctl *ctl,
                                   u64 offset)
 {
        u64 bitmap_start;
        u64 bytes_per_bitmap;
-        bytes_per_bitmap = BITS_PER_BITMAP * block_group->sectorsize;
+        bytes_per_bitmap = BITS_PER_BITMAP * ctl->unit;
-        bitmap_start = offset - block_group->key.objectid;
+        bitmap_start = offset - ctl->start;
        bitmap_start = div64_u64(bitmap_start, bytes_per_bitmap);
        bitmap_start *= bytes_per_bitmap;
-        bitmap_start += block_group->key.objectid;
+        bitmap_start += ctl->start;
        return bitmap_start;
 }
@@ -85,10 +988,16 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
                         * logically.
                         */
                        if (bitmap) {
-                                WARN_ON(info->bitmap);
+                                if (info->bitmap) {
+                                        WARN_ON_ONCE(1);
+                                        return -EEXIST;
+                                }
                                p = &(*p)->rb_right;
                        } else {
-                                WARN_ON(!info->bitmap);
+                                if (!info->bitmap) {
+                                        WARN_ON_ONCE(1);
+                                        return -EEXIST;
+                                }
                                p = &(*p)->rb_left;
                        }
                }
@@ -108,10 +1017,10 @@ static int tree_insert_offset(struct rb_root *root, u64 offset,
 * offset.
 */
 static struct btrfs_free_space *
-tree_search_offset(struct btrfs_block_group_cache *block_group,
+tree_search_offset(struct btrfs_free_space_ctl *ctl,
                   u64 offset, int bitmap_only, int fuzzy)
 {
-        struct rb_node *n = block_group->free_space_offset.rb_node;
+        struct rb_node *n = ctl->free_space_offset.rb_node;
        struct btrfs_free_space *entry, *prev = NULL;
        /* find entry that is closest to the 'offset' */
@@ -207,8 +1116,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
                                break;
                        }
                }
-                if (entry->offset + BITS_PER_BITMAP *
+                if (entry->offset + BITS_PER_BITMAP * ctl->unit > offset)
-                    block_group->sectorsize > offset)
                        return entry;
        } else if (entry->offset + entry->bytes > offset)
                return entry;
@@ -219,7 +1127,7 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
        while (1) {
                if (entry->bitmap) {
                        if (entry->offset + BITS_PER_BITMAP *
-                            block_group->sectorsize > offset)
+                            ctl->unit > offset)
                                break;
                } else {
                        if (entry->offset + entry->bytes > offset)
@@ -234,53 +1142,69 @@ tree_search_offset(struct btrfs_block_group_cache *block_group,
        return entry;
 }
-static void unlink_free_space(struct btrfs_block_group_cache *block_group,
+static inline void
+__unlink_free_space(struct btrfs_free_space_ctl *ctl,
+                    struct btrfs_free_space *info)
+{
+        rb_erase(&info->offset_index, &ctl->free_space_offset);
+        ctl->free_extents--;
+}
+static void unlink_free_space(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *info)
 {
-        rb_erase(&info->offset_index, &block_group->free_space_offset);
+        __unlink_free_space(ctl, info);
-        block_group->free_extents--;
+        ctl->free_space -= info->bytes;
-        block_group->free_space -= info->bytes;
 }
-static int link_free_space(struct btrfs_block_group_cache *block_group,
+static int link_free_space(struct btrfs_free_space_ctl *ctl,
                           struct btrfs_free_space *info)
 {
        int ret = 0;
        BUG_ON(!info->bitmap && !info->bytes);
-        ret = tree_insert_offset(&block_group->free_space_offset, info->offset,
+        ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
                                 &info->offset_index, (info->bitmap != NULL));
        if (ret)
                return ret;
-        block_group->free_space += info->bytes;
+        ctl->free_space += info->bytes;
-        block_group->free_extents++;
+        ctl->free_extents++;
        return ret;
 }
-static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 {
+        struct btrfs_block_group_cache *block_group = ctl->private;
        u64 max_bytes;
        u64 bitmap_bytes;
        u64 extent_bytes;
+        u64 size = block_group->key.offset;
+        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        int max_bitmaps = div64_u64(size + bytes_per_bg - 1, bytes_per_bg);
+        BUG_ON(ctl->total_bitmaps > max_bitmaps);
        /*
         * The goal is to keep the total amount of memory used per 1gb of space
         * at or below 32k, so we need to adjust how much memory we allow to be
         * used by extent based free space tracking
         */
-        max_bytes = MAX_CACHE_BYTES_PER_GIG *
+        if (size < 1024 * 1024 * 1024)
-                (div64_u64(block_group->key.offset, 1024 * 1024 * 1024));
+                max_bytes = MAX_CACHE_BYTES_PER_GIG;
+        else
+                max_bytes = MAX_CACHE_BYTES_PER_GIG *
+                        div64_u64(size, 1024 * 1024 * 1024);
        /*
         * we want to account for 1 more bitmap than what we have so we can make
         * sure we don't go over our overall goal of MAX_CACHE_BYTES_PER_GIG as
         * we add more bitmaps.
         */
-        bitmap_bytes = (block_group->total_bitmaps + 1) * PAGE_CACHE_SIZE;
+        bitmap_bytes = (ctl->total_bitmaps + 1) * PAGE_CACHE_SIZE;
        if (bitmap_bytes >= max_bytes) {
-                block_group->extents_thresh = 0;
+                ctl->extents_thresh = 0;
                return;
        }
@@ -291,47 +1215,43 @@ static void recalculate_thresholds(struct btrfs_block_group_cache *block_group)
        extent_bytes = max_bytes - bitmap_bytes;
        extent_bytes = min_t(u64, extent_bytes, div64_u64(max_bytes, 2));
-        block_group->extents_thresh =
+        ctl->extents_thresh =
                div64_u64(extent_bytes, (sizeof(struct btrfs_free_space)));
 }
-static void bitmap_clear_bits(struct btrfs_block_group_cache *block_group,
+static void bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *info, u64 offset,
                              u64 bytes)
 {
-        unsigned long start, end;
+        unsigned long start, count;
-        unsigned long i;
-        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        start = offset_to_bit(info->offset, ctl->unit, offset);
-        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        count = bytes_to_bits(bytes, ctl->unit);
-        BUG_ON(end > BITS_PER_BITMAP);
+        BUG_ON(start + count > BITS_PER_BITMAP);
-        for (i = start; i < end; i++)
+        bitmap_clear(info->bitmap, start, count);
-                clear_bit(i, info->bitmap);
        info->bytes -= bytes;
-        block_group->free_space -= bytes;
+        ctl->free_space -= bytes;
 }
-static void bitmap_set_bits(struct btrfs_block_group_cache *block_group,
+static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
                            struct btrfs_free_space *info, u64 offset,
                            u64 bytes)
 {
-        unsigned long start, end;
+        unsigned long start, count;
-        unsigned long i;
-        start = offset_to_bit(info->offset, block_group->sectorsize, offset);
+        start = offset_to_bit(info->offset, ctl->unit, offset);
-        end = start + bytes_to_bits(bytes, block_group->sectorsize);
+        count = bytes_to_bits(bytes, ctl->unit);
-        BUG_ON(end > BITS_PER_BITMAP);
+        BUG_ON(start + count > BITS_PER_BITMAP);
-        for (i = start; i < end; i++)
+        bitmap_set(info->bitmap, start, count);
-                set_bit(i, info->bitmap);
        info->bytes += bytes;
-        block_group->free_space += bytes;
+        ctl->free_space += bytes;
 }
-static int search_bitmap(struct btrfs_block_group_cache *block_group,
+static int search_bitmap(struct btrfs_free_space_ctl *ctl,
                         struct btrfs_free_space *bitmap_info, u64 *offset,
                         u64 *bytes)
 {
@@ -339,9 +1259,9 @@ static int search_bitmap(struct btrfs_block_group_cache *block_group,
        unsigned long bits, i;
        unsigned long next_zero;
-        i = offset_to_bit(bitmap_info->offset, block_group->sectorsize,
+        i = offset_to_bit(bitmap_info->offset, ctl->unit,
                          max_t(u64, *offset, bitmap_info->offset));
-        bits = bytes_to_bits(*bytes, block_group->sectorsize);
+        bits = bytes_to_bits(*bytes, ctl->unit);
        for (i = find_next_bit(bitmap_info->bitmap, BITS_PER_BITMAP, i);
             i < BITS_PER_BITMAP;
@@ -356,29 +1276,25 @@ static int search_bitmap(struct btrfs_block_group_cache *block_group,
        }
        if (found_bits) {
-                *offset = (u64)(i * block_group->sectorsize) +
+                *offset = (u64)(i * ctl->unit) + bitmap_info->offset;
-                        bitmap_info->offset;
+                *bytes = (u64)(found_bits) * ctl->unit;
-                *bytes = (u64)(found_bits) * block_group->sectorsize;
                return 0;
        }
        return -1;
 }
-static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
+static struct btrfs_free_space *
-                                                *block_group, u64 *offset,
+find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes)
-                                                u64 *bytes, int debug)
 {
        struct btrfs_free_space *entry;
        struct rb_node *node;
        int ret;
-        if (!block_group->free_space_offset.rb_node)
+        if (!ctl->free_space_offset.rb_node)
                return NULL;
-        entry = tree_search_offset(block_group,
+        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, *offset), 0, 1);
-                                   offset_to_bitmap(block_group, *offset),
-                                   0, 1);
        if (!entry)
                return NULL;
@@ -388,7 +1304,7 @@ static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
                        continue;
                if (entry->bitmap) {
-                        ret = search_bitmap(block_group, entry, offset, bytes);
+                        ret = search_bitmap(ctl, entry, offset, bytes);
                        if (!ret)
                                return entry;
                        continue;
@@ -402,23 +1318,28 @@ static struct btrfs_free_space *find_free_space(struct btrfs_block_group_cache
        return NULL;
 }
-static void add_new_bitmap(struct btrfs_block_group_cache *block_group,
+static void add_new_bitmap(struct btrfs_free_space_ctl *ctl,
                           struct btrfs_free_space *info, u64 offset)
 {
-        u64 bytes_per_bg = BITS_PER_BITMAP * block_group->sectorsize;
+        info->offset = offset_to_bitmap(ctl, offset);
-        int max_bitmaps = (int)div64_u64(block_group->key.offset +
-                                         bytes_per_bg - 1, bytes_per_bg);
-        BUG_ON(block_group->total_bitmaps >= max_bitmaps);
-        info->offset = offset_to_bitmap(block_group, offset);
        info->bytes = 0;
-        link_free_space(block_group, info);
+        link_free_space(ctl, info);
-        block_group->total_bitmaps++;
+        ctl->total_bitmaps++;
-        recalculate_thresholds(block_group);
+        ctl->op->recalc_thresholds(ctl);
 }
-static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_group,
+static void free_bitmap(struct btrfs_free_space_ctl *ctl,
+                        struct btrfs_free_space *bitmap_info)
+{
+        unlink_free_space(ctl, bitmap_info);
+        kfree(bitmap_info->bitmap);
+        kmem_cache_free(btrfs_free_space_cachep, bitmap_info);
+        ctl->total_bitmaps--;
+        ctl->op->recalc_thresholds(ctl);
+}
+static noinline int remove_from_bitmap(struct btrfs_free_space_ctl *ctl,
                              struct btrfs_free_space *bitmap_info,
                              u64 *offset, u64 *bytes)
 {
@@ -427,8 +1348,7 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro
        int ret;
 again:
-        end = bitmap_info->offset +
+        end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * ctl->unit) - 1;
-                (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1;
        /*
         * XXX - this can go away after a few releases.
@@ -442,29 +1362,23 @@ again:
         */
        search_start = *offset;
        search_bytes = *bytes;
-        ret = search_bitmap(block_group, bitmap_info, &search_start,
+        search_bytes = min(search_bytes, end - search_start + 1);
-                            &search_bytes);
+        ret = search_bitmap(ctl, bitmap_info, &search_start, &search_bytes);
        BUG_ON(ret < 0 || search_start != *offset);
        if (*offset > bitmap_info->offset && *offset + *bytes > end) {
-                bitmap_clear_bits(block_group, bitmap_info, *offset,
+                bitmap_clear_bits(ctl, bitmap_info, *offset, end - *offset + 1);
-                                  end - *offset + 1);
                *bytes -= end - *offset + 1;
                *offset = end + 1;
        } else if (*offset >= bitmap_info->offset && *offset + *bytes <= end) {
-                bitmap_clear_bits(block_group, bitmap_info, *offset, *bytes);
+                bitmap_clear_bits(ctl, bitmap_info, *offset, *bytes);
                *bytes = 0;
        }
        if (*bytes) {
                struct rb_node *next = rb_next(&bitmap_info->offset_index);
-                if (!bitmap_info->bytes) {
+                if (!bitmap_info->bytes)
-                        unlink_free_space(block_group, bitmap_info);
+                        free_bitmap(ctl, bitmap_info);
-                        kfree(bitmap_info->bitmap);
-                        kfree(bitmap_info);
-                        block_group->total_bitmaps--;
-                        recalculate_thresholds(block_group);
-                }
                /*
                 * no entry after this bitmap, but we still have bytes to
@@ -491,38 +1405,59 @@ again:
                 */
                search_start = *offset;
                search_bytes = *bytes;
-                ret = search_bitmap(block_group, bitmap_info, &search_start,
+                ret = search_bitmap(ctl, bitmap_info, &search_start,
                                    &search_bytes);
                if (ret < 0 || search_start != *offset)
                        return -EAGAIN;
                goto again;
-        } else if (!bitmap_info->bytes) {
+        } else if (!bitmap_info->bytes)
-                unlink_free_space(block_group, bitmap_info);
+                free_bitmap(ctl, bitmap_info);
-                kfree(bitmap_info->bitmap);
-                kfree(bitmap_info);
-                block_group->total_bitmaps--;
-                recalculate_thresholds(block_group);
-        }
        return 0;
 }
-static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
+static u64 add_bytes_to_bitmap(struct btrfs_free_space_ctl *ctl,
-                              struct btrfs_free_space *info)
+                               struct btrfs_free_space *info, u64 offset,
+                               u64 bytes)
 {
-        struct btrfs_free_space *bitmap_info;
+        u64 bytes_to_set = 0;
-        int added = 0;
+        u64 end;
-        u64 bytes, offset, end;
-        int ret;
+        end = info->offset + (u64)(BITS_PER_BITMAP * ctl->unit);
+        bytes_to_set = min(end - offset, bytes);
+        bitmap_set_bits(ctl, info, offset, bytes_to_set);
+        return bytes_to_set;
+}
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+                      struct btrfs_free_space *info)
+{
+        struct btrfs_block_group_cache *block_group = ctl->private;
        /*
         * If we are below the extents threshold then we can add this as an
         * extent, and don't have to deal with the bitmap
         */
-        if (block_group->free_extents < block_group->extents_thresh &&
+        if (ctl->free_extents < ctl->extents_thresh) {
-            info->bytes > block_group->sectorsize * 4)
+                /*
-                return 0;
+                 * If this block group has some small extents we don't want to
+                 * use up all of our free slots in the cache with them, we want
+                 * to reserve them to larger extents, however if we have plent
+                 * of cache left then go ahead an dadd them, no sense in adding
+                 * the overhead of a bitmap if we don't have to.
+                 */
+                if (info->bytes <= block_group->sectorsize * 4) {
+                        if (ctl->free_extents * 2 <= ctl->extents_thresh)
+                                return false;
+                } else {
+                        return false;
+                }
+        }
        /*
         * some block groups are so tiny they can't be enveloped by a bitmap, so
@@ -530,35 +1465,85 @@ static int insert_into_bitmap(struct btrfs_block_group_cache *block_group,
         */
        if (BITS_PER_BITMAP * block_group->sectorsize >
            block_group->key.offset)
-                return 0;
+                return false;
+        return true;
+}
+static struct btrfs_free_space_op free_space_op = {
+        .recalc_thresholds      = recalculate_thresholds,
+        .use_bitmap             = use_bitmap,
+};
+static int insert_into_bitmap(struct btrfs_free_space_ctl *ctl,
+                              struct btrfs_free_space *info)
+{
+        struct btrfs_free_space *bitmap_info;
+        struct btrfs_block_group_cache *block_group = NULL;
+        int added = 0;
+        u64 bytes, offset, bytes_added;
+        int ret;
        bytes = info->bytes;
        offset = info->offset;
+        if (!ctl->op->use_bitmap(ctl, info))
+                return 0;
+        if (ctl->op == &free_space_op)
+                block_group = ctl->private;
 again:
-        bitmap_info = tree_search_offset(block_group,
+        /*
-                                         offset_to_bitmap(block_group, offset),
+         * Since we link bitmaps right into the cluster we need to see if we
+         * have a cluster here, and if so and it has our bitmap we need to add
+         * the free space to that bitmap.
+         */
+        if (block_group && !list_empty(&block_group->cluster_list)) {
+                struct btrfs_free_cluster *cluster;
+                struct rb_node *node;
+                struct btrfs_free_space *entry;
+                cluster = list_entry(block_group->cluster_list.next,
+                                     struct btrfs_free_cluster,
+                                     block_group_list);
+                spin_lock(&cluster->lock);
+                node = rb_first(&cluster->root);
+                if (!node) {
+                        spin_unlock(&cluster->lock);
+                        goto no_cluster_bitmap;
+                }
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                if (!entry->bitmap) {
+                        spin_unlock(&cluster->lock);
+                        goto no_cluster_bitmap;
+                }
+                if (entry->offset == offset_to_bitmap(ctl, offset)) {
+                        bytes_added = add_bytes_to_bitmap(ctl, entry,
+                                                          offset, bytes);
+                        bytes -= bytes_added;
+                        offset += bytes_added;
+                }
+                spin_unlock(&cluster->lock);
+                if (!bytes) {
+                        ret = 1;
+                        goto out;
+                }
+        }
+no_cluster_bitmap:
+        bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
                                         1, 0);
        if (!bitmap_info) {
                BUG_ON(added);
                goto new_bitmap;
        }
-        end = bitmap_info->offset +
+        bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
-                (u64)(BITS_PER_BITMAP * block_group->sectorsize);
+        bytes -= bytes_added;
+        offset += bytes_added;
-        if (offset >= bitmap_info->offset && offset + bytes > end) {
+        added = 0;
-                bitmap_set_bits(block_group, bitmap_info, offset,
-                                end - offset);
-                bytes -= end - offset;
-                offset = end;
-                added = 0;
-        } else if (offset >= bitmap_info->offset && offset + bytes <= end) {
-                bitmap_set_bits(block_group, bitmap_info, offset, bytes);
-                bytes = 0;
-        } else {
-                BUG();
-        }
        if (!bytes) {
                ret = 1;
@@ -568,19 +1553,19 @@ again:
 new_bitmap:
        if (info && info->bitmap) {
-                add_new_bitmap(block_group, info, offset);
+                add_new_bitmap(ctl, info, offset);
                added = 1;
                info = NULL;
                goto again;
        } else {
-                spin_unlock(&block_group->tree_lock);
+                spin_unlock(&ctl->tree_lock);
                /* no pre-allocated info, allocate a new one */
                if (!info) {
-                        info = kzalloc(sizeof(struct btrfs_free_space),
+                        info = kmem_cache_zalloc(btrfs_free_space_cachep,
-                                       GFP_NOFS);
+                                                 GFP_NOFS);
                        if (!info) {
-                                spin_lock(&block_group->tree_lock);
+                                spin_lock(&ctl->tree_lock);
                                ret = -ENOMEM;
                                goto out;
                        }
@@ -588,7 +1573,7 @@ new_bitmap:
                /* allocate the bitmap */
                info->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-                spin_lock(&block_group->tree_lock);
+                spin_lock(&ctl->tree_lock);
                if (!info->bitmap) {
                        ret = -ENOMEM;
                        goto out;
@@ -600,77 +1585,94 @@ out:
        if (info) {
                if (info->bitmap)
                        kfree(info->bitmap);
-                kfree(info);
+                kmem_cache_free(btrfs_free_space_cachep, info);
        }
        return ret;
 }
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
-                         u64 offset, u64 bytes)
+                          struct btrfs_free_space *info, bool update_stat)
 {
-        struct btrfs_free_space *right_info = NULL;
+        struct btrfs_free_space *left_info;
-        struct btrfs_free_space *left_info = NULL;
+        struct btrfs_free_space *right_info;
-        struct btrfs_free_space *info = NULL;
+        bool merged = false;
-        int ret = 0;
+        u64 offset = info->offset;
+        u64 bytes = info->bytes;
-        info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS);
-        if (!info)
-                return -ENOMEM;
-        info->offset = offset;
-        info->bytes = bytes;
-        spin_lock(&block_group->tree_lock);
        /*
         * first we want to see if there is free space adjacent to the range we
         * are adding, if there is remove that struct and add a new one to
         * cover the entire range
         */
-        right_info = tree_search_offset(block_group, offset + bytes, 0, 0);
+        right_info = tree_search_offset(ctl, offset + bytes, 0, 0);
        if (right_info && rb_prev(&right_info->offset_index))
                left_info = rb_entry(rb_prev(&right_info->offset_index),
                                     struct btrfs_free_space, offset_index);
        else
-                left_info = tree_search_offset(block_group, offset - 1, 0, 0);
+                left_info = tree_search_offset(ctl, offset - 1, 0, 0);
-        /*
-         * If there was no extent directly to the left or right of this new
-         * extent then we know we're going to have to allocate a new extent, so
-         * before we do that see if we need to drop this into a bitmap
-         */
-        if ((!left_info || left_info->bitmap) &&
-            (!right_info || right_info->bitmap)) {
-                ret = insert_into_bitmap(block_group, info);
-                if (ret < 0) {
-                        goto out;
-                } else if (ret) {
-                        ret = 0;
-                        goto out;
-                }
-        }
        if (right_info && !right_info->bitmap) {
-                unlink_free_space(block_group, right_info);
+                if (update_stat)
+                        unlink_free_space(ctl, right_info);
+                else
+                        __unlink_free_space(ctl, right_info);
                info->bytes += right_info->bytes;
-                kfree(right_info);
+                kmem_cache_free(btrfs_free_space_cachep, right_info);
+                merged = true;
        }
        if (left_info && !left_info->bitmap &&
            left_info->offset + left_info->bytes == offset) {
-                unlink_free_space(block_group, left_info);
+                if (update_stat)
+                        unlink_free_space(ctl, left_info);
+                else
+                        __unlink_free_space(ctl, left_info);
                info->offset = left_info->offset;
                info->bytes += left_info->bytes;
-                kfree(left_info);
+                kmem_cache_free(btrfs_free_space_cachep, left_info);
+                merged = true;
        }
-        ret = link_free_space(block_group, info);
+        return merged;
+}
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+                           u64 offset, u64 bytes)
+{
+        struct btrfs_free_space *info;
+        int ret = 0;
+        info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+        if (!info)
+                return -ENOMEM;
+        info->offset = offset;
+        info->bytes = bytes;
+        spin_lock(&ctl->tree_lock);
+        if (try_merge_free_space(ctl, info, true))
+                goto link;
+        /*
+         * There was no extent directly to the left or right of this new
+         * extent then we know we're going to have to allocate a new extent, so
+         * before we do that see if we need to drop this into a bitmap
+         */
+        ret = insert_into_bitmap(ctl, info);
+        if (ret < 0) {
+                goto out;
+        } else if (ret) {
+                ret = 0;
+                goto out;
+        }
+link:
+        ret = link_free_space(ctl, info);
        if (ret)
-                kfree(info);
+                kmem_cache_free(btrfs_free_space_cachep, info);
 out:
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
        if (ret) {
                printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
@@ -683,21 +1685,21 @@ out:
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                            u64 offset, u64 bytes)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
        struct btrfs_free_space *next_info = NULL;
        int ret = 0;
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
 again:
-        info = tree_search_offset(block_group, offset, 0, 0);
+        info = tree_search_offset(ctl, offset, 0, 0);
        if (!info) {
                /*
                 * oops didn't find an extent that matched the space we wanted
                 * to remove, look for a bitmap instead
                 */
-                info = tree_search_offset(block_group,
+                info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
-                                          offset_to_bitmap(block_group, offset),
                                          1, 0);
                if (!info) {
                        WARN_ON(1);
@@ -712,8 +1714,8 @@ again:
                                             offset_index);
                if (next_info->bitmap)
-                        end = next_info->offset + BITS_PER_BITMAP *
+                        end = next_info->offset +
-                                block_group->sectorsize - 1;
+                              BITS_PER_BITMAP * ctl->unit - 1;
                else
                        end = next_info->offset + next_info->bytes;
@@ -733,20 +1735,20 @@ again:
        }
        if (info->bytes == bytes) {
-                unlink_free_space(block_group, info);
+                unlink_free_space(ctl, info);
                if (info->bitmap) {
                        kfree(info->bitmap);
-                        block_group->total_bitmaps--;
+                        ctl->total_bitmaps--;
                }
-                kfree(info);
+                kmem_cache_free(btrfs_free_space_cachep, info);
                goto out_lock;
        }
        if (!info->bitmap && info->offset == offset) {
-                unlink_free_space(block_group, info);
+                unlink_free_space(ctl, info);
                info->offset += bytes;
                info->bytes -= bytes;
-                link_free_space(block_group, info);
+                link_free_space(ctl, info);
                goto out_lock;
        }
@@ -760,13 +1762,13 @@ again:
                 * first unlink the old info and then
                 * insert it again after the hole we're creating
                 */
-                unlink_free_space(block_group, info);
+                unlink_free_space(ctl, info);
                if (offset + bytes < info->offset + info->bytes) {
                        u64 old_end = info->offset + info->bytes;
                        info->offset = offset + bytes;
                        info->bytes = old_end - info->offset;
-                        ret = link_free_space(block_group, info);
+                        ret = link_free_space(ctl, info);
                        WARN_ON(ret);
                        if (ret)
                                goto out_lock;
@@ -774,9 +1776,9 @@ again:
                        /* the hole we're creating ends at the end
                         * of the info struct, just free the info
                         */
-                        kfree(info);
+                        kmem_cache_free(btrfs_free_space_cachep, info);
                }
-                spin_unlock(&block_group->tree_lock);
+                spin_unlock(&ctl->tree_lock);
                /* step two, insert a new info struct to cover
                 * anything before the hole
@@ -787,12 +1789,12 @@ again:
                goto out;
        }
-        ret = remove_from_bitmap(block_group, info, &offset, &bytes);
+        ret = remove_from_bitmap(ctl, info, &offset, &bytes);
        if (ret == -EAGAIN)
                goto again;
        BUG_ON(ret);
 out_lock:
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
 out:
        return ret;
 }
@@ -800,11 +1802,12 @@ out:
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                           u64 bytes)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *info;
        struct rb_node *n;
        int count = 0;
-        for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) {
+        for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) {
                info = rb_entry(n, struct btrfs_free_space, offset_index);
                if (info->bytes >= bytes)
                        count++;
@@ -819,19 +1822,23 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
               "\n", count);
 }
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group)
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
 {
-        struct btrfs_free_space *info;
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
-        struct rb_node *n;
-        u64 ret = 0;
-        for (n = rb_first(&block_group->free_space_offset); n;
+        spin_lock_init(&ctl->tree_lock);
-             n = rb_next(n)) {
+        ctl->unit = block_group->sectorsize;
-                info = rb_entry(n, struct btrfs_free_space, offset_index);
+        ctl->start = block_group->key.objectid;
-                ret += info->bytes;
+        ctl->private = block_group;
-        }
+        ctl->op = &free_space_op;
-        return ret;
+        /*
+         * we only want to have 32k of ram per block group for keeping
+         * track of free space, and if we pass 1/2 of that we want to
+         * start converting things over to using bitmaps
+         */
+        ctl->extents_thresh = ((1024 * 32) / 2) /
+                                sizeof(struct btrfs_free_space);
 }
 /*
@@ -845,31 +1852,31 @@ __btrfs_return_cluster_to_free_space(
                             struct btrfs_block_group_cache *block_group,
                             struct btrfs_free_cluster *cluster)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry;
        struct rb_node *node;
-        bool bitmap;
        spin_lock(&cluster->lock);
        if (cluster->block_group != block_group)
                goto out;
-        bitmap = cluster->points_to_bitmap;
        cluster->block_group = NULL;
        cluster->window_start = 0;
        list_del_init(&cluster->block_group_list);
-        cluster->points_to_bitmap = false;
-        if (bitmap)
-                goto out;
        node = rb_first(&cluster->root);
        while (node) {
+                bool bitmap;
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
                rb_erase(&entry->offset_index, &cluster->root);
-                BUG_ON(entry->bitmap);
-                tree_insert_offset(&block_group->free_space_offset,
+                bitmap = (entry->bitmap != NULL);
-                                   entry->offset, &entry->offset_index, 0);
+                if (!bitmap)
+                        try_merge_free_space(ctl, entry, false);
+                tree_insert_offset(&ctl->free_space_offset,
+                                   entry->offset, &entry->offset_index, bitmap);
        }
        cluster->root = RB_ROOT;
@@ -879,14 +1886,41 @@ out:
        return 0;
 }
-void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+void __btrfs_remove_free_space_cache_locked(struct btrfs_free_space_ctl *ctl)
 {
        struct btrfs_free_space *info;
        struct rb_node *node;
+        while ((node = rb_last(&ctl->free_space_offset)) != NULL) {
+                info = rb_entry(node, struct btrfs_free_space, offset_index);
+                if (!info->bitmap) {
+                        unlink_free_space(ctl, info);
+                        kmem_cache_free(btrfs_free_space_cachep, info);
+                } else {
+                        free_bitmap(ctl, info);
+                }
+                if (need_resched()) {
+                        spin_unlock(&ctl->tree_lock);
+                        cond_resched();
+                        spin_lock(&ctl->tree_lock);
+                }
+        }
+}
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl)
+{
+        spin_lock(&ctl->tree_lock);
+        __btrfs_remove_free_space_cache_locked(ctl);
+        spin_unlock(&ctl->tree_lock);
+}
+void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_cluster *cluster;
        struct list_head *head;
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
        while ((head = block_group->cluster_list.next) !=
               &block_group->cluster_list) {
                cluster = list_entry(head, struct btrfs_free_cluster,
@@ -895,62 +1929,46 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group)
                WARN_ON(cluster->block_group != block_group);
                __btrfs_return_cluster_to_free_space(block_group, cluster);
                if (need_resched()) {
-                        spin_unlock(&block_group->tree_lock);
+                        spin_unlock(&ctl->tree_lock);
-                        cond_resched();
-                        spin_lock(&block_group->tree_lock);
-                }
-        }
-        while ((node = rb_last(&block_group->free_space_offset)) != NULL) {
-                info = rb_entry(node, struct btrfs_free_space, offset_index);
-                unlink_free_space(block_group, info);
-                if (info->bitmap)
-                        kfree(info->bitmap);
-                kfree(info);
-                if (need_resched()) {
-                        spin_unlock(&block_group->tree_lock);
                        cond_resched();
-                        spin_lock(&block_group->tree_lock);
+                        spin_lock(&ctl->tree_lock);
                }
        }
+        __btrfs_remove_free_space_cache_locked(ctl);
+        spin_unlock(&ctl->tree_lock);
-        spin_unlock(&block_group->tree_lock);
 }
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
                               u64 offset, u64 bytes, u64 empty_size)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry = NULL;
        u64 bytes_search = bytes + empty_size;
        u64 ret = 0;
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
-        entry = find_free_space(block_group, &offset, &bytes_search, 0);
+        entry = find_free_space(ctl, &offset, &bytes_search);
        if (!entry)
                goto out;
        ret = offset;
        if (entry->bitmap) {
-                bitmap_clear_bits(block_group, entry, offset, bytes);
+                bitmap_clear_bits(ctl, entry, offset, bytes);
-                if (!entry->bytes) {
+                if (!entry->bytes)
-                        unlink_free_space(block_group, entry);
+                        free_bitmap(ctl, entry);
-                        kfree(entry->bitmap);
-                        kfree(entry);
-                        block_group->total_bitmaps--;
-                        recalculate_thresholds(block_group);
-                }
        } else {
-                unlink_free_space(block_group, entry);
+                unlink_free_space(ctl, entry);
                entry->offset += bytes;
                entry->bytes -= bytes;
                if (!entry->bytes)
-                        kfree(entry);
+                        kmem_cache_free(btrfs_free_space_cachep, entry);
                else
-                        link_free_space(block_group, entry);
+                        link_free_space(ctl, entry);
        }
 out:
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
        return ret;
 }
@@ -967,6 +1985,7 @@ int btrfs_return_cluster_to_free_space(
                               struct btrfs_block_group_cache *block_group,
                               struct btrfs_free_cluster *cluster)
 {
+        struct btrfs_free_space_ctl *ctl;
        int ret;
        /* first, get a safe pointer to the block group */
@@ -985,10 +2004,12 @@ int btrfs_return_cluster_to_free_space(
        atomic_inc(&block_group->count);
        spin_unlock(&cluster->lock);
+        ctl = block_group->free_space_ctl;
        /* now return any extents the cluster had on it */
-        spin_lock(&block_group->tree_lock);
+        spin_lock(&ctl->tree_lock);
        ret = __btrfs_return_cluster_to_free_space(block_group, cluster);
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
        /* finally drop our ref */
        btrfs_put_block_group(block_group);
@@ -997,48 +2018,24 @@ int btrfs_return_cluster_to_free_space(
 static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group,
                                   struct btrfs_free_cluster *cluster,
+                                   struct btrfs_free_space *entry,
                                   u64 bytes, u64 min_start)
 {
-        struct btrfs_free_space *entry;
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        int err;
        u64 search_start = cluster->window_start;
        u64 search_bytes = bytes;
        u64 ret = 0;
-        spin_lock(&block_group->tree_lock);
-        spin_lock(&cluster->lock);
-        if (!cluster->points_to_bitmap)
-                goto out;
-        if (cluster->block_group != block_group)
-                goto out;
-        /*
-         * search_start is the beginning of the bitmap, but at some point it may
-         * be a good idea to point to the actual start of the free area in the
-         * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only
-         * to 1 to make sure we get the bitmap entry
-         */
-        entry = tree_search_offset(block_group,
-                                   offset_to_bitmap(block_group, search_start),
-                                   1, 0);
-        if (!entry || !entry->bitmap)
-                goto out;
        search_start = min_start;
        search_bytes = bytes;
-        err = search_bitmap(block_group, entry, &search_start,
+        err = search_bitmap(ctl, entry, &search_start, &search_bytes);
-                            &search_bytes);
        if (err)
-                goto out;
+                return 0;
        ret = search_start;
-        bitmap_clear_bits(block_group, entry, ret, bytes);
+        bitmap_clear_bits(ctl, entry, ret, bytes);
-out:
-        spin_unlock(&cluster->lock);
-        spin_unlock(&block_group->tree_lock);
        return ret;
 }
@@ -1052,14 +2049,11 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                             struct btrfs_free_cluster *cluster, u64 bytes,
                             u64 min_start)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        struct btrfs_free_space *entry = NULL;
        struct rb_node *node;
        u64 ret = 0;
-        if (cluster->points_to_bitmap)
-                return btrfs_alloc_from_bitmap(block_group, cluster, bytes,
-                                               min_start);
        spin_lock(&cluster->lock);
        if (bytes > cluster->max_size)
                goto out;
@@ -1072,11 +2066,9 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                goto out;
        entry = rb_entry(node, struct btrfs_free_space, offset_index);
        while(1) {
-                if (entry->bytes < bytes || entry->offset < min_start) {
+                if (entry->bytes < bytes ||
-                        struct rb_node *node;
+                    (!entry->bitmap && entry->offset < min_start)) {
                        node = rb_next(&entry->offset_index);
                        if (!node)
                                break;
@@ -1084,20 +2076,52 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
                                         offset_index);
                        continue;
                }
-                ret = entry->offset;
-                entry->offset += bytes;
+                if (entry->bitmap) {
-                entry->bytes -= bytes;
+                        ret = btrfs_alloc_from_bitmap(block_group,
+                                                      cluster, entry, bytes,
+                                                      min_start);
+                        if (ret == 0) {
+                                node = rb_next(&entry->offset_index);
+                                if (!node)
+                                        break;
+                                entry = rb_entry(node, struct btrfs_free_space,
+                                                 offset_index);
+                                continue;
+                        }
+                } else {
-                if (entry->bytes == 0) {
+                        ret = entry->offset;
-                        rb_erase(&entry->offset_index, &cluster->root);
-                        kfree(entry);
+                        entry->offset += bytes;
+                        entry->bytes -= bytes;
                }
+                if (entry->bytes == 0)
+                        rb_erase(&entry->offset_index, &cluster->root);
                break;
        }
 out:
        spin_unlock(&cluster->lock);
+        if (!ret)
+                return 0;
+        spin_lock(&ctl->tree_lock);
+        ctl->free_space -= bytes;
+        if (entry->bytes == 0) {
+                ctl->free_extents--;
+                if (entry->bitmap) {
+                        kfree(entry->bitmap);
+                        ctl->total_bitmaps--;
+                        ctl->op->recalc_thresholds(ctl);
+                }
+                kmem_cache_free(btrfs_free_space_cachep, entry);
+        }
+        spin_unlock(&ctl->tree_lock);
        return ret;
 }
@@ -1106,6 +2130,7 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
                                struct btrfs_free_cluster *cluster,
                                u64 offset, u64 bytes, u64 min_bytes)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
        unsigned long next_zero;
        unsigned long i;
        unsigned long search_bits;
@@ -1113,12 +2138,13 @@ static int btrfs_bitmap_cluster(struct btrfs_block_group_cache *block_group,
        unsigned long found_bits;
        unsigned long start = 0;
        unsigned long total_found = 0;
+        int ret;
        bool found = false;
        i = offset_to_bit(entry->offset, block_group->sectorsize,
                          max_t(u64, offset, entry->offset));
-        search_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
+        search_bits = bytes_to_bits(bytes, block_group->sectorsize);
-        total_bits = bytes_to_bits(bytes, block_group->sectorsize);
+        total_bits = bytes_to_bits(min_bytes, block_group->sectorsize);
 again:
        found_bits = 0;
@@ -1135,7 +2161,7 @@ again:
        }
        if (!found_bits)
-                return -1;
+                return -ENOSPC;
        if (!found) {
                start = i;
@@ -1159,131 +2185,67 @@ again:
        cluster->window_start = start * block_group->sectorsize +
                entry->offset;
-        cluster->points_to_bitmap = true;
+        rb_erase(&entry->offset_index, &ctl->free_space_offset);
+        ret = tree_insert_offset(&cluster->root, entry->offset,
+                                 &entry->offset_index, 1);
+        BUG_ON(ret);
        return 0;
 }
 /*
- * here we try to find a cluster of blocks in a block group.  The goal
+ * This searches the block group for just extents to fill the cluster with.
- * is to find at least bytes free and up to empty_size + bytes free.
- * We might not find them all in one contiguous area.
- *
- * returns zero and sets up cluster if things worked out, otherwise
- * it returns -enospc
 */
-int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+static noinline int
-                             struct btrfs_root *root,
+setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
-                             struct btrfs_block_group_cache *block_group,
+                        struct btrfs_free_cluster *cluster,
-                             struct btrfs_free_cluster *cluster,
+                        struct list_head *bitmaps, u64 offset, u64 bytes,
-                             u64 offset, u64 bytes, u64 empty_size)
+                        u64 min_bytes)
 {
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct btrfs_free_space *first = NULL;
        struct btrfs_free_space *entry = NULL;
+        struct btrfs_free_space *prev = NULL;
+        struct btrfs_free_space *last;
        struct rb_node *node;
-        struct btrfs_free_space *next;
-        struct btrfs_free_space *last = NULL;
-        u64 min_bytes;
        u64 window_start;
        u64 window_free;
-        u64 max_extent = 0;
+        u64 max_extent;
-        bool found_bitmap = false;
+        u64 max_gap = 128 * 1024;
-        int ret;
-        /* for metadata, allow allocates with more holes */
+        entry = tree_search_offset(ctl, offset, 0, 1);
-        if (btrfs_test_opt(root, SSD_SPREAD)) {
+        if (!entry)
-                min_bytes = bytes + empty_size;
+                return -ENOSPC;
-        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
-                /*
-                 * we want to do larger allocations when we are
-                 * flushing out the delayed refs, it helps prevent
-                 * making more work as we go along.
-                 */
-                if (trans->transaction->delayed_refs.flushing)
-                        min_bytes = max(bytes, (bytes + empty_size) >> 1);
-                else
-                        min_bytes = max(bytes, (bytes + empty_size) >> 4);
-        } else
-                min_bytes = max(bytes, (bytes + empty_size) >> 2);
-        spin_lock(&block_group->tree_lock);
-        spin_lock(&cluster->lock);
-        /* someone already found a cluster, hooray */
-        if (cluster->block_group) {
-                ret = 0;
-                goto out;
-        }
-again:
-        entry = tree_search_offset(block_group, offset, found_bitmap, 1);
-        if (!entry) {
-                ret = -ENOSPC;
-                goto out;
-        }
        /*
-         * If found_bitmap is true, we exhausted our search for extent entries,
+         * We don't want bitmaps, so just move along until we find a normal
-         * and we just want to search all of the bitmaps that we can find, and
+         * extent entry.
-         * ignore any extent entries we find.
         */
-        while (entry->bitmap || found_bitmap ||
+        while (entry->bitmap) {
-               (!entry->bitmap && entry->bytes < min_bytes)) {
+                if (list_empty(&entry->list))
-                struct rb_node *node = rb_next(&entry->offset_index);
+                        list_add_tail(&entry->list, bitmaps);
+                node = rb_next(&entry->offset_index);
-                if (entry->bitmap && entry->bytes > bytes + empty_size) {
+                if (!node)
-                        ret = btrfs_bitmap_cluster(block_group, entry, cluster,
+                        return -ENOSPC;
-                                                   offset, bytes + empty_size,
-                                                   min_bytes);
-                        if (!ret)
-                                goto got_it;
-                }
-                if (!node) {
-                        ret = -ENOSPC;
-                        goto out;
-                }
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
        }
-        /*
-         * We already searched all the extent entries from the passed in offset
-         * to the end and didn't find enough space for the cluster, and we also
-         * didn't find any bitmaps that met our criteria, just go ahead and exit
-         */
-        if (found_bitmap) {
-                ret = -ENOSPC;
-                goto out;
-        }
-        cluster->points_to_bitmap = false;
        window_start = entry->offset;
        window_free = entry->bytes;
-        last = entry;
        max_extent = entry->bytes;
+        first = entry;
+        last = entry;
+        prev = entry;
-        while (1) {
+        while (window_free <= min_bytes) {
-                /* out window is just right, lets fill it */
+                node = rb_next(&entry->offset_index);
-                if (window_free >= bytes + empty_size)
+                if (!node)
-                        break;
+                        return -ENOSPC;
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
-                node = rb_next(&last->offset_index);
-                if (!node) {
-                        if (found_bitmap)
-                                goto again;
-                        ret = -ENOSPC;
-                        goto out;
-                }
-                next = rb_entry(node, struct btrfs_free_space, offset_index);
-                /*
+                if (entry->bitmap) {
-                 * we found a bitmap, so if this search doesn't result in a
+                        if (list_empty(&entry->list))
-                 * cluster, we know to go and search again for the bitmaps and
+                                list_add_tail(&entry->list, bitmaps);
-                 * start looking for space there
-                 */
-                if (next->bitmap) {
-                        if (!found_bitmap)
-                                offset = next->offset;
-                        found_bitmap = true;
-                        last = next;
                        continue;
                }
@@ -1291,60 +2253,190 @@ again:
                 * we haven't filled the empty size and the window is
                 * very large.  reset and try again
                 */
-                if (next->offset - (last->offset + last->bytes) > 128 * 1024 ||
+                if (entry->offset - (prev->offset + prev->bytes) > max_gap ||
-                    next->offset - window_start > (bytes + empty_size) * 2) {
+                    entry->offset - window_start > (min_bytes * 2)) {
-                        entry = next;
+                        first = entry;
                        window_start = entry->offset;
                        window_free = entry->bytes;
                        last = entry;
                        max_extent = entry->bytes;
                } else {
-                        last = next;
+                        last = entry;
-                        window_free += next->bytes;
+                        window_free += entry->bytes;
                        if (entry->bytes > max_extent)
                                max_extent = entry->bytes;
                }
+                prev = entry;
        }
-        cluster->window_start = entry->offset;
+        cluster->window_start = first->offset;
+        node = &first->offset_index;
        /*
         * now we've found our entries, pull them out of the free space
         * cache and put them into the cluster rbtree
-         *
-         * The cluster includes an rbtree, but only uses the offset index
-         * of each free space cache entry.
         */
-        while (1) {
+        do {
+                int ret;
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
                node = rb_next(&entry->offset_index);
-                if (entry->bitmap && node) {
+                if (entry->bitmap)
-                        entry = rb_entry(node, struct btrfs_free_space,
-                                         offset_index);
                        continue;
-                } else if (entry->bitmap && !node) {
-                        break;
-                }
-                rb_erase(&entry->offset_index, &block_group->free_space_offset);
+                rb_erase(&entry->offset_index, &ctl->free_space_offset);
                ret = tree_insert_offset(&cluster->root, entry->offset,
                                         &entry->offset_index, 0);
                BUG_ON(ret);
+        } while (node && entry != last);
-                if (!node || entry == last)
+        cluster->max_size = max_extent;
-                        break;
+        return 0;
+}
+/*
+ * This specifically looks for bitmaps that may work in the cluster, we assume
+ * that we have already failed to find extents that will work.
+ */
+static noinline int
+setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
+                     struct btrfs_free_cluster *cluster,
+                     struct list_head *bitmaps, u64 offset, u64 bytes,
+                     u64 min_bytes)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct btrfs_free_space *entry;
+        struct rb_node *node;
+        int ret = -ENOSPC;
+        if (ctl->total_bitmaps == 0)
+                return -ENOSPC;
+        /*
+         * First check our cached list of bitmaps and see if there is an entry
+         * here that will work.
+         */
+        list_for_each_entry(entry, bitmaps, list) {
+                if (entry->bytes < min_bytes)
+                        continue;
+                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+                                           bytes, min_bytes);
+                if (!ret)
+                        return 0;
+        }
+        /*
+         * If we do have entries on our list and we are here then we didn't find
+         * anything, so go ahead and get the next entry after the last entry in
+         * this list and start the search from there.
+         */
+        if (!list_empty(bitmaps)) {
+                entry = list_entry(bitmaps->prev, struct btrfs_free_space,
+                                   list);
+                node = rb_next(&entry->offset_index);
+                if (!node)
+                        return -ENOSPC;
                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                goto search;
        }
-        cluster->max_size = max_extent;
+        entry = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 0, 1);
-got_it:
+        if (!entry)
-        ret = 0;
+                return -ENOSPC;
-        atomic_inc(&block_group->count);
-        list_add_tail(&cluster->block_group_list, &block_group->cluster_list);
+search:
-        cluster->block_group = block_group;
+        node = &entry->offset_index;
+        do {
+                entry = rb_entry(node, struct btrfs_free_space, offset_index);
+                node = rb_next(&entry->offset_index);
+                if (!entry->bitmap)
+                        continue;
+                if (entry->bytes < min_bytes)
+                        continue;
+                ret = btrfs_bitmap_cluster(block_group, entry, cluster, offset,
+                                           bytes, min_bytes);
+        } while (ret && node);
+        return ret;
+}
+/*
+ * here we try to find a cluster of blocks in a block group.  The goal
+ * is to find at least bytes free and up to empty_size + bytes free.
+ * We might not find them all in one contiguous area.
+ *
+ * returns zero and sets up cluster if things worked out, otherwise
+ * it returns -enospc
+ */
+int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
+                             struct btrfs_root *root,
+                             struct btrfs_block_group_cache *block_group,
+                             struct btrfs_free_cluster *cluster,
+                             u64 offset, u64 bytes, u64 empty_size)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct list_head bitmaps;
+        struct btrfs_free_space *entry, *tmp;
+        u64 min_bytes;
+        int ret;
+        /* for metadata, allow allocates with more holes */
+        if (btrfs_test_opt(root, SSD_SPREAD)) {
+                min_bytes = bytes + empty_size;
+        } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) {
+                /*
+                 * we want to do larger allocations when we are
+                 * flushing out the delayed refs, it helps prevent
+                 * making more work as we go along.
+                 */
+                if (trans->transaction->delayed_refs.flushing)
+                        min_bytes = max(bytes, (bytes + empty_size) >> 1);
+                else
+                        min_bytes = max(bytes, (bytes + empty_size) >> 4);
+        } else
+                min_bytes = max(bytes, (bytes + empty_size) >> 2);
+        spin_lock(&ctl->tree_lock);
+        /*
+         * If we know we don't have enough space to make a cluster don't even
+         * bother doing all the work to try and find one.
+         */
+        if (ctl->free_space < min_bytes) {
+                spin_unlock(&ctl->tree_lock);
+                return -ENOSPC;
+        }
+        spin_lock(&cluster->lock);
+        /* someone already found a cluster, hooray */
+        if (cluster->block_group) {
+                ret = 0;
+                goto out;
+        }
+        INIT_LIST_HEAD(&bitmaps);
+        ret = setup_cluster_no_bitmap(block_group, cluster, &bitmaps, offset,
+                                      bytes, min_bytes);
+        if (ret)
+                ret = setup_cluster_bitmap(block_group, cluster, &bitmaps,
+                                           offset, bytes, min_bytes);
+        /* Clear our temporary list */
+        list_for_each_entry_safe(entry, tmp, &bitmaps, list)
+                list_del_init(&entry->list);
+        if (!ret) {
+                atomic_inc(&block_group->count);
+                list_add_tail(&cluster->block_group_list,
+                              &block_group->cluster_list);
+                cluster->block_group = block_group;
+        }
 out:
        spin_unlock(&cluster->lock);
-        spin_unlock(&block_group->tree_lock);
+        spin_unlock(&ctl->tree_lock);
        return ret;
 }
@@ -1358,8 +2450,244 @@ void btrfs_init_free_cluster(struct btrfs_free_cluster *cluster)
        spin_lock_init(&cluster->refill_lock);
        cluster->root = RB_ROOT;
        cluster->max_size = 0;
-        cluster->points_to_bitmap = false;
        INIT_LIST_HEAD(&cluster->block_group_list);
        cluster->block_group = NULL;
 }
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+                           u64 *trimmed, u64 start, u64 end, u64 minlen)
+{
+        struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
+        struct btrfs_free_space *entry = NULL;
+        struct btrfs_fs_info *fs_info = block_group->fs_info;
+        u64 bytes = 0;
+        u64 actually_trimmed;
+        int ret = 0;
+        *trimmed = 0;
+        while (start < end) {
+                spin_lock(&ctl->tree_lock);
+                if (ctl->free_space < minlen) {
+                        spin_unlock(&ctl->tree_lock);
+                        break;
+                }
+                entry = tree_search_offset(ctl, start, 0, 1);
+                if (!entry)
+                        entry = tree_search_offset(ctl,
+                                                   offset_to_bitmap(ctl, start),
+                                                   1, 1);
+                if (!entry || entry->offset >= end) {
+                        spin_unlock(&ctl->tree_lock);
+                        break;
+                }
+                if (entry->bitmap) {
+                        ret = search_bitmap(ctl, entry, &start, &bytes);
+                        if (!ret) {
+                                if (start >= end) {
+                                        spin_unlock(&ctl->tree_lock);
+                                        break;
+                                }
+                                bytes = min(bytes, end - start);
+                                bitmap_clear_bits(ctl, entry, start, bytes);
+                                if (entry->bytes == 0)
+                                        free_bitmap(ctl, entry);
+                        } else {
+                                start = entry->offset + BITS_PER_BITMAP *
+                                        block_group->sectorsize;
+                                spin_unlock(&ctl->tree_lock);
+                                ret = 0;
+                                continue;
+                        }
+                } else {
+                        start = entry->offset;
+                        bytes = min(entry->bytes, end - start);
+                        unlink_free_space(ctl, entry);
+                        kmem_cache_free(btrfs_free_space_cachep, entry);
+                }
+                spin_unlock(&ctl->tree_lock);
+                if (bytes >= minlen) {
+                        int update_ret;
+                        update_ret = btrfs_update_reserved_bytes(block_group,
+                                                                 bytes, 1, 1);
+                        ret = btrfs_error_discard_extent(fs_info->extent_root,
+                                                         start,
+                                                         bytes,
+                                                         &actually_trimmed);
+                        btrfs_add_free_space(block_group, start, bytes);
+                        if (!update_ret)
+                                btrfs_update_reserved_bytes(block_group,
+                                                            bytes, 0, 1);
+                        if (ret)
+                                break;
+                        *trimmed += actually_trimmed;
+                }
+                start += bytes;
+                bytes = 0;
+                if (fatal_signal_pending(current)) {
+                        ret = -ERESTARTSYS;
+                        break;
+                }
+                cond_resched();
+        }
+        return ret;
+}
+/*
+ * Find the left-most item in the cache tree, and then return the
+ * smallest inode number in the item.
+ *
+ * Note: the returned inode number may not be the smallest one in
+ * the tree, if the left-most item is a bitmap.
+ */
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
+{
+        struct btrfs_free_space_ctl *ctl = fs_root->free_ino_ctl;
+        struct btrfs_free_space *entry = NULL;
+        u64 ino = 0;
+        spin_lock(&ctl->tree_lock);
+        if (RB_EMPTY_ROOT(&ctl->free_space_offset))
+                goto out;
+        entry = rb_entry(rb_first(&ctl->free_space_offset),
+                         struct btrfs_free_space, offset_index);
+        if (!entry->bitmap) {
+                ino = entry->offset;
+                unlink_free_space(ctl, entry);
+                entry->offset++;
+                entry->bytes--;
+                if (!entry->bytes)
+                        kmem_cache_free(btrfs_free_space_cachep, entry);
+                else
+                        link_free_space(ctl, entry);
+        } else {
+                u64 offset = 0;
+                u64 count = 1;
+                int ret;
+                ret = search_bitmap(ctl, entry, &offset, &count);
+                BUG_ON(ret);
+                ino = offset;
+                bitmap_clear_bits(ctl, entry, offset, 1);
+                if (entry->bytes == 0)
+                        free_bitmap(ctl, entry);
+        }
+out:
+        spin_unlock(&ctl->tree_lock);
+        return ino;
+}
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+                                    struct btrfs_path *path)
+{
+        struct inode *inode = NULL;
+        spin_lock(&root->cache_lock);
+        if (root->cache_inode)
+                inode = igrab(root->cache_inode);
+        spin_unlock(&root->cache_lock);
+        if (inode)
+                return inode;
+        inode = __lookup_free_space_inode(root, path, 0);
+        if (IS_ERR(inode))
+                return inode;
+        spin_lock(&root->cache_lock);
+        if (!btrfs_fs_closing(root->fs_info))
+                root->cache_inode = igrab(inode);
+        spin_unlock(&root->cache_lock);
+        return inode;
+}
+int create_free_ino_inode(struct btrfs_root *root,
+                          struct btrfs_trans_handle *trans,
+                          struct btrfs_path *path)
+{
+        return __create_free_space_inode(root, trans, path,
+                                         BTRFS_FREE_INO_OBJECTID, 0);
+}
+int load_free_ino_cache(struct btrfs_fs_info *fs_info, struct btrfs_root *root)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct btrfs_path *path;
+        struct inode *inode;
+        int ret = 0;
+        u64 root_gen = btrfs_root_generation(&root->root_item);
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
+        /*
+         * If we're unmounting then just return, since this does a search on the
+         * normal root and not the commit root and we could deadlock.
+         */
+        if (btrfs_fs_closing(fs_info))
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return 0;
+        inode = lookup_free_ino_inode(root, path);
+        if (IS_ERR(inode))
+                goto out;
+        if (root_gen != BTRFS_I(inode)->generation)
+                goto out_put;
+        ret = __load_free_space_cache(root, inode, ctl, path, 0);
+        if (ret < 0)
+                printk(KERN_ERR "btrfs: failed to load free ino cache for "
+                       "root %llu\n", root->root_key.objectid);
+out_put:
+        iput(inode);
+out:
+        btrfs_free_path(path);
+        return ret;
+}
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+                              struct btrfs_trans_handle *trans,
+                              struct btrfs_path *path)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct inode *inode;
+        int ret;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
+        inode = lookup_free_ino_inode(root, path);
+        if (IS_ERR(inode))
+                return 0;
+        ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
+        if (ret < 0)
+                printk(KERN_ERR "btrfs: failed to write free ino cache "
+                       "for root %llu\n", root->root_key.objectid);
+        iput(inode);
+        return ret;
+}
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 890a8e79011b..8f2613f779ed 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -27,17 +27,75 @@ struct btrfs_free_space {
        struct list_head list;
 };
-int btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+struct btrfs_free_space_ctl {
-                         u64 bytenr, u64 size);
+        spinlock_t tree_lock;
+        struct rb_root free_space_offset;
+        u64 free_space;
+        int extents_thresh;
+        int free_extents;
+        int total_bitmaps;
+        int unit;
+        u64 start;
+        struct btrfs_free_space_op *op;
+        void *private;
+};
+struct btrfs_free_space_op {
+        void (*recalc_thresholds)(struct btrfs_free_space_ctl *ctl);
+        bool (*use_bitmap)(struct btrfs_free_space_ctl *ctl,
+                           struct btrfs_free_space *info);
+};
+struct inode *lookup_free_space_inode(struct btrfs_root *root,
+                                      struct btrfs_block_group_cache
+                                      *block_group, struct btrfs_path *path);
+int create_free_space_inode(struct btrfs_root *root,
+                            struct btrfs_trans_handle *trans,
+                            struct btrfs_block_group_cache *block_group,
+                            struct btrfs_path *path);
+int btrfs_truncate_free_space_cache(struct btrfs_root *root,
+                                    struct btrfs_trans_handle *trans,
+                                    struct btrfs_path *path,
+                                    struct inode *inode);
+int load_free_space_cache(struct btrfs_fs_info *fs_info,
+                          struct btrfs_block_group_cache *block_group);
+int btrfs_write_out_cache(struct btrfs_root *root,
+                          struct btrfs_trans_handle *trans,
+                          struct btrfs_block_group_cache *block_group,
+                          struct btrfs_path *path);
+struct inode *lookup_free_ino_inode(struct btrfs_root *root,
+                                    struct btrfs_path *path);
+int create_free_ino_inode(struct btrfs_root *root,
+                          struct btrfs_trans_handle *trans,
+                          struct btrfs_path *path);
+int load_free_ino_cache(struct btrfs_fs_info *fs_info,
+                        struct btrfs_root *root);
+int btrfs_write_out_ino_cache(struct btrfs_root *root,
+                              struct btrfs_trans_handle *trans,
+                              struct btrfs_path *path);
+void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group);
+int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl,
+                           u64 bytenr, u64 size);
+static inline int
+btrfs_add_free_space(struct btrfs_block_group_cache *block_group,
+                     u64 bytenr, u64 size)
+{
+        return __btrfs_add_free_space(block_group->free_space_ctl,
+                                      bytenr, size);
+}
 int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group,
                            u64 bytenr, u64 size);
+void __btrfs_remove_free_space_cache(struct btrfs_free_space_ctl *ctl);
 void btrfs_remove_free_space_cache(struct btrfs_block_group_cache
-                                   *block_group);
+                                     *block_group);
 u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
                               u64 offset, u64 bytes, u64 empty_size);
+u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
                           u64 bytes);
-u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group);
 int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root,
                             struct btrfs_block_group_cache *block_group,
@@ -50,4 +108,6 @@ u64 btrfs_alloc_from_cluster(struct btrfs_block_group_cache *block_group,
 int btrfs_return_cluster_to_free_space(
                               struct btrfs_block_group_cache *block_group,
                               struct btrfs_free_cluster *cluster);
+int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
+                           u64 *trimmed, u64 start, u64 end, u64 minlen);
 #endif
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 64f1150bb48d..baa74f3db691 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -130,7 +130,6 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
                              item_size - (ptr + sub_item_len - item_start));
        ret = btrfs_truncate_item(trans, root, path,
                                  item_size - sub_item_len, 1);
-        BUG_ON(ret);
 out:
        btrfs_free_path(path);
        return ret;
@@ -167,7 +166,6 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
                old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
                ret = btrfs_extend_item(trans, root, path, ins_len);
-                BUG_ON(ret);
                ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
                                     struct btrfs_inode_ref);
                ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index c56eb5909172..b4087e0fa871 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -16,11 +16,476 @@
 * Boston, MA 021110-1307, USA.
 */
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
 #include "ctree.h"
 #include "disk-io.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
 #include "transaction.h"
-int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
+static int caching_kthread(void *data)
+{
+        struct btrfs_root *root = data;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct extent_buffer *leaf;
+        u64 last = (u64)-1;
+        int slot;
+        int ret;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        /* Since the commit root is read-only, we can safely skip locking. */
+        path->skip_locking = 1;
+        path->search_commit_root = 1;
+        path->reada = 2;
+        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
+        key.offset = 0;
+        key.type = BTRFS_INODE_ITEM_KEY;
+again:
+        /* need to make sure the commit_root doesn't disappear */
+        mutex_lock(&root->fs_commit_mutex);
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        while (1) {
+                if (btrfs_fs_closing(fs_info))
+                        goto out;
+                leaf = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(leaf)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
+                                goto out;
+                        else if (ret > 0)
+                                break;
+                        if (need_resched() ||
+                            btrfs_transaction_in_commit(fs_info)) {
+                                leaf = path->nodes[0];
+                                if (btrfs_header_nritems(leaf) == 0) {
+                                        WARN_ON(1);
+                                        break;
+                                }
+                                /*
+                                 * Save the key so we can advances forward
+                                 * in the next search.
+                                 */
+                                btrfs_item_key_to_cpu(leaf, &key, 0);
+                                btrfs_release_path(path);
+                                root->cache_progress = last;
+                                mutex_unlock(&root->fs_commit_mutex);
+                                schedule_timeout(1);
+                                goto again;
+                        } else
+                                continue;
+                }
+                btrfs_item_key_to_cpu(leaf, &key, slot);
+                if (key.type != BTRFS_INODE_ITEM_KEY)
+                        goto next;
+                if (key.objectid >= root->highest_objectid)
+                        break;
+                if (last != (u64)-1 && last + 1 != key.objectid) {
+                        __btrfs_add_free_space(ctl, last + 1,
+                                               key.objectid - last - 1);
+                        wake_up(&root->cache_wait);
+                }
+                last = key.objectid;
+next:
+                path->slots[0]++;
+        }
+        if (last < root->highest_objectid - 1) {
+                __btrfs_add_free_space(ctl, last + 1,
+                                       root->highest_objectid - last - 1);
+        }
+        spin_lock(&root->cache_lock);
+        root->cached = BTRFS_CACHE_FINISHED;
+        spin_unlock(&root->cache_lock);
+        root->cache_progress = (u64)-1;
+        btrfs_unpin_free_ino(root);
+out:
+        wake_up(&root->cache_wait);
+        mutex_unlock(&root->fs_commit_mutex);
+        btrfs_free_path(path);
+        return ret;
+}
+static void start_caching(struct btrfs_root *root)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct task_struct *tsk;
+        int ret;
+        u64 objectid;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return;
+        spin_lock(&root->cache_lock);
+        if (root->cached != BTRFS_CACHE_NO) {
+                spin_unlock(&root->cache_lock);
+                return;
+        }
+        root->cached = BTRFS_CACHE_STARTED;
+        spin_unlock(&root->cache_lock);
+        ret = load_free_ino_cache(root->fs_info, root);
+        if (ret == 1) {
+                spin_lock(&root->cache_lock);
+                root->cached = BTRFS_CACHE_FINISHED;
+                spin_unlock(&root->cache_lock);
+                return;
+        }
+        /*
+         * It can be quite time-consuming to fill the cache by searching
+         * through the extent tree, and this can keep ino allocation path
+         * waiting. Therefore at start we quickly find out the highest
+         * inode number and we know we can use inode numbers which fall in
+         * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID].
+         */
+        ret = btrfs_find_free_objectid(root, &objectid);
+        if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) {
+                __btrfs_add_free_space(ctl, objectid,
+                                       BTRFS_LAST_FREE_OBJECTID - objectid + 1);
+        }
+        tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n",
+                          root->root_key.objectid);
+        BUG_ON(IS_ERR(tsk));
+}
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid)
+{
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return btrfs_find_free_objectid(root, objectid);
+again:
+        *objectid = btrfs_find_ino_for_alloc(root);
+        if (*objectid != 0)
+                return 0;
+        start_caching(root);
+        wait_event(root->cache_wait,
+                   root->cached == BTRFS_CACHE_FINISHED ||
+                   root->free_ino_ctl->free_space > 0);
+        if (root->cached == BTRFS_CACHE_FINISHED &&
+            root->free_ino_ctl->free_space == 0)
+                return -ENOSPC;
+        else
+                goto again;
+}
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return;
+again:
+        if (root->cached == BTRFS_CACHE_FINISHED) {
+                __btrfs_add_free_space(ctl, objectid, 1);
+        } else {
+                /*
+                 * If we are in the process of caching free ino chunks,
+                 * to avoid adding the same inode number to the free_ino
+                 * tree twice due to cross transaction, we'll leave it
+                 * in the pinned tree until a transaction is committed
+                 * or the caching work is done.
+                 */
+                mutex_lock(&root->fs_commit_mutex);
+                spin_lock(&root->cache_lock);
+                if (root->cached == BTRFS_CACHE_FINISHED) {
+                        spin_unlock(&root->cache_lock);
+                        mutex_unlock(&root->fs_commit_mutex);
+                        goto again;
+                }
+                spin_unlock(&root->cache_lock);
+                start_caching(root);
+                if (objectid <= root->cache_progress ||
+                    objectid > root->highest_objectid)
+                        __btrfs_add_free_space(ctl, objectid, 1);
+                else
+                        __btrfs_add_free_space(pinned, objectid, 1);
+                mutex_unlock(&root->fs_commit_mutex);
+        }
+}
+/*
+ * When a transaction is committed, we'll move those inode numbers which
+ * are smaller than root->cache_progress from pinned tree to free_ino tree,
+ * and others will just be dropped, because the commit root we were
+ * searching has changed.
+ *
+ * Must be called with root->fs_commit_mutex held
+ */
+void btrfs_unpin_free_ino(struct btrfs_root *root)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset;
+        struct btrfs_free_space *info;
+        struct rb_node *n;
+        u64 count;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return;
+        while (1) {
+                n = rb_first(rbroot);
+                if (!n)
+                        break;
+                info = rb_entry(n, struct btrfs_free_space, offset_index);
+                BUG_ON(info->bitmap);
+                if (info->offset > root->cache_progress)
+                        goto free;
+                else if (info->offset + info->bytes > root->cache_progress)
+                        count = root->cache_progress - info->offset + 1;
+                else
+                        count = info->bytes;
+                __btrfs_add_free_space(ctl, info->offset, count);
+free:
+                rb_erase(&info->offset_index, rbroot);
+                kfree(info);
+        }
+}
+#define INIT_THRESHOLD  (((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
+/*
+ * The goal is to keep the memory used by the free_ino tree won't
+ * exceed the memory if we use bitmaps only.
+ */
+static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+        struct btrfs_free_space *info;
+        struct rb_node *n;
+        int max_ino;
+        int max_bitmaps;
+        n = rb_last(&ctl->free_space_offset);
+        if (!n) {
+                ctl->extents_thresh = INIT_THRESHOLD;
+                return;
+        }
+        info = rb_entry(n, struct btrfs_free_space, offset_index);
+        /*
+         * Find the maximum inode number in the filesystem. Note we
+         * ignore the fact that this can be a bitmap, because we are
+         * not doing precise calculation.
+         */
+        max_ino = info->bytes - 1;
+        max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP;
+        if (max_bitmaps <= ctl->total_bitmaps) {
+                ctl->extents_thresh = 0;
+                return;
+        }
+        ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) *
+                                PAGE_CACHE_SIZE / sizeof(*info);
+}
+/*
+ * We don't fall back to bitmap, if we are below the extents threshold
+ * or this chunk of inode numbers is a big one.
+ */
+static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
+                       struct btrfs_free_space *info)
+{
+        if (ctl->free_extents < ctl->extents_thresh ||
+            info->bytes > INODES_PER_BITMAP / 10)
+                return false;
+        return true;
+}
+static struct btrfs_free_space_op free_ino_op = {
+        .recalc_thresholds      = recalculate_thresholds,
+        .use_bitmap             = use_bitmap,
+};
+static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl)
+{
+}
+static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
+                              struct btrfs_free_space *info)
+{
+        /*
+         * We always use extents for two reasons:
+         *
+         * - The pinned tree is only used during the process of caching
+         *   work.
+         * - Make code simpler. See btrfs_unpin_free_ino().
+         */
+        return false;
+}
+static struct btrfs_free_space_op pinned_free_ino_op = {
+        .recalc_thresholds      = pinned_recalc_thresholds,
+        .use_bitmap             = pinned_use_bitmap,
+};
+void btrfs_init_free_ino_ctl(struct btrfs_root *root)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct btrfs_free_space_ctl *pinned = root->free_ino_pinned;
+        spin_lock_init(&ctl->tree_lock);
+        ctl->unit = 1;
+        ctl->start = 0;
+        ctl->private = NULL;
+        ctl->op = &free_ino_op;
+        /*
+         * Initially we allow to use 16K of ram to cache chunks of
+         * inode numbers before we resort to bitmaps. This is somewhat
+         * arbitrary, but it will be adjusted in runtime.
+         */
+        ctl->extents_thresh = INIT_THRESHOLD;
+        spin_lock_init(&pinned->tree_lock);
+        pinned->unit = 1;
+        pinned->start = 0;
+        pinned->private = NULL;
+        pinned->extents_thresh = 0;
+        pinned->op = &pinned_free_ino_op;
+}
+int btrfs_save_ino_cache(struct btrfs_root *root,
+                         struct btrfs_trans_handle *trans)
+{
+        struct btrfs_free_space_ctl *ctl = root->free_ino_ctl;
+        struct btrfs_path *path;
+        struct inode *inode;
+        u64 alloc_hint = 0;
+        int ret;
+        int prealloc;
+        bool retry = false;
+        /* only fs tree and subvol/snap needs ino cache */
+        if (root->root_key.objectid != BTRFS_FS_TREE_OBJECTID &&
+            (root->root_key.objectid < BTRFS_FIRST_FREE_OBJECTID ||
+             root->root_key.objectid > BTRFS_LAST_FREE_OBJECTID))
+                return 0;
+        /* Don't save inode cache if we are deleting this root */
+        if (btrfs_root_refs(&root->root_item) == 0 &&
+            root != root->fs_info->tree_root)
+                return 0;
+        if (!btrfs_test_opt(root, INODE_MAP_CACHE))
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+again:
+        inode = lookup_free_ino_inode(root, path);
+        if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
+                ret = PTR_ERR(inode);
+                goto out;
+        }
+        if (IS_ERR(inode)) {
+                BUG_ON(retry);
+                retry = true;
+                ret = create_free_ino_inode(root, trans, path);
+                if (ret)
+                        goto out;
+                goto again;
+        }
+        BTRFS_I(inode)->generation = 0;
+        ret = btrfs_update_inode(trans, root, inode);
+        WARN_ON(ret);
+        if (i_size_read(inode) > 0) {
+                ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+                if (ret)
+                        goto out_put;
+        }
+        spin_lock(&root->cache_lock);
+        if (root->cached != BTRFS_CACHE_FINISHED) {
+                ret = -1;
+                spin_unlock(&root->cache_lock);
+                goto out_put;
+        }
+        spin_unlock(&root->cache_lock);
+        spin_lock(&ctl->tree_lock);
+        prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents;
+        prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE);
+        prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE;
+        spin_unlock(&ctl->tree_lock);
+        /* Just to make sure we have enough space */
+        prealloc += 8 * PAGE_CACHE_SIZE;
+        ret = btrfs_check_data_free_space(inode, prealloc);
+        if (ret)
+                goto out_put;
+        ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
+                                              prealloc, prealloc, &alloc_hint);
+        if (ret)
+                goto out_put;
+        btrfs_free_reserved_data_space(inode, prealloc);
+out_put:
+        iput(inode);
+out:
+        if (ret == 0)
+                ret = btrfs_write_out_ino_cache(root, trans, path);
+        btrfs_free_path(path);
+        return ret;
+}
+static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid)
 {
        struct btrfs_path *path;
        int ret;
@@ -30,7 +495,8 @@ int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid)
        int slot;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        search_key.objectid = BTRFS_LAST_FREE_OBJECTID;
        search_key.type = -1;
@@ -54,15 +520,14 @@ error:
        return ret;
 }
-int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid)
-                             struct btrfs_root *root,
-                             u64 dirid, u64 *objectid)
 {
        int ret;
        mutex_lock(&root->objectid_mutex);
        if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) {
-                ret = btrfs_find_highest_inode(root, &root->highest_objectid);
+                ret = btrfs_find_highest_objectid(root,
+                                                  &root->highest_objectid);
                if (ret)
                        goto out;
        }
diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h
new file mode 100644
index 000000000000..ddb347bfee23
--- /dev/null
+++ b/fs/btrfs/inode-map.h
@@ -0,0 +1,13 @@
+#ifndef __BTRFS_INODE_MAP
+#define __BTRFS_INODE_MAP
+void btrfs_init_free_ino_ctl(struct btrfs_root *root);
+void btrfs_unpin_free_ino(struct btrfs_root *root);
+void btrfs_return_ino(struct btrfs_root *root, u64 objectid);
+int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid);
+int btrfs_save_ino_cache(struct btrfs_root *root,
+                         struct btrfs_trans_handle *trans);
+int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid);
+#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index c03864406af3..3601f0aebddf 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -37,6 +37,7 @@
 #include <linux/posix_acl.h>
 #include <linux/falloc.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -50,6 +51,8 @@
 #include "tree-log.h"
 #include "compression.h"
 #include "locking.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
 struct btrfs_iget_args {
        u64 ino;
@@ -70,6 +73,7 @@ static struct kmem_cache *btrfs_inode_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
 struct kmem_cache *btrfs_path_cachep;
+struct kmem_cache *btrfs_free_space_cachep;
 #define S_SHIFT 12
 static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
@@ -82,7 +86,8 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
        [S_IFLNK >> S_SHIFT]    = BTRFS_FT_SYMLINK,
 };
-static void btrfs_truncate(struct inode *inode);
+static int btrfs_setsize(struct inode *inode, loff_t newsize);
+static int btrfs_truncate(struct inode *inode);
 static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end);
 static noinline int cow_file_range(struct inode *inode,
                                   struct page *locked_page,
@@ -90,13 +95,14 @@ static noinline int cow_file_range(struct inode *inode,
                                   unsigned long *nr_written, int unlock);
 static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
-                                     struct inode *inode,  struct inode *dir)
+                                     struct inode *inode,  struct inode *dir,
+                                     const struct qstr *qstr)
 {
        int err;
        err = btrfs_init_acl(trans, inode, dir);
        if (!err)
-                err = btrfs_xattr_security_init(trans, inode, dir);
+                err = btrfs_xattr_security_init(trans, inode, dir, qstr);
        return err;
 }
@@ -108,6 +114,7 @@ static int btrfs_init_inode_security(struct btrfs_trans_handle *trans,
 static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root, struct inode *inode,
                                u64 start, size_t size, size_t compressed_size,
+                                int compress_type,
                                struct page **compressed_pages)
 {
        struct btrfs_key key;
@@ -122,21 +129,17 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        size_t cur_size = size;
        size_t datasize;
        unsigned long offset;
-        int use_compress = 0;
-        if (compressed_size && compressed_pages) {
+        if (compressed_size && compressed_pages)
-                use_compress = 1;
                cur_size = compressed_size;
-        }
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
        path->leave_spinning = 1;
-        btrfs_set_trans_block_group(trans, inode);
-        key.objectid = inode->i_ino;
+        key.objectid = btrfs_ino(inode);
        key.offset = start;
        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
        datasize = btrfs_file_extent_calc_inline_size(cur_size);
@@ -159,7 +162,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
        btrfs_set_file_extent_ram_bytes(leaf, ei, size);
        ptr = btrfs_file_extent_inline_start(ei);
-        if (use_compress) {
+        if (compress_type != BTRFS_COMPRESS_NONE) {
                struct page *cpage;
                int i = 0;
                while (compressed_size > 0) {
@@ -176,7 +179,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
                        compressed_size -= cur_size;
                }
                btrfs_set_file_extent_compression(leaf, ei,
-                                                  BTRFS_COMPRESS_ZLIB);
+                                                  compress_type);
        } else {
                page = find_get_page(inode->i_mapping,
                                     start >> PAGE_CACHE_SHIFT);
@@ -217,7 +220,7 @@ fail:
 static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root,
                                 struct inode *inode, u64 start, u64 end,
-                                 size_t compressed_size,
+                                 size_t compressed_size, int compress_type,
                                 struct page **compressed_pages)
 {
        u64 isize = i_size_read(inode);
@@ -250,7 +253,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
                inline_len = min_t(u64, isize, actual_end);
        ret = insert_inline_extent(trans, root, inode, start,
                                   inline_len, compressed_size,
-                                   compressed_pages);
+                                   compress_type, compressed_pages);
        BUG_ON(ret);
        btrfs_delalloc_release_metadata(inode, end + 1 - start);
        btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
@@ -263,6 +266,7 @@ struct async_extent {
        u64 compressed_size;
        struct page **pages;
        unsigned long nr_pages;
+        int compress_type;
        struct list_head list;
 };
@@ -280,16 +284,19 @@ static noinline int add_async_extent(struct async_cow *cow,
                                     u64 start, u64 ram_size,
                                     u64 compressed_size,
                                     struct page **pages,
-                                     unsigned long nr_pages)
+                                     unsigned long nr_pages,
+                                     int compress_type)
 {
        struct async_extent *async_extent;
        async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS);
+        BUG_ON(!async_extent);
        async_extent->start = start;
        async_extent->ram_size = ram_size;
        async_extent->compressed_size = compressed_size;
        async_extent->pages = pages;
        async_extent->nr_pages = nr_pages;
+        async_extent->compress_type = compress_type;
        list_add_tail(&async_extent->list, &cow->extents);
        return 0;
 }
@@ -319,8 +326,6 @@ static noinline int compress_file_range(struct inode *inode,
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        u64 num_bytes;
-        u64 orig_start;
-        u64 disk_num_bytes;
        u64 blocksize = root->sectorsize;
        u64 actual_end;
        u64 isize = i_size_read(inode);
@@ -334,8 +339,11 @@ static noinline int compress_file_range(struct inode *inode,
        unsigned long max_uncompressed = 128 * 1024;
        int i;
        int will_compress;
+        int compress_type = root->fs_info->compress_type;
-        orig_start = start;
+        /* if this is a small write inside eof, kick off a defragbot */
+        if (end <= BTRFS_I(inode)->disk_i_size && (end - start + 1) < 16 * 1024)
+                btrfs_add_inode_defrag(NULL, inode);
        actual_end = min_t(u64, isize, end + 1);
 again:
@@ -371,7 +379,6 @@ again:
        total_compressed = min(total_compressed, max_uncompressed);
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        num_bytes = max(blocksize,  num_bytes);
-        disk_num_bytes = num_bytes;
        total_in = 0;
        ret = 0;
@@ -382,16 +389,22 @@ again:
         */
        if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) &&
            (btrfs_test_opt(root, COMPRESS) ||
-             (BTRFS_I(inode)->force_compress))) {
+             (BTRFS_I(inode)->force_compress) ||
+             (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
                WARN_ON(pages);
                pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
+                BUG_ON(!pages);
+                if (BTRFS_I(inode)->force_compress)
+                        compress_type = BTRFS_I(inode)->force_compress;
-                ret = btrfs_zlib_compress_pages(inode->i_mapping, start,
+                ret = btrfs_compress_pages(compress_type,
-                                                total_compressed, pages,
+                                           inode->i_mapping, start,
-                                                nr_pages, &nr_pages_ret,
+                                           total_compressed, pages,
-                                                &total_in,
+                                           nr_pages, &nr_pages_ret,
-                                                &total_compressed,
+                                           &total_in,
-                                                max_compressed);
+                                           &total_compressed,
+                                           max_compressed);
                if (!ret) {
                        unsigned long offset = total_compressed &
@@ -412,9 +425,8 @@ again:
                }
        }
        if (start == 0) {
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
-                BUG_ON(!trans);
+                BUG_ON(IS_ERR(trans));
-                btrfs_set_trans_block_group(trans, inode);
                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                /* lets try to make an inline extent */
@@ -423,12 +435,13 @@ again:
                         * to make an uncompressed inline extent.
                         */
                        ret = cow_file_range_inline(trans, root, inode,
-                                                    start, end, 0, NULL);
+                                                    start, end, 0, 0, NULL);
                } else {
                        /* try making a compressed inline extent */
                        ret = cow_file_range_inline(trans, root, inode,
                                                    start, end,
-                                                    total_compressed, pages);
+                                                    total_compressed,
+                                                    compress_type, pages);
                }
                if (ret == 0) {
                        /*
@@ -467,7 +480,6 @@ again:
                if (total_compressed >= total_in) {
                        will_compress = 0;
                } else {
-                        disk_num_bytes = total_compressed;
                        num_bytes = total_in;
                }
        }
@@ -499,9 +511,10 @@ again:
                 * and will submit them to the elevator.
                 */
                add_async_extent(async_cow, start, num_bytes,
-                                 total_compressed, pages, nr_pages_ret);
+                                 total_compressed, pages, nr_pages_ret,
+                                 compress_type);
-                if (start + num_bytes < end && start + num_bytes < actual_end) {
+                if (start + num_bytes < end) {
                        start += num_bytes;
                        pages = NULL;
                        cond_resched();
@@ -521,7 +534,8 @@ cleanup_and_bail_uncompressed:
                        __set_page_dirty_nobuffers(locked_page);
                        /* unlocked later on in the async handlers */
                }
-                add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0);
+                add_async_extent(async_cow, start, end - start + 1,
+                                 0, NULL, 0, BTRFS_COMPRESS_NONE);
                *num_added += 1;
        }
@@ -607,7 +621,9 @@ retry:
                            async_extent->start + async_extent->ram_size - 1,
                            GFP_NOFS);
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
+                BUG_ON(IS_ERR(trans));
+                trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                ret = btrfs_reserve_extent(trans, root,
                                           async_extent->compressed_size,
                                           async_extent->compressed_size,
@@ -638,7 +654,8 @@ retry:
                                        async_extent->start +
                                        async_extent->ram_size - 1, 0);
-                em = alloc_extent_map(GFP_NOFS);
+                em = alloc_extent_map();
+                BUG_ON(!em);
                em->start = async_extent->start;
                em->len = async_extent->ram_size;
                em->orig_start = em->start;
@@ -646,6 +663,7 @@ retry:
                em->block_start = ins.objectid;
                em->block_len = ins.offset;
                em->bdev = root->fs_info->fs_devices->latest_bdev;
+                em->compress_type = async_extent->compress_type;
                set_bit(EXTENT_FLAG_PINNED, &em->flags);
                set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
@@ -662,11 +680,13 @@ retry:
                                                async_extent->ram_size - 1, 0);
                }
-                ret = btrfs_add_ordered_extent(inode, async_extent->start,
+                ret = btrfs_add_ordered_extent_compress(inode,
-                                               ins.objectid,
+                                                async_extent->start,
-                                               async_extent->ram_size,
+                                                ins.objectid,
-                                               ins.offset,
+                                                async_extent->ram_size,
-                                               BTRFS_ORDERED_COMPRESSED);
+                                                ins.offset,
+                                                BTRFS_ORDERED_COMPRESSED,
+                                                async_extent->compress_type);
                BUG_ON(ret);
                /*
@@ -730,6 +750,15 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
        return alloc_hint;
 }
+static inline bool is_free_space_inode(struct btrfs_root *root,
+                                       struct inode *inode)
+{
+        if (root == root->fs_info->tree_root ||
+            BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID)
+                return true;
+        return false;
+}
 /*
 * when extent_io.c finds a delayed allocation range in the file,
 * the call backs end up in this code.  The basic idea is to
@@ -757,29 +786,29 @@ static noinline int cow_file_range(struct inode *inode,
        u64 disk_num_bytes;
        u64 cur_alloc_size;
        u64 blocksize = root->sectorsize;
-        u64 actual_end;
-        u64 isize = i_size_read(inode);
        struct btrfs_key ins;
        struct extent_map *em;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        int ret = 0;
-        trans = btrfs_join_transaction(root, 1);
+        BUG_ON(is_free_space_inode(root, inode));
-        BUG_ON(!trans);
+        trans = btrfs_join_transaction(root);
-        btrfs_set_trans_block_group(trans, inode);
+        BUG_ON(IS_ERR(trans));
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-        actual_end = min_t(u64, isize, end + 1);
        num_bytes = (end - start + blocksize) & ~(blocksize - 1);
        num_bytes = max(blocksize,  num_bytes);
        disk_num_bytes = num_bytes;
        ret = 0;
+        /* if this is a small write inside eof, kick off defrag */
+        if (end <= BTRFS_I(inode)->disk_i_size && num_bytes < 64 * 1024)
+                btrfs_add_inode_defrag(trans, inode);
        if (start == 0) {
                /* lets try to make an inline extent */
                ret = cow_file_range_inline(trans, root, inode,
-                                            start, end, 0, NULL);
+                                            start, end, 0, 0, NULL);
                if (ret == 0) {
                        extent_clear_unlock_delalloc(inode,
                                     &BTRFS_I(inode)->io_tree,
@@ -814,7 +843,8 @@ static noinline int cow_file_range(struct inode *inode,
                                           (u64)-1, &ins, 1);
                BUG_ON(ret);
-                em = alloc_extent_map(GFP_NOFS);
+                em = alloc_extent_map();
+                BUG_ON(!em);
                em->start = start;
                em->orig_start = em->start;
                ram_size = ins.offset;
@@ -941,6 +971,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
                         1, 0, NULL, GFP_NOFS);
        while (start < end) {
                async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS);
+                BUG_ON(!async_cow);
                async_cow->inode = inode;
                async_cow->root = root;
                async_cow->locked_page = locked_page;
@@ -994,7 +1025,7 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
        LIST_HEAD(list);
        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr,
-                                       bytenr + num_bytes - 1, &list);
+                                       bytenr + num_bytes - 1, &list, 0);
        if (ret == 0 && list_empty(&list))
                return 0;
@@ -1035,23 +1066,33 @@ static noinline int run_delalloc_nocow(struct inode *inode,
        int type;
        int nocow;
        int check_prev = 1;
+        bool nolock;
+        u64 ino = btrfs_ino(inode);
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        trans = btrfs_join_transaction(root, 1);
-        BUG_ON(!trans);
+        nolock = is_free_space_inode(root, inode);
+        if (nolock)
+                trans = btrfs_join_transaction_nolock(root);
+        else
+                trans = btrfs_join_transaction(root);
+        BUG_ON(IS_ERR(trans));
+        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        cow_start = (u64)-1;
        cur_offset = start;
        while (1) {
-                ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+                ret = btrfs_lookup_file_extent(trans, root, path, ino,
                                               cur_offset, 0);
                BUG_ON(ret < 0);
                if (ret > 0 && path->slots[0] > 0 && check_prev) {
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &found_key,
                                              path->slots[0] - 1);
-                        if (found_key.objectid == inode->i_ino &&
+                        if (found_key.objectid == ino &&
                            found_key.type == BTRFS_EXTENT_DATA_KEY)
                                path->slots[0]--;
                }
@@ -1072,7 +1113,7 @@ next_slot:
                num_bytes = 0;
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-                if (found_key.objectid > inode->i_ino ||
+                if (found_key.objectid > ino ||
                    found_key.type > BTRFS_EXTENT_DATA_KEY ||
                    found_key.offset > end)
                        break;
@@ -1107,7 +1148,7 @@ next_slot:
                                goto out_check;
                        if (btrfs_extent_readonly(root, disk_bytenr))
                                goto out_check;
-                        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+                        if (btrfs_cross_ref_exist(trans, root, ino,
                                                  found_key.offset -
                                                  extent_offset, disk_bytenr))
                                goto out_check;
@@ -1144,7 +1185,7 @@ out_check:
                        goto next_slot;
                }
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                if (cow_start != (u64)-1) {
                        ret = cow_file_range(inode, locked_page, cow_start,
                                        found_key.offset - 1, page_started,
@@ -1157,7 +1198,8 @@ out_check:
                        struct extent_map *em;
                        struct extent_map_tree *em_tree;
                        em_tree = &BTRFS_I(inode)->extent_tree;
-                        em = alloc_extent_map(GFP_NOFS);
+                        em = alloc_extent_map();
+                        BUG_ON(!em);
                        em->start = cur_offset;
                        em->orig_start = em->start;
                        em->len = num_bytes;
@@ -1201,7 +1243,7 @@ out_check:
                if (cur_offset > end)
                        break;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (cur_offset <= end && cow_start == (u64)-1)
                cow_start = cur_offset;
@@ -1211,8 +1253,13 @@ out_check:
                BUG_ON(ret);
        }
-        ret = btrfs_end_transaction(trans, root);
+        if (nolock) {
-        BUG_ON(ret);
+                ret = btrfs_end_transaction_nolock(trans, root);
+                BUG_ON(ret);
+        } else {
+                ret = btrfs_end_transaction(trans, root);
+                BUG_ON(ret);
+        }
        btrfs_free_path(path);
        return 0;
 }
@@ -1234,7 +1281,8 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
                ret = run_delalloc_nocow(inode, locked_page, start, end,
                                         page_started, 0, nr_written);
        else if (!btrfs_test_opt(root, COMPRESS) &&
-                 !(BTRFS_I(inode)->force_compress))
+                 !(BTRFS_I(inode)->force_compress) &&
+                 !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))
                ret = cow_file_range(inode, locked_page, start, end,
                                      page_started, nr_written, 1);
        else
@@ -1283,12 +1331,13 @@ static int btrfs_set_bit_hook(struct inode *inode,
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
-         * but in this case, we are only testeing for the DELALLOC
+         * but in this case, we are only testing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
        if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
                u64 len = state->end + 1 - state->start;
+                bool do_list = !is_free_space_inode(root, inode);
                if (*bits & EXTENT_FIRST_DELALLOC)
                        *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1298,7 +1347,7 @@ static int btrfs_set_bit_hook(struct inode *inode,
                spin_lock(&root->fs_info->delalloc_lock);
                BTRFS_I(inode)->delalloc_bytes += len;
                root->fs_info->delalloc_bytes += len;
-                if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
+                if (do_list && list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_add_tail(&BTRFS_I(inode)->delalloc_inodes,
                                      &root->fs_info->delalloc_inodes);
                }
@@ -1315,12 +1364,13 @@ static int btrfs_clear_bit_hook(struct inode *inode,
 {
        /*
         * set_bit and clear bit hooks normally require _irqsave/restore
-         * but in this case, we are only testeing for the DELALLOC
+         * but in this case, we are only testing for the DELALLOC
         * bit, which is only set or cleared with irqs on
         */
        if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
                struct btrfs_root *root = BTRFS_I(inode)->root;
                u64 len = state->end + 1 - state->start;
+                bool do_list = !is_free_space_inode(root, inode);
                if (*bits & EXTENT_FIRST_DELALLOC)
                        *bits &= ~EXTENT_FIRST_DELALLOC;
@@ -1330,14 +1380,15 @@ static int btrfs_clear_bit_hook(struct inode *inode,
                if (*bits & EXTENT_DO_ACCOUNTING)
                        btrfs_delalloc_release_metadata(inode, len);
-                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID)
+                if (root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID
+                    && do_list)
                        btrfs_free_reserved_data_space(inode, len);
                spin_lock(&root->fs_info->delalloc_lock);
                root->fs_info->delalloc_bytes -= len;
                BTRFS_I(inode)->delalloc_bytes -= len;
-                if (BTRFS_I(inode)->delalloc_bytes == 0 &&
+                if (do_list && BTRFS_I(inode)->delalloc_bytes == 0 &&
                    !list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
                        list_del_init(&BTRFS_I(inode)->delalloc_inodes);
                }
@@ -1372,7 +1423,7 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
        if (map_length < length + size)
                return 1;
-        return 0;
+        return ret;
 }
 /*
@@ -1426,15 +1477,21 @@ static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
        skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
-        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        if (is_free_space_inode(root, inode))
+                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 2);
+        else
+                ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
        BUG_ON(ret);
        if (!(rw & REQ_WRITE)) {
                if (bio_flags & EXTENT_BIO_COMPRESSED) {
                        return btrfs_submit_compressed_read(inode, bio,
                                                    mirror_num, bio_flags);
-                } else if (!skip_sum)
+                } else if (!skip_sum) {
-                        btrfs_lookup_bio_sums(root, inode, bio, NULL);
+                        ret = btrfs_lookup_bio_sums(root, inode, bio, NULL);
+                        if (ret)
+                                return ret;
+                }
                goto mapit;
        } else if (!skip_sum) {
                /* csum items have already been cloned */
@@ -1462,8 +1519,6 @@ static noinline int add_pending_csums(struct btrfs_trans_handle *trans,
 {
        struct btrfs_ordered_sum *sum;
-        btrfs_set_trans_block_group(trans, inode);
        list_for_each_entry(sum, list, list) {
                btrfs_csum_file_blocks(trans,
                       BTRFS_I(inode)->root->fs_info->csum_root, sum);
@@ -1534,6 +1589,7 @@ out:
 out_page:
        unlock_page(page);
        page_cache_release(page);
+        kfree(fixup);
 }
 /*
@@ -1605,7 +1661,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
                                 &hint, 0);
        BUG_ON(ret);
-        ins.objectid = inode->i_ino;
+        ins.objectid = btrfs_ino(inode);
        ins.offset = file_pos;
        ins.type = BTRFS_EXTENT_DATA_KEY;
        ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi));
@@ -1636,7 +1692,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
        ins.type = BTRFS_EXTENT_ITEM_KEY;
        ret = btrfs_alloc_reserved_file_extent(trans, root,
                                        root->root_key.objectid,
-                                        inode->i_ino, file_pos, &ins);
+                                        btrfs_ino(inode), file_pos, &ins);
        BUG_ON(ret);
        btrfs_free_path(path);
@@ -1660,8 +1716,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        struct btrfs_ordered_extent *ordered_extent = NULL;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct extent_state *cached_state = NULL;
-        int compressed = 0;
+        int compress_type = 0;
        int ret;
+        bool nolock;
        ret = btrfs_dec_test_ordered_pending(inode, &ordered_extent, start,
                                             end - start + 1);
@@ -1669,12 +1726,17 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                return 0;
        BUG_ON(!ordered_extent);
+        nolock = is_free_space_inode(root, inode);
        if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
                BUG_ON(!list_empty(&ordered_extent->list));
                ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
                if (!ret) {
-                        trans = btrfs_join_transaction(root, 1);
+                        if (nolock)
-                        btrfs_set_trans_block_group(trans, inode);
+                                trans = btrfs_join_transaction_nolock(root);
+                        else
+                                trans = btrfs_join_transaction(root);
+                        BUG_ON(IS_ERR(trans));
                        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
                        ret = btrfs_update_inode(trans, root, inode);
                        BUG_ON(ret);
@@ -1686,27 +1748,31 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
                         ordered_extent->file_offset + ordered_extent->len - 1,
                         0, &cached_state, GFP_NOFS);
-        trans = btrfs_join_transaction(root, 1);
+        if (nolock)
-        btrfs_set_trans_block_group(trans, inode);
+                trans = btrfs_join_transaction_nolock(root);
+        else
+                trans = btrfs_join_transaction(root);
+        BUG_ON(IS_ERR(trans));
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
        if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
-                compressed = 1;
+                compress_type = ordered_extent->compress_type;
        if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) {
-                BUG_ON(compressed);
+                BUG_ON(compress_type);
                ret = btrfs_mark_extent_written(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->file_offset +
                                                ordered_extent->len);
                BUG_ON(ret);
        } else {
+                BUG_ON(root == root->fs_info->tree_root);
                ret = insert_reserved_file_extent(trans, inode,
                                                ordered_extent->file_offset,
                                                ordered_extent->start,
                                                ordered_extent->disk_len,
                                                ordered_extent->len,
                                                ordered_extent->len,
-                                                compressed, 0, 0,
+                                                compress_type, 0, 0,
                                                BTRFS_FILE_EXTENT_REG);
                unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
                                   ordered_extent->file_offset,
@@ -1720,13 +1786,22 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
        add_pending_csums(trans, inode, ordered_extent->file_offset,
                          &ordered_extent->list);
-        btrfs_ordered_update_i_size(inode, 0, ordered_extent);
+        ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent);
-        ret = btrfs_update_inode(trans, root, inode);
+        if (!ret) {
-        BUG_ON(ret);
+                ret = btrfs_update_inode(trans, root, inode);
+                BUG_ON(ret);
+        }
+        ret = 0;
 out:
-        btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+        if (nolock) {
-        if (trans)
+                if (trans)
-                btrfs_end_transaction(trans, root);
+                        btrfs_end_transaction_nolock(trans, root);
+        } else {
+                btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+                if (trans)
+                        btrfs_end_transaction(trans, root);
+        }
        /* once for us */
        btrfs_put_ordered_extent(ordered_extent);
        /* once for the tree */
@@ -1738,6 +1813,8 @@ out:
 static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
                                struct extent_state *state, int uptodate)
 {
+        trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
        ClearPagePrivate2(page);
        return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
@@ -1793,7 +1870,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
                }
                read_unlock(&em_tree->lock);
-                if (!em || IS_ERR(em)) {
+                if (IS_ERR_OR_NULL(em)) {
                        kfree(failrec);
                        return -EIO;
                }
@@ -1802,6 +1879,8 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
                        logical = em->block_start;
                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                        extent_set_compress_type(&failrec->bio_flags,
+                                                 em->compress_type);
                }
                failrec->logical = logical;
                free_extent_map(em);
@@ -1846,10 +1925,10 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
        else
                rw = READ;
-        BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
+        ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
                                                      failrec->last_mirror,
                                                      failrec->bio_flags, 0);
-        return 0;
+        return ret;
 }
 /*
@@ -1865,7 +1944,7 @@ static int btrfs_clean_io_failures(struct inode *inode, u64 start)
        private = 0;
        if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-                             (u64)-1, 1, EXTENT_DIRTY)) {
+                             (u64)-1, 1, EXTENT_DIRTY, 0)) {
                ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
                                        start, &private_failure);
                if (ret == 0) {
@@ -1907,7 +1986,7 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
        }
        if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
-                return 0;
+                goto good;
        if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID &&
            test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) {
@@ -1940,12 +2019,11 @@ good:
        return 0;
 zeroit:
-        if (printk_ratelimit()) {
+        printk_ratelimited(KERN_INFO "btrfs csum failed ino %llu off %llu csum %u "
-                printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u "
+                       "private %llu\n",
-                       "private %llu\n", page->mapping->host->i_ino,
+                       (unsigned long long)btrfs_ino(page->mapping->host),
                       (unsigned long long)start, csum,
                       (unsigned long long)private);
-        }
        memset(kaddr + offset, 1, end - start + 1);
        flush_dcache_page(page);
        kunmap_atomic(kaddr, KM_USER0);
@@ -2161,8 +2239,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
                        insert = 1;
 #endif
                insert = 1;
-        } else {
-                WARN_ON(!BTRFS_I(inode)->orphan_meta_reserved);
        }
        if (!BTRFS_I(inode)->orphan_meta_reserved) {
@@ -2182,7 +2258,7 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
        /* insert an orphan item to track this unlinked/truncated file */
        if (insert >= 1) {
-                ret = btrfs_insert_orphan_item(trans, root, inode->i_ino);
+                ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
                BUG_ON(ret);
        }
@@ -2219,7 +2295,7 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
        spin_unlock(&root->orphan_lock);
        if (trans && delete_item) {
-                ret = btrfs_del_orphan_item(trans, root, inode->i_ino);
+                ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
                BUG_ON(ret);
        }
@@ -2233,21 +2309,23 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode)
 * this cleans up any orphans that may be left on the list from the last use
 * of this root.
 */
-void btrfs_orphan_cleanup(struct btrfs_root *root)
+int btrfs_orphan_cleanup(struct btrfs_root *root)
 {
        struct btrfs_path *path;
        struct extent_buffer *leaf;
-        struct btrfs_item *item;
        struct btrfs_key key, found_key;
        struct btrfs_trans_handle *trans;
        struct inode *inode;
        int ret = 0, nr_unlink = 0, nr_truncate = 0;
        if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
-                return;
+                return 0;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
        path->reada = -1;
        key.objectid = BTRFS_ORPHAN_OBJECTID;
@@ -2256,18 +2334,16 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-                if (ret < 0) {
+                if (ret < 0)
-                        printk(KERN_ERR "Error searching slot for orphan: %d"
+                        goto out;
-                               "\n", ret);
-                        break;
-                }
                /*
                 * if ret == 0 means we found what we were searching for, which
-                 * is weird, but possible, so only screw with path if we didnt
+                 * is weird, but possible, so only screw with path if we didn't
                 * find the key and see if we have stuff that matches
                 */
                if (ret > 0) {
+                        ret = 0;
                        if (path->slots[0] == 0)
                                break;
                        path->slots[0]--;
@@ -2275,7 +2351,6 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                /* pull out the item */
                leaf = path->nodes[0];
-                item = btrfs_item_nr(leaf, path->slots[0]);
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                /* make sure the item matches what we want */
@@ -2285,7 +2360,7 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                        break;
                /* release the path since we're done with it */
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                /*
                 * this is where we are basically btrfs_lookup, without the
@@ -2296,7 +2371,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                found_key.type = BTRFS_INODE_ITEM_KEY;
                found_key.offset = 0;
                inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-                BUG_ON(IS_ERR(inode));
+                if (IS_ERR(inode)) {
+                        ret = PTR_ERR(inode);
+                        goto out;
+                }
                /*
                 * add this inode to the orphan list so btrfs_orphan_del does
@@ -2314,6 +2392,10 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                 */
                if (is_bad_inode(inode)) {
                        trans = btrfs_start_transaction(root, 0);
+                        if (IS_ERR(trans)) {
+                                ret = PTR_ERR(trans);
+                                goto out;
+                        }
                        btrfs_orphan_del(trans, inode);
                        btrfs_end_transaction(trans, root);
                        iput(inode);
@@ -2322,17 +2404,22 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                /* if we have links, this was a truncate, lets do that */
                if (inode->i_nlink) {
+                        if (!S_ISREG(inode->i_mode)) {
+                                WARN_ON(1);
+                                iput(inode);
+                                continue;
+                        }
                        nr_truncate++;
-                        btrfs_truncate(inode);
+                        ret = btrfs_truncate(inode);
                } else {
                        nr_unlink++;
                }
                /* this will do delete_inode and everything for us */
                iput(inode);
+                if (ret)
+                        goto out;
        }
-        btrfs_free_path(path);
        root->orphan_cleanup_state = ORPHAN_CLEANUP_DONE;
        if (root->orphan_block_rsv)
@@ -2340,14 +2427,21 @@ void btrfs_orphan_cleanup(struct btrfs_root *root)
                                        (u64)-1);
        if (root->orphan_block_rsv || root->orphan_item_inserted) {
-                trans = btrfs_join_transaction(root, 1);
+                trans = btrfs_join_transaction(root);
-                btrfs_end_transaction(trans, root);
+                if (!IS_ERR(trans))
+                        btrfs_end_transaction(trans, root);
        }
        if (nr_unlink)
                printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink);
        if (nr_truncate)
                printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate);
+out:
+        if (ret)
+                printk(KERN_CRIT "btrfs: could not do orphan cleanup %d\n", ret);
+        btrfs_free_path(path);
+        return ret;
 }
 /*
@@ -2413,12 +2507,17 @@ static void btrfs_read_locked_inode(struct inode *inode)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key location;
        int maybe_acls;
-        u64 alloc_group_block;
        u32 rdev;
        int ret;
+        bool filled = false;
+        ret = btrfs_fill_inode(inode, &rdev);
+        if (!ret)
+                filled = true;
        path = btrfs_alloc_path();
        BUG_ON(!path);
+        path->leave_spinning = 1;
        memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
        ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
@@ -2426,8 +2525,18 @@ static void btrfs_read_locked_inode(struct inode *inode)
                goto make_bad;
        leaf = path->nodes[0];
+        if (filled)
+                goto cache_acl;
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
                                    struct btrfs_inode_item);
+        if (!leaf->map_token)
+                map_private_extent_buffer(leaf, (unsigned long)inode_item,
+                                          sizeof(struct btrfs_inode_item),
+                                          &leaf->map_token, &leaf->kaddr,
+                                          &leaf->map_start, &leaf->map_len,
+                                          KM_USER1);
        inode->i_mode = btrfs_inode_mode(leaf, inode_item);
        inode->i_nlink = btrfs_inode_nlink(leaf, inode_item);
@@ -2456,21 +2565,22 @@ static void btrfs_read_locked_inode(struct inode *inode)
        BTRFS_I(inode)->index_cnt = (u64)-1;
        BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
+cache_acl:
-        alloc_group_block = btrfs_inode_block_group(leaf, inode_item);
        /*
         * try to precache a NULL acl entry for files that don't have
         * any xattrs or acls
         */
-        maybe_acls = acls_after_inode_item(leaf, path->slots[0], inode->i_ino);
+        maybe_acls = acls_after_inode_item(leaf, path->slots[0],
+                                           btrfs_ino(inode));
        if (!maybe_acls)
                cache_no_acl(inode);
-        BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0,
+        if (leaf->map_token) {
-                                                alloc_group_block, 0);
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
        btrfs_free_path(path);
-        inode_item = NULL;
        switch (inode->i_mode & S_IFMT) {
        case S_IFREG:
@@ -2514,6 +2624,13 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
                            struct btrfs_inode_item *item,
                            struct inode *inode)
 {
+        if (!leaf->map_token)
+                map_private_extent_buffer(leaf, (unsigned long)item,
+                                          sizeof(struct btrfs_inode_item),
+                                          &leaf->map_token, &leaf->kaddr,
+                                          &leaf->map_start, &leaf->map_len,
+                                          KM_USER1);
        btrfs_set_inode_uid(leaf, item, inode->i_uid);
        btrfs_set_inode_gid(leaf, item, inode->i_gid);
        btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size);
@@ -2541,7 +2658,12 @@ static void fill_inode_item(struct btrfs_trans_handle *trans,
        btrfs_set_inode_transid(leaf, item, trans->transid);
        btrfs_set_inode_rdev(leaf, item, inode->i_rdev);
        btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags);
-        btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group);
+        btrfs_set_inode_block_group(leaf, item, 0);
+        if (leaf->map_token) {
+                unmap_extent_buffer(leaf, leaf->map_token, KM_USER1);
+                leaf->map_token = NULL;
+        }
 }
 /*
@@ -2555,11 +2677,28 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
        struct extent_buffer *leaf;
        int ret;
+        /*
+         * If the inode is a free space inode, we can deadlock during commit
+         * if we put it into the delayed code.
+         *
+         * The data relocation inode should also be directly updated
+         * without delay
+         */
+        if (!is_free_space_inode(root, inode)
+            && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) {
+                ret = btrfs_delayed_update_inode(trans, root, inode);
+                if (!ret)
+                        btrfs_set_inode_last_trans(trans, inode);
+                return ret;
+        }
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        path->leave_spinning = 1;
-        ret = btrfs_lookup_inode(trans, root, path,
+        ret = btrfs_lookup_inode(trans, root, path, &BTRFS_I(inode)->location,
-                                 &BTRFS_I(inode)->location, 1);
+                                 1);
        if (ret) {
                if (ret > 0)
                        ret = -ENOENT;
@@ -2569,7 +2708,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
        btrfs_unlock_up_safe(path, 1);
        leaf = path->nodes[0];
        inode_item = btrfs_item_ptr(leaf, path->slots[0],
-                                  struct btrfs_inode_item);
+                                    struct btrfs_inode_item);
        fill_inode_item(trans, leaf, inode_item, inode);
        btrfs_mark_buffer_dirty(leaf);
@@ -2580,16 +2719,15 @@ failed:
        return ret;
 }
 /*
 * unlink helper that gets used here in inode.c and in the tree logging
 * recovery code.  It remove a link in a directory with a given name, and
 * also drops the back refs in the inode to the directory
 */
-int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
-                       struct btrfs_root *root,
+                                struct btrfs_root *root,
-                       struct inode *dir, struct inode *inode,
+                                struct inode *dir, struct inode *inode,
-                       const char *name, int name_len)
+                                const char *name, int name_len)
 {
        struct btrfs_path *path;
        int ret = 0;
@@ -2597,15 +2735,17 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        struct btrfs_dir_item *di;
        struct btrfs_key key;
        u64 index;
+        u64 ino = btrfs_ino(inode);
+        u64 dir_ino = btrfs_ino(dir);
        path = btrfs_alloc_path();
        if (!path) {
                ret = -ENOMEM;
-                goto err;
+                goto out;
        }
        path->leave_spinning = 1;
-        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
                                    name, name_len, -1);
        if (IS_ERR(di)) {
                ret = PTR_ERR(di);
@@ -2620,38 +2760,29 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
        ret = btrfs_delete_one_dir_name(trans, root, path, di);
        if (ret)
                goto err;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
-        ret = btrfs_del_inode_ref(trans, root, name, name_len,
+        ret = btrfs_del_inode_ref(trans, root, name, name_len, ino,
-                                  inode->i_ino,
+                                  dir_ino, &index);
-                                  dir->i_ino, &index);
        if (ret) {
                printk(KERN_INFO "btrfs failed to delete reference to %.*s, "
-                       "inode %lu parent %lu\n", name_len, name,
+                       "inode %llu parent %llu\n", name_len, name,
-                       inode->i_ino, dir->i_ino);
+                       (unsigned long long)ino, (unsigned long long)dir_ino);
                goto err;
        }
-        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
-                                         index, name, name_len, -1);
+        if (ret)
-        if (IS_ERR(di)) {
-                ret = PTR_ERR(di);
-                goto err;
-        }
-        if (!di) {
-                ret = -ENOENT;
                goto err;
-        }
-        ret = btrfs_delete_one_dir_name(trans, root, path, di);
-        btrfs_release_path(root, path);
        ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
-                                         inode, dir->i_ino);
+                                         inode, dir_ino);
        BUG_ON(ret != 0 && ret != -ENOENT);
        ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
                                           dir, index);
-        BUG_ON(ret);
+        if (ret == -ENOENT)
+                ret = 0;
 err:
        btrfs_free_path(path);
        if (ret)
@@ -2660,22 +2791,36 @@ err:
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
        inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME;
        btrfs_update_inode(trans, root, dir);
-        btrfs_drop_nlink(inode);
-        ret = btrfs_update_inode(trans, root, inode);
 out:
        return ret;
 }
+int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
+                       struct btrfs_root *root,
+                       struct inode *dir, struct inode *inode,
+                       const char *name, int name_len)
+{
+        int ret;
+        ret = __btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
+        if (!ret) {
+                btrfs_drop_nlink(inode);
+                ret = btrfs_update_inode(trans, root, inode);
+        }
+        return ret;
+}
+                
 /* helper to check if there is any shared block in the path */
 static int check_path_shared(struct btrfs_root *root,
                             struct btrfs_path *path)
 {
        struct extent_buffer *eb;
        int level;
-        int ret;
        u64 refs = 1;
        for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
+                int ret;
                if (!path->nodes[level])
                        break;
                eb = path->nodes[level];
@@ -2709,12 +2854,14 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        int check_link = 1;
        int err = -ENOSPC;
        int ret;
+        u64 ino = btrfs_ino(inode);
+        u64 dir_ino = btrfs_ino(dir);
        trans = btrfs_start_transaction(root, 10);
        if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
                return trans;
-        if (inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+        if (ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                return ERR_PTR(-ENOSPC);
        /* check if there is someone else holds reference */
@@ -2755,7 +2902,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        } else {
                check_link = 0;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = btrfs_lookup_inode(trans, root, path,
                                &BTRFS_I(inode)->location, 0);
@@ -2769,11 +2916,11 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        } else {
                check_link = 0;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (ret == 0 && S_ISREG(inode->i_mode)) {
                ret = btrfs_lookup_file_extent(trans, root, path,
-                                               inode->i_ino, (u64)-1, 0);
+                                               ino, (u64)-1, 0);
                if (ret < 0) {
                        err = ret;
                        goto out;
@@ -2781,7 +2928,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
                BUG_ON(ret == 0);
                if (check_path_shared(root, path))
                        goto out;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
        }
        if (!check_link) {
@@ -2789,7 +2936,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
                goto out;
        }
-        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
                                dentry->d_name.name, dentry->d_name.len, 0);
        if (IS_ERR(di)) {
                err = PTR_ERR(di);
@@ -2802,11 +2949,11 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
                err = 0;
                goto out;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ref = btrfs_lookup_inode_ref(trans, root, path,
                                dentry->d_name.name, dentry->d_name.len,
-                                inode->i_ino, dir->i_ino, 0);
+                                ino, dir_ino, 0);
        if (IS_ERR(ref)) {
                err = PTR_ERR(ref);
                goto out;
@@ -2815,9 +2962,17 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
        if (check_path_shared(root, path))
                goto out;
        index = btrfs_inode_ref_index(path->nodes[0], ref);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
-        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, index,
+        /*
+         * This is a commit root search, if we can lookup inode item and other
+         * relative items in the commit root, it means the transaction of
+         * dir/file creation has been committed, and the dir index item that we
+         * delay to insert has also been inserted into the commit root. So
+         * we needn't worry about the delayed insertion of the dir index item
+         * here.
+         */
+        di = btrfs_lookup_dir_index_item(trans, root, path, dir_ino, index,
                                dentry->d_name.name, dentry->d_name.len, 0);
        if (IS_ERR(di)) {
                err = PTR_ERR(di);
@@ -2862,8 +3017,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
        btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
        ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
@@ -2892,47 +3045,41 @@ int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
        struct btrfs_key key;
        u64 index;
        int ret;
+        u64 dir_ino = btrfs_ino(dir);
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
-        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
+        di = btrfs_lookup_dir_item(trans, root, path, dir_ino,
                                   name, name_len, -1);
-        BUG_ON(!di || IS_ERR(di));
+        BUG_ON(IS_ERR_OR_NULL(di));
        leaf = path->nodes[0];
        btrfs_dir_item_key_to_cpu(leaf, di, &key);
        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
        ret = btrfs_delete_one_dir_name(trans, root, path, di);
        BUG_ON(ret);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = btrfs_del_root_ref(trans, root->fs_info->tree_root,
                                 objectid, root->root_key.objectid,
-                                 dir->i_ino, &index, name, name_len);
+                                 dir_ino, &index, name, name_len);
        if (ret < 0) {
                BUG_ON(ret != -ENOENT);
-                di = btrfs_search_dir_index_item(root, path, dir->i_ino,
+                di = btrfs_search_dir_index_item(root, path, dir_ino,
                                                 name, name_len);
-                BUG_ON(!di || IS_ERR(di));
+                BUG_ON(IS_ERR_OR_NULL(di));
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                index = key.offset;
        }
+        btrfs_release_path(path);
-        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+        ret = btrfs_delete_delayed_dir_index(trans, root, dir, index);
-                                         index, name, name_len, -1);
-        BUG_ON(!di || IS_ERR(di));
-        leaf = path->nodes[0];
-        btrfs_dir_item_key_to_cpu(leaf, di, &key);
-        WARN_ON(key.type != BTRFS_ROOT_ITEM_KEY || key.objectid != objectid);
-        ret = btrfs_delete_one_dir_name(trans, root, path, di);
        BUG_ON(ret);
-        btrfs_release_path(root, path);
        btrfs_i_size_write(dir, dir->i_size - name_len * 2);
        dir->i_mtime = dir->i_ctime = CURRENT_TIME;
@@ -2952,16 +3099,14 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
        unsigned long nr = 0;
        if (inode->i_size > BTRFS_EMPTY_DIR_SIZE ||
-            inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+            btrfs_ino(inode) == BTRFS_FIRST_FREE_OBJECTID)
                return -ENOTEMPTY;
        trans = __unlink_start_trans(dir, dentry);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
+        if (unlikely(btrfs_ino(inode) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
-        if (unlikely(inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
                err = btrfs_unlink_subvol(trans, root, dir,
                                          BTRFS_I(inode)->location.objectid,
                                          dentry->d_name.name,
@@ -2986,178 +3131,6 @@ out:
        return err;
 }
-#if 0
-/*
- * when truncating bytes in a file, it is possible to avoid reading
- * the leaves that contain only checksum items.  This can be the
- * majority of the IO required to delete a large file, but it must
- * be done carefully.
- *
- * The keys in the level just above the leaves are checked to make sure
- * the lowest key in a given leaf is a csum key, and starts at an offset
- * after the new  size.
- *
- * Then the key for the next leaf is checked to make sure it also has
- * a checksum item for the same file.  If it does, we know our target leaf
- * contains only checksum items, and it can be safely freed without reading
- * it.
- *
- * This is just an optimization targeted at large files.  It may do
- * nothing.  It will return 0 unless things went badly.
- */
-static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans,
-                                     struct btrfs_root *root,
-                                     struct btrfs_path *path,
-                                     struct inode *inode, u64 new_size)
-{
-        struct btrfs_key key;
-        int ret;
-        int nritems;
-        struct btrfs_key found_key;
-        struct btrfs_key other_key;
-        struct btrfs_leaf_ref *ref;
-        u64 leaf_gen;
-        u64 leaf_start;
-        path->lowest_level = 1;
-        key.objectid = inode->i_ino;
-        key.type = BTRFS_CSUM_ITEM_KEY;
-        key.offset = new_size;
-again:
-        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-        if (ret < 0)
-                goto out;
-        if (path->nodes[1] == NULL) {
-                ret = 0;
-                goto out;
-        }
-        ret = 0;
-        btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]);
-        nritems = btrfs_header_nritems(path->nodes[1]);
-        if (!nritems)
-                goto out;
-        if (path->slots[1] >= nritems)
-                goto next_node;
-        /* did we find a key greater than anything we want to delete? */
-        if (found_key.objectid > inode->i_ino ||
-           (found_key.objectid == inode->i_ino && found_key.type > key.type))
-                goto out;
-        /* we check the next key in the node to make sure the leave contains
-         * only checksum items.  This comparison doesn't work if our
-         * leaf is the last one in the node
-         */
-        if (path->slots[1] + 1 >= nritems) {
-next_node:
-                /* search forward from the last key in the node, this
-                 * will bring us into the next node in the tree
-                 */
-                btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1);
-                /* unlikely, but we inc below, so check to be safe */
-                if (found_key.offset == (u64)-1)
-                        goto out;
-                /* search_forward needs a path with locks held, do the
-                 * search again for the original key.  It is possible
-                 * this will race with a balance and return a path that
-                 * we could modify, but this drop is just an optimization
-                 * and is allowed to miss some leaves.
-                 */
-                btrfs_release_path(root, path);
-                found_key.offset++;
-                /* setup a max key for search_forward */
-                other_key.offset = (u64)-1;
-                other_key.type = key.type;
-                other_key.objectid = key.objectid;
-                path->keep_locks = 1;
-                ret = btrfs_search_forward(root, &found_key, &other_key,
-                                           path, 0, 0);
-                path->keep_locks = 0;
-                if (ret || found_key.objectid != key.objectid ||
-                    found_key.type != key.type) {
-                        ret = 0;
-                        goto out;
-                }
-                key.offset = found_key.offset;
-                btrfs_release_path(root, path);
-                cond_resched();
-                goto again;
-        }
-        /* we know there's one more slot after us in the tree,
-         * read that key so we can verify it is also a checksum item
-         */
-        btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1);
-        if (found_key.objectid < inode->i_ino)
-                goto next_key;
-        if (found_key.type != key.type || found_key.offset < new_size)
-                goto next_key;
-        /*
-         * if the key for the next leaf isn't a csum key from this objectid,
-         * we can't be sure there aren't good items inside this leaf.
-         * Bail out
-         */
-        if (other_key.objectid != inode->i_ino || other_key.type != key.type)
-                goto out;
-        leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]);
-        leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]);
-        /*
-         * it is safe to delete this leaf, it contains only
-         * csum items from this inode at an offset >= new_size
-         */
-        ret = btrfs_del_leaf(trans, root, path, leaf_start);
-        BUG_ON(ret);
-        if (root->ref_cows && leaf_gen < trans->transid) {
-                ref = btrfs_alloc_leaf_ref(root, 0);
-                if (ref) {
-                        ref->root_gen = root->root_key.offset;
-                        ref->bytenr = leaf_start;
-                        ref->owner = 0;
-                        ref->generation = leaf_gen;
-                        ref->nritems = 0;
-                        btrfs_sort_leaf_ref(ref);
-                        ret = btrfs_add_leaf_ref(root, ref, 0);
-                        WARN_ON(ret);
-                        btrfs_free_leaf_ref(root, ref);
-                } else {
-                        WARN_ON(1);
-                }
-        }
-next_key:
-        btrfs_release_path(root, path);
-        if (other_key.objectid == inode->i_ino &&
-            other_key.type == key.type && other_key.offset > key.offset) {
-                key.offset = other_key.offset;
-                cond_resched();
-                goto again;
-        }
-        ret = 0;
-out:
-        /* fixup any changes we've made to the path */
-        path->lowest_level = 0;
-        path->keep_locks = 0;
-        btrfs_release_path(root, path);
-        return ret;
-}
-#endif
 /*
 * this can truncate away extent items, csum items and directory items.
 * It starts at a high offset and removes keys until it can't find
@@ -3193,17 +3166,27 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
        int encoding;
        int ret;
        int err = 0;
+        u64 ino = btrfs_ino(inode);
        BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY);
-        if (root->ref_cows)
+        if (root->ref_cows || root == root->fs_info->tree_root)
                btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0);
+        /*
+         * This function is also used to drop the items in the log tree before
+         * we relog the inode, so if root != BTRFS_I(inode)->root, it means
+         * it is used to drop the loged items. So we shouldn't kill the delayed
+         * items.
+         */
+        if (min_type == 0 && root == BTRFS_I(inode)->root)
+                btrfs_kill_delayed_inode_items(inode);
        path = btrfs_alloc_path();
        BUG_ON(!path);
        path->reada = -1;
-        key.objectid = inode->i_ino;
+        key.objectid = ino;
        key.offset = (u64)-1;
        key.type = (u8)-1;
@@ -3231,7 +3214,7 @@ search_again:
                found_type = btrfs_key_type(&found_key);
                encoding = 0;
-                if (found_key.objectid != inode->i_ino)
+                if (found_key.objectid != ino)
                        break;
                if (found_type < min_type)
@@ -3321,7 +3304,6 @@ search_again:
                                    btrfs_file_extent_calc_inline_size(size);
                                ret = btrfs_truncate_item(trans, root, path,
                                                          size, 1);
-                                BUG_ON(ret);
                        } else if (root->ref_cows) {
                                inode_sub_bytes(inode, item_end + 1 -
                                                found_key.offset);
@@ -3344,12 +3326,13 @@ delete:
                } else {
                        break;
                }
-                if (found_extent && root->ref_cows) {
+                if (found_extent && (root->ref_cows ||
+                                     root == root->fs_info->tree_root)) {
                        btrfs_set_path_blocking(path);
                        ret = btrfs_free_extent(trans, root, extent_start,
                                                extent_num_bytes, 0,
                                                btrfs_header_owner(leaf),
-                                                inode->i_ino, extent_offset);
+                                                ino, extent_offset);
                        BUG_ON(ret);
                }
@@ -3358,7 +3341,9 @@ delete:
                if (path->slots[0] == 0 ||
                    path->slots[0] != pending_del_slot) {
-                        if (root->ref_cows) {
+                        if (root->ref_cows &&
+                            BTRFS_I(inode)->location.objectid !=
+                                                BTRFS_FREE_INO_OBJECTID) {
                                err = -EAGAIN;
                                goto out;
                        }
@@ -3369,7 +3354,7 @@ delete:
                                BUG_ON(ret);
                                pending_del_nr = 0;
                        }
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto search_again;
                } else {
                        path->slots[0]--;
@@ -3485,7 +3470,13 @@ out:
        return ret;
 }
-int btrfs_cont_expand(struct inode *inode, loff_t size)
+/*
+ * This function puts in dummy file extents for the area we're creating a hole
+ * for.  So if we are truncating this file to a larger size we need to insert
+ * these file extents so that btrfs_get_extent will return a EXTENT_MAP_HOLE for
+ * the range between oldsize and size
+ */
+int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -3493,7 +3484,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        struct extent_map *em = NULL;
        struct extent_state *cached_state = NULL;
        u64 mask = root->sectorsize - 1;
-        u64 hole_start = (inode->i_size + mask) & ~mask;
+        u64 hole_start = (oldsize + mask) & ~mask;
        u64 block_end = (size + mask) & ~mask;
        u64 last_byte;
        u64 cur_offset;
@@ -3521,7 +3512,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        while (1) {
                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
                                block_end - cur_offset, 0);
-                BUG_ON(IS_ERR(em) || !em);
+                BUG_ON(IS_ERR_OR_NULL(em));
                last_byte = min(extent_map_end(em), block_end);
                last_byte = (last_byte + mask) & ~mask;
                if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
@@ -3533,18 +3524,19 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
                                err = PTR_ERR(trans);
                                break;
                        }
-                        btrfs_set_trans_block_group(trans, inode);
                        err = btrfs_drop_extents(trans, inode, cur_offset,
                                                 cur_offset + hole_size,
                                                 &hint_byte, 1);
-                        BUG_ON(err);
+                        if (err)
+                                break;
                        err = btrfs_insert_file_extent(trans, root,
-                                        inode->i_ino, cur_offset, 0,
+                                        btrfs_ino(inode), cur_offset, 0,
                                        0, hole_size, 0, hole_size,
                                        0, 0, 0);
-                        BUG_ON(err);
+                        if (err)
+                                break;
                        btrfs_drop_extent_cache(inode, hole_start,
                                        last_byte - 1, 0);
@@ -3564,94 +3556,58 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
        return err;
 }
-static int btrfs_setattr_size(struct inode *inode, struct iattr *attr)
+static int btrfs_setsize(struct inode *inode, loff_t newsize)
 {
-        struct btrfs_root *root = BTRFS_I(inode)->root;
+        loff_t oldsize = i_size_read(inode);
-        struct btrfs_trans_handle *trans;
-        unsigned long nr;
        int ret;
-        if (attr->ia_size == inode->i_size)
+        if (newsize == oldsize)
                return 0;
-        if (attr->ia_size > inode->i_size) {
+        if (newsize > oldsize) {
-                unsigned long limit;
+                i_size_write(inode, newsize);
-                limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur;
+                btrfs_ordered_update_i_size(inode, i_size_read(inode), NULL);
-                if (attr->ia_size > inode->i_sb->s_maxbytes)
+                truncate_pagecache(inode, oldsize, newsize);
-                        return -EFBIG;
+                ret = btrfs_cont_expand(inode, oldsize, newsize);
-                if (limit != RLIM_INFINITY && attr->ia_size > limit) {
-                        send_sig(SIGXFSZ, current, 0);
-                        return -EFBIG;
-                }
-        }
-        trans = btrfs_start_transaction(root, 5);
-        if (IS_ERR(trans))
-                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, inode);
-        ret = btrfs_orphan_add(trans, inode);
-        BUG_ON(ret);
-        nr = trans->blocks_used;
-        btrfs_end_transaction(trans, root);
-        btrfs_btree_balance_dirty(root, nr);
-        if (attr->ia_size > inode->i_size) {
-                ret = btrfs_cont_expand(inode, attr->ia_size);
                if (ret) {
-                        btrfs_truncate(inode);
+                        btrfs_setsize(inode, oldsize);
                        return ret;
                }
-                i_size_write(inode, attr->ia_size);
+                mark_inode_dirty(inode);
-                btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
+        } else {
-                trans = btrfs_start_transaction(root, 0);
+                /*
-                BUG_ON(IS_ERR(trans));
+                 * We're truncating a file that used to have good data down to
-                btrfs_set_trans_block_group(trans, inode);
+                 * zero. Make sure it gets into the ordered flush list so that
-                trans->block_rsv = root->orphan_block_rsv;
+                 * any new writes get down to disk quickly.
-                BUG_ON(!trans->block_rsv);
+                 */
+                if (newsize == 0)
+                        BTRFS_I(inode)->ordered_data_close = 1;
-                ret = btrfs_update_inode(trans, root, inode);
+                /* we don't support swapfiles, so vmtruncate shouldn't fail */
-                BUG_ON(ret);
+                truncate_setsize(inode, newsize);
-                if (inode->i_nlink > 0) {
+                ret = btrfs_truncate(inode);
-                        ret = btrfs_orphan_del(trans, inode);
-                        BUG_ON(ret);
-                }
-                nr = trans->blocks_used;
-                btrfs_end_transaction(trans, root);
-                btrfs_btree_balance_dirty(root, nr);
-                return 0;
        }
-        /*
+        return ret;
-         * We're truncating a file that used to have good data down to
-         * zero. Make sure it gets into the ordered flush list so that
-         * any new writes get down to disk quickly.
-         */
-        if (attr->ia_size == 0)
-                BTRFS_I(inode)->ordered_data_close = 1;
-        /* we don't support swapfiles, so vmtruncate shouldn't fail */
-        ret = vmtruncate(inode, attr->ia_size);
-        BUG_ON(ret);
-        return 0;
 }
 static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
 {
        struct inode *inode = dentry->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
        int err;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        err = inode_change_ok(inode, attr);
        if (err)
                return err;
        if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
-                err = btrfs_setattr_size(inode, attr);
+                err = btrfs_setsize(inode, attr->ia_size);
                if (err)
                        return err;
        }
@@ -3674,8 +3630,11 @@ void btrfs_evict_inode(struct inode *inode)
        unsigned long nr;
        int ret;
+        trace_btrfs_inode_evict(inode);
        truncate_inode_pages(&inode->i_data, 0);
-        if (inode->i_nlink && btrfs_root_refs(&root->root_item) != 0)
+        if (inode->i_nlink && (btrfs_root_refs(&root->root_item) != 0 ||
+                               is_free_space_inode(root, inode)))
                goto no_delete;
        if (is_bad_inode(inode)) {
@@ -3698,9 +3657,8 @@ void btrfs_evict_inode(struct inode *inode)
        btrfs_i_size_write(inode, 0);
        while (1) {
-                trans = btrfs_start_transaction(root, 0);
+                trans = btrfs_join_transaction(root);
                BUG_ON(IS_ERR(trans));
-                btrfs_set_trans_block_group(trans, inode);
                trans->block_rsv = root->orphan_block_rsv;
                ret = btrfs_block_rsv_check(trans, root,
@@ -3728,6 +3686,10 @@ void btrfs_evict_inode(struct inode *inode)
                BUG_ON(ret);
        }
+        if (!(root == root->fs_info->tree_root ||
+              root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
+                btrfs_return_ino(root, btrfs_ino(inode));
        nr = trans->blocks_used;
        btrfs_end_transaction(trans, root);
        btrfs_btree_balance_dirty(root, nr);
@@ -3753,12 +3715,12 @@ static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
+        di = btrfs_lookup_dir_item(NULL, root, path, btrfs_ino(dir), name,
                                    namelen, 0);
        if (IS_ERR(di))
                ret = PTR_ERR(di);
-        if (!di || IS_ERR(di))
+        if (IS_ERR_OR_NULL(di))
                goto out_err;
        btrfs_dir_item_key_to_cpu(path->nodes[0], di, location);
@@ -3806,7 +3768,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        leaf = path->nodes[0];
        ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
-        if (btrfs_root_ref_dirid(leaf, ref) != dir->i_ino ||
+        if (btrfs_root_ref_dirid(leaf, ref) != btrfs_ino(dir) ||
            btrfs_root_ref_name_len(leaf, ref) != dentry->d_name.len)
                goto out;
@@ -3816,7 +3778,7 @@ static int fixup_tree_root_location(struct btrfs_root *root,
        if (ret)
                goto out;
-        btrfs_release_path(root->fs_info->tree_root, path);
+        btrfs_release_path(path);
        new_root = btrfs_read_fs_root_no_name(root->fs_info, location);
        if (IS_ERR(new_root)) {
@@ -3845,11 +3807,12 @@ static void inode_tree_add(struct inode *inode)
        struct btrfs_inode *entry;
        struct rb_node **p;
        struct rb_node *parent;
+        u64 ino = btrfs_ino(inode);
 again:
        p = &root->inode_tree.rb_node;
        parent = NULL;
-        if (hlist_unhashed(&inode->i_hash))
+        if (inode_unhashed(inode))
                return;
        spin_lock(&root->inode_lock);
@@ -3857,9 +3820,9 @@ again:
                parent = *p;
                entry = rb_entry(parent, struct btrfs_inode, rb_node);
-                if (inode->i_ino < entry->vfs_inode.i_ino)
+                if (ino < btrfs_ino(&entry->vfs_inode))
                        p = &parent->rb_left;
-                else if (inode->i_ino > entry->vfs_inode.i_ino)
+                else if (ino > btrfs_ino(&entry->vfs_inode))
                        p = &parent->rb_right;
                else {
                        WARN_ON(!(entry->vfs_inode.i_state &
@@ -3888,7 +3851,14 @@ static void inode_tree_del(struct inode *inode)
        }
        spin_unlock(&root->inode_lock);
-        if (empty && btrfs_root_refs(&root->root_item) == 0) {
+        /*
+         * Free space cache has inodes in the tree root, but the tree root has a
+         * root_refs of 0, so this could end up dropping the tree root as a
+         * snapshot, so we need the extra !root->fs_info->tree_root check to
+         * make sure we don't drop it.
+         */
+        if (empty && btrfs_root_refs(&root->root_item) == 0 &&
+            root != root->fs_info->tree_root) {
                synchronize_srcu(&root->fs_info->subvol_srcu);
                spin_lock(&root->inode_lock);
                empty = RB_EMPTY_ROOT(&root->inode_tree);
@@ -3916,9 +3886,9 @@ again:
                prev = node;
                entry = rb_entry(node, struct btrfs_inode, rb_node);
-                if (objectid < entry->vfs_inode.i_ino)
+                if (objectid < btrfs_ino(&entry->vfs_inode))
                        node = node->rb_left;
-                else if (objectid > entry->vfs_inode.i_ino)
+                else if (objectid > btrfs_ino(&entry->vfs_inode))
                        node = node->rb_right;
                else
                        break;
@@ -3926,7 +3896,7 @@ again:
        if (!node) {
                while (prev) {
                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
-                        if (objectid <= entry->vfs_inode.i_ino) {
+                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
                                node = prev;
                                break;
                        }
@@ -3935,7 +3905,7 @@ again:
        }
        while (node) {
                entry = rb_entry(node, struct btrfs_inode, rb_node);
-                objectid = entry->vfs_inode.i_ino + 1;
+                objectid = btrfs_ino(&entry->vfs_inode) + 1;
                inode = igrab(&entry->vfs_inode);
                if (inode) {
                        spin_unlock(&root->inode_lock);
@@ -3973,7 +3943,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 static int btrfs_find_actor(struct inode *inode, void *opaque)
 {
        struct btrfs_iget_args *args = opaque;
-        return args->ino == inode->i_ino &&
+        return args->ino == btrfs_ino(inode) &&
                args->root == BTRFS_I(inode)->root;
 }
@@ -4008,7 +3978,6 @@ struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location,
                BTRFS_I(inode)->root = root;
                memcpy(&BTRFS_I(inode)->location, location, sizeof(*location));
                btrfs_read_locked_inode(inode);
                inode_tree_add(inode);
                unlock_new_inode(inode);
                if (new)
@@ -4049,8 +4018,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        int index;
        int ret;
-        dentry->d_op = &btrfs_dentry_operations;
        if (dentry->d_name.len > BTRFS_NAME_LEN)
                return ERR_PTR(-ENAMETOOLONG);
@@ -4082,17 +4049,19 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
        }
        srcu_read_unlock(&root->fs_info->subvol_srcu, index);
-        if (root != sub_root) {
+        if (!IS_ERR(inode) && root != sub_root) {
                down_read(&root->fs_info->cleanup_work_sem);
                if (!(inode->i_sb->s_flags & MS_RDONLY))
-                        btrfs_orphan_cleanup(sub_root);
+                        ret = btrfs_orphan_cleanup(sub_root);
                up_read(&root->fs_info->cleanup_work_sem);
+                if (ret)
+                        inode = ERR_PTR(ret);
        }
        return inode;
 }
-static int btrfs_dentry_delete(struct dentry *dentry)
+static int btrfs_dentry_delete(const struct dentry *dentry)
 {
        struct btrfs_root *root;
@@ -4119,7 +4088,7 @@ static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
        return d_splice_alias(inode, dentry);
 }
-static unsigned char btrfs_filetype_table[] = {
+unsigned char btrfs_filetype_table[] = {
        DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK
 };
@@ -4133,11 +4102,11 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        struct btrfs_key key;
        struct btrfs_key found_key;
        struct btrfs_path *path;
+        struct list_head ins_list;
+        struct list_head del_list;
        int ret;
-        u32 nritems;
        struct extent_buffer *leaf;
        int slot;
-        int advance;
        unsigned char d_type;
        int over = 0;
        u32 di_cur;
@@ -4147,6 +4116,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        char tmp_name[32];
        char *name_ptr;
        int name_len;
+        int is_curr = 0;        /* filp->f_pos points to the current index? */
        /* FIXME, use a real flag for deciding about the key type */
        if (root->fs_info->tree_root == root)
@@ -4154,9 +4124,7 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
        /* special case for "." */
        if (filp->f_pos == 0) {
-                over = filldir(dirent, ".", 1,
+                over = filldir(dirent, ".", 1, 1, btrfs_ino(inode), DT_DIR);
-                               1, inode->i_ino,
-                               DT_DIR);
                if (over)
                        return 0;
                filp->f_pos = 1;
@@ -4171,36 +4139,37 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                filp->f_pos = 2;
        }
        path = btrfs_alloc_path();
-        path->reada = 2;
+        if (!path)
+                return -ENOMEM;
+        path->reada = 1;
+        if (key_type == BTRFS_DIR_INDEX_KEY) {
+                INIT_LIST_HEAD(&ins_list);
+                INIT_LIST_HEAD(&del_list);
+                btrfs_get_delayed_items(inode, &ins_list, &del_list);
+        }
        btrfs_set_key_type(&key, key_type);
        key.offset = filp->f_pos;
-        key.objectid = inode->i_ino;
+        key.objectid = btrfs_ino(inode);
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
-        advance = 0;
        while (1) {
                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
                slot = path->slots[0];
-                if (advance || slot >= nritems) {
+                if (slot >= btrfs_header_nritems(leaf)) {
-                        if (slot >= nritems - 1) {
+                        ret = btrfs_next_leaf(root, path);
-                                ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
-                                if (ret)
+                                goto err;
-                                        break;
+                        else if (ret > 0)
-                                leaf = path->nodes[0];
+                                break;
-                                nritems = btrfs_header_nritems(leaf);
+                        continue;
-                                slot = path->slots[0];
-                        } else {
-                                slot++;
-                                path->slots[0]++;
-                        }
                }
-                advance = 1;
                item = btrfs_item_nr(leaf, slot);
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
@@ -4209,9 +4178,14 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                if (btrfs_key_type(&found_key) != key_type)
                        break;
                if (found_key.offset < filp->f_pos)
-                        continue;
+                        goto next;
+                if (key_type == BTRFS_DIR_INDEX_KEY &&
+                    btrfs_should_delete_dir_index(&del_list,
+                                                  found_key.offset))
+                        goto next;
                filp->f_pos = found_key.offset;
+                is_curr = 1;
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
                di_cur = 0;
@@ -4220,6 +4194,9 @@ static int btrfs_real_readdir(struct file *filp, void *dirent,
                while (di_cur < di_total) {
                        struct btrfs_key location;
+                        if (verify_dir_item(root, leaf, di))
+                                break;
                        name_len = btrfs_dir_name_len(leaf, di);
                        if (name_len <= sizeof(tmp_name)) {
                                name_ptr = tmp_name;
@@ -4259,6 +4236,17 @@ skip:
                        di_cur += di_len;
                        di = (struct btrfs_dir_item *)((char *)di + di_len);
                }
+next:
+                path->slots[0]++;
+        }
+        if (key_type == BTRFS_DIR_INDEX_KEY) {
+                if (is_curr)
+                        filp->f_pos++;
+                ret = btrfs_readdir_delayed_dir_index(filp, dirent, filldir,
+                                                      &ins_list);
+                if (ret)
+                        goto nopos;
        }
        /* Reached end of directory/root. Bump pos past the last item. */
@@ -4273,6 +4261,8 @@ skip:
 nopos:
        ret = 0;
 err:
+        if (key_type == BTRFS_DIR_INDEX_KEY)
+                btrfs_put_delayed_items(&ins_list, &del_list);
        btrfs_free_path(path);
        return ret;
 }
@@ -4282,14 +4272,25 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
        int ret = 0;
+        bool nolock = false;
        if (BTRFS_I(inode)->dummy_inode)
                return 0;
+        if (btrfs_fs_closing(root->fs_info) && is_free_space_inode(root, inode))
+                nolock = true;
        if (wbc->sync_mode == WB_SYNC_ALL) {
-                trans = btrfs_join_transaction(root, 1);
+                if (nolock)
-                btrfs_set_trans_block_group(trans, inode);
+                        trans = btrfs_join_transaction_nolock(root);
-                ret = btrfs_commit_transaction(trans, root);
+                else
+                        trans = btrfs_join_transaction(root);
+                if (IS_ERR(trans))
+                        return PTR_ERR(trans);
+                if (nolock)
+                        ret = btrfs_end_transaction_nolock(trans, root);
+                else
+                        ret = btrfs_commit_transaction(trans, root);
        }
        return ret;
 }
@@ -4300,7 +4301,7 @@ int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 * FIXME, needs more benchmarking...there are no reasons other than performance
 * to keep or drop this code.
 */
-void btrfs_dirty_inode(struct inode *inode)
+void btrfs_dirty_inode(struct inode *inode, int flags)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
@@ -4309,8 +4310,8 @@ void btrfs_dirty_inode(struct inode *inode)
        if (BTRFS_I(inode)->dummy_inode)
                return;
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
-        btrfs_set_trans_block_group(trans, inode);
+        BUG_ON(IS_ERR(trans));
        ret = btrfs_update_inode(trans, root, inode);
        if (ret && ret == -ENOSPC) {
@@ -4318,25 +4319,24 @@ void btrfs_dirty_inode(struct inode *inode)
                btrfs_end_transaction(trans, root);
                trans = btrfs_start_transaction(root, 1);
                if (IS_ERR(trans)) {
-                        if (printk_ratelimit()) {
+                        printk_ratelimited(KERN_ERR "btrfs: fail to "
-                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %llu error %ld\n",
-                                       "dirty  inode %lu error %ld\n",
+                                       (unsigned long long)btrfs_ino(inode),
-                                       inode->i_ino, PTR_ERR(trans));
+                                       PTR_ERR(trans));
-                        }
                        return;
                }
-                btrfs_set_trans_block_group(trans, inode);
                ret = btrfs_update_inode(trans, root, inode);
                if (ret) {
-                        if (printk_ratelimit()) {
+                        printk_ratelimited(KERN_ERR "btrfs: fail to "
-                                printk(KERN_ERR "btrfs: fail to "
+                                       "dirty  inode %llu error %d\n",
-                                       "dirty  inode %lu error %d\n",
+                                       (unsigned long long)btrfs_ino(inode),
-                                       inode->i_ino, ret);
+                                       ret);
-                        }
                }
        }
        btrfs_end_transaction(trans, root);
+        if (BTRFS_I(inode)->delayed_node)
+                btrfs_balance_delayed_items(root);
 }
 /*
@@ -4352,7 +4352,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
        struct extent_buffer *leaf;
        int ret;
-        key.objectid = inode->i_ino;
+        key.objectid = btrfs_ino(inode);
        btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY);
        key.offset = (u64)-1;
@@ -4384,7 +4384,7 @@ static int btrfs_set_inode_index_count(struct inode *inode)
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-        if (found_key.objectid != inode->i_ino ||
+        if (found_key.objectid != btrfs_ino(inode) ||
            btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) {
                BTRFS_I(inode)->index_cnt = 2;
                goto out;
@@ -4405,9 +4405,12 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
        int ret = 0;
        if (BTRFS_I(dir)->index_cnt == (u64)-1) {
-                ret = btrfs_set_inode_index_count(dir);
+                ret = btrfs_inode_delayed_dir_index_count(dir);
-                if (ret)
+                if (ret) {
-                        return ret;
+                        ret = btrfs_set_inode_index_count(dir);
+                        if (ret)
+                                return ret;
+                }
        }
        *index = BTRFS_I(dir)->index_cnt;
@@ -4420,8 +4423,8 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root,
                                     struct inode *dir,
                                     const char *name, int name_len,
-                                     u64 ref_objectid, u64 objectid,
+                                     u64 ref_objectid, u64 objectid, int mode,
-                                     u64 alloc_hint, int mode, u64 *index)
+                                     u64 *index)
 {
        struct inode *inode;
        struct btrfs_inode_item *inode_item;
@@ -4438,12 +4441,23 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BUG_ON(!path);
        inode = new_inode(root->fs_info->sb);
-        if (!inode)
+        if (!inode) {
+                btrfs_free_path(path);
                return ERR_PTR(-ENOMEM);
+        }
+        /*
+         * we have to initialize this early, so we can reclaim the inode
+         * number if we fail afterwards in this function.
+         */
+        inode->i_ino = objectid;
        if (dir) {
+                trace_btrfs_inode_request(dir);
                ret = btrfs_set_inode_index(dir, index);
                if (ret) {
+                        btrfs_free_path(path);
                        iput(inode);
                        return ERR_PTR(ret);
                }
@@ -4456,14 +4470,13 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        BTRFS_I(inode)->index_cnt = 2;
        BTRFS_I(inode)->root = root;
        BTRFS_I(inode)->generation = trans->transid;
+        inode->i_generation = BTRFS_I(inode)->generation;
        btrfs_set_inode_space_info(root, inode);
        if (mode & S_IFDIR)
                owner = 0;
        else
                owner = 1;
-        BTRFS_I(inode)->block_group =
-                        btrfs_find_block_group(root, 0, alloc_hint, owner);
        key[0].objectid = objectid;
        btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY);
@@ -4482,7 +4495,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
                goto fail;
        inode_init_owner(inode, dir, mode);
-        inode->i_ino = objectid;
        inode_set_bytes(inode, 0);
        inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
        inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
@@ -4509,12 +4521,17 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
        if ((mode & S_IFREG)) {
                if (btrfs_test_opt(root, NODATASUM))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM;
-                if (btrfs_test_opt(root, NODATACOW))
+                if (btrfs_test_opt(root, NODATACOW) ||
+                    (BTRFS_I(dir)->flags & BTRFS_INODE_NODATACOW))
                        BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
        }
        insert_inode_hash(inode);
        inode_tree_add(inode);
+        trace_btrfs_inode_new(inode);
+        btrfs_set_inode_last_trans(trans, inode);
        return inode;
 fail:
        if (dir)
@@ -4542,29 +4559,29 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
        int ret = 0;
        struct btrfs_key key;
        struct btrfs_root *root = BTRFS_I(parent_inode)->root;
+        u64 ino = btrfs_ino(inode);
+        u64 parent_ino = btrfs_ino(parent_inode);
-        if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
                memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key));
        } else {
-                key.objectid = inode->i_ino;
+                key.objectid = ino;
                btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
                key.offset = 0;
        }
-        if (unlikely(inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+        if (unlikely(ino == BTRFS_FIRST_FREE_OBJECTID)) {
                ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
                                         key.objectid, root->root_key.objectid,
-                                         parent_inode->i_ino,
+                                         parent_ino, index, name, name_len);
-                                         index, name, name_len);
        } else if (add_backref) {
-                ret = btrfs_insert_inode_ref(trans, root,
+                ret = btrfs_insert_inode_ref(trans, root, name, name_len, ino,
-                                             name, name_len, inode->i_ino,
+                                             parent_ino, index);
-                                             parent_inode->i_ino, index);
        }
        if (ret == 0) {
                ret = btrfs_insert_dir_item(trans, root, name, name_len,
-                                            parent_inode->i_ino, &key,
+                                            parent_inode, &key,
                                            btrfs_inode_type(inode), index);
                BUG_ON(ret);
@@ -4577,12 +4594,12 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
 }
 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
-                            struct dentry *dentry, struct inode *inode,
+                            struct inode *dir, struct dentry *dentry,
-                            int backref, u64 index)
+                            struct inode *inode, int backref, u64 index)
 {
-        int err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+        int err = btrfs_add_link(trans, dir, inode,
-                                 inode, dentry->d_name.name,
+                                 dentry->d_name.name, dentry->d_name.len,
-                                 dentry->d_name.len, backref, index);
+                                 backref, index);
        if (!err) {
                d_instantiate(dentry, inode);
                return 0;
@@ -4607,10 +4624,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (!new_valid_dev(rdev))
                return -EINVAL;
-        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-        if (err)
-                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
@@ -4620,24 +4633,25 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_ino(root, &objectid);
+        if (err)
+                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
+                                mode, &index);
-                                BTRFS_I(dir)->block_group, mode, &index);
+        if (IS_ERR(inode)) {
-        err = PTR_ERR(inode);
+                err = PTR_ERR(inode);
-        if (IS_ERR(inode))
                goto out_unlock;
+        }
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
        }
-        btrfs_set_trans_block_group(trans, inode);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -4645,8 +4659,6 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
                init_special_inode(inode, inode->i_mode, rdev);
                btrfs_update_inode(trans, root, inode);
        }
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
@@ -4670,9 +4682,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        u64 objectid;
        u64 index = 0;
-        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-        if (err)
-                return err;
        /*
         * 2 for inode item and ref
         * 2 for dir items
@@ -4682,25 +4691,25 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_ino(root, &objectid);
+        if (err)
+                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                dentry->d_parent->d_inode->i_ino,
+                                mode, &index);
-                                objectid, BTRFS_I(dir)->block_group, mode,
+        if (IS_ERR(inode)) {
-                                &index);
+                err = PTR_ERR(inode);
-        err = PTR_ERR(inode);
-        if (IS_ERR(inode))
                goto out_unlock;
+        }
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
        }
-        btrfs_set_trans_block_group(trans, inode);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -4710,8 +4719,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
 out_unlock:
        nr = trans->blocks_used;
        btrfs_end_transaction_throttle(trans, root);
@@ -4734,41 +4741,42 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
        int err;
        int drop_inode = 0;
-        if (inode->i_nlink == 0)
-                return -ENOENT;
        /* do not allow sys_link's with other subvols of the same device */
        if (root->objectid != BTRFS_I(inode)->root->objectid)
-                return -EPERM;
+                return -EXDEV;
-        btrfs_inc_nlink(inode);
+        if (inode->i_nlink == ~0U)
+                return -EMLINK;
        err = btrfs_set_inode_index(dir, &index);
        if (err)
                goto fail;
        /*
-         * 1 item for inode ref
+         * 2 items for inode and inode ref
         * 2 items for dir items
+         * 1 item for parent inode
         */
-        trans = btrfs_start_transaction(root, 3);
+        trans = btrfs_start_transaction(root, 5);
        if (IS_ERR(trans)) {
                err = PTR_ERR(trans);
                goto fail;
        }
-        btrfs_set_trans_block_group(trans, dir);
+        btrfs_inc_nlink(inode);
-        atomic_inc(&inode->i_count);
+        inode->i_ctime = CURRENT_TIME;
+        ihold(inode);
-        err = btrfs_add_nondir(trans, dentry, inode, 1, index);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 1, index);
        if (err) {
                drop_inode = 1;
        } else {
-                btrfs_update_inode_block_group(trans, dir);
+                struct dentry *parent = dget_parent(dentry);
                err = btrfs_update_inode(trans, root, inode);
                BUG_ON(err);
-                btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
+                btrfs_log_new_name(trans, inode, NULL, parent);
+                dput(parent);
        }
        nr = trans->blocks_used;
@@ -4793,10 +4801,6 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        u64 index = 0;
        unsigned long nr = 1;
-        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-        if (err)
-                return err;
        /*
         * 2 items for inode and ref
         * 2 items for dir items
@@ -4805,13 +4809,14 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        trans = btrfs_start_transaction(root, 5);
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_ino(root, &objectid);
+        if (err)
+                goto out_fail;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
+                                S_IFDIR | mode, &index);
-                                BTRFS_I(dir)->block_group, S_IFDIR | mode,
-                                &index);
        if (IS_ERR(inode)) {
                err = PTR_ERR(inode);
                goto out_fail;
@@ -4819,29 +4824,25 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        drop_on_err = 1;
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err)
                goto out_fail;
        inode->i_op = &btrfs_dir_inode_operations;
        inode->i_fop = &btrfs_dir_file_operations;
-        btrfs_set_trans_block_group(trans, inode);
        btrfs_i_size_write(inode, 0);
        err = btrfs_update_inode(trans, root, inode);
        if (err)
                goto out_fail;
-        err = btrfs_add_link(trans, dentry->d_parent->d_inode,
+        err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
-                                 inode, dentry->d_name.name,
+                             dentry->d_name.len, 0, index);
-                                 dentry->d_name.len, 0, index);
        if (err)
                goto out_fail;
        d_instantiate(dentry, inode);
        drop_on_err = 0;
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
 out_fail:
        nr = trans->blocks_used;
@@ -4886,19 +4887,23 @@ static noinline int uncompress_inline(struct btrfs_path *path,
        size_t max_size;
        unsigned long inline_size;
        unsigned long ptr;
+        int compress_type;
        WARN_ON(pg_offset != 0);
+        compress_type = btrfs_file_extent_compression(leaf, item);
        max_size = btrfs_file_extent_ram_bytes(leaf, item);
        inline_size = btrfs_file_extent_inline_item_len(leaf,
                                        btrfs_item_nr(leaf, path->slots[0]));
        tmp = kmalloc(inline_size, GFP_NOFS);
+        if (!tmp)
+                return -ENOMEM;
        ptr = btrfs_file_extent_inline_start(item);
        read_extent_buffer(leaf, tmp, ptr, inline_size);
        max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size);
-        ret = btrfs_zlib_decompress(tmp, page, extent_offset,
+        ret = btrfs_decompress(compress_type, tmp, page,
-                                    inline_size, max_size);
+                               extent_offset, inline_size, max_size);
        if (ret) {
                char *kaddr = kmap_atomic(page, KM_USER0);
                unsigned long copy_size = min_t(u64,
@@ -4929,7 +4934,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
        u64 bytenr;
        u64 extent_start = 0;
        u64 extent_end = 0;
-        u64 objectid = inode->i_ino;
+        u64 objectid = btrfs_ino(inode);
        u32 found_type;
        struct btrfs_path *path = NULL;
        struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4940,7 +4945,7 @@ struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
        struct btrfs_trans_handle *trans = NULL;
-        int compressed;
+        int compress_type;
 again:
        read_lock(&em_tree->lock);
@@ -4957,7 +4962,7 @@ again:
                else
                        goto out;
        }
-        em = alloc_extent_map(GFP_NOFS);
+        em = alloc_extent_map();
        if (!em) {
                err = -ENOMEM;
                goto out;
@@ -4970,7 +4975,15 @@ again:
        if (!path) {
                path = btrfs_alloc_path();
-                BUG_ON(!path);
+                if (!path) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                /*
+                 * Chances are we'll be called again, so go ahead and do
+                 * readahead
+                 */
+                path->reada = 1;
        }
        ret = btrfs_lookup_file_extent(trans, root, path,
@@ -4999,7 +5012,7 @@ again:
        found_type = btrfs_file_extent_type(leaf, item);
        extent_start = found_key.offset;
-        compressed = btrfs_file_extent_compression(leaf, item);
+        compress_type = btrfs_file_extent_compression(leaf, item);
        if (found_type == BTRFS_FILE_EXTENT_REG ||
            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
                extent_end = extent_start +
@@ -5045,8 +5058,9 @@ again:
                        em->block_start = EXTENT_MAP_HOLE;
                        goto insert;
                }
-                if (compressed) {
+                if (compress_type != BTRFS_COMPRESS_NONE) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                        em->compress_type = compress_type;
                        em->block_start = bytenr;
                        em->block_len = btrfs_file_extent_disk_num_bytes(leaf,
                                                                         item);
@@ -5080,12 +5094,14 @@ again:
                em->len = (copy_size + root->sectorsize - 1) &
                        ~((u64)root->sectorsize - 1);
                em->orig_start = EXTENT_MAP_INLINE;
-                if (compressed)
+                if (compress_type) {
                        set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
+                        em->compress_type = compress_type;
+                }
                ptr = btrfs_file_extent_inline_start(item) + extent_offset;
                if (create == 0 && !PageUptodate(page)) {
-                        if (btrfs_file_extent_compression(leaf, item) ==
+                        if (btrfs_file_extent_compression(leaf, item) !=
-                            BTRFS_COMPRESS_ZLIB) {
+                            BTRFS_COMPRESS_NONE) {
                                ret = uncompress_inline(path, inode, page,
                                                        pg_offset,
                                                        extent_offset, item);
@@ -5108,8 +5124,12 @@ again:
                                kunmap(page);
                                free_extent_map(em);
                                em = NULL;
-                                btrfs_release_path(root, path);
-                                trans = btrfs_join_transaction(root, 1);
+                                btrfs_release_path(path);
+                                trans = btrfs_join_transaction(root);
+                                if (IS_ERR(trans))
+                                        return ERR_CAST(trans);
                                goto again;
                        }
                        map = kmap(page);
@@ -5119,7 +5139,7 @@ again:
                        btrfs_mark_buffer_dirty(leaf);
                }
                set_extent_uptodate(io_tree, em->start,
-                                    extent_map_end(em) - 1, GFP_NOFS);
+                                    extent_map_end(em) - 1, NULL, GFP_NOFS);
                goto insert;
        } else {
                printk(KERN_ERR "btrfs unknown found_type %d\n", found_type);
@@ -5132,7 +5152,7 @@ not_found_em:
        em->block_start = EXTENT_MAP_HOLE;
        set_bit(EXTENT_FLAG_VACANCY, &em->flags);
 insert:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (em->start > start || extent_map_end(em) <= start) {
                printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed "
                       "[%llu %llu]\n", (unsigned long long)em->start,
@@ -5186,6 +5206,9 @@ insert:
        }
        write_unlock(&em_tree->lock);
 out:
+        trace_btrfs_get_extent(root, em);
        if (path)
                btrfs_free_path(path);
        if (trans) {
@@ -5200,22 +5223,160 @@ out:
        return em;
 }
+struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
+                                           size_t pg_offset, u64 start, u64 len,
+                                           int create)
+{
+        struct extent_map *em;
+        struct extent_map *hole_em = NULL;
+        u64 range_start = start;
+        u64 end;
+        u64 found;
+        u64 found_end;
+        int err = 0;
+        em = btrfs_get_extent(inode, page, pg_offset, start, len, create);
+        if (IS_ERR(em))
+                return em;
+        if (em) {
+                /*
+                 * if our em maps to a hole, there might
+                 * actually be delalloc bytes behind it
+                 */
+                if (em->block_start != EXTENT_MAP_HOLE)
+                        return em;
+                else
+                        hole_em = em;
+        }
+        /* check to see if we've wrapped (len == -1 or similar) */
+        end = start + len;
+        if (end < start)
+                end = (u64)-1;
+        else
+                end -= 1;
+        em = NULL;
+        /* ok, we didn't find anything, lets look for delalloc */
+        found = count_range_bits(&BTRFS_I(inode)->io_tree, &range_start,
+                                 end, len, EXTENT_DELALLOC, 1);
+        found_end = range_start + found;
+        if (found_end < range_start)
+                found_end = (u64)-1;
+        /*
+         * we didn't find anything useful, return
+         * the original results from get_extent()
+         */
+        if (range_start > end || found_end <= start) {
+                em = hole_em;
+                hole_em = NULL;
+                goto out;
+        }
+        /* adjust the range_start to make sure it doesn't
+         * go backwards from the start they passed in
+         */
+        range_start = max(start,range_start);
+        found = found_end - range_start;
+        if (found > 0) {
+                u64 hole_start = start;
+                u64 hole_len = len;
+                em = alloc_extent_map();
+                if (!em) {
+                        err = -ENOMEM;
+                        goto out;
+                }
+                /*
+                 * when btrfs_get_extent can't find anything it
+                 * returns one huge hole
+                 *
+                 * make sure what it found really fits our range, and
+                 * adjust to make sure it is based on the start from
+                 * the caller
+                 */
+                if (hole_em) {
+                        u64 calc_end = extent_map_end(hole_em);
+                        if (calc_end <= start || (hole_em->start > end)) {
+                                free_extent_map(hole_em);
+                                hole_em = NULL;
+                        } else {
+                                hole_start = max(hole_em->start, start);
+                                hole_len = calc_end - hole_start;
+                        }
+                }
+                em->bdev = NULL;
+                if (hole_em && range_start > hole_start) {
+                        /* our hole starts before our delalloc, so we
+                         * have to return just the parts of the hole
+                         * that go until  the delalloc starts
+                         */
+                        em->len = min(hole_len,
+                                      range_start - hole_start);
+                        em->start = hole_start;
+                        em->orig_start = hole_start;
+                        /*
+                         * don't adjust block start at all,
+                         * it is fixed at EXTENT_MAP_HOLE
+                         */
+                        em->block_start = hole_em->block_start;
+                        em->block_len = hole_len;
+                } else {
+                        em->start = range_start;
+                        em->len = found;
+                        em->orig_start = range_start;
+                        em->block_start = EXTENT_MAP_DELALLOC;
+                        em->block_len = found;
+                }
+        } else if (hole_em) {
+                return hole_em;
+        }
+out:
+        free_extent_map(hole_em);
+        if (err) {
+                free_extent_map(em);
+                return ERR_PTR(err);
+        }
+        return em;
+}
 static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
+                                                  struct extent_map *em,
                                                  u64 start, u64 len)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_trans_handle *trans;
-        struct extent_map *em;
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        struct btrfs_key ins;
        u64 alloc_hint;
        int ret;
+        bool insert = false;
-        btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        /*
+         * Ok if the extent map we looked up is a hole and is for the exact
+         * range we want, there is no reason to allocate a new one, however if
+         * it is not right then we need to free this one and drop the cache for
+         * our range.
+         */
+        if (em->block_start != EXTENT_MAP_HOLE || em->start != start ||
+            em->len != len) {
+                free_extent_map(em);
+                em = NULL;
+                insert = true;
+                btrfs_drop_extent_cache(inode, start, start + len - 1, 0);
+        }
-        trans = btrfs_join_transaction(root, 0);
+        trans = btrfs_join_transaction(root);
-        if (!trans)
+        if (IS_ERR(trans))
-                return ERR_PTR(-ENOMEM);
+                return ERR_CAST(trans);
+        if (start <= BTRFS_I(inode)->disk_i_size && len < 64 * 1024)
+                btrfs_add_inode_defrag(trans, inode);
        trans->block_rsv = &root->fs_info->delalloc_block_rsv;
@@ -5227,10 +5388,12 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
                goto out;
        }
-        em = alloc_extent_map(GFP_NOFS);
        if (!em) {
-                em = ERR_PTR(-ENOMEM);
+                em = alloc_extent_map();
-                goto out;
+                if (!em) {
+                        em = ERR_PTR(-ENOMEM);
+                        goto out;
+                }
        }
        em->start = start;
@@ -5240,9 +5403,15 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
        em->block_start = ins.objectid;
        em->block_len = ins.offset;
        em->bdev = root->fs_info->fs_devices->latest_bdev;
+        /*
+         * We need to do this because if we're using the original em we searched
+         * for, we could have EXTENT_FLAG_VACANCY set, and we don't want that.
+         */
+        em->flags = 0;
        set_bit(EXTENT_FLAG_PINNED, &em->flags);
-        while (1) {
+        while (insert) {
                write_lock(&em_tree->lock);
                ret = add_extent_mapping(em_tree, em);
                write_unlock(&em_tree->lock);
@@ -5286,7 +5455,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
        if (!path)
                return -ENOMEM;
-        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+        ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
                                       offset, 0);
        if (ret < 0)
                goto out;
@@ -5303,7 +5472,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
        ret = 0;
        leaf = path->nodes[0];
        btrfs_item_key_to_cpu(leaf, &key, slot);
-        if (key.objectid != inode->i_ino ||
+        if (key.objectid != btrfs_ino(inode) ||
            key.type != BTRFS_EXTENT_DATA_KEY) {
                /* not our file or wrong item type, must cow */
                goto out;
@@ -5337,7 +5506,7 @@ static noinline int can_nocow_odirect(struct btrfs_trans_handle *trans,
         * look for other files referencing this extent, if we
         * find any we must cow
         */
-        if (btrfs_cross_ref_exist(trans, root, inode->i_ino,
+        if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
                                  key.offset - backref_offset, disk_bytenr))
                goto out;
@@ -5438,8 +5607,8 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
                 * to make sure the current transaction stays open
                 * while we look for nocow cross refs
                 */
-                trans = btrfs_join_transaction(root, 0);
+                trans = btrfs_join_transaction(root);
-                if (!trans)
+                if (IS_ERR(trans))
                        goto must_cow;
                if (can_nocow_odirect(trans, inode, start, len) == 1) {
@@ -5460,8 +5629,7 @@ must_cow:
         * it above
         */
        len = bh_result->b_size;
-        free_extent_map(em);
+        em = btrfs_new_extent_direct(inode, em, start, len);
-        em = btrfs_new_extent_direct(inode, start, len);
        if (IS_ERR(em))
                return PTR_ERR(em);
        len = min(len, em->len - (start - em->start));
@@ -5490,13 +5658,21 @@ struct btrfs_dio_private {
        u64 bytes;
        u32 *csums;
        void *private;
+        /* number of bios pending for this dio */
+        atomic_t pending_bios;
+        /* IO errors */
+        int errors;
+        struct bio *orig_bio;
 };
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
+        struct btrfs_dio_private *dip = bio->bi_private;
        struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
        struct bio_vec *bvec = bio->bi_io_vec;
-        struct btrfs_dio_private *dip = bio->bi_private;
        struct inode *inode = dip->inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        u64 start;
@@ -5520,9 +5696,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
                        flush_dcache_page(bvec->bv_page);
                        if (csum != *private) {
-                                printk(KERN_ERR "btrfs csum failed ino %lu off"
+                                printk(KERN_ERR "btrfs csum failed ino %llu off"
                                      " %llu csum %u private %u\n",
-                                      inode->i_ino, (unsigned long long)start,
+                                      (unsigned long long)btrfs_ino(inode),
+                                      (unsigned long long)start,
                                      csum, *private);
                                err = -EIO;
                        }
@@ -5539,6 +5716,10 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
        kfree(dip->csums);
        kfree(dip);
+        /* If we had a csum failure make sure to clear the uptodate flag */
+        if (err)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
        dio_end_io(bio, err);
 }
@@ -5550,20 +5731,23 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
        struct btrfs_trans_handle *trans;
        struct btrfs_ordered_extent *ordered = NULL;
        struct extent_state *cached_state = NULL;
+        u64 ordered_offset = dip->logical_offset;
+        u64 ordered_bytes = dip->bytes;
        int ret;
        if (err)
                goto out_done;
+again:
-        ret = btrfs_dec_test_ordered_pending(inode, &ordered,
+        ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
-                                             dip->logical_offset, dip->bytes);
+                                                   &ordered_offset,
+                                                   ordered_bytes);
        if (!ret)
-                goto out_done;
+                goto out_test;
        BUG_ON(!ordered);
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
-        if (!trans) {
+        if (IS_ERR(trans)) {
                err = -ENOMEM;
                goto out;
        }
@@ -5609,8 +5793,10 @@ static void btrfs_endio_direct_write(struct bio *bio, int err)
        }
        add_pending_csums(trans, inode, ordered->file_offset, &ordered->list);
-        btrfs_ordered_update_i_size(inode, 0, ordered);
+        ret = btrfs_ordered_update_i_size(inode, 0, ordered);
-        btrfs_update_inode(trans, root, inode);
+        if (!ret)
+                btrfs_update_inode(trans, root, inode);
+        ret = 0;
 out_unlock:
        unlock_extent_cached(&BTRFS_I(inode)->io_tree, ordered->file_offset,
                             ordered->file_offset + ordered->len - 1,
@@ -5618,13 +5804,29 @@ out_unlock:
 out:
        btrfs_delalloc_release_metadata(inode, ordered->len);
        btrfs_end_transaction(trans, root);
+        ordered_offset = ordered->file_offset + ordered->len;
        btrfs_put_ordered_extent(ordered);
        btrfs_put_ordered_extent(ordered);
+out_test:
+        /*
+         * our bio might span multiple ordered extents.  If we haven't
+         * completed the accounting for the whole dio, go back and try again
+         */
+        if (ordered_offset < dip->logical_offset + dip->bytes) {
+                ordered_bytes = dip->logical_offset + dip->bytes -
+                        ordered_offset;
+                goto again;
+        }
 out_done:
        bio->bi_private = dip->private;
        kfree(dip->csums);
        kfree(dip);
+        /* If we had an error make sure to clear the uptodate flag */
+        if (err)
+                clear_bit(BIO_UPTODATE, &bio->bi_flags);
        dio_end_io(bio, err);
 }
@@ -5639,13 +5841,207 @@ static int __btrfs_submit_bio_start_direct_io(struct inode *inode, int rw,
        return 0;
 }
+static void btrfs_end_dio_bio(struct bio *bio, int err)
+{
+        struct btrfs_dio_private *dip = bio->bi_private;
+        if (err) {
+                printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
+                      "sector %#Lx len %u err no %d\n",
+                      (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
+                      (unsigned long long)bio->bi_sector, bio->bi_size, err);
+                dip->errors = 1;
+                /*
+                 * before atomic variable goto zero, we must make sure
+                 * dip->errors is perceived to be set.
+                 */
+                smp_mb__before_atomic_dec();
+        }
+        /* if there are more bios still pending for this dio, just exit */
+        if (!atomic_dec_and_test(&dip->pending_bios))
+                goto out;
+        if (dip->errors)
+                bio_io_error(dip->orig_bio);
+        else {
+                set_bit(BIO_UPTODATE, &dip->orig_bio->bi_flags);
+                bio_endio(dip->orig_bio, 0);
+        }
+out:
+        bio_put(bio);
+}
+static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev,
+                                       u64 first_sector, gfp_t gfp_flags)
+{
+        int nr_vecs = bio_get_nr_vecs(bdev);
+        return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags);
+}
+static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+                                         int rw, u64 file_offset, int skip_sum,
+                                         u32 *csums, int async_submit)
+{
+        int write = rw & REQ_WRITE;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret;
+        bio_get(bio);
+        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        if (ret)
+                goto err;
+        if (skip_sum)
+                goto map;
+        if (write && async_submit) {
+                ret = btrfs_wq_submit_bio(root->fs_info,
+                                   inode, rw, bio, 0, 0,
+                                   file_offset,
+                                   __btrfs_submit_bio_start_direct_io,
+                                   __btrfs_submit_bio_done);
+                goto err;
+        } else if (write) {
+                /*
+                 * If we aren't doing async submit, calculate the csum of the
+                 * bio now.
+                 */
+                ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1);
+                if (ret)
+                        goto err;
+        } else if (!skip_sum) {
+                ret = btrfs_lookup_bio_sums_dio(root, inode, bio,
+                                          file_offset, csums);
+                if (ret)
+                        goto err;
+        }
+map:
+        ret = btrfs_map_bio(root, rw, bio, 0, async_submit);
+err:
+        bio_put(bio);
+        return ret;
+}
+static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
+                                    int skip_sum)
+{
+        struct inode *inode = dip->inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+        struct bio *bio;
+        struct bio *orig_bio = dip->orig_bio;
+        struct bio_vec *bvec = orig_bio->bi_io_vec;
+        u64 start_sector = orig_bio->bi_sector;
+        u64 file_offset = dip->logical_offset;
+        u64 submit_len = 0;
+        u64 map_length;
+        int nr_pages = 0;
+        u32 *csums = dip->csums;
+        int ret = 0;
+        int async_submit = 0;
+        int write = rw & REQ_WRITE;
+        map_length = orig_bio->bi_size;
+        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                              &map_length, NULL, 0);
+        if (ret) {
+                bio_put(orig_bio);
+                return -EIO;
+        }
+        if (map_length >= orig_bio->bi_size) {
+                bio = orig_bio;
+                goto submit;
+        }
+        async_submit = 1;
+        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev, start_sector, GFP_NOFS);
+        if (!bio)
+                return -ENOMEM;
+        bio->bi_private = dip;
+        bio->bi_end_io = btrfs_end_dio_bio;
+        atomic_inc(&dip->pending_bios);
+        while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) {
+                if (unlikely(map_length < submit_len + bvec->bv_len ||
+                    bio_add_page(bio, bvec->bv_page, bvec->bv_len,
+                                 bvec->bv_offset) < bvec->bv_len)) {
+                        /*
+                         * inc the count before we submit the bio so
+                         * we know the end IO handler won't happen before
+                         * we inc the count. Otherwise, the dip might get freed
+                         * before we're done setting it up
+                         */
+                        atomic_inc(&dip->pending_bios);
+                        ret = __btrfs_submit_dio_bio(bio, inode, rw,
+                                                     file_offset, skip_sum,
+                                                     csums, async_submit);
+                        if (ret) {
+                                bio_put(bio);
+                                atomic_dec(&dip->pending_bios);
+                                goto out_err;
+                        }
+                        /* Write's use the ordered csums */
+                        if (!write && !skip_sum)
+                                csums = csums + nr_pages;
+                        start_sector += submit_len >> 9;
+                        file_offset += submit_len;
+                        submit_len = 0;
+                        nr_pages = 0;
+                        bio = btrfs_dio_bio_alloc(orig_bio->bi_bdev,
+                                                  start_sector, GFP_NOFS);
+                        if (!bio)
+                                goto out_err;
+                        bio->bi_private = dip;
+                        bio->bi_end_io = btrfs_end_dio_bio;
+                        map_length = orig_bio->bi_size;
+                        ret = btrfs_map_block(map_tree, READ, start_sector << 9,
+                                              &map_length, NULL, 0);
+                        if (ret) {
+                                bio_put(bio);
+                                goto out_err;
+                        }
+                } else {
+                        submit_len += bvec->bv_len;
+                        nr_pages ++;
+                        bvec++;
+                }
+        }
+submit:
+        ret = __btrfs_submit_dio_bio(bio, inode, rw, file_offset, skip_sum,
+                                     csums, async_submit);
+        if (!ret)
+                return 0;
+        bio_put(bio);
+out_err:
+        dip->errors = 1;
+        /*
+         * before atomic variable goto zero, we must
+         * make sure dip->errors is perceived to be set.
+         */
+        smp_mb__before_atomic_dec();
+        if (atomic_dec_and_test(&dip->pending_bios))
+                bio_io_error(dip->orig_bio);
+        /* bio_end_io() will handle error, so we needn't return it */
+        return 0;
+}
 static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
                                loff_t file_offset)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_dio_private *dip;
        struct bio_vec *bvec = bio->bi_io_vec;
-        u64 start;
        int skip_sum;
        int write = rw & REQ_WRITE;
        int ret = 0;
@@ -5659,9 +6055,11 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        }
        dip->csums = NULL;
-        if (!skip_sum) {
+        /* Write's use the ordered csum stuff, so we don't need dip->csums */
+        if (!write && !skip_sum) {
                dip->csums = kmalloc(sizeof(u32) * bio->bi_vcnt, GFP_NOFS);
                if (!dip->csums) {
+                        kfree(dip);
                        ret = -ENOMEM;
                        goto free_ordered;
                }
@@ -5671,7 +6069,6 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        dip->inode = inode;
        dip->logical_offset = file_offset;
-        start = dip->logical_offset;
        dip->bytes = 0;
        do {
                dip->bytes += bvec->bv_len;
@@ -5680,36 +6077,18 @@ static void btrfs_submit_direct(int rw, struct bio *bio, struct inode *inode,
        dip->disk_bytenr = (u64)bio->bi_sector << 9;
        bio->bi_private = dip;
+        dip->errors = 0;
+        dip->orig_bio = bio;
+        atomic_set(&dip->pending_bios, 0);
        if (write)
                bio->bi_end_io = btrfs_endio_direct_write;
        else
                bio->bi_end_io = btrfs_endio_direct_read;
-        ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+        ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
-        if (ret)
+        if (!ret)
-                goto out_err;
-        if (write && !skip_sum) {
-                ret = btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info,
-                                   inode, rw, bio, 0, 0,
-                                   dip->logical_offset,
-                                   __btrfs_submit_bio_start_direct_io,
-                                   __btrfs_submit_bio_done);
-                if (ret)
-                        goto out_err;
                return;
-        } else if (!skip_sum)
-                btrfs_lookup_bio_sums_dio(root, inode, bio,
-                                          dip->logical_offset, dip->csums);
-        ret = btrfs_map_bio(root, rw, bio, 0, 1);
-        if (ret)
-                goto out_err;
-        return;
-out_err:
-        kfree(dip->csums);
-        kfree(dip);
 free_ordered:
        /*
         * If this is a write, we need to clean up the reserved space and kill
@@ -5717,8 +6096,7 @@ free_ordered:
         */
        if (write) {
                struct btrfs_ordered_extent *ordered;
-                ordered = btrfs_lookup_ordered_extent(inode,
+                ordered = btrfs_lookup_ordered_extent(inode, file_offset);
-                                                      dip->logical_offset);
                if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) &&
                    !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags))
                        btrfs_free_reserved_extent(root, ordered->start,
@@ -5734,6 +6112,7 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
                        unsigned long nr_segs)
 {
        int seg;
+        int i;
        size_t size;
        unsigned long addr;
        unsigned blocksize_mask = root->sectorsize - 1;
@@ -5748,8 +6127,22 @@ static ssize_t check_direct_IO(struct btrfs_root *root, int rw, struct kiocb *io
                addr = (unsigned long)iov[seg].iov_base;
                size = iov[seg].iov_len;
                end += size;
-                if ((addr & blocksize_mask) || (size & blocksize_mask)) 
+                if ((addr & blocksize_mask) || (size & blocksize_mask))
                        goto out;
+                /* If this is a write we don't need to check anymore */
+                if (rw & WRITE)
+                        continue;
+                /*
+                 * Check to make sure we don't have duplicate iov_base's in this
+                 * iovec, if so return EINVAL, otherwise we'll get csum errors
+                 * when reading back.
+                 */
+                for (i = seg + 1; i < nr_segs; i++) {
+                        if (iov[seg].iov_base == iov[i].iov_base)
+                                goto out;
+                }
        }
        retval = 0;
 out:
@@ -5850,7 +6243,7 @@ out:
 static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                __u64 start, __u64 len)
 {
-        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent);
+        return extent_fiemap(inode, fieinfo, start, len, btrfs_get_extent_fiemap);
 }
 int btrfs_readpage(struct file *file, struct page *page)
@@ -6100,30 +6493,97 @@ out:
        return ret;
 }
-static void btrfs_truncate(struct inode *inode)
+static int btrfs_truncate(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_block_rsv *rsv;
        int ret;
+        int err = 0;
        struct btrfs_trans_handle *trans;
        unsigned long nr;
        u64 mask = root->sectorsize - 1;
-        if (!S_ISREG(inode->i_mode)) {
-                WARN_ON(1);
-                return;
-        }
        ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
        if (ret)
-                return;
+                return ret;
        btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
        btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
-        trans = btrfs_start_transaction(root, 0);
+        /*
-        BUG_ON(IS_ERR(trans));
+         * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
-        btrfs_set_trans_block_group(trans, inode);
+         * 3 things going on here
-        trans->block_rsv = root->orphan_block_rsv;
+         *
+         * 1) We need to reserve space for our orphan item and the space to
+         * delete our orphan item.  Lord knows we don't want to have a dangling
+         * orphan item because we didn't reserve space to remove it.
+         *
+         * 2) We need to reserve space to update our inode.
+         *
+         * 3) We need to have something to cache all the space that is going to
+         * be free'd up by the truncate operation, but also have some slack
+         * space reserved in case it uses space during the truncate (thank you
+         * very much snapshotting).
+         *
+         * And we need these to all be seperate.  The fact is we can use alot of
+         * space doing the truncate, and we have no earthly idea how much space
+         * we will use, so we need the truncate reservation to be seperate so it
+         * doesn't end up using space reserved for updating the inode or
+         * removing the orphan item.  We also need to be able to stop the
+         * transaction and start a new one, which means we need to be able to
+         * update the inode several times, and we have no idea of knowing how
+         * many times that will be, so we can't just reserve 1 item for the
+         * entirety of the opration, so that has to be done seperately as well.
+         * Then there is the orphan item, which does indeed need to be held on
+         * to for the whole operation, and we need nobody to touch this reserved
+         * space except the orphan code.
+         *
+         * So that leaves us with
+         *
+         * 1) root->orphan_block_rsv - for the orphan deletion.
+         * 2) rsv - for the truncate reservation, which we will steal from the
+         * transaction reservation.
+         * 3) fs_info->trans_block_rsv - this will have 1 items worth left for
+         * updating the inode.
+         */
+        rsv = btrfs_alloc_block_rsv(root);
+        if (!rsv)
+                return -ENOMEM;
+        btrfs_add_durable_block_rsv(root->fs_info, rsv);
+        trans = btrfs_start_transaction(root, 4);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out;
+        }
+        /*
+         * Reserve space for the truncate process.  Truncate should be adding
+         * space, but if there are snapshots it may end up using space.
+         */
+        ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+        BUG_ON(ret);
+        ret = btrfs_orphan_add(trans, inode);
+        if (ret) {
+                btrfs_end_transaction(trans, root);
+                goto out;
+        }
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+        /*
+         * Ok so we've already migrated our bytes over for the truncate, so here
+         * just reserve the one slot we need for updating the inode.
+         */
+        trans = btrfs_start_transaction(root, 1);
+        if (IS_ERR(trans)) {
+                err = PTR_ERR(trans);
+                goto out;
+        }
+        trans->block_rsv = rsv;
        /*
         * setattr is responsible for setting the ordered_data_close flag,
@@ -6147,30 +6607,33 @@ static void btrfs_truncate(struct inode *inode)
        while (1) {
                if (!trans) {
-                        trans = btrfs_start_transaction(root, 0);
+                        trans = btrfs_start_transaction(root, 3);
-                        BUG_ON(IS_ERR(trans));
+                        if (IS_ERR(trans)) {
-                        btrfs_set_trans_block_group(trans, inode);
+                                err = PTR_ERR(trans);
-                        trans->block_rsv = root->orphan_block_rsv;
+                                goto out;
-                }
+                        }
-                ret = btrfs_block_rsv_check(trans, root,
+                        ret = btrfs_truncate_reserve_metadata(trans, root,
-                                            root->orphan_block_rsv, 0, 5);
+                                                              rsv);
-                if (ret) {
-                        BUG_ON(ret != -EAGAIN);
-                        ret = btrfs_commit_transaction(trans, root);
                        BUG_ON(ret);
-                        trans = NULL;
-                        continue;
+                        trans->block_rsv = rsv;
                }
                ret = btrfs_truncate_inode_items(trans, root, inode,
                                                 inode->i_size,
                                                 BTRFS_EXTENT_DATA_KEY);
-                if (ret != -EAGAIN)
+                if (ret != -EAGAIN) {
+                        err = ret;
                        break;
+                }
+                trans->block_rsv = &root->fs_info->trans_block_rsv;
                ret = btrfs_update_inode(trans, root, inode);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        break;
+                }
                nr = trans->blocks_used;
                btrfs_end_transaction(trans, root);
@@ -6179,32 +6642,48 @@ static void btrfs_truncate(struct inode *inode)
        }
        if (ret == 0 && inode->i_nlink > 0) {
+                trans->block_rsv = root->orphan_block_rsv;
                ret = btrfs_orphan_del(trans, inode);
-                BUG_ON(ret);
+                if (ret)
+                        err = ret;
+        } else if (ret && inode->i_nlink > 0) {
+                /*
+                 * Failed to do the truncate, remove us from the in memory
+                 * orphan list.
+                 */
+                ret = btrfs_orphan_del(NULL, inode);
        }
+        trans->block_rsv = &root->fs_info->trans_block_rsv;
        ret = btrfs_update_inode(trans, root, inode);
-        BUG_ON(ret);
+        if (ret && !err)
+                err = ret;
        nr = trans->blocks_used;
        ret = btrfs_end_transaction_throttle(trans, root);
-        BUG_ON(ret);
        btrfs_btree_balance_dirty(root, nr);
+out:
+        btrfs_free_block_rsv(root, rsv);
+        if (ret && !err)
+                err = ret;
+        return err;
 }
 /*
 * create a new subvolume directory/inode (helper for the ioctl).
 */
 int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
-                             struct btrfs_root *new_root,
+                             struct btrfs_root *new_root, u64 new_dirid)
-                             u64 new_dirid, u64 alloc_hint)
 {
        struct inode *inode;
        int err;
        u64 index = 0;
        inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid,
-                                new_dirid, alloc_hint, S_IFDIR | 0700, &index);
+                                new_dirid, S_IFDIR | 0700, &index);
        if (IS_ERR(inode))
                return PTR_ERR(inode);
        inode->i_op = &btrfs_dir_inode_operations;
@@ -6256,19 +6735,21 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        ei->index_cnt = (u64)-1;
        ei->last_unlink_trans = 0;
-        spin_lock_init(&ei->accounting_lock);
        atomic_set(&ei->outstanding_extents, 0);
-        ei->reserved_extents = 0;
+        atomic_set(&ei->reserved_extents, 0);
        ei->ordered_data_close = 0;
        ei->orphan_meta_reserved = 0;
        ei->dummy_inode = 0;
-        ei->force_compress = 0;
+        ei->in_defrag = 0;
+        ei->force_compress = BTRFS_COMPRESS_NONE;
+        ei->delayed_node = NULL;
        inode = &ei->vfs_inode;
-        extent_map_tree_init(&ei->extent_tree, GFP_NOFS);
+        extent_map_tree_init(&ei->extent_tree);
-        extent_io_tree_init(&ei->io_tree, &inode->i_data, GFP_NOFS);
+        extent_io_tree_init(&ei->io_tree, &inode->i_data);
-        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data, GFP_NOFS);
+        extent_io_tree_init(&ei->io_failure_tree, &inode->i_data);
        mutex_init(&ei->log_mutex);
        btrfs_ordered_inode_tree_init(&ei->ordered_tree);
        INIT_LIST_HEAD(&ei->i_orphan);
@@ -6279,6 +6760,13 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
        return inode;
 }
+static void btrfs_i_callback(struct rcu_head *head)
+{
+        struct inode *inode = container_of(head, struct inode, i_rcu);
+        INIT_LIST_HEAD(&inode->i_dentry);
+        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+}
 void btrfs_destroy_inode(struct inode *inode)
 {
        struct btrfs_ordered_extent *ordered;
@@ -6287,7 +6775,7 @@ void btrfs_destroy_inode(struct inode *inode)
        WARN_ON(!list_empty(&inode->i_dentry));
        WARN_ON(inode->i_data.nrpages);
        WARN_ON(atomic_read(&BTRFS_I(inode)->outstanding_extents));
-        WARN_ON(BTRFS_I(inode)->reserved_extents);
+        WARN_ON(atomic_read(&BTRFS_I(inode)->reserved_extents));
        /*
         * This can happen where we create an inode, but somebody else also
@@ -6310,8 +6798,8 @@ void btrfs_destroy_inode(struct inode *inode)
        spin_lock(&root->orphan_lock);
        if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
-                printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n",
+                printk(KERN_INFO "BTRFS: inode %llu still on the orphan list\n",
-                       inode->i_ino);
+                       (unsigned long long)btrfs_ino(inode));
                list_del_init(&BTRFS_I(inode)->i_orphan);
        }
        spin_unlock(&root->orphan_lock);
@@ -6333,14 +6821,16 @@ void btrfs_destroy_inode(struct inode *inode)
        inode_tree_del(inode);
        btrfs_drop_extent_cache(inode, 0, (u64)-1, 0);
 free:
-        kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
+        btrfs_remove_delayed_node(inode);
+        call_rcu(&inode->i_rcu, btrfs_i_callback);
 }
 int btrfs_drop_inode(struct inode *inode)
 {
        struct btrfs_root *root = BTRFS_I(inode)->root;
-        if (btrfs_root_refs(&root->root_item) == 0)
+        if (btrfs_root_refs(&root->root_item) == 0 &&
+            !is_free_space_inode(root, inode))
                return 1;
        else
                return generic_drop_inode(inode);
@@ -6363,6 +6853,8 @@ void btrfs_destroy_cachep(void)
                kmem_cache_destroy(btrfs_transaction_cachep);
        if (btrfs_path_cachep)
                kmem_cache_destroy(btrfs_path_cachep);
+        if (btrfs_free_space_cachep)
+                kmem_cache_destroy(btrfs_free_space_cachep);
 }
 int btrfs_init_cachep(void)
@@ -6391,6 +6883,12 @@ int btrfs_init_cachep(void)
        if (!btrfs_path_cachep)
                goto fail;
+        btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space_cache",
+                        sizeof(struct btrfs_free_space), 0,
+                        SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, NULL);
+        if (!btrfs_free_space_cachep)
+                goto fail;
        return 0;
 fail:
        btrfs_destroy_cachep();
@@ -6409,6 +6907,26 @@ static int btrfs_getattr(struct vfsmount *mnt,
        return 0;
 }
+/*
+ * If a file is moved, it will inherit the cow and compression flags of the new
+ * directory.
+ */
+static void fixup_inode_flags(struct inode *dir, struct inode *inode)
+{
+        struct btrfs_inode *b_dir = BTRFS_I(dir);
+        struct btrfs_inode *b_inode = BTRFS_I(inode);
+        if (b_dir->flags & BTRFS_INODE_NODATACOW)
+                b_inode->flags |= BTRFS_INODE_NODATACOW;
+        else
+                b_inode->flags &= ~BTRFS_INODE_NODATACOW;
+        if (b_dir->flags & BTRFS_INODE_COMPRESS)
+                b_inode->flags |= BTRFS_INODE_COMPRESS;
+        else
+                b_inode->flags &= ~BTRFS_INODE_COMPRESS;
+}
 static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                           struct inode *new_dir, struct dentry *new_dentry)
 {
@@ -6421,16 +6939,17 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        u64 index = 0;
        u64 root_objectid;
        int ret;
+        u64 old_ino = btrfs_ino(old_inode);
-        if (new_dir->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
+        if (btrfs_ino(new_dir) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
                return -EPERM;
        /* we only allow rename subvolume link between subvolumes */
-        if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+        if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
                return -EXDEV;
-        if (old_inode->i_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
+        if (old_ino == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID ||
-            (new_inode && new_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID))
+            (new_inode && btrfs_ino(new_inode) == BTRFS_FIRST_FREE_OBJECTID))
                return -ENOTEMPTY;
        if (S_ISDIR(old_inode->i_mode) && new_inode &&
@@ -6446,7 +6965,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                filemap_flush(old_inode->i_mapping);
        /* close the racy window with snapshot create/destroy ioctl */
-        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
                down_read(&root->fs_info->subvol_sem);
        /*
         * We want to reserve the absolute worst case amount of items.  So if
@@ -6457,10 +6976,10 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * should cover the worst case number of items we'll modify.
         */
        trans = btrfs_start_transaction(root, 20);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
-                return PTR_ERR(trans);
+                ret = PTR_ERR(trans);
+                goto out_notrans;
-        btrfs_set_trans_block_group(trans, new_dir);
+        }
        if (dest != root)
                btrfs_record_root_in_trans(trans, dest);
@@ -6469,15 +6988,15 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (ret)
                goto out_fail;
-        if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                /* force full log commit if subvolume involved. */
                root->fs_info->last_trans_log_full_commit = trans->transid;
        } else {
                ret = btrfs_insert_inode_ref(trans, dest,
                                             new_dentry->d_name.name,
                                             new_dentry->d_name.len,
-                                             old_inode->i_ino,
+                                             old_ino,
-                                             new_dir->i_ino, index);
+                                             btrfs_ino(new_dir), index);
                if (ret)
                        goto out_fail;
                /*
@@ -6493,10 +7012,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         * make sure the inode gets flushed if it is replacing
         * something.
         */
-        if (new_inode && new_inode->i_size &&
+        if (new_inode && new_inode->i_size && S_ISREG(old_inode->i_mode))
-            old_inode && S_ISREG(old_inode->i_mode)) {
                btrfs_add_ordered_operation(trans, root, old_inode);
-        }
        old_dir->i_ctime = old_dir->i_mtime = ctime;
        new_dir->i_ctime = new_dir->i_mtime = ctime;
@@ -6505,23 +7022,24 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
        if (old_dentry->d_parent != new_dentry->d_parent)
                btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
-        if (unlikely(old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)) {
+        if (unlikely(old_ino == BTRFS_FIRST_FREE_OBJECTID)) {
                root_objectid = BTRFS_I(old_inode)->root->root_key.objectid;
                ret = btrfs_unlink_subvol(trans, root, old_dir, root_objectid,
                                        old_dentry->d_name.name,
                                        old_dentry->d_name.len);
        } else {
-                btrfs_inc_nlink(old_dentry->d_inode);
+                ret = __btrfs_unlink_inode(trans, root, old_dir,
-                ret = btrfs_unlink_inode(trans, root, old_dir,
+                                        old_dentry->d_inode,
-                                         old_dentry->d_inode,
+                                        old_dentry->d_name.name,
-                                         old_dentry->d_name.name,
+                                        old_dentry->d_name.len);
-                                         old_dentry->d_name.len);
+                if (!ret)
+                        ret = btrfs_update_inode(trans, root, old_inode);
        }
        BUG_ON(ret);
        if (new_inode) {
                new_inode->i_ctime = CURRENT_TIME;
-                if (unlikely(new_inode->i_ino ==
+                if (unlikely(btrfs_ino(new_inode) ==
                             BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)) {
                        root_objectid = BTRFS_I(new_inode)->location.objectid;
                        ret = btrfs_unlink_subvol(trans, dest, new_dir,
@@ -6542,20 +7060,23 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
                }
        }
+        fixup_inode_flags(new_dir, old_inode);
        ret = btrfs_add_link(trans, new_dir, old_inode,
                             new_dentry->d_name.name,
                             new_dentry->d_name.len, 0, index);
        BUG_ON(ret);
-        if (old_inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+        if (old_ino != BTRFS_FIRST_FREE_OBJECTID) {
-                btrfs_log_new_name(trans, old_inode, old_dir,
+                struct dentry *parent = dget_parent(new_dentry);
-                                   new_dentry->d_parent);
+                btrfs_log_new_name(trans, old_inode, old_dir, parent);
+                dput(parent);
                btrfs_end_log_trans(root);
        }
 out_fail:
        btrfs_end_transaction_throttle(trans, root);
+out_notrans:
-        if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+        if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
                up_read(&root->fs_info->subvol_sem);
        return ret;
@@ -6609,38 +7130,6 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput)
        return 0;
 }
-int btrfs_start_one_delalloc_inode(struct btrfs_root *root, int delay_iput)
-{
-        struct btrfs_inode *binode;
-        struct inode *inode = NULL;
-        spin_lock(&root->fs_info->delalloc_lock);
-        while (!list_empty(&root->fs_info->delalloc_inodes)) {
-                binode = list_entry(root->fs_info->delalloc_inodes.next,
-                                    struct btrfs_inode, delalloc_inodes);
-                inode = igrab(&binode->vfs_inode);
-                if (inode) {
-                        list_move_tail(&binode->delalloc_inodes,
-                                       &root->fs_info->delalloc_inodes);
-                        break;
-                }
-                list_del_init(&binode->delalloc_inodes);
-                cond_resched_lock(&root->fs_info->delalloc_lock);
-        }
-        spin_unlock(&root->fs_info->delalloc_lock);
-        if (inode) {
-                write_inode_now(inode, 0);
-                if (delay_iput)
-                        btrfs_add_delayed_iput(inode);
-                else
-                        iput(inode);
-                return 1;
-        }
-        return 0;
-}
 static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                         const char *symname)
 {
@@ -6664,9 +7153,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
                return -ENAMETOOLONG;
-        err = btrfs_find_free_objectid(NULL, root, dir->i_ino, &objectid);
-        if (err)
-                return err;
        /*
         * 2 items for inode item and ref
         * 2 items for dir items
@@ -6676,25 +7162,25 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, dir);
+        err = btrfs_find_free_ino(root, &objectid);
+        if (err)
+                goto out_unlock;
        inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name,
-                                dentry->d_name.len,
+                                dentry->d_name.len, btrfs_ino(dir), objectid,
-                                dentry->d_parent->d_inode->i_ino, objectid,
+                                S_IFLNK|S_IRWXUGO, &index);
-                                BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO,
+        if (IS_ERR(inode)) {
-                                &index);
+                err = PTR_ERR(inode);
-        err = PTR_ERR(inode);
-        if (IS_ERR(inode))
                goto out_unlock;
+        }
-        err = btrfs_init_inode_security(trans, inode, dir);
+        err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
        if (err) {
                drop_inode = 1;
                goto out_unlock;
        }
-        btrfs_set_trans_block_group(trans, inode);
+        err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
-        err = btrfs_add_nondir(trans, dentry, inode, 0, index);
        if (err)
                drop_inode = 1;
        else {
@@ -6704,14 +7190,12 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                inode->i_op = &btrfs_file_inode_operations;
                BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
        }
-        btrfs_update_inode_block_group(trans, inode);
-        btrfs_update_inode_block_group(trans, dir);
        if (drop_inode)
                goto out_unlock;
        path = btrfs_alloc_path();
        BUG_ON(!path);
-        key.objectid = inode->i_ino;
+        key.objectid = btrfs_ino(inode);
        key.offset = 0;
        btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
        datasize = btrfs_file_extent_calc_inline_size(name_len);
@@ -6719,6 +7203,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
                                      datasize);
        if (err) {
                drop_inode = 1;
+                btrfs_free_path(path);
                goto out_unlock;
        }
        leaf = path->nodes[0];
@@ -6757,27 +7242,34 @@ out_unlock:
        return err;
 }
-int btrfs_prealloc_file_range(struct inode *inode, int mode,
+static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
-                              u64 start, u64 num_bytes, u64 min_size,
+                                       u64 start, u64 num_bytes, u64 min_size,
-                              loff_t actual_len, u64 *alloc_hint)
+                                       loff_t actual_len, u64 *alloc_hint,
+                                       struct btrfs_trans_handle *trans)
 {
-        struct btrfs_trans_handle *trans;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_key ins;
        u64 cur_offset = start;
+        u64 i_size;
        int ret = 0;
+        bool own_trans = true;
+        if (trans)
+                own_trans = false;
        while (num_bytes > 0) {
-                trans = btrfs_start_transaction(root, 3);
+                if (own_trans) {
-                if (IS_ERR(trans)) {
+                        trans = btrfs_start_transaction(root, 3);
-                        ret = PTR_ERR(trans);
+                        if (IS_ERR(trans)) {
-                        break;
+                                ret = PTR_ERR(trans);
+                                break;
+                        }
                }
                ret = btrfs_reserve_extent(trans, root, num_bytes, min_size,
                                           0, *alloc_hint, (u64)-1, &ins, 1);
                if (ret) {
-                        btrfs_end_transaction(trans, root);
+                        if (own_trans)
+                                btrfs_end_transaction(trans, root);
                        break;
                }
@@ -6800,121 +7292,38 @@ int btrfs_prealloc_file_range(struct inode *inode, int mode,
                    (actual_len > inode->i_size) &&
                    (cur_offset > inode->i_size)) {
                        if (cur_offset > actual_len)
-                                i_size_write(inode, actual_len);
+                                i_size = actual_len;
                        else
-                                i_size_write(inode, cur_offset);
+                                i_size = cur_offset;
-                        i_size_write(inode, cur_offset);
+                        i_size_write(inode, i_size);
-                        btrfs_ordered_update_i_size(inode, cur_offset, NULL);
+                        btrfs_ordered_update_i_size(inode, i_size, NULL);
                }
                ret = btrfs_update_inode(trans, root, inode);
                BUG_ON(ret);
-                btrfs_end_transaction(trans, root);
+                if (own_trans)
+                        btrfs_end_transaction(trans, root);
        }
        return ret;
 }
-static long btrfs_fallocate(struct inode *inode, int mode,
+int btrfs_prealloc_file_range(struct inode *inode, int mode,
-                            loff_t offset, loff_t len)
+                              u64 start, u64 num_bytes, u64 min_size,
+                              loff_t actual_len, u64 *alloc_hint)
 {
-        struct extent_state *cached_state = NULL;
+        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
-        u64 cur_offset;
+                                           min_size, actual_len, alloc_hint,
-        u64 last_byte;
+                                           NULL);
-        u64 alloc_start;
+}
-        u64 alloc_end;
-        u64 alloc_hint = 0;
-        u64 locked_end;
-        u64 mask = BTRFS_I(inode)->root->sectorsize - 1;
-        struct extent_map *em;
-        int ret;
-        alloc_start = offset & ~mask;
-        alloc_end =  (offset + len + mask) & ~mask;
-        /*
-         * wait for ordered IO before we have any locks.  We'll loop again
-         * below with the locks held.
-         */
-        btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start);
-        mutex_lock(&inode->i_mutex);
-        if (alloc_start > inode->i_size) {
-                ret = btrfs_cont_expand(inode, alloc_start);
-                if (ret)
-                        goto out;
-        }
-        ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-        if (ret)
-                goto out;
-        locked_end = alloc_end - 1;
-        while (1) {
-                struct btrfs_ordered_extent *ordered;
-                /* the extent lock is ordered inside the running
-                 * transaction
-                 */
-                lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-                                 locked_end, 0, &cached_state, GFP_NOFS);
-                ordered = btrfs_lookup_first_ordered_extent(inode,
-                                                            alloc_end - 1);
-                if (ordered &&
-                    ordered->file_offset + ordered->len > alloc_start &&
-                    ordered->file_offset < alloc_end) {
-                        btrfs_put_ordered_extent(ordered);
-                        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
-                                             alloc_start, locked_end,
-                                             &cached_state, GFP_NOFS);
-                        /*
-                         * we can't wait on the range with the transaction
-                         * running or with the extent lock held
-                         */
-                        btrfs_wait_ordered_range(inode, alloc_start,
-                                                 alloc_end - alloc_start);
-                } else {
-                        if (ordered)
-                                btrfs_put_ordered_extent(ordered);
-                        break;
-                }
-        }
-        cur_offset = alloc_start;
-        while (1) {
-                em = btrfs_get_extent(inode, NULL, 0, cur_offset,
-                                      alloc_end - cur_offset, 0);
-                BUG_ON(IS_ERR(em) || !em);
-                last_byte = min(extent_map_end(em), alloc_end);
-                last_byte = (last_byte + mask) & ~mask;
-                if (em->block_start == EXTENT_MAP_HOLE ||
-                    (cur_offset >= inode->i_size &&
-                     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
-                        ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
-                                                        last_byte - cur_offset,
-                                                        1 << inode->i_blkbits,
-                                                        offset + len,
-                                                        &alloc_hint);
-                        if (ret < 0) {
-                                free_extent_map(em);
-                                break;
-                        }
-                }
-                free_extent_map(em);
-                cur_offset = last_byte;
-                if (cur_offset >= alloc_end) {
-                        ret = 0;
-                        break;
-                }
-        }
-        unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
-                             &cached_state, GFP_NOFS);
-        btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
+int btrfs_prealloc_file_range_trans(struct inode *inode,
-out:
+                                    struct btrfs_trans_handle *trans, int mode,
-        mutex_unlock(&inode->i_mutex);
+                                    u64 start, u64 num_bytes, u64 min_size,
-        return ret;
+                                    loff_t actual_len, u64 *alloc_hint)
+{
+        return __btrfs_prealloc_file_range(inode, mode, start, num_bytes,
+                                           min_size, actual_len, alloc_hint, trans);
 }
 static int btrfs_set_page_dirty(struct page *page)
@@ -6922,11 +7331,15 @@ static int btrfs_set_page_dirty(struct page *page)
        return __set_page_dirty_nobuffers(page);
 }
-static int btrfs_permission(struct inode *inode, int mask)
+static int btrfs_permission(struct inode *inode, int mask, unsigned int flags)
 {
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        if (btrfs_root_readonly(root) && (mask & MAY_WRITE))
+                return -EROFS;
        if ((BTRFS_I(inode)->flags & BTRFS_INODE_READONLY) && (mask & MAY_WRITE))
                return -EACCES;
-        return generic_permission(inode, mask, btrfs_check_acl);
+        return generic_permission(inode, mask, flags, btrfs_check_acl);
 }
 static const struct inode_operations btrfs_dir_inode_operations = {
@@ -6995,7 +7408,6 @@ static const struct address_space_operations btrfs_aops = {
        .writepage      = btrfs_writepage,
        .writepages     = btrfs_writepages,
        .readpages      = btrfs_readpages,
-        .sync_page      = block_sync_page,
        .direct_IO      = btrfs_direct_IO,
        .invalidatepage = btrfs_invalidatepage,
        .releasepage    = btrfs_releasepage,
@@ -7011,7 +7423,6 @@ static const struct address_space_operations btrfs_symlink_aops = {
 };
 static const struct inode_operations btrfs_file_inode_operations = {
-        .truncate       = btrfs_truncate,
        .getattr        = btrfs_getattr,
        .setattr        = btrfs_setattr,
        .setxattr       = btrfs_setxattr,
@@ -7019,7 +7430,6 @@ static const struct inode_operations btrfs_file_inode_operations = {
        .listxattr      = btrfs_listxattr,
        .removexattr    = btrfs_removexattr,
        .permission     = btrfs_permission,
-        .fallocate      = btrfs_fallocate,
        .fiemap         = btrfs_fiemap,
 };
 static const struct inode_operations btrfs_special_inode_operations = {
@@ -7035,6 +7445,7 @@ static const struct inode_operations btrfs_symlink_inode_operations = {
        .readlink       = generic_readlink,
        .follow_link    = page_follow_link_light,
        .put_link       = page_put_link,
+        .getattr        = btrfs_getattr,
        .permission     = btrfs_permission,
        .setxattr       = btrfs_setxattr,
        .getxattr       = btrfs_getxattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 9254b3d58dbe..a3c4751e07db 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -40,6 +40,7 @@
 #include <linux/xattr.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
+#include <linux/blkdev.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -49,6 +50,7 @@
 #include "print-tree.h"
 #include "volumes.h"
 #include "locking.h"
+#include "inode-map.h"
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -80,6 +82,13 @@ static unsigned int btrfs_flags_to_ioctl(unsigned int flags)
                iflags |= FS_NOATIME_FL;
        if (flags & BTRFS_INODE_DIRSYNC)
                iflags |= FS_DIRSYNC_FL;
+        if (flags & BTRFS_INODE_NODATACOW)
+                iflags |= FS_NOCOW_FL;
+        if ((flags & BTRFS_INODE_COMPRESS) && !(flags & BTRFS_INODE_NOCOMPRESS))
+                iflags |= FS_COMPR_FL;
+        else if (flags & BTRFS_INODE_NOCOMPRESS)
+                iflags |= FS_NOCOMP_FL;
        return iflags;
 }
@@ -138,6 +147,21 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
        return 0;
 }
+static int check_flags(unsigned int flags)
+{
+        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+                      FS_NOATIME_FL | FS_NODUMP_FL | \
+                      FS_SYNC_FL | FS_DIRSYNC_FL | \
+                      FS_NOCOMP_FL | FS_COMPR_FL |
+                      FS_NOCOW_FL))
+                return -EOPNOTSUPP;
+        if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
+                return -EINVAL;
+        return 0;
+}
 static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 {
        struct inode *inode = file->f_path.dentry->d_inode;
@@ -147,15 +171,17 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        unsigned int flags, oldflags;
        int ret;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        if (copy_from_user(&flags, arg, sizeof(flags)))
                return -EFAULT;
-        if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
+        ret = check_flags(flags);
-                      FS_NOATIME_FL | FS_NODUMP_FL | \
+        if (ret)
-                      FS_SYNC_FL | FS_DIRSYNC_FL))
+                return ret;
-                return -EOPNOTSUPP;
-        if (!is_owner_or_cap(inode))
+        if (!inode_owner_or_capable(inode))
                return -EACCES;
        mutex_lock(&inode->i_mutex);
@@ -197,10 +223,28 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
                ip->flags |= BTRFS_INODE_DIRSYNC;
        else
                ip->flags &= ~BTRFS_INODE_DIRSYNC;
+        if (flags & FS_NOCOW_FL)
+                ip->flags |= BTRFS_INODE_NODATACOW;
+        else
+                ip->flags &= ~BTRFS_INODE_NODATACOW;
+        /*
+         * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
+         * flag may be changed automatically if compression code won't make
+         * things smaller.
+         */
+        if (flags & FS_NOCOMP_FL) {
+                ip->flags &= ~BTRFS_INODE_COMPRESS;
+                ip->flags |= BTRFS_INODE_NOCOMPRESS;
+        } else if (flags & FS_COMPR_FL) {
+                ip->flags |= BTRFS_INODE_COMPRESS;
+                ip->flags &= ~BTRFS_INODE_NOCOMPRESS;
+        } else {
+                ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
+        }
-        trans = btrfs_join_transaction(root, 1);
+        trans = btrfs_join_transaction(root);
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        ret = btrfs_update_inode(trans, root, inode);
        BUG_ON(ret);
@@ -210,9 +254,11 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
        btrfs_end_transaction(trans, root);
        mnt_drop_write(file->f_path.mnt);
+        ret = 0;
 out_unlock:
        mutex_unlock(&inode->i_mutex);
-        return 0;
+        return ret;
 }
 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
@@ -222,9 +268,54 @@ static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
        return put_user(inode->i_generation, arg);
 }
+static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
+{
+        struct btrfs_root *root = fdentry(file)->d_sb->s_fs_info;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_device *device;
+        struct request_queue *q;
+        struct fstrim_range range;
+        u64 minlen = ULLONG_MAX;
+        u64 num_devices = 0;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        rcu_read_lock();
+        list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
+                                dev_list) {
+                if (!device->bdev)
+                        continue;
+                q = bdev_get_queue(device->bdev);
+                if (blk_queue_discard(q)) {
+                        num_devices++;
+                        minlen = min((u64)q->limits.discard_granularity,
+                                     minlen);
+                }
+        }
+        rcu_read_unlock();
+        if (!num_devices)
+                return -EOPNOTSUPP;
+        if (copy_from_user(&range, arg, sizeof(range)))
+                return -EFAULT;
+        range.minlen = max(range.minlen, minlen);
+        ret = btrfs_trim_fs(root, &range);
+        if (ret < 0)
+                return ret;
+        if (copy_to_user(arg, &range, sizeof(range)))
+                return -EFAULT;
+        return 0;
+}
 static noinline int create_subvol(struct btrfs_root *root,
                                  struct dentry *dentry,
-                                  char *name, int namelen)
+                                  char *name, int namelen,
+                                  u64 *async_transid)
 {
        struct btrfs_trans_handle *trans;
        struct btrfs_key key;
@@ -232,17 +323,22 @@ static noinline int create_subvol(struct btrfs_root *root,
        struct btrfs_inode_item *inode_item;
        struct extent_buffer *leaf;
        struct btrfs_root *new_root;
-        struct inode *dir = dentry->d_parent->d_inode;
+        struct dentry *parent = dget_parent(dentry);
+        struct inode *dir;
        int ret;
        int err;
        u64 objectid;
        u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
        u64 index = 0;
-        ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root,
+        ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid);
-                                       0, &objectid);
+        if (ret) {
-        if (ret)
+                dput(parent);
                return ret;
+        }
+        dir = parent->d_inode;
        /*
         * 1 - inode item
         * 2 - refs
@@ -250,8 +346,10 @@ static noinline int create_subvol(struct btrfs_root *root,
         * 2 - dir items
         */
        trans = btrfs_start_transaction(root, 6);
-        if (IS_ERR(trans))
+        if (IS_ERR(trans)) {
+                dput(parent);
                return PTR_ERR(trans);
+        }
        leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
                                      0, objectid, NULL, 0, 0, 0);
@@ -282,6 +380,10 @@ static noinline int create_subvol(struct btrfs_root *root,
        inode_item->nbytes = cpu_to_le64(root->leafsize);
        inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+        root_item.flags = 0;
+        root_item.byte_limit = 0;
+        inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
        btrfs_set_root_bytenr(&root_item, leaf->start);
        btrfs_set_root_generation(&root_item, trans->transid);
        btrfs_set_root_level(&root_item, 0);
@@ -312,8 +414,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        btrfs_record_root_in_trans(trans, new_root);
-        ret = btrfs_create_subvol_root(trans, new_root, new_dirid,
+        ret = btrfs_create_subvol_root(trans, new_root, new_dirid);
-                                       BTRFS_I(dir)->block_group);
        /*
         * insert the directory item
         */
@@ -321,7 +422,7 @@ static noinline int create_subvol(struct btrfs_root *root,
        BUG_ON(ret);
        ret = btrfs_insert_dir_item(trans, root,
-                                    name, namelen, dir->i_ino, &key,
+                                    name, namelen, dir, &key,
                                    BTRFS_FT_DIR, index);
        if (ret)
                goto fail;
@@ -332,21 +433,30 @@ static noinline int create_subvol(struct btrfs_root *root,
        ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
                                 objectid, root->root_key.objectid,
-                                 dir->i_ino, index, name, namelen);
+                                 btrfs_ino(dir), index, name, namelen);
        BUG_ON(ret);
        d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry));
 fail:
-        err = btrfs_commit_transaction(trans, root);
+        dput(parent);
+        if (async_transid) {
+                *async_transid = trans->transid;
+                err = btrfs_commit_transaction_async(trans, root, 1);
+        } else {
+                err = btrfs_commit_transaction(trans, root);
+        }
        if (err && !ret)
                ret = err;
        return ret;
 }
-static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
+static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
+                           char *name, int namelen, u64 *async_transid,
+                           bool readonly)
 {
        struct inode *inode;
+        struct dentry *parent;
        struct btrfs_pending_snapshot *pending_snapshot;
        struct btrfs_trans_handle *trans;
        int ret;
@@ -361,6 +471,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
        btrfs_init_block_rsv(&pending_snapshot->block_rsv);
        pending_snapshot->dentry = dentry;
        pending_snapshot->root = root;
+        pending_snapshot->readonly = readonly;
        trans = btrfs_start_transaction(root->fs_info->extent_root, 5);
        if (IS_ERR(trans)) {
@@ -371,18 +482,31 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry)
        ret = btrfs_snap_reserve_metadata(trans, pending_snapshot);
        BUG_ON(ret);
+        spin_lock(&root->fs_info->trans_lock);
        list_add(&pending_snapshot->list,
                 &trans->transaction->pending_snapshots);
-        ret = btrfs_commit_transaction(trans, root->fs_info->extent_root);
+        spin_unlock(&root->fs_info->trans_lock);
+        if (async_transid) {
+                *async_transid = trans->transid;
+                ret = btrfs_commit_transaction_async(trans,
+                                     root->fs_info->extent_root, 1);
+        } else {
+                ret = btrfs_commit_transaction(trans,
+                                               root->fs_info->extent_root);
+        }
        BUG_ON(ret);
        ret = pending_snapshot->error;
        if (ret)
                goto fail;
-        btrfs_orphan_cleanup(pending_snapshot->snap);
+        ret = btrfs_orphan_cleanup(pending_snapshot->snap);
+        if (ret)
+                goto fail;
-        inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry);
+        parent = dget_parent(dentry);
+        inode = btrfs_lookup_dentry(parent->d_inode, dentry);
+        dput(parent);
        if (IS_ERR(inode)) {
                ret = PTR_ERR(inode);
                goto fail;
@@ -395,6 +519,76 @@ fail:
        return ret;
 }
+/*  copy of check_sticky in fs/namei.c()
+* It's inline, so penalty for filesystems that don't use sticky bit is
+* minimal.
+*/
+static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode)
+{
+        uid_t fsuid = current_fsuid();
+        if (!(dir->i_mode & S_ISVTX))
+                return 0;
+        if (inode->i_uid == fsuid)
+                return 0;
+        if (dir->i_uid == fsuid)
+                return 0;
+        return !capable(CAP_FOWNER);
+}
+/*  copy of may_delete in fs/namei.c()
+ *      Check whether we can remove a link victim from directory dir, check
+ *  whether the type of victim is right.
+ *  1. We can't do it if dir is read-only (done in permission())
+ *  2. We should have write and exec permissions on dir
+ *  3. We can't remove anything from append-only dir
+ *  4. We can't do anything with immutable dir (done in permission())
+ *  5. If the sticky bit on dir is set we should either
+ *      a. be owner of dir, or
+ *      b. be owner of victim, or
+ *      c. have CAP_FOWNER capability
+ *  6. If the victim is append-only or immutable we can't do antyhing with
+ *     links pointing to it.
+ *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
+ *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
+ *  9. We can't remove a root or mountpoint.
+ * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
+ *     nfs_async_unlink().
+ */
+static int btrfs_may_delete(struct inode *dir,struct dentry *victim,int isdir)
+{
+        int error;
+        if (!victim->d_inode)
+                return -ENOENT;
+        BUG_ON(victim->d_parent->d_inode != dir);
+        audit_inode_child(victim, dir);
+        error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+        if (error)
+                return error;
+        if (IS_APPEND(dir))
+                return -EPERM;
+        if (btrfs_check_sticky(dir, victim->d_inode)||
+                IS_APPEND(victim->d_inode)||
+            IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode))
+                return -EPERM;
+        if (isdir) {
+                if (!S_ISDIR(victim->d_inode->i_mode))
+                        return -ENOTDIR;
+                if (IS_ROOT(victim))
+                        return -EBUSY;
+        } else if (S_ISDIR(victim->d_inode->i_mode))
+                return -EISDIR;
+        if (IS_DEADDIR(dir))
+                return -ENOENT;
+        if (victim->d_flags & DCACHE_NFSFS_RENAMED)
+                return -EBUSY;
+        return 0;
+}
 /* copy of may_create in fs/namei.c() */
 static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 {
@@ -412,7 +606,8 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
 */
 static noinline int btrfs_mksubvol(struct path *parent,
                                   char *name, int namelen,
-                                   struct btrfs_root *snap_src)
+                                   struct btrfs_root *snap_src,
+                                   u64 *async_transid, bool readonly)
 {
        struct inode *dir  = parent->dentry->d_inode;
        struct dentry *dentry;
@@ -443,10 +638,11 @@ static noinline int btrfs_mksubvol(struct path *parent,
                goto out_up_read;
        if (snap_src) {
-                error = create_snapshot(snap_src, dentry);
+                error = create_snapshot(snap_src, dentry,
+                                        name, namelen, async_transid, readonly);
        } else {
                error = create_subvol(BTRFS_I(dir)->root, dentry,
-                                      name, namelen);
+                                      name, namelen, async_transid);
        }
        if (!error)
                fsnotify_mkdir(dir, dentry);
@@ -461,6 +657,107 @@ out_unlock:
        return error;
 }
+/*
+ * When we're defragging a range, we don't want to kick it off again
+ * if it is really just waiting for delalloc to send it down.
+ * If we find a nice big extent or delalloc range for the bytes in the
+ * file you want to defrag, we return 0 to let you know to skip this
+ * part of the file
+ */
+static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh)
+{
+        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+        struct extent_map *em = NULL;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        u64 end;
+        read_lock(&em_tree->lock);
+        em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
+        read_unlock(&em_tree->lock);
+        if (em) {
+                end = extent_map_end(em);
+                free_extent_map(em);
+                if (end - offset > thresh)
+                        return 0;
+        }
+        /* if we already have a nice delalloc here, just stop */
+        thresh /= 2;
+        end = count_range_bits(io_tree, &offset, offset + thresh,
+                               thresh, EXTENT_DELALLOC, 1);
+        if (end >= thresh)
+                return 0;
+        return 1;
+}
+/*
+ * helper function to walk through a file and find extents
+ * newer than a specific transid, and smaller than thresh.
+ *
+ * This is used by the defragging code to find new and small
+ * extents
+ */
+static int find_new_extents(struct btrfs_root *root,
+                            struct inode *inode, u64 newer_than,
+                            u64 *off, int thresh)
+{
+        struct btrfs_path *path;
+        struct btrfs_key min_key;
+        struct btrfs_key max_key;
+        struct extent_buffer *leaf;
+        struct btrfs_file_extent_item *extent;
+        int type;
+        int ret;
+        u64 ino = btrfs_ino(inode);
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        min_key.objectid = ino;
+        min_key.type = BTRFS_EXTENT_DATA_KEY;
+        min_key.offset = *off;
+        max_key.objectid = ino;
+        max_key.type = (u8)-1;
+        max_key.offset = (u64)-1;
+        path->keep_locks = 1;
+        while(1) {
+                ret = btrfs_search_forward(root, &min_key, &max_key,
+                                           path, 0, newer_than);
+                if (ret != 0)
+                        goto none;
+                if (min_key.objectid != ino)
+                        goto none;
+                if (min_key.type != BTRFS_EXTENT_DATA_KEY)
+                        goto none;
+                leaf = path->nodes[0];
+                extent = btrfs_item_ptr(leaf, path->slots[0],
+                                        struct btrfs_file_extent_item);
+                type = btrfs_file_extent_type(leaf, extent);
+                if (type == BTRFS_FILE_EXTENT_REG &&
+                    btrfs_file_extent_num_bytes(leaf, extent) < thresh &&
+                    check_defrag_in_cache(inode, min_key.offset, thresh)) {
+                        *off = min_key.offset;
+                        btrfs_free_path(path);
+                        return 0;
+                }
+                if (min_key.offset == (u64)-1)
+                        goto none;
+                min_key.offset++;
+                btrfs_release_path(path);
+        }
+none:
+        btrfs_free_path(path);
+        return -ENOENT;
+}
 static int should_defrag_range(struct inode *inode, u64 start, u64 len,
                               int thresh, u64 *last_len, u64 *skip,
                               u64 *defrag_end)
@@ -470,10 +767,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
        int ret = 1;
-        if (thresh == 0)
-                thresh = 256 * 1024;
        /*
         * make sure that once we start defragging and extent, we keep on
         * defragging it
@@ -532,28 +825,208 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
        return ret;
 }
-static int btrfs_defrag_file(struct file *file,
+/*
-                             struct btrfs_ioctl_defrag_range_args *range)
+ * it doesn't do much good to defrag one or two pages
+ * at a time.  This pulls in a nice chunk of pages
+ * to COW and defrag.
+ *
+ * It also makes sure the delalloc code has enough
+ * dirty data to avoid making new small extents as part
+ * of the defrag
+ *
+ * It's a good idea to start RA on this range
+ * before calling this.
+ */
+static int cluster_pages_for_defrag(struct inode *inode,
+                                    struct page **pages,
+                                    unsigned long start_index,
+                                    int num_pages)
 {
-        struct inode *inode = fdentry(file)->d_inode;
+        unsigned long file_end;
-        struct btrfs_root *root = BTRFS_I(inode)->root;
+        u64 isize = i_size_read(inode);
-        struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-        struct btrfs_ordered_extent *ordered;
-        struct page *page;
-        unsigned long last_index;
-        unsigned long ra_pages = root->fs_info->bdi.ra_pages;
-        unsigned long total_read = 0;
        u64 page_start;
        u64 page_end;
+        int ret;
+        int i;
+        int i_done;
+        struct btrfs_ordered_extent *ordered;
+        struct extent_state *cached_state = NULL;
+        if (isize == 0)
+                return 0;
+        file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+        ret = btrfs_delalloc_reserve_space(inode,
+                                           num_pages << PAGE_CACHE_SHIFT);
+        if (ret)
+                return ret;
+again:
+        ret = 0;
+        i_done = 0;
+        /* step one, lock all the pages */
+        for (i = 0; i < num_pages; i++) {
+                struct page *page;
+                page = grab_cache_page(inode->i_mapping,
+                                            start_index + i);
+                if (!page)
+                        break;
+                if (!PageUptodate(page)) {
+                        btrfs_readpage(NULL, page);
+                        lock_page(page);
+                        if (!PageUptodate(page)) {
+                                unlock_page(page);
+                                page_cache_release(page);
+                                ret = -EIO;
+                                break;
+                        }
+                }
+                isize = i_size_read(inode);
+                file_end = (isize - 1) >> PAGE_CACHE_SHIFT;
+                if (!isize || page->index > file_end ||
+                    page->mapping != inode->i_mapping) {
+                        /* whoops, we blew past eof, skip this page */
+                        unlock_page(page);
+                        page_cache_release(page);
+                        break;
+                }
+                pages[i] = page;
+                i_done++;
+        }
+        if (!i_done || ret)
+                goto out;
+        if (!(inode->i_sb->s_flags & MS_ACTIVE))
+                goto out;
+        /*
+         * so now we have a nice long stream of locked
+         * and up to date pages, lets wait on them
+         */
+        for (i = 0; i < i_done; i++)
+                wait_on_page_writeback(pages[i]);
+        page_start = page_offset(pages[0]);
+        page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
+        lock_extent_bits(&BTRFS_I(inode)->io_tree,
+                         page_start, page_end - 1, 0, &cached_state,
+                         GFP_NOFS);
+        ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1);
+        if (ordered &&
+            ordered->file_offset + ordered->len > page_start &&
+            ordered->file_offset < page_end) {
+                btrfs_put_ordered_extent(ordered);
+                unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                                     page_start, page_end - 1,
+                                     &cached_state, GFP_NOFS);
+                for (i = 0; i < i_done; i++) {
+                        unlock_page(pages[i]);
+                        page_cache_release(pages[i]);
+                }
+                btrfs_wait_ordered_range(inode, page_start,
+                                         page_end - page_start);
+                goto again;
+        }
+        if (ordered)
+                btrfs_put_ordered_extent(ordered);
+        clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
+                          page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
+                          EXTENT_DO_ACCOUNTING, 0, 0, &cached_state,
+                          GFP_NOFS);
+        if (i_done != num_pages) {
+                atomic_inc(&BTRFS_I(inode)->outstanding_extents);
+                btrfs_delalloc_release_space(inode,
+                                     (num_pages - i_done) << PAGE_CACHE_SHIFT);
+        }
+        btrfs_set_extent_delalloc(inode, page_start, page_end - 1,
+                                  &cached_state);
+        unlock_extent_cached(&BTRFS_I(inode)->io_tree,
+                             page_start, page_end - 1, &cached_state,
+                             GFP_NOFS);
+        for (i = 0; i < i_done; i++) {
+                clear_page_dirty_for_io(pages[i]);
+                ClearPageChecked(pages[i]);
+                set_page_extent_mapped(pages[i]);
+                set_page_dirty(pages[i]);
+                unlock_page(pages[i]);
+                page_cache_release(pages[i]);
+        }
+        return i_done;
+out:
+        for (i = 0; i < i_done; i++) {
+                unlock_page(pages[i]);
+                page_cache_release(pages[i]);
+        }
+        btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT);
+        return ret;
+}
+int btrfs_defrag_file(struct inode *inode, struct file *file,
+                      struct btrfs_ioctl_defrag_range_args *range,
+                      u64 newer_than, unsigned long max_to_defrag)
+{
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_super_block *disk_super;
+        struct file_ra_state *ra = NULL;
+        unsigned long last_index;
+        u64 features;
        u64 last_len = 0;
        u64 skip = 0;
        u64 defrag_end = 0;
+        u64 newer_off = range->start;
+        int newer_left = 0;
        unsigned long i;
        int ret;
+        int defrag_count = 0;
+        int compress_type = BTRFS_COMPRESS_ZLIB;
+        int extent_thresh = range->extent_thresh;
+        int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+        u64 new_align = ~((u64)128 * 1024 - 1);
+        struct page **pages = NULL;
+        if (extent_thresh == 0)
+                extent_thresh = 256 * 1024;
+        if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+                if (range->compress_type > BTRFS_COMPRESS_TYPES)
+                        return -EINVAL;
+                if (range->compress_type)
+                        compress_type = range->compress_type;
+        }
        if (inode->i_size == 0)
                return 0;
+        /*
+         * if we were not given a file, allocate a readahead
+         * context
+         */
+        if (!file) {
+                ra = kzalloc(sizeof(*ra), GFP_NOFS);
+                if (!ra)
+                        return -ENOMEM;
+                file_ra_state_init(ra, inode->i_mapping);
+        } else {
+                ra = &file->f_ra;
+        }
+        pages = kmalloc(sizeof(struct page *) * newer_cluster,
+                        GFP_NOFS);
+        if (!pages) {
+                ret = -ENOMEM;
+                goto out_ra;
+        }
+        /* find the last page to defrag */
        if (range->start + range->len > range->start) {
                last_index = min_t(u64, inode->i_size - 1,
                         range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
@@ -561,11 +1034,37 @@ static int btrfs_defrag_file(struct file *file,
                last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
        }
-        i = range->start >> PAGE_CACHE_SHIFT;
+        if (newer_than) {
-        while (i <= last_index) {
+                ret = find_new_extents(root, inode, newer_than,
-                if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
+                                       &newer_off, 64 * 1024);
+                if (!ret) {
+                        range->start = newer_off;
+                        /*
+                         * we always align our defrag to help keep
+                         * the extents in the file evenly spaced
+                         */
+                        i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+                        newer_left = newer_cluster;
+                } else
+                        goto out_ra;
+        } else {
+                i = range->start >> PAGE_CACHE_SHIFT;
+        }
+        if (!max_to_defrag)
+                max_to_defrag = last_index - 1;
+        while (i <= last_index && defrag_count < max_to_defrag) {
+                /*
+                 * make sure we stop running if someone unmounts
+                 * the FS
+                 */
+                if (!(inode->i_sb->s_flags & MS_ACTIVE))
+                        break;
+                if (!newer_than &&
+                    !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT,
                                        PAGE_CACHE_SIZE,
-                                        range->extent_thresh,
+                                        extent_thresh,
                                        &last_len, &skip,
                                        &defrag_end)) {
                        unsigned long next;
@@ -577,92 +1076,39 @@ static int btrfs_defrag_file(struct file *file,
                        i = max(i + 1, next);
                        continue;
                }
-                if (total_read % ra_pages == 0) {
-                        btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i,
-                                       min(last_index, i + ra_pages - 1));
-                }
-                total_read++;
-                mutex_lock(&inode->i_mutex);
                if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-                        BTRFS_I(inode)->force_compress = 1;
+                        BTRFS_I(inode)->force_compress = compress_type;
-                ret  = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE);
+                btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
-                if (ret)
-                        goto err_unlock;
-again:
-                if (inode->i_size == 0 ||
-                    i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) {
-                        ret = 0;
-                        goto err_reservations;
-                }
-                page = grab_cache_page(inode->i_mapping, i);
+                ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
-                if (!page) {
+                if (ret < 0)
-                        ret = -ENOMEM;
+                        goto out_ra;
-                        goto err_reservations;
-                }
-                if (!PageUptodate(page)) {
-                        btrfs_readpage(NULL, page);
-                        lock_page(page);
-                        if (!PageUptodate(page)) {
-                                unlock_page(page);
-                                page_cache_release(page);
-                                ret = -EIO;
-                                goto err_reservations;
-                        }
-                }
-                if (page->mapping != inode->i_mapping) {
-                        unlock_page(page);
-                        page_cache_release(page);
-                        goto again;
-                }
-                wait_on_page_writeback(page);
-                if (PageDirty(page)) {
+                defrag_count += ret;
-                        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+                balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
-                        goto loop_unlock;
+                i += ret;
-                }
-                page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+                if (newer_than) {
-                page_end = page_start + PAGE_CACHE_SIZE - 1;
+                        if (newer_off == (u64)-1)
-                lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+                                break;
-                ordered = btrfs_lookup_ordered_extent(inode, page_start);
+                        newer_off = max(newer_off + 1,
-                if (ordered) {
+                                        (u64)i << PAGE_CACHE_SHIFT);
-                        unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-                        unlock_page(page);
+                        ret = find_new_extents(root, inode,
-                        page_cache_release(page);
+                                               newer_than, &newer_off,
-                        btrfs_start_ordered_extent(inode, ordered, 1);
+                                               64 * 1024);
-                        btrfs_put_ordered_extent(ordered);
+                        if (!ret) {
-                        goto again;
+                                range->start = newer_off;
+                                i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
+                                newer_left = newer_cluster;
+                        } else {
+                                break;
+                        }
+                } else {
+                        i++;
                }
-                set_page_extent_mapped(page);
-                /*
-                 * this makes sure page_mkwrite is called on the
-                 * page if it is dirtied again later
-                 */
-                clear_page_dirty_for_io(page);
-                clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start,
-                                  page_end, EXTENT_DIRTY | EXTENT_DELALLOC |
-                                  EXTENT_DO_ACCOUNTING, GFP_NOFS);
-                btrfs_set_extent_delalloc(inode, page_start, page_end, NULL);
-                ClearPageChecked(page);
-                set_page_dirty(page);
-                unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-loop_unlock:
-                unlock_page(page);
-                page_cache_release(page);
-                mutex_unlock(&inode->i_mutex);
-                balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1);
-                i++;
        }
        if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO))
@@ -683,16 +1129,25 @@ loop_unlock:
                atomic_dec(&root->fs_info->async_submit_draining);
                mutex_lock(&inode->i_mutex);
-                BTRFS_I(inode)->force_compress = 0;
+                BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
                mutex_unlock(&inode->i_mutex);
        }
-        return 0;
+        disk_super = &root->fs_info->super_copy;
+        features = btrfs_super_incompat_flags(disk_super);
+        if (range->compress_type == BTRFS_COMPRESS_LZO) {
+                features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
+                btrfs_set_super_incompat_flags(disk_super, features);
+        }
-err_reservations:
+        if (!file)
-        btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
+                kfree(ra);
-err_unlock:
+        return defrag_count;
-        mutex_unlock(&inode->i_mutex);
+out_ra:
+        if (!file)
+                kfree(ra);
+        kfree(pages);
        return ret;
 }
@@ -708,7 +1163,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        char *sizestr;
        char *devstr = NULL;
        int ret = 0;
-        int namelen;
        int mod = 0;
        if (root->fs_info->sb->s_flags & MS_RDONLY)
@@ -722,7 +1176,6 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
                return PTR_ERR(vol_args);
        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-        namelen = strlen(vol_args->name);
        mutex_lock(&root->fs_info->volume_mutex);
        sizestr = vol_args->name;
@@ -789,6 +1242,10 @@ static noinline int btrfs_ioctl_resize(struct btrfs_root *root,
        if (new_size > old_size) {
                trans = btrfs_start_transaction(root, 0);
+                if (IS_ERR(trans)) {
+                        ret = PTR_ERR(trans);
+                        goto out_unlock;
+                }
                ret = btrfs_grow_device(trans, device, new_size);
                btrfs_commit_transaction(trans, root);
        } else {
@@ -801,11 +1258,14 @@ out_unlock:
        return ret;
 }
-static noinline int btrfs_ioctl_snap_create(struct file *file,
+static noinline int btrfs_ioctl_snap_create_transid(struct file *file,
-                                            void __user *arg, int subvol)
+                                                    char *name,
+                                                    unsigned long fd,
+                                                    int subvol,
+                                                    u64 *transid,
+                                                    bool readonly)
 {
        struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root;
-        struct btrfs_ioctl_vol_args *vol_args;
        struct file *src_file;
        int namelen;
        int ret = 0;
@@ -813,23 +1273,18 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
        if (root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
-        vol_args = memdup_user(arg, sizeof(*vol_args));
+        namelen = strlen(name);
-        if (IS_ERR(vol_args))
+        if (strchr(name, '/')) {
-                return PTR_ERR(vol_args);
-        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
-        namelen = strlen(vol_args->name);
-        if (strchr(vol_args->name, '/')) {
                ret = -EINVAL;
                goto out;
        }
        if (subvol) {
-                ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+                ret = btrfs_mksubvol(&file->f_path, name, namelen,
-                                     NULL);
+                                     NULL, transid, readonly);
        } else {
                struct inode *src_inode;
-                src_file = fget(vol_args->fd);
+                src_file = fget(fd);
                if (!src_file) {
                        ret = -EINVAL;
                        goto out;
@@ -843,15 +1298,155 @@ static noinline int btrfs_ioctl_snap_create(struct file *file,
                        fput(src_file);
                        goto out;
                }
-                ret = btrfs_mksubvol(&file->f_path, vol_args->name, namelen,
+                ret = btrfs_mksubvol(&file->f_path, name, namelen,
-                                     BTRFS_I(src_inode)->root);
+                                     BTRFS_I(src_inode)->root,
+                                     transid, readonly);
                fput(src_file);
        }
 out:
+        return ret;
+}
+static noinline int btrfs_ioctl_snap_create(struct file *file,
+                                            void __user *arg, int subvol)
+{
+        struct btrfs_ioctl_vol_args *vol_args;
+        int ret;
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
+                return PTR_ERR(vol_args);
+        vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
+        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+                                              vol_args->fd, subvol,
+                                              NULL, false);
+        kfree(vol_args);
+        return ret;
+}
+static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
+                                               void __user *arg, int subvol)
+{
+        struct btrfs_ioctl_vol_args_v2 *vol_args;
+        int ret;
+        u64 transid = 0;
+        u64 *ptr = NULL;
+        bool readonly = false;
+        vol_args = memdup_user(arg, sizeof(*vol_args));
+        if (IS_ERR(vol_args))
+                return PTR_ERR(vol_args);
+        vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
+        if (vol_args->flags &
+            ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY)) {
+                ret = -EOPNOTSUPP;
+                goto out;
+        }
+        if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
+                ptr = &transid;
+        if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
+                readonly = true;
+        ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
+                                              vol_args->fd, subvol,
+                                              ptr, readonly);
+        if (ret == 0 && ptr &&
+            copy_to_user(arg +
+                         offsetof(struct btrfs_ioctl_vol_args_v2,
+                                  transid), ptr, sizeof(*ptr)))
+                ret = -EFAULT;
+out:
        kfree(vol_args);
        return ret;
 }
+static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
+                                                void __user *arg)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        int ret = 0;
+        u64 flags = 0;
+        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
+                return -EINVAL;
+        down_read(&root->fs_info->subvol_sem);
+        if (btrfs_root_readonly(root))
+                flags |= BTRFS_SUBVOL_RDONLY;
+        up_read(&root->fs_info->subvol_sem);
+        if (copy_to_user(arg, &flags, sizeof(flags)))
+                ret = -EFAULT;
+        return ret;
+}
+static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
+                                              void __user *arg)
+{
+        struct inode *inode = fdentry(file)->d_inode;
+        struct btrfs_root *root = BTRFS_I(inode)->root;
+        struct btrfs_trans_handle *trans;
+        u64 root_flags;
+        u64 flags;
+        int ret = 0;
+        if (root->fs_info->sb->s_flags & MS_RDONLY)
+                return -EROFS;
+        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
+                return -EINVAL;
+        if (copy_from_user(&flags, arg, sizeof(flags)))
+                return -EFAULT;
+        if (flags & BTRFS_SUBVOL_CREATE_ASYNC)
+                return -EINVAL;
+        if (flags & ~BTRFS_SUBVOL_RDONLY)
+                return -EOPNOTSUPP;
+        if (!inode_owner_or_capable(inode))
+                return -EACCES;
+        down_write(&root->fs_info->subvol_sem);
+        /* nothing to do */
+        if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
+                goto out;
+        root_flags = btrfs_root_flags(&root->root_item);
+        if (flags & BTRFS_SUBVOL_RDONLY)
+                btrfs_set_root_flags(&root->root_item,
+                                     root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
+        else
+                btrfs_set_root_flags(&root->root_item,
+                                     root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
+        trans = btrfs_start_transaction(root, 1);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto out_reset;
+        }
+        ret = btrfs_update_root(trans, root->fs_info->tree_root,
+                                &root->root_key, &root->root_item);
+        btrfs_commit_transaction(trans, root);
+out_reset:
+        if (ret)
+                btrfs_set_root_flags(&root->root_item, root_flags);
+out:
+        up_write(&root->fs_info->subvol_sem);
+        return ret;
+}
 /*
 * helper to check if the subvolume references other subvolumes
 */
@@ -928,7 +1523,6 @@ static noinline int copy_to_sk(struct btrfs_root *root,
        int nritems;
        int i;
        int slot;
-        int found = 0;
        int ret = 0;
        leaf = path->nodes[0];
@@ -975,7 +1569,7 @@ static noinline int copy_to_sk(struct btrfs_root *root,
                                           item_off, item_len);
                        *sk_offset += item_len;
                }
-                found++;
+                (*num_found)++;
                if (*num_found >= sk->nr_items)
                        break;
@@ -994,7 +1588,6 @@ advance_key:
        } else
                ret = 1;
 overflow:
-        *num_found += found;
        return ret;
 }
@@ -1051,7 +1644,7 @@ static noinline int search_ioctl(struct inode *inode,
                }
                ret = copy_to_sk(root, path, &key, sk, args->buf,
                                 &sk_offset, &num_found);
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                if (ret || num_found >= sk->nr_items)
                        break;
@@ -1073,14 +1666,10 @@ static noinline int btrfs_ioctl_tree_search(struct file *file,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        args = kmalloc(sizeof(*args), GFP_KERNEL);
+        args = memdup_user(argp, sizeof(*args));
-        if (!args)
+        if (IS_ERR(args))
-                return -ENOMEM;
+                return PTR_ERR(args);
-        if (copy_from_user(args, argp, sizeof(*args))) {
-                kfree(args);
-                return -EFAULT;
-        }
        inode = fdentry(file)->d_inode;
        ret = search_ioctl(inode, args);
        if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
@@ -1162,7 +1751,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
                if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
                        break;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                key.objectid = key.offset;
                key.offset = (u64)-1;
                dirid = key.objectid;
@@ -1188,14 +1777,10 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file,
        if (!capable(CAP_SYS_ADMIN))
                return -EPERM;
-        args = kmalloc(sizeof(*args), GFP_KERNEL);
+        args = memdup_user(argp, sizeof(*args));
-        if (!args)
+        if (IS_ERR(args))
-                return -ENOMEM;
+                return PTR_ERR(args);
-        if (copy_from_user(args, argp, sizeof(*args))) {
-                kfree(args);
-                return -EFAULT;
-        }
        inode = fdentry(file)->d_inode;
        if (args->treeid == 0)
@@ -1227,9 +1812,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        int ret;
        int err = 0;
-        if (!capable(CAP_SYS_ADMIN))
-                return -EPERM;
        vol_args = memdup_user(arg, sizeof(*vol_args));
        if (IS_ERR(vol_args))
                return PTR_ERR(vol_args);
@@ -1259,12 +1841,50 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
        }
        inode = dentry->d_inode;
-        if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) {
+        dest = BTRFS_I(inode)->root;
+        if (!capable(CAP_SYS_ADMIN)){
+                /*
+                 * Regular user.  Only allow this with a special mount
+                 * option, when the user has write+exec access to the
+                 * subvol root, and when rmdir(2) would have been
+                 * allowed.
+                 *
+                 * Note that this is _not_ check that the subvol is
+                 * empty or doesn't contain data that we wouldn't
+                 * otherwise be able to delete.
+                 *
+                 * Users who want to delete empty subvols should try
+                 * rmdir(2).
+                 */
+                err = -EPERM;
+                if (!btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+                        goto out_dput;
+                /*
+                 * Do not allow deletion if the parent dir is the same
+                 * as the dir to be deleted.  That means the ioctl
+                 * must be called on the dentry referencing the root
+                 * of the subvol, not a random directory contained
+                 * within it.
+                 */
                err = -EINVAL;
-                goto out_dput;
+                if (root == dest)
+                        goto out_dput;
+                err = inode_permission(inode, MAY_WRITE | MAY_EXEC);
+                if (err)
+                        goto out_dput;
+                /* check if subvolume may be deleted by a non-root user */
+                err = btrfs_may_delete(dir, dentry, 1);
+                if (err)
+                        goto out_dput;
        }
-        dest = BTRFS_I(inode)->root;
+        if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) {
+                err = -EINVAL;
+                goto out_dput;
+        }
        mutex_lock(&inode->i_mutex);
        err = d_invalidate(dentry);
@@ -1304,7 +1924,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
                BUG_ON(ret);
        }
-        ret = btrfs_commit_transaction(trans, root);
+        ret = btrfs_end_transaction(trans, root);
        BUG_ON(ret);
        inode->i_flags |= S_DEAD;
 out_up_write:
@@ -1333,6 +1953,9 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
        struct btrfs_ioctl_defrag_range_args *range;
        int ret;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                return ret;
@@ -1376,7 +1999,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
                        /* the rest are all set to zero by kzalloc */
                        range->len = (u64)-1;
                }
-                ret = btrfs_defrag_file(file, range);
+                ret = btrfs_defrag_file(fdentry(file)->d_inode, file,
+                                        range, 0, 0);
+                if (ret > 0)
+                        ret = 0;
                kfree(range);
                break;
        default:
@@ -1428,6 +2054,80 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg)
        return ret;
 }
+static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_fs_info_args *fi_args;
+        struct btrfs_device *device;
+        struct btrfs_device *next;
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        int ret = 0;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        fi_args = kzalloc(sizeof(*fi_args), GFP_KERNEL);
+        if (!fi_args)
+                return -ENOMEM;
+        fi_args->num_devices = fs_devices->num_devices;
+        memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
+        mutex_lock(&fs_devices->device_list_mutex);
+        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
+                if (device->devid > fi_args->max_id)
+                        fi_args->max_id = device->devid;
+        }
+        mutex_unlock(&fs_devices->device_list_mutex);
+        if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
+                ret = -EFAULT;
+        kfree(fi_args);
+        return ret;
+}
+static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
+{
+        struct btrfs_ioctl_dev_info_args *di_args;
+        struct btrfs_device *dev;
+        struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
+        int ret = 0;
+        char *s_uuid = NULL;
+        char empty_uuid[BTRFS_UUID_SIZE] = {0};
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        di_args = memdup_user(arg, sizeof(*di_args));
+        if (IS_ERR(di_args))
+                return PTR_ERR(di_args);
+        if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0)
+                s_uuid = di_args->uuid;
+        mutex_lock(&fs_devices->device_list_mutex);
+        dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL);
+        mutex_unlock(&fs_devices->device_list_mutex);
+        if (!dev) {
+                ret = -ENODEV;
+                goto out;
+        }
+        di_args->devid = dev->devid;
+        di_args->bytes_used = dev->bytes_used;
+        di_args->total_bytes = dev->total_bytes;
+        memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
+        strncpy(di_args->path, dev->name, sizeof(di_args->path));
+out:
+        if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
+                ret = -EFAULT;
+        kfree(di_args);
+        return ret;
+}
 static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                       u64 off, u64 olen, u64 destoff)
 {
@@ -1461,6 +2161,9 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
                return -EINVAL;
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                return ret;
@@ -1502,11 +2205,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        path->reada = 2;
        if (inode < src) {
-                mutex_lock(&inode->i_mutex);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-                mutex_lock(&src->i_mutex);
+                mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
        } else {
-                mutex_lock(&src->i_mutex);
+                mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-                mutex_lock(&inode->i_mutex);
+                mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
        }
        /* determine range to clone */
@@ -1517,12 +2220,11 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                olen = len = src->i_size - off;
        /* if we extend to eof, continue to block boundary */
        if (off + len == src->i_size)
-                len = ((src->i_size + bs-1) & ~(bs-1))
+                len = ALIGN(src->i_size, bs) - off;
-                        - off;
        /* verify the end result is block aligned */
-        if ((off & (bs-1)) ||
+        if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
-            ((off + len) & (bs-1)))
+            !IS_ALIGNED(destoff, bs))
                goto out_unlock;
        /* do any pending delalloc/csum calc on src, one way or
@@ -1530,17 +2232,19 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
        while (1) {
                struct btrfs_ordered_extent *ordered;
                lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
-                ordered = btrfs_lookup_first_ordered_extent(inode, off+len);
+                ordered = btrfs_lookup_first_ordered_extent(src, off+len);
-                if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered)
+                if (!ordered &&
+                    !test_range_bit(&BTRFS_I(src)->io_tree, off, off+len,
+                                   EXTENT_DELALLOC, 0, NULL))
                        break;
                unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
                if (ordered)
                        btrfs_put_ordered_extent(ordered);
-                btrfs_wait_ordered_range(src, off, off+len);
+                btrfs_wait_ordered_range(src, off, len);
        }
        /* clone data */
-        key.objectid = src->i_ino;
+        key.objectid = btrfs_ino(src);
        key.type = BTRFS_EXTENT_DATA_KEY;
        key.offset = 0;
@@ -1567,7 +2271,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
-                    key.objectid != src->i_ino)
+                    key.objectid != btrfs_ino(src))
                        break;
                if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
@@ -1603,15 +2307,18 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                datal = btrfs_file_extent_ram_bytes(leaf,
                                                                    extent);
                        }
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
-                        if (key.offset + datal < off ||
+                        if (key.offset + datal <= off ||
                            key.offset >= off+len)
                                goto next;
                        memcpy(&new_key, &key, sizeof(new_key));
-                        new_key.objectid = inode->i_ino;
+                        new_key.objectid = btrfs_ino(inode);
-                        new_key.offset = key.offset + destoff - off;
+                        if (off <= key.offset)
+                                new_key.offset = key.offset + destoff - off;
+                        else
+                                new_key.offset = destoff;
                        trans = btrfs_start_transaction(root, 1);
                        if (IS_ERR(trans)) {
@@ -1661,7 +2368,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                                        ret = btrfs_inc_extent_ref(trans, root,
                                                        disko, diskl, 0,
                                                        root->root_key.objectid,
-                                                        inode->i_ino,
+                                                        btrfs_ino(inode),
                                                        new_key.offset - datao);
                                        BUG_ON(ret);
                                }
@@ -1710,7 +2417,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        }
                        btrfs_mark_buffer_dirty(leaf);
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        inode->i_mtime = inode->i_ctime = CURRENT_TIME;
@@ -1720,8 +2427,8 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                         * but shouldn't round up the file size
                         */
                        endoff = new_key.offset + datal;
-                        if (endoff > off+olen)
+                        if (endoff > destoff+olen)
-                                endoff = off+olen;
+                                endoff = destoff+olen;
                        if (endoff > inode->i_size)
                                btrfs_i_size_write(inode, endoff);
@@ -1731,12 +2438,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
                        btrfs_end_transaction(trans, root);
                }
 next:
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                key.offset++;
        }
        ret = 0;
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS);
 out_unlock:
        mutex_unlock(&src->i_mutex);
@@ -1781,26 +2488,26 @@ static long btrfs_ioctl_trans_start(struct file *file)
        if (file->private_data)
                goto out;
+        ret = -EROFS;
+        if (btrfs_root_readonly(root))
+                goto out;
        ret = mnt_want_write(file->f_path.mnt);
        if (ret)
                goto out;
-        mutex_lock(&root->fs_info->trans_mutex);
+        atomic_inc(&root->fs_info->open_ioctl_trans);
-        root->fs_info->open_ioctl_trans++;
-        mutex_unlock(&root->fs_info->trans_mutex);
        ret = -ENOMEM;
-        trans = btrfs_start_ioctl_transaction(root, 0);
+        trans = btrfs_start_ioctl_transaction(root);
-        if (!trans)
+        if (IS_ERR(trans))
                goto out_drop;
        file->private_data = trans;
        return 0;
 out_drop:
-        mutex_lock(&root->fs_info->trans_mutex);
+        atomic_dec(&root->fs_info->open_ioctl_trans);
-        root->fs_info->open_ioctl_trans--;
-        mutex_unlock(&root->fs_info->trans_mutex);
        mnt_drop_write(file->f_path.mnt);
 out:
        return ret;
@@ -1847,9 +2554,9 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        path->leave_spinning = 1;
        trans = btrfs_start_transaction(root, 1);
-        if (!trans) {
+        if (IS_ERR(trans)) {
                btrfs_free_path(path);
-                return -ENOMEM;
+                return PTR_ERR(trans);
        }
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
@@ -1879,35 +2586,80 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
        return 0;
 }
+static void get_block_group_info(struct list_head *groups_list,
+                                 struct btrfs_ioctl_space_info *space)
+{
+        struct btrfs_block_group_cache *block_group;
+        space->total_bytes = 0;
+        space->used_bytes = 0;
+        space->flags = 0;
+        list_for_each_entry(block_group, groups_list, list) {
+                space->flags = block_group->flags;
+                space->total_bytes += block_group->key.offset;
+                space->used_bytes +=
+                        btrfs_block_group_used(&block_group->item);
+        }
+}
 long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 {
        struct btrfs_ioctl_space_args space_args;
        struct btrfs_ioctl_space_info space;
        struct btrfs_ioctl_space_info *dest;
        struct btrfs_ioctl_space_info *dest_orig;
-        struct btrfs_ioctl_space_info *user_dest;
+        struct btrfs_ioctl_space_info __user *user_dest;
        struct btrfs_space_info *info;
+        u64 types[] = {BTRFS_BLOCK_GROUP_DATA,
+                       BTRFS_BLOCK_GROUP_SYSTEM,
+                       BTRFS_BLOCK_GROUP_METADATA,
+                       BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA};
+        int num_types = 4;
        int alloc_size;
        int ret = 0;
-        int slot_count = 0;
+        u64 slot_count = 0;
+        int i, c;
        if (copy_from_user(&space_args,
                           (struct btrfs_ioctl_space_args __user *)arg,
                           sizeof(space_args)))
                return -EFAULT;
-        /* first we count slots */
+        for (i = 0; i < num_types; i++) {
-        rcu_read_lock();
+                struct btrfs_space_info *tmp;
-        list_for_each_entry_rcu(info, &root->fs_info->space_info, list)
-                slot_count++;
+                info = NULL;
-        rcu_read_unlock();
+                rcu_read_lock();
+                list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
+                                        list) {
+                        if (tmp->flags == types[i]) {
+                                info = tmp;
+                                break;
+                        }
+                }
+                rcu_read_unlock();
+                if (!info)
+                        continue;
+                down_read(&info->groups_sem);
+                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+                        if (!list_empty(&info->block_groups[c]))
+                                slot_count++;
+                }
+                up_read(&info->groups_sem);
+        }
        /* space_slots == 0 means they are asking for a count */
        if (space_args.space_slots == 0) {
                space_args.total_spaces = slot_count;
                goto out;
        }
+        slot_count = min_t(u64, space_args.space_slots, slot_count);
        alloc_size = sizeof(*dest) * slot_count;
        /* we generally have at most 6 or so space infos, one for each raid
         * level.  So, a whole page should be more than enough for everyone
         */
@@ -1921,27 +2673,40 @@ long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
        dest_orig = dest;
        /* now we have a buffer to copy into */
-        rcu_read_lock();
+        for (i = 0; i < num_types; i++) {
-        list_for_each_entry_rcu(info, &root->fs_info->space_info, list) {
+                struct btrfs_space_info *tmp;
-                /* make sure we don't copy more than we allocated
-                 * in our buffer
-                 */
-                if (slot_count == 0)
-                        break;
-                slot_count--;
-                /* make sure userland has enough room in their buffer */
+                if (!slot_count)
-                if (space_args.total_spaces >= space_args.space_slots)
                        break;
-                space.flags = info->flags;
+                info = NULL;
-                space.total_bytes = info->total_bytes;
+                rcu_read_lock();
-                space.used_bytes = info->bytes_used;
+                list_for_each_entry_rcu(tmp, &root->fs_info->space_info,
-                memcpy(dest, &space, sizeof(space));
+                                        list) {
-                dest++;
+                        if (tmp->flags == types[i]) {
-                space_args.total_spaces++;
+                                info = tmp;
+                                break;
+                        }
+                }
+                rcu_read_unlock();
+                if (!info)
+                        continue;
+                down_read(&info->groups_sem);
+                for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
+                        if (!list_empty(&info->block_groups[c])) {
+                                get_block_group_info(&info->block_groups[c],
+                                                     &space);
+                                memcpy(dest, &space, sizeof(space));
+                                dest++;
+                                space_args.total_spaces++;
+                                slot_count--;
+                        }
+                        if (!slot_count)
+                                break;
+                }
+                up_read(&info->groups_sem);
        }
-        rcu_read_unlock();
        user_dest = (struct btrfs_ioctl_space_info *)
                (arg + sizeof(struct btrfs_ioctl_space_args));
@@ -1976,14 +2741,101 @@ long btrfs_ioctl_trans_end(struct file *file)
        btrfs_end_transaction(trans, root);
-        mutex_lock(&root->fs_info->trans_mutex);
+        atomic_dec(&root->fs_info->open_ioctl_trans);
-        root->fs_info->open_ioctl_trans--;
-        mutex_unlock(&root->fs_info->trans_mutex);
        mnt_drop_write(file->f_path.mnt);
        return 0;
 }
+static noinline long btrfs_ioctl_start_sync(struct file *file, void __user *argp)
+{
+        struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+        struct btrfs_trans_handle *trans;
+        u64 transid;
+        int ret;
+        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
+        transid = trans->transid;
+        ret = btrfs_commit_transaction_async(trans, root, 0);
+        if (ret) {
+                btrfs_end_transaction(trans, root);
+                return ret;
+        }
+        if (argp)
+                if (copy_to_user(argp, &transid, sizeof(transid)))
+                        return -EFAULT;
+        return 0;
+}
+static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp)
+{
+        struct btrfs_root *root = BTRFS_I(file->f_dentry->d_inode)->root;
+        u64 transid;
+        if (argp) {
+                if (copy_from_user(&transid, argp, sizeof(transid)))
+                        return -EFAULT;
+        } else {
+                transid = 0;  /* current trans */
+        }
+        return btrfs_wait_for_commit(root, transid);
+}
+static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg)
+{
+        int ret;
+        struct btrfs_ioctl_scrub_args *sa;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        sa = memdup_user(arg, sizeof(*sa));
+        if (IS_ERR(sa))
+                return PTR_ERR(sa);
+        ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end,
+                              &sa->progress, sa->flags & BTRFS_SCRUB_READONLY);
+        if (copy_to_user(arg, sa, sizeof(*sa)))
+                ret = -EFAULT;
+        kfree(sa);
+        return ret;
+}
+static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg)
+{
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        return btrfs_scrub_cancel(root);
+}
+static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
+                                       void __user *arg)
+{
+        struct btrfs_ioctl_scrub_args *sa;
+        int ret;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
+        sa = memdup_user(arg, sizeof(*sa));
+        if (IS_ERR(sa))
+                return PTR_ERR(sa);
+        ret = btrfs_scrub_progress(root, sa->devid, &sa->progress);
+        if (copy_to_user(arg, sa, sizeof(*sa)))
+                ret = -EFAULT;
+        kfree(sa);
+        return ret;
+}
 long btrfs_ioctl(struct file *file, unsigned int
                cmd, unsigned long arg)
 {
@@ -1997,12 +2849,20 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_setflags(file, argp);
        case FS_IOC_GETVERSION:
                return btrfs_ioctl_getversion(file, argp);
+        case FITRIM:
+                return btrfs_ioctl_fitrim(file, argp);
        case BTRFS_IOC_SNAP_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 0);
+        case BTRFS_IOC_SNAP_CREATE_V2:
+                return btrfs_ioctl_snap_create_v2(file, argp, 0);
        case BTRFS_IOC_SUBVOL_CREATE:
                return btrfs_ioctl_snap_create(file, argp, 1);
        case BTRFS_IOC_SNAP_DESTROY:
                return btrfs_ioctl_snap_destroy(file, argp);
+        case BTRFS_IOC_SUBVOL_GETFLAGS:
+                return btrfs_ioctl_subvol_getflags(file, argp);
+        case BTRFS_IOC_SUBVOL_SETFLAGS:
+                return btrfs_ioctl_subvol_setflags(file, argp);
        case BTRFS_IOC_DEFAULT_SUBVOL:
                return btrfs_ioctl_default_subvol(file, argp);
        case BTRFS_IOC_DEFRAG:
@@ -2015,6 +2875,10 @@ long btrfs_ioctl(struct file *file, unsigned int
                return btrfs_ioctl_add_dev(root, argp);
        case BTRFS_IOC_RM_DEV:
                return btrfs_ioctl_rm_dev(root, argp);
+        case BTRFS_IOC_FS_INFO:
+                return btrfs_ioctl_fs_info(root, argp);
+        case BTRFS_IOC_DEV_INFO:
+                return btrfs_ioctl_dev_info(root, argp);
        case BTRFS_IOC_BALANCE:
                return btrfs_balance(root->fs_info->dev_root);
        case BTRFS_IOC_CLONE:
@@ -2034,6 +2898,16 @@ long btrfs_ioctl(struct file *file, unsigned int
        case BTRFS_IOC_SYNC:
                btrfs_sync_fs(file->f_dentry->d_sb, 1);
                return 0;
+        case BTRFS_IOC_START_SYNC:
+                return btrfs_ioctl_start_sync(file, argp);
+        case BTRFS_IOC_WAIT_SYNC:
+                return btrfs_ioctl_wait_sync(file, argp);
+        case BTRFS_IOC_SCRUB:
+                return btrfs_ioctl_scrub(root, argp);
+        case BTRFS_IOC_SCRUB_CANCEL:
+                return btrfs_ioctl_scrub_cancel(root, argp);
+        case BTRFS_IOC_SCRUB_PROGRESS:
+                return btrfs_ioctl_scrub_progress(root, argp);
        }
        return -ENOTTY;
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 424694aa517f..ad1ea789fcb4 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -22,14 +22,93 @@
 #define BTRFS_IOCTL_MAGIC 0x94
 #define BTRFS_VOL_NAME_MAX 255
-#define BTRFS_PATH_NAME_MAX 4087
 /* this should be 4k */
+#define BTRFS_PATH_NAME_MAX 4087
 struct btrfs_ioctl_vol_args {
        __s64 fd;
        char name[BTRFS_PATH_NAME_MAX + 1];
 };
+#define BTRFS_SUBVOL_CREATE_ASYNC       (1ULL << 0)
+#define BTRFS_SUBVOL_RDONLY             (1ULL << 1)
+#define BTRFS_FSID_SIZE 16
+#define BTRFS_UUID_SIZE 16
+#define BTRFS_SUBVOL_NAME_MAX 4039
+struct btrfs_ioctl_vol_args_v2 {
+        __s64 fd;
+        __u64 transid;
+        __u64 flags;
+        __u64 unused[4];
+        char name[BTRFS_SUBVOL_NAME_MAX + 1];
+};
+/*
+ * structure to report errors and progress to userspace, either as a
+ * result of a finished scrub, a canceled scrub or a progress inquiry
+ */
+struct btrfs_scrub_progress {
+        __u64 data_extents_scrubbed;    /* # of data extents scrubbed */
+        __u64 tree_extents_scrubbed;    /* # of tree extents scrubbed */
+        __u64 data_bytes_scrubbed;      /* # of data bytes scrubbed */
+        __u64 tree_bytes_scrubbed;      /* # of tree bytes scrubbed */
+        __u64 read_errors;              /* # of read errors encountered (EIO) */
+        __u64 csum_errors;              /* # of failed csum checks */
+        __u64 verify_errors;            /* # of occurences, where the metadata
+                                         * of a tree block did not match the
+                                         * expected values, like generation or
+                                         * logical */
+        __u64 no_csum;                  /* # of 4k data block for which no csum
+                                         * is present, probably the result of
+                                         * data written with nodatasum */
+        __u64 csum_discards;            /* # of csum for which no data was found
+                                         * in the extent tree. */
+        __u64 super_errors;             /* # of bad super blocks encountered */
+        __u64 malloc_errors;            /* # of internal kmalloc errors. These
+                                         * will likely cause an incomplete
+                                         * scrub */
+        __u64 uncorrectable_errors;     /* # of errors where either no intact
+                                         * copy was found or the writeback
+                                         * failed */
+        __u64 corrected_errors;         /* # of errors corrected */
+        __u64 last_physical;            /* last physical address scrubbed. In
+                                         * case a scrub was aborted, this can
+                                         * be used to restart the scrub */
+        __u64 unverified_errors;        /* # of occurences where a read for a
+                                         * full (64k) bio failed, but the re-
+                                         * check succeeded for each 4k piece.
+                                         * Intermittent error. */
+};
+#define BTRFS_SCRUB_READONLY    1
+struct btrfs_ioctl_scrub_args {
+        __u64 devid;                            /* in */
+        __u64 start;                            /* in */
+        __u64 end;                              /* in */
+        __u64 flags;                            /* in */
+        struct btrfs_scrub_progress progress;   /* out */
+        /* pad to 1k */
+        __u64 unused[(1024-32-sizeof(struct btrfs_scrub_progress))/8];
+};
+#define BTRFS_DEVICE_PATH_NAME_MAX 1024
+struct btrfs_ioctl_dev_info_args {
+        __u64 devid;                            /* in/out */
+        __u8 uuid[BTRFS_UUID_SIZE];             /* in/out */
+        __u64 bytes_used;                       /* out */
+        __u64 total_bytes;                      /* out */
+        __u64 unused[379];                      /* pad to 4k */
+        __u8 path[BTRFS_DEVICE_PATH_NAME_MAX];  /* out */
+};
+struct btrfs_ioctl_fs_info_args {
+        __u64 max_id;                           /* out */
+        __u64 num_devices;                      /* out */
+        __u8 fsid[BTRFS_FSID_SIZE];             /* out */
+        __u64 reserved[124];                    /* pad to 1k */
+};
 #define BTRFS_INO_LOOKUP_PATH_MAX 4080
 struct btrfs_ioctl_ino_lookup_args {
        __u64 treeid;
@@ -102,30 +181,6 @@ struct btrfs_ioctl_clone_range_args {
 #define BTRFS_DEFRAG_RANGE_COMPRESS 1
 #define BTRFS_DEFRAG_RANGE_START_IO 2
-struct btrfs_ioctl_defrag_range_args {
-        /* start of the defrag operation */
-        __u64 start;
-        /* number of bytes to defrag, use (u64)-1 to say all */
-        __u64 len;
-        /*
-         * flags for the operation, which can include turning
-         * on compression for this one defrag
-         */
-        __u64 flags;
-        /*
-         * any extent bigger than this will be considered
-         * already defragged.  Use 0 to take the kernel default
-         * Use 1 to say every single extent must be rewritten
-         */
-        __u32 extent_thresh;
-        /* spare for later */
-        __u32 unused[5];
-};
 struct btrfs_ioctl_space_info {
        __u64 flags;
        __u64 total_bytes;
@@ -178,4 +233,19 @@ struct btrfs_ioctl_space_args {
 #define BTRFS_IOC_DEFAULT_SUBVOL _IOW(BTRFS_IOCTL_MAGIC, 19, u64)
 #define BTRFS_IOC_SPACE_INFO _IOWR(BTRFS_IOCTL_MAGIC, 20, \
                                    struct btrfs_ioctl_space_args)
+#define BTRFS_IOC_START_SYNC _IOR(BTRFS_IOCTL_MAGIC, 24, __u64)
+#define BTRFS_IOC_WAIT_SYNC  _IOW(BTRFS_IOCTL_MAGIC, 22, __u64)
+#define BTRFS_IOC_SNAP_CREATE_V2 _IOW(BTRFS_IOCTL_MAGIC, 23, \
+                                   struct btrfs_ioctl_vol_args_v2)
+#define BTRFS_IOC_SUBVOL_GETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 25, __u64)
+#define BTRFS_IOC_SUBVOL_SETFLAGS _IOW(BTRFS_IOCTL_MAGIC, 26, __u64)
+#define BTRFS_IOC_SCRUB _IOWR(BTRFS_IOCTL_MAGIC, 27, \
+                              struct btrfs_ioctl_scrub_args)
+#define BTRFS_IOC_SCRUB_CANCEL _IO(BTRFS_IOCTL_MAGIC, 28)
+#define BTRFS_IOC_SCRUB_PROGRESS _IOWR(BTRFS_IOCTL_MAGIC, 29, \
+                                       struct btrfs_ioctl_scrub_args)
+#define BTRFS_IOC_DEV_INFO _IOWR(BTRFS_IOCTL_MAGIC, 30, \
+                                 struct btrfs_ioctl_dev_info_args)
+#define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
+                               struct btrfs_ioctl_fs_info_args)
 #endif
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 6151f2ea38bb..66fa43dc3f0f 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -185,31 +185,6 @@ sleep:
        return 0;
 }
-/*
- * Very quick trylock, this does not spin or schedule.  It returns
- * 1 with the spinlock held if it was able to take the lock, or it
- * returns zero if it was unable to take the lock.
- *
- * After this call, scheduling is not safe without first calling
- * btrfs_set_lock_blocking()
- */
-int btrfs_try_tree_lock(struct extent_buffer *eb)
-{
-        if (spin_trylock(&eb->lock)) {
-                if (test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) {
-                        /*
-                         * we've got the spinlock, but the real owner is
-                         * blocking.  Drop the spinlock and return failure
-                         */
-                        spin_unlock(&eb->lock);
-                        return 0;
-                }
-                return 1;
-        }
-        /* someone else has the spinlock giveup */
-        return 0;
-}
 int btrfs_tree_unlock(struct extent_buffer *eb)
 {
        /*
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6c4ce457168c..5c33a560a2f1 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,8 +21,6 @@
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
-int btrfs_try_tree_lock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
 void btrfs_set_lock_blocking(struct extent_buffer *eb);
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
new file mode 100644
index 000000000000..a178f5ebea78
--- /dev/null
+++ b/fs/btrfs/lzo.c
@@ -0,0 +1,427 @@
+/*
+ * Copyright (C) 2008 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/lzo.h>
+#include "compression.h"
+#define LZO_LEN 4
+struct workspace {
+        void *mem;
+        void *buf;      /* where compressed data goes */
+        void *cbuf;     /* where decompressed data goes */
+        struct list_head list;
+};
+static void lzo_free_workspace(struct list_head *ws)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        vfree(workspace->buf);
+        vfree(workspace->cbuf);
+        vfree(workspace->mem);
+        kfree(workspace);
+}
+static struct list_head *lzo_alloc_workspace(void)
+{
+        struct workspace *workspace;
+        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
+        if (!workspace)
+                return ERR_PTR(-ENOMEM);
+        workspace->mem = vmalloc(LZO1X_MEM_COMPRESS);
+        workspace->buf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+        workspace->cbuf = vmalloc(lzo1x_worst_compress(PAGE_CACHE_SIZE));
+        if (!workspace->mem || !workspace->buf || !workspace->cbuf)
+                goto fail;
+        INIT_LIST_HEAD(&workspace->list);
+        return &workspace->list;
+fail:
+        lzo_free_workspace(&workspace->list);
+        return ERR_PTR(-ENOMEM);
+}
+static inline void write_compress_length(char *buf, size_t len)
+{
+        __le32 dlen;
+        dlen = cpu_to_le32(len);
+        memcpy(buf, &dlen, LZO_LEN);
+}
+static inline size_t read_compress_length(char *buf)
+{
+        __le32 dlen;
+        memcpy(&dlen, buf, LZO_LEN);
+        return le32_to_cpu(dlen);
+}
+static int lzo_compress_pages(struct list_head *ws,
+                              struct address_space *mapping,
+                              u64 start, unsigned long len,
+                              struct page **pages,
+                              unsigned long nr_dest_pages,
+                              unsigned long *out_pages,
+                              unsigned long *total_in,
+                              unsigned long *total_out,
+                              unsigned long max_out)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        int ret = 0;
+        char *data_in;
+        char *cpage_out;
+        int nr_pages = 0;
+        struct page *in_page = NULL;
+        struct page *out_page = NULL;
+        unsigned long bytes_left;
+        size_t in_len;
+        size_t out_len;
+        char *buf;
+        unsigned long tot_in = 0;
+        unsigned long tot_out = 0;
+        unsigned long pg_bytes_left;
+        unsigned long out_offset;
+        unsigned long bytes;
+        *out_pages = 0;
+        *total_out = 0;
+        *total_in = 0;
+        in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+        data_in = kmap(in_page);
+        /*
+         * store the size of all chunks of compressed data in
+         * the first 4 bytes
+         */
+        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        if (out_page == NULL) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        cpage_out = kmap(out_page);
+        out_offset = LZO_LEN;
+        tot_out = LZO_LEN;
+        pages[0] = out_page;
+        nr_pages = 1;
+        pg_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+        /* compress at most one page of data each time */
+        in_len = min(len, PAGE_CACHE_SIZE);
+        while (tot_in < len) {
+                ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf,
+                                       &out_len, workspace->mem);
+                if (ret != LZO_E_OK) {
+                        printk(KERN_DEBUG "btrfs deflate in loop returned %d\n",
+                               ret);
+                        ret = -1;
+                        goto out;
+                }
+                /* store the size of this chunk of compressed data */
+                write_compress_length(cpage_out + out_offset, out_len);
+                tot_out += LZO_LEN;
+                out_offset += LZO_LEN;
+                pg_bytes_left -= LZO_LEN;
+                tot_in += in_len;
+                tot_out += out_len;
+                /* copy bytes from the working buffer into the pages */
+                buf = workspace->cbuf;
+                while (out_len) {
+                        bytes = min_t(unsigned long, pg_bytes_left, out_len);
+                        memcpy(cpage_out + out_offset, buf, bytes);
+                        out_len -= bytes;
+                        pg_bytes_left -= bytes;
+                        buf += bytes;
+                        out_offset += bytes;
+                        /*
+                         * we need another page for writing out.
+                         *
+                         * Note if there's less than 4 bytes left, we just
+                         * skip to a new page.
+                         */
+                        if ((out_len == 0 && pg_bytes_left < LZO_LEN) ||
+                            pg_bytes_left == 0) {
+                                if (pg_bytes_left) {
+                                        memset(cpage_out + out_offset, 0,
+                                               pg_bytes_left);
+                                        tot_out += pg_bytes_left;
+                                }
+                                /* we're done, don't allocate new page */
+                                if (out_len == 0 && tot_in >= len)
+                                        break;
+                                kunmap(out_page);
+                                if (nr_pages == nr_dest_pages) {
+                                        out_page = NULL;
+                                        ret = -1;
+                                        goto out;
+                                }
+                                out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                                if (out_page == NULL) {
+                                        ret = -ENOMEM;
+                                        goto out;
+                                }
+                                cpage_out = kmap(out_page);
+                                pages[nr_pages++] = out_page;
+                                pg_bytes_left = PAGE_CACHE_SIZE;
+                                out_offset = 0;
+                        }
+                }
+                /* we're making it bigger, give up */
+                if (tot_in > 8192 && tot_in < tot_out)
+                        goto out;
+                /* we're all done */
+                if (tot_in >= len)
+                        break;
+                if (tot_out > max_out)
+                        break;
+                bytes_left = len - tot_in;
+                kunmap(in_page);
+                page_cache_release(in_page);
+                start += PAGE_CACHE_SIZE;
+                in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT);
+                data_in = kmap(in_page);
+                in_len = min(bytes_left, PAGE_CACHE_SIZE);
+        }
+        if (tot_out > tot_in)
+                goto out;
+        /* store the size of all chunks of compressed data */
+        cpage_out = kmap(pages[0]);
+        write_compress_length(cpage_out, tot_out);
+        kunmap(pages[0]);
+        ret = 0;
+        *total_out = tot_out;
+        *total_in = tot_in;
+out:
+        *out_pages = nr_pages;
+        if (out_page)
+                kunmap(out_page);
+        if (in_page) {
+                kunmap(in_page);
+                page_cache_release(in_page);
+        }
+        return ret;
+}
+static int lzo_decompress_biovec(struct list_head *ws,
+                                 struct page **pages_in,
+                                 u64 disk_start,
+                                 struct bio_vec *bvec,
+                                 int vcnt,
+                                 size_t srclen)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        int ret = 0, ret2;
+        char *data_in;
+        unsigned long page_in_index = 0;
+        unsigned long page_out_index = 0;
+        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
+                                        PAGE_CACHE_SIZE;
+        unsigned long buf_start;
+        unsigned long buf_offset = 0;
+        unsigned long bytes;
+        unsigned long working_bytes;
+        unsigned long pg_offset;
+        size_t in_len;
+        size_t out_len;
+        unsigned long in_offset;
+        unsigned long in_page_bytes_left;
+        unsigned long tot_in;
+        unsigned long tot_out;
+        unsigned long tot_len;
+        char *buf;
+        bool may_late_unmap, need_unmap;
+        data_in = kmap(pages_in[0]);
+        tot_len = read_compress_length(data_in);
+        tot_in = LZO_LEN;
+        in_offset = LZO_LEN;
+        tot_len = min_t(size_t, srclen, tot_len);
+        in_page_bytes_left = PAGE_CACHE_SIZE - LZO_LEN;
+        tot_out = 0;
+        pg_offset = 0;
+        while (tot_in < tot_len) {
+                in_len = read_compress_length(data_in + in_offset);
+                in_page_bytes_left -= LZO_LEN;
+                in_offset += LZO_LEN;
+                tot_in += LZO_LEN;
+                tot_in += in_len;
+                working_bytes = in_len;
+                may_late_unmap = need_unmap = false;
+                /* fast path: avoid using the working buffer */
+                if (in_page_bytes_left >= in_len) {
+                        buf = data_in + in_offset;
+                        bytes = in_len;
+                        may_late_unmap = true;
+                        goto cont;
+                }
+                /* copy bytes from the pages into the working buffer */
+                buf = workspace->cbuf;
+                buf_offset = 0;
+                while (working_bytes) {
+                        bytes = min(working_bytes, in_page_bytes_left);
+                        memcpy(buf + buf_offset, data_in + in_offset, bytes);
+                        buf_offset += bytes;
+cont:
+                        working_bytes -= bytes;
+                        in_page_bytes_left -= bytes;
+                        in_offset += bytes;
+                        /* check if we need to pick another page */
+                        if ((working_bytes == 0 && in_page_bytes_left < LZO_LEN)
+                            || in_page_bytes_left == 0) {
+                                tot_in += in_page_bytes_left;
+                                if (working_bytes == 0 && tot_in >= tot_len)
+                                        break;
+                                if (page_in_index + 1 >= total_pages_in) {
+                                        ret = -1;
+                                        goto done;
+                                }
+                                if (may_late_unmap)
+                                        need_unmap = true;
+                                else
+                                        kunmap(pages_in[page_in_index]);
+                                data_in = kmap(pages_in[++page_in_index]);
+                                in_page_bytes_left = PAGE_CACHE_SIZE;
+                                in_offset = 0;
+                        }
+                }
+                out_len = lzo1x_worst_compress(PAGE_CACHE_SIZE);
+                ret = lzo1x_decompress_safe(buf, in_len, workspace->buf,
+                                            &out_len);
+                if (need_unmap)
+                        kunmap(pages_in[page_in_index - 1]);
+                if (ret != LZO_E_OK) {
+                        printk(KERN_WARNING "btrfs decompress failed\n");
+                        ret = -1;
+                        break;
+                }
+                buf_start = tot_out;
+                tot_out += out_len;
+                ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
+                                                 tot_out, disk_start,
+                                                 bvec, vcnt,
+                                                 &page_out_index, &pg_offset);
+                if (ret2 == 0)
+                        break;
+        }
+done:
+        kunmap(pages_in[page_in_index]);
+        return ret;
+}
+static int lzo_decompress(struct list_head *ws, unsigned char *data_in,
+                          struct page *dest_page,
+                          unsigned long start_byte,
+                          size_t srclen, size_t destlen)
+{
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        size_t in_len;
+        size_t out_len;
+        size_t tot_len;
+        int ret = 0;
+        char *kaddr;
+        unsigned long bytes;
+        BUG_ON(srclen < LZO_LEN);
+        tot_len = read_compress_length(data_in);
+        data_in += LZO_LEN;
+        in_len = read_compress_length(data_in);
+        data_in += LZO_LEN;
+        out_len = PAGE_CACHE_SIZE;
+        ret = lzo1x_decompress_safe(data_in, in_len, workspace->buf, &out_len);
+        if (ret != LZO_E_OK) {
+                printk(KERN_WARNING "btrfs decompress failed!\n");
+                ret = -1;
+                goto out;
+        }
+        if (out_len < start_byte) {
+                ret = -1;
+                goto out;
+        }
+        bytes = min_t(unsigned long, destlen, out_len - start_byte);
+        kaddr = kmap_atomic(dest_page, KM_USER0);
+        memcpy(kaddr, workspace->buf + start_byte, bytes);
+        kunmap_atomic(kaddr, KM_USER0);
+out:
+        return ret;
+}
+struct btrfs_compress_op btrfs_lzo_compress = {
+        .alloc_workspace        = lzo_alloc_workspace,
+        .free_workspace         = lzo_free_workspace,
+        .compress_pages         = lzo_compress_pages,
+        .decompress_biovec      = lzo_decompress_biovec,
+        .decompress             = lzo_decompress,
+};
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index e56c72bc5add..a1c940425307 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -141,7 +141,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
                                          u64 file_offset)
 {
        struct rb_root *root = &tree->tree;
-        struct rb_node *prev;
+        struct rb_node *prev = NULL;
        struct rb_node *ret;
        struct btrfs_ordered_extent *entry;
@@ -172,7 +172,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
 */
 static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                                      u64 start, u64 len, u64 disk_len,
-                                      int type, int dio)
+                                      int type, int dio, int compress_type)
 {
        struct btrfs_ordered_inode_tree *tree;
        struct rb_node *node;
@@ -189,6 +189,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        entry->disk_len = disk_len;
        entry->bytes_left = len;
        entry->inode = inode;
+        entry->compress_type = compress_type;
        if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
                set_bit(type, &entry->flags);
@@ -201,6 +202,8 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
        INIT_LIST_HEAD(&entry->list);
        INIT_LIST_HEAD(&entry->root_extent_list);
+        trace_btrfs_ordered_extent_add(inode, entry);
        spin_lock(&tree->lock);
        node = tree_insert(&tree->tree, file_offset,
                           &entry->rb_node);
@@ -220,14 +223,25 @@ int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type)
 {
        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-                                          disk_len, type, 0);
+                                          disk_len, type, 0,
+                                          BTRFS_COMPRESS_NONE);
 }
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
                                 u64 start, u64 len, u64 disk_len, int type)
 {
        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
-                                          disk_len, type, 1);
+                                          disk_len, type, 1,
+                                          BTRFS_COMPRESS_NONE);
+}
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+                                      u64 start, u64 len, u64 disk_len,
+                                      int type, int compress_type)
+{
+        return __btrfs_add_ordered_extent(inode, file_offset, start, len,
+                                          disk_len, type, 0,
+                                          compress_type);
 }
 /*
@@ -250,6 +264,73 @@ int btrfs_add_ordered_sum(struct inode *inode,
 /*
 * this is used to account for finished IO across a given range
+ * of the file.  The IO may span ordered extents.  If
+ * a given ordered_extent is completely done, 1 is returned, otherwise
+ * 0.
+ *
+ * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used
+ * to make sure this function only returns 1 once for a given ordered extent.
+ *
+ * file_offset is updated to one byte past the range that is recorded as
+ * complete.  This allows you to walk forward in the file.
+ */
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+                                   struct btrfs_ordered_extent **cached,
+                                   u64 *file_offset, u64 io_size)
+{
+        struct btrfs_ordered_inode_tree *tree;
+        struct rb_node *node;
+        struct btrfs_ordered_extent *entry = NULL;
+        int ret;
+        u64 dec_end;
+        u64 dec_start;
+        u64 to_dec;
+        tree = &BTRFS_I(inode)->ordered_tree;
+        spin_lock(&tree->lock);
+        node = tree_search(tree, *file_offset);
+        if (!node) {
+                ret = 1;
+                goto out;
+        }
+        entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+        if (!offset_in_entry(entry, *file_offset)) {
+                ret = 1;
+                goto out;
+        }
+        dec_start = max(*file_offset, entry->file_offset);
+        dec_end = min(*file_offset + io_size, entry->file_offset +
+                      entry->len);
+        *file_offset = dec_end;
+        if (dec_start > dec_end) {
+                printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
+                       (unsigned long long)dec_start,
+                       (unsigned long long)dec_end);
+        }
+        to_dec = dec_end - dec_start;
+        if (to_dec > entry->bytes_left) {
+                printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
+                       (unsigned long long)entry->bytes_left,
+                       (unsigned long long)to_dec);
+        }
+        entry->bytes_left -= to_dec;
+        if (entry->bytes_left == 0)
+                ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+        else
+                ret = 1;
+out:
+        if (!ret && cached && entry) {
+                *cached = entry;
+                atomic_inc(&entry->refs);
+        }
+        spin_unlock(&tree->lock);
+        return ret == 0;
+}
+/*
+ * this is used to account for finished IO across a given range
 * of the file.  The IO should not span ordered extents.  If
 * a given ordered_extent is completely done, 1 is returned, otherwise
 * 0.
@@ -308,6 +389,8 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
        struct list_head *cur;
        struct btrfs_ordered_sum *sum;
+        trace_btrfs_ordered_extent_put(entry->inode, entry);
        if (atomic_dec_and_test(&entry->refs)) {
                while (!list_empty(&entry->list)) {
                        cur = entry->list.next;
@@ -341,6 +424,8 @@ static int __btrfs_remove_ordered_extent(struct inode *inode,
        spin_lock(&root->fs_info->ordered_extent_lock);
        list_del_init(&entry->root_extent_list);
+        trace_btrfs_ordered_extent_remove(inode, entry);
        /*
         * we have no more ordered extents for this inode and
         * no dirty pages.  We can safely remove it from the
@@ -506,6 +591,8 @@ void btrfs_start_ordered_extent(struct inode *inode,
        u64 start = entry->file_offset;
        u64 end = start + entry->len - 1;
+        trace_btrfs_ordered_extent_start(inode, entry);
        /*
         * pages in the range can be dirty, clean or writeback.  We
         * start IO on any dirty ones so the wait doesn't stall waiting
@@ -526,7 +613,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
 {
        u64 end;
        u64 orig_end;
-        u64 wait_end;
        struct btrfs_ordered_extent *ordered;
        int found;
@@ -537,7 +623,6 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
                if (orig_end > INT_LIMIT(loff_t))
                        orig_end = INT_LIMIT(loff_t);
        }
-        wait_end = orig_end;
 again:
        /* start IO across the range first to instantiate any delalloc
         * extents
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 8ac365492a3f..ff1f69aa1883 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -68,7 +68,7 @@ struct btrfs_ordered_sum {
 #define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */
-#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */
+#define BTRFS_ORDERED_COMPRESSED 3 /* writing a zlib compressed extent */
 #define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */
@@ -93,6 +93,9 @@ struct btrfs_ordered_extent {
        /* flags (described above) */
        unsigned long flags;
+        /* compression algorithm */
+        int compress_type;
        /* reference count */
        atomic_t refs;
@@ -141,10 +144,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
 int btrfs_dec_test_ordered_pending(struct inode *inode,
                                   struct btrfs_ordered_extent **cached,
                                   u64 file_offset, u64 io_size);
+int btrfs_dec_test_first_ordered_pending(struct inode *inode,
+                                   struct btrfs_ordered_extent **cached,
+                                   u64 *file_offset, u64 io_size);
 int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
                             u64 start, u64 len, u64 disk_len, int type);
 int btrfs_add_ordered_extent_dio(struct inode *inode, u64 file_offset,
                                 u64 start, u64 len, u64 disk_len, int type);
+int btrfs_add_ordered_extent_compress(struct inode *inode, u64 file_offset,
+                                      u64 start, u64 len, u64 disk_len,
+                                      int type, int compress_type);
 int btrfs_add_ordered_sum(struct inode *inode,
                          struct btrfs_ordered_extent *entry,
                          struct btrfs_ordered_sum *sum);
diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c
index 79cba5fbc28e..f8be250963a0 100644
--- a/fs/btrfs/orphan.c
+++ b/fs/btrfs/orphan.c
@@ -56,8 +56,12 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
-        if (ret)
+        if (ret < 0)
                goto out;
+        if (ret) {
+                ret = -ENOENT;
+                goto out;
+        }
        ret = btrfs_del_item(trans, root, path);
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 0d126be22b63..fb2605d998e9 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -260,6 +260,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 #else
                        BUG();
 #endif
+                        break;
                case BTRFS_BLOCK_GROUP_ITEM_KEY:
                        bi = btrfs_item_ptr(l, i,
                                            struct btrfs_block_group_item);
diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c
index a97314cf6bd6..82d569cb6267 100644
--- a/fs/btrfs/ref-cache.c
+++ b/fs/btrfs/ref-cache.c
@@ -23,56 +23,6 @@
 #include "ref-cache.h"
 #include "transaction.h"
-/*
- * leaf refs are used to cache the information about which extents
- * a given leaf has references on.  This allows us to process that leaf
- * in btrfs_drop_snapshot without needing to read it back from disk.
- */
-/*
- * kmalloc a leaf reference struct and update the counters for the
- * total ref cache size
- */
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
-                                            int nr_extents)
-{
-        struct btrfs_leaf_ref *ref;
-        size_t size = btrfs_leaf_ref_size(nr_extents);
-        ref = kmalloc(size, GFP_NOFS);
-        if (ref) {
-                spin_lock(&root->fs_info->ref_cache_lock);
-                root->fs_info->total_ref_cache_size += size;
-                spin_unlock(&root->fs_info->ref_cache_lock);
-                memset(ref, 0, sizeof(*ref));
-                atomic_set(&ref->usage, 1);
-                INIT_LIST_HEAD(&ref->list);
-        }
-        return ref;
-}
-/*
- * free a leaf reference struct and update the counters for the
- * total ref cache size
- */
-void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
-{
-        if (!ref)
-                return;
-        WARN_ON(atomic_read(&ref->usage) == 0);
-        if (atomic_dec_and_test(&ref->usage)) {
-                size_t size = btrfs_leaf_ref_size(ref->nritems);
-                BUG_ON(ref->in_tree);
-                kfree(ref);
-                spin_lock(&root->fs_info->ref_cache_lock);
-                root->fs_info->total_ref_cache_size -= size;
-                spin_unlock(&root->fs_info->ref_cache_lock);
-        }
-}
 static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr,
                                   struct rb_node *node)
 {
@@ -116,117 +66,3 @@ static struct rb_node *tree_search(struct rb_root *root, u64 bytenr)
        }
        return NULL;
 }
-int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
-                           int shared)
-{
-        struct btrfs_leaf_ref *ref = NULL;
-        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-        if (shared)
-                tree = &root->fs_info->shared_ref_tree;
-        if (!tree)
-                return 0;
-        spin_lock(&tree->lock);
-        while (!list_empty(&tree->list)) {
-                ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list);
-                BUG_ON(ref->tree != tree);
-                if (ref->root_gen > max_root_gen)
-                        break;
-                if (!xchg(&ref->in_tree, 0)) {
-                        cond_resched_lock(&tree->lock);
-                        continue;
-                }
-                rb_erase(&ref->rb_node, &tree->root);
-                list_del_init(&ref->list);
-                spin_unlock(&tree->lock);
-                btrfs_free_leaf_ref(root, ref);
-                cond_resched();
-                spin_lock(&tree->lock);
-        }
-        spin_unlock(&tree->lock);
-        return 0;
-}
-/*
- * find the leaf ref for a given extent.  This returns the ref struct with
- * a usage reference incremented
- */
-struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
-                                             u64 bytenr)
-{
-        struct rb_node *rb;
-        struct btrfs_leaf_ref *ref = NULL;
-        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-again:
-        if (tree) {
-                spin_lock(&tree->lock);
-                rb = tree_search(&tree->root, bytenr);
-                if (rb)
-                        ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node);
-                if (ref)
-                        atomic_inc(&ref->usage);
-                spin_unlock(&tree->lock);
-                if (ref)
-                        return ref;
-        }
-        if (tree != &root->fs_info->shared_ref_tree) {
-                tree = &root->fs_info->shared_ref_tree;
-                goto again;
-        }
-        return NULL;
-}
-/*
- * add a fully filled in leaf ref struct
- * remove all the refs older than a given root generation
- */
-int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
-                       int shared)
-{
-        int ret = 0;
-        struct rb_node *rb;
-        struct btrfs_leaf_ref_tree *tree = root->ref_tree;
-        if (shared)
-                tree = &root->fs_info->shared_ref_tree;
-        spin_lock(&tree->lock);
-        rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node);
-        if (rb) {
-                ret = -EEXIST;
-        } else {
-                atomic_inc(&ref->usage);
-                ref->tree = tree;
-                ref->in_tree = 1;
-                list_add_tail(&ref->list, &tree->list);
-        }
-        spin_unlock(&tree->lock);
-        return ret;
-}
-/*
- * remove a single leaf ref from the tree.  This drops the ref held by the tree
- * only
- */
-int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref)
-{
-        struct btrfs_leaf_ref_tree *tree;
-        if (!xchg(&ref->in_tree, 0))
-                return 0;
-        tree = ref->tree;
-        spin_lock(&tree->lock);
-        rb_erase(&ref->rb_node, &tree->root);
-        list_del_init(&ref->list);
-        spin_unlock(&tree->lock);
-        btrfs_free_leaf_ref(root, ref);
-        return 0;
-}
diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h
index e2a55cb2072b..24f7001f6387 100644
--- a/fs/btrfs/ref-cache.h
+++ b/fs/btrfs/ref-cache.h
@@ -49,28 +49,4 @@ static inline size_t btrfs_leaf_ref_size(int nr_extents)
        return sizeof(struct btrfs_leaf_ref) +
               sizeof(struct btrfs_extent_info) * nr_extents;
 }
-static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree)
-{
-        tree->root = RB_ROOT;
-        INIT_LIST_HEAD(&tree->list);
-        spin_lock_init(&tree->lock);
-}
-static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree)
-{
-        return RB_EMPTY_ROOT(&tree->root);
-}
-void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree);
-struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root,
-                                            int nr_extents);
-void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
-struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root,
-                                             u64 bytenr);
-int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref,
-                       int shared);
-int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen,
-                           int shared);
-int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref);
 #endif
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b37d723b9d4a..5e0a3dc79a45 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -29,6 +29,8 @@
 #include "locking.h"
 #include "btrfs_inode.h"
 #include "async-thread.h"
+#include "free-space-cache.h"
+#include "inode-map.h"
 /*
 * backref_node, mapping_node and tree_block start with this
@@ -178,8 +180,6 @@ struct reloc_control {
        u64 search_start;
        u64 extents_found;
-        int block_rsv_retries;
        unsigned int stage:8;
        unsigned int create_reloc_tree:1;
        unsigned int merge_reloc_tree:1;
@@ -508,6 +508,7 @@ static int update_backref_cache(struct btrfs_trans_handle *trans,
        return 1;
 }
 static int should_ignore_root(struct btrfs_root *root)
 {
        struct btrfs_root *reloc_root;
@@ -530,7 +531,6 @@ static int should_ignore_root(struct btrfs_root *root)
         */
        return 1;
 }
 /*
 * find reloc tree by address of tree root
 */
@@ -677,6 +677,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
                err = -ENOMEM;
                goto out;
        }
+        path1->reada = 1;
+        path2->reada = 2;
        node = alloc_backref_node(cache);
        if (!node) {
@@ -710,7 +712,7 @@ again:
        WARN_ON(cur->checked);
        if (!list_empty(&cur->upper)) {
                /*
-                 * the backref was added previously when processsing
+                 * the backref was added previously when processing
                 * backref of type BTRFS_TREE_BLOCK_REF_KEY
                 */
                BUG_ON(!list_is_singular(&cur->upper));
@@ -962,7 +964,7 @@ again:
                        lower = upper;
                        upper = NULL;
                }
-                btrfs_release_path(root, path2);
+                btrfs_release_path(path2);
 next:
                if (ptr < end) {
                        ptr += btrfs_extent_inline_ref_size(key.type);
@@ -975,7 +977,7 @@ next:
                if (ptr >= end)
                        path1->slots[0]++;
        }
-        btrfs_release_path(rc->extent_root, path1);
+        btrfs_release_path(path1);
        cur->checked = 1;
        WARN_ON(exist);
@@ -1158,6 +1160,7 @@ static int clone_backref_node(struct btrfs_trans_handle *trans,
        new_node->bytenr = dest->node->start;
        new_node->level = node->level;
        new_node->lowest = node->lowest;
+        new_node->checked = 1;
        new_node->root = dest;
        if (!node->lowest) {
@@ -1365,7 +1368,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
        int ret;
        if (!root->reloc_root)
-                return 0;
+                goto out;
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
@@ -1387,6 +1390,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
        ret = btrfs_update_root(trans, root->fs_info->tree_root,
                                &reloc_root->root_key, root_item);
        BUG_ON(ret);
+out:
        return 0;
 }
@@ -1409,9 +1414,9 @@ again:
                prev = node;
                entry = rb_entry(node, struct btrfs_inode, rb_node);
-                if (objectid < entry->vfs_inode.i_ino)
+                if (objectid < btrfs_ino(&entry->vfs_inode))
                        node = node->rb_left;
-                else if (objectid > entry->vfs_inode.i_ino)
+                else if (objectid > btrfs_ino(&entry->vfs_inode))
                        node = node->rb_right;
                else
                        break;
@@ -1419,7 +1424,7 @@ again:
        if (!node) {
                while (prev) {
                        entry = rb_entry(prev, struct btrfs_inode, rb_node);
-                        if (objectid <= entry->vfs_inode.i_ino) {
+                        if (objectid <= btrfs_ino(&entry->vfs_inode)) {
                                node = prev;
                                break;
                        }
@@ -1434,7 +1439,7 @@ again:
                        return inode;
                }
-                objectid = entry->vfs_inode.i_ino + 1;
+                objectid = btrfs_ino(&entry->vfs_inode) + 1;
                if (cond_resched_lock(&root->inode_lock))
                        goto again;
@@ -1470,7 +1475,7 @@ static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
                return -ENOMEM;
        bytenr -= BTRFS_I(reloc_inode)->index_cnt;
-        ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino,
+        ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(reloc_inode),
                                       bytenr, 0);
        if (ret < 0)
                goto out;
@@ -1558,11 +1563,11 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
                        if (first) {
                                inode = find_next_inode(root, key.objectid);
                                first = 0;
-                        } else if (inode && inode->i_ino < key.objectid) {
+                        } else if (inode && btrfs_ino(inode) < key.objectid) {
                                btrfs_add_delayed_iput(inode);
                                inode = find_next_inode(root, key.objectid);
                        }
-                        if (inode && inode->i_ino == key.objectid) {
+                        if (inode && btrfs_ino(inode) == key.objectid) {
                                end = key.offset +
                                      btrfs_file_extent_num_bytes(leaf, fi);
                                WARN_ON(!IS_ALIGNED(key.offset,
@@ -1724,6 +1729,7 @@ again:
                        eb = read_tree_block(dest, old_bytenr, blocksize,
                                             old_ptr_gen);
+                        BUG_ON(!eb);
                        btrfs_tree_lock(eb);
                        if (cow) {
                                ret = btrfs_cow_block(trans, dest, eb, parent,
@@ -1748,7 +1754,7 @@ again:
                btrfs_node_key_to_cpu(path->nodes[level], &key,
                                      path->slots[level]);
-                btrfs_release_path(src, path);
+                btrfs_release_path(path);
                path->lowest_level = level;
                ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
@@ -1892,6 +1898,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
        struct inode *inode = NULL;
        u64 objectid;
        u64 start, end;
+        u64 ino;
        objectid = min_key->objectid;
        while (1) {
@@ -1904,17 +1911,18 @@ static int invalidate_extent_cache(struct btrfs_root *root,
                inode = find_next_inode(root, objectid);
                if (!inode)
                        break;
+                ino = btrfs_ino(inode);
-                if (inode->i_ino > max_key->objectid) {
+                if (ino > max_key->objectid) {
                        iput(inode);
                        break;
                }
-                objectid = inode->i_ino + 1;
+                objectid = ino + 1;
                if (!S_ISREG(inode->i_mode))
                        continue;
-                if (unlikely(min_key->objectid == inode->i_ino)) {
+                if (unlikely(min_key->objectid == ino)) {
                        if (min_key->type > BTRFS_EXTENT_DATA_KEY)
                                continue;
                        if (min_key->type < BTRFS_EXTENT_DATA_KEY)
@@ -1927,7 +1935,7 @@ static int invalidate_extent_cache(struct btrfs_root *root,
                        start = 0;
                }
-                if (unlikely(max_key->objectid == inode->i_ino)) {
+                if (unlikely(max_key->objectid == ino)) {
                        if (max_key->type < BTRFS_EXTENT_DATA_KEY)
                                continue;
                        if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
@@ -1995,6 +2003,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
        reloc_root = root->reloc_root;
        root_item = &reloc_root->root_item;
@@ -2029,6 +2038,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
        while (1) {
                trans = btrfs_start_transaction(root, 0);
+                BUG_ON(IS_ERR(trans));
                trans->block_rsv = rc->block_rsv;
                ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
@@ -2133,29 +2143,34 @@ int prepare_to_merge(struct reloc_control *rc, int err)
        LIST_HEAD(reloc_roots);
        u64 num_bytes = 0;
        int ret;
-        int retries = 0;
-        mutex_lock(&root->fs_info->trans_mutex);
+        mutex_lock(&root->fs_info->reloc_mutex);
        rc->merging_rsv_size += root->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
        rc->merging_rsv_size += rc->nodes_relocated * 2;
-        mutex_unlock(&root->fs_info->trans_mutex);
+        mutex_unlock(&root->fs_info->reloc_mutex);
 again:
        if (!err) {
                num_bytes = rc->merging_rsv_size;
                ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
-                                          num_bytes, &retries);
+                                          num_bytes);
                if (ret)
                        err = ret;
        }
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
+        if (IS_ERR(trans)) {
+                if (!err)
+                        btrfs_block_rsv_release(rc->extent_root,
+                                                rc->block_rsv, num_bytes);
+                return PTR_ERR(trans);
+        }
        if (!err) {
                if (num_bytes != rc->merging_rsv_size) {
                        btrfs_end_transaction(trans, rc->extent_root);
                        btrfs_block_rsv_release(rc->extent_root,
                                                rc->block_rsv, num_bytes);
-                        retries = 0;
                        goto again;
                }
        }
@@ -2202,9 +2217,16 @@ int merge_reloc_roots(struct reloc_control *rc)
        int ret;
 again:
        root = rc->extent_root;
-        mutex_lock(&root->fs_info->trans_mutex);
+        /*
+         * this serializes us with btrfs_record_root_in_transaction,
+         * we have to make sure nobody is in the middle of
+         * adding their roots to the list while we are
+         * doing this splice
+         */
+        mutex_lock(&root->fs_info->reloc_mutex);
        list_splice_init(&rc->reloc_roots, &reloc_roots);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        mutex_unlock(&root->fs_info->reloc_mutex);
        while (!list_empty(&reloc_roots)) {
                found = 1;
@@ -2340,7 +2362,7 @@ struct btrfs_root *select_one_root(struct btrfs_trans_handle *trans,
                root = next->root;
                BUG_ON(!root);
-                /* no other choice for non-refernce counted tree */
+                /* no other choice for non-references counted tree */
                if (!root->ref_cows)
                        return root;
@@ -2405,15 +2427,13 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
        num_bytes = calcu_metadata_size(rc, node, 1) * 2;
        trans->block_rsv = rc->block_rsv;
-        ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes,
+        ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
-                                  &rc->block_rsv_retries);
        if (ret) {
                if (ret == -EAGAIN)
                        rc->commit_transaction = 1;
                return ret;
        }
-        rc->block_rsv_retries = 0;
        return 0;
 }
@@ -2492,7 +2512,7 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                        path->locks[upper->level] = 0;
                        slot = path->slots[upper->level];
-                        btrfs_release_path(NULL, path);
+                        btrfs_release_path(path);
                } else {
                        ret = btrfs_bin_search(upper->eb, key, upper->level,
                                               &slot);
@@ -2510,6 +2530,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
                blocksize = btrfs_level_size(root, node->level);
                generation = btrfs_node_ptr_generation(upper->eb, slot);
                eb = read_tree_block(root, bytenr, blocksize, generation);
+                if (!eb) {
+                        err = -EIO;
+                        goto next;
+                }
                btrfs_tree_lock(eb);
                btrfs_set_lock_blocking(eb);
@@ -2667,6 +2691,7 @@ static int get_tree_block_key(struct reloc_control *rc,
        BUG_ON(block->key_ready);
        eb = read_tree_block(rc->extent_root, block->bytenr,
                             block->key.objectid, block->key.offset);
+        BUG_ON(!eb);
        WARN_ON(btrfs_header_level(eb) != block->level);
        if (block->level == 0)
                btrfs_item_key_to_cpu(eb, &block->key, 0);
@@ -2728,7 +2753,7 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
                } else {
                        path->lowest_level = node->level;
                        ret = btrfs_search_slot(trans, root, key, path, 0, 1);
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        if (ret > 0)
                                ret = 0;
                }
@@ -2861,7 +2886,7 @@ int setup_extent_mapping(struct inode *inode, u64 start, u64 end,
        struct extent_map *em;
        int ret = 0;
-        em = alloc_extent_map(GFP_NOFS);
+        em = alloc_extent_map();
        if (!em)
                return -ENOMEM;
@@ -3099,6 +3124,8 @@ static int add_tree_block(struct reloc_control *rc,
                BUG_ON(item_size != sizeof(struct btrfs_extent_item_v0));
                ret = get_ref_objectid_v0(rc, path, extent_key,
                                          &ref_owner, NULL);
+                if (ret < 0)
+                        return ret;
                BUG_ON(ref_owner >= BTRFS_MAX_LEVEL);
                level = (int)ref_owner;
                /* FIXME: get real generation */
@@ -3108,7 +3135,7 @@ static int add_tree_block(struct reloc_control *rc,
 #endif
        }
-        btrfs_release_path(rc->extent_root, path);
+        btrfs_release_path(path);
        BUG_ON(level == -1);
@@ -3191,6 +3218,55 @@ static int block_use_full_backref(struct reloc_control *rc,
        return ret;
 }
+static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
+                                    struct inode *inode, u64 ino)
+{
+        struct btrfs_key key;
+        struct btrfs_path *path;
+        struct btrfs_root *root = fs_info->tree_root;
+        struct btrfs_trans_handle *trans;
+        unsigned long nr;
+        int ret = 0;
+        if (inode)
+                goto truncate;
+        key.objectid = ino;
+        key.type = BTRFS_INODE_ITEM_KEY;
+        key.offset = 0;
+        inode = btrfs_iget(fs_info->sb, &key, root, NULL);
+        if (IS_ERR_OR_NULL(inode) || is_bad_inode(inode)) {
+                if (inode && !IS_ERR(inode))
+                        iput(inode);
+                return -ENOENT;
+        }
+truncate:
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto out;
+        }
+        trans = btrfs_join_transaction(root);
+        if (IS_ERR(trans)) {
+                btrfs_free_path(path);
+                ret = PTR_ERR(trans);
+                goto out;
+        }
+        ret = btrfs_truncate_free_space_cache(root, trans, path, inode);
+        btrfs_free_path(path);
+        nr = trans->blocks_used;
+        btrfs_end_transaction(trans, root);
+        btrfs_btree_balance_dirty(root, nr);
+out:
+        iput(inode);
+        return ret;
+}
 /*
 * helper to add tree blocks for backref of type BTRFS_EXTENT_DATA_REF_KEY
 * this function scans fs tree to find blocks reference the data extent
@@ -3217,15 +3293,28 @@ static int find_data_references(struct reloc_control *rc,
        int counted;
        int ret;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
        ref_root = btrfs_extent_data_ref_root(leaf, ref);
        ref_objectid = btrfs_extent_data_ref_objectid(leaf, ref);
        ref_offset = btrfs_extent_data_ref_offset(leaf, ref);
        ref_count = btrfs_extent_data_ref_count(leaf, ref);
+        /*
+         * This is an extent belonging to the free space cache, lets just delete
+         * it and redo the search.
+         */
+        if (ref_root == BTRFS_ROOT_TREE_OBJECTID) {
+                ret = delete_block_group_cache(rc->extent_root->fs_info,
+                                               NULL, ref_objectid);
+                if (ret != -ENOENT)
+                        return ret;
+                ret = 0;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 1;
        root = read_fs_root(rc->extent_root->fs_info, ref_root);
        if (IS_ERR(root)) {
                err = PTR_ERR(root);
@@ -3433,7 +3522,7 @@ int add_data_references(struct reloc_control *rc,
                }
                path->slots[0]++;
        }
-        btrfs_release_path(rc->extent_root, path);
+        btrfs_release_path(path);
        if (err)
                free_block_list(blocks);
        return err;
@@ -3496,7 +3585,7 @@ next:
                                            EXTENT_DIRTY);
                if (ret == 0 && start <= key.objectid) {
-                        btrfs_release_path(rc->extent_root, path);
+                        btrfs_release_path(path);
                        rc->search_start = end + 1;
                } else {
                        rc->search_start = key.objectid + key.offset;
@@ -3504,24 +3593,26 @@ next:
                        return 0;
                }
        }
-        btrfs_release_path(rc->extent_root, path);
+        btrfs_release_path(path);
        return ret;
 }
 static void set_reloc_control(struct reloc_control *rc)
 {
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-        mutex_lock(&fs_info->trans_mutex);
+        mutex_lock(&fs_info->reloc_mutex);
        fs_info->reloc_ctl = rc;
-        mutex_unlock(&fs_info->trans_mutex);
+        mutex_unlock(&fs_info->reloc_mutex);
 }
 static void unset_reloc_control(struct reloc_control *rc)
 {
        struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
-        mutex_lock(&fs_info->trans_mutex);
+        mutex_lock(&fs_info->reloc_mutex);
        fs_info->reloc_ctl = NULL;
-        mutex_unlock(&fs_info->trans_mutex);
+        mutex_unlock(&fs_info->reloc_mutex);
 }
 static int check_extent_flags(u64 flags)
@@ -3554,8 +3645,7 @@ int prepare_to_relocate(struct reloc_control *rc)
         * is no reservation in transaction handle.
         */
        ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
-                                  rc->extent_root->nodesize * 256,
+                                  rc->extent_root->nodesize * 256);
-                                  &rc->block_rsv_retries);
        if (ret)
                return ret;
@@ -3567,12 +3657,12 @@ int prepare_to_relocate(struct reloc_control *rc)
        rc->extents_found = 0;
        rc->nodes_relocated = 0;
        rc->merging_rsv_size = 0;
-        rc->block_rsv_retries = 0;
        rc->create_reloc_tree = 1;
        set_reloc_control(rc);
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
+        BUG_ON(IS_ERR(trans));
        btrfs_commit_transaction(trans, rc->extent_root);
        return 0;
 }
@@ -3589,10 +3679,12 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        u32 item_size;
        int ret;
        int err = 0;
+        int progress = 0;
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = 1;
        ret = prepare_to_relocate(rc);
        if (ret) {
@@ -3601,8 +3693,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        }
        while (1) {
+                progress++;
                trans = btrfs_start_transaction(rc->extent_root, 0);
+                BUG_ON(IS_ERR(trans));
+restart:
                if (update_backref_cache(trans, &rc->backref_cache)) {
                        btrfs_end_transaction(trans, rc->extent_root);
                        continue;
@@ -3639,7 +3733,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                                flags = BTRFS_EXTENT_FLAG_DATA;
                        if (path_change) {
-                                btrfs_release_path(rc->extent_root, path);
+                                btrfs_release_path(path);
                                path->search_commit_root = 1;
                                path->skip_locking = 1;
@@ -3662,7 +3756,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                           (flags & BTRFS_EXTENT_FLAG_DATA)) {
                        ret = add_data_references(rc, &key, path, &blocks);
                } else {
-                        btrfs_release_path(rc->extent_root, path);
+                        btrfs_release_path(path);
                        ret = 0;
                }
                if (ret < 0) {
@@ -3715,8 +3809,17 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
                        }
                }
        }
+        if (trans && progress && err == -ENOSPC) {
+                ret = btrfs_force_chunk_alloc(trans, rc->extent_root,
+                                              rc->block_group->flags);
+                if (ret == 0) {
+                        err = 0;
+                        progress = 0;
+                        goto restart;
+                }
+        }
-        btrfs_release_path(rc->extent_root, path);
+        btrfs_release_path(path);
        clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY,
                          GFP_NOFS);
@@ -3748,8 +3851,11 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
        btrfs_block_rsv_release(rc->extent_root, rc->block_rsv, (u64)-1);
        /* get rid of pinned extents */
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
-        btrfs_commit_transaction(trans, rc->extent_root);
+        if (IS_ERR(trans))
+                err = PTR_ERR(trans);
+        else
+                btrfs_commit_transaction(trans, rc->extent_root);
 out_free:
        btrfs_free_block_rsv(rc->extent_root, rc->block_rsv);
        btrfs_free_path(path);
@@ -3781,7 +3887,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
        btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
                                          BTRFS_INODE_PREALLOC);
        btrfs_mark_buffer_dirty(leaf);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
 out:
        btrfs_free_path(path);
        return ret;
@@ -3811,7 +3917,7 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
        if (IS_ERR(trans))
                return ERR_CAST(trans);
-        err = btrfs_find_free_objectid(trans, root, objectid, &objectid);
+        err = btrfs_find_free_objectid(root, &objectid);
        if (err)
                goto out;
@@ -3849,7 +3955,7 @@ static struct reloc_control *alloc_reloc_control(void)
        INIT_LIST_HEAD(&rc->reloc_roots);
        backref_cache_init(&rc->backref_cache);
        mapping_tree_init(&rc->reloc_root_tree);
-        extent_io_tree_init(&rc->processed_blocks, NULL, GFP_NOFS);
+        extent_io_tree_init(&rc->processed_blocks, NULL);
        return rc;
 }
@@ -3860,6 +3966,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 {
        struct btrfs_fs_info *fs_info = extent_root->fs_info;
        struct reloc_control *rc;
+        struct inode *inode;
+        struct btrfs_path *path;
        int ret;
        int rw = 0;
        int err = 0;
@@ -3882,6 +3990,26 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
                rw = 1;
        }
+        path = btrfs_alloc_path();
+        if (!path) {
+                err = -ENOMEM;
+                goto out;
+        }
+        inode = lookup_free_space_inode(fs_info->tree_root, rc->block_group,
+                                        path);
+        btrfs_free_path(path);
+        if (!IS_ERR(inode))
+                ret = delete_block_group_cache(fs_info, inode, 0);
+        else
+                ret = PTR_ERR(inode);
+        if (ret && ret != -ENOENT) {
+                err = ret;
+                goto out;
+        }
        rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
        if (IS_ERR(rc->data_inode)) {
                err = PTR_ERR(rc->data_inode);
@@ -3945,6 +4073,7 @@ static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
        int ret;
        trans = btrfs_start_transaction(root->fs_info->tree_root, 0);
+        BUG_ON(IS_ERR(trans));
        memset(&root->root_item.drop_progress, 0,
                sizeof(root->root_item.drop_progress));
@@ -3981,6 +4110,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        path = btrfs_alloc_path();
        if (!path)
                return -ENOMEM;
+        path->reada = -1;
        key.objectid = BTRFS_TREE_RELOC_OBJECTID;
        key.type = BTRFS_ROOT_ITEM_KEY;
@@ -4000,7 +4130,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                }
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                btrfs_release_path(root->fs_info->tree_root, path);
+                btrfs_release_path(path);
                if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
                    key.type != BTRFS_ROOT_ITEM_KEY)
@@ -4032,7 +4162,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
                key.offset--;
        }
-        btrfs_release_path(root->fs_info->tree_root, path);
+        btrfs_release_path(path);
        if (list_empty(&reloc_roots))
                goto out;
@@ -4047,7 +4177,12 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        set_reloc_control(rc);
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
+        if (IS_ERR(trans)) {
+                unset_reloc_control(rc);
+                err = PTR_ERR(trans);
+                goto out_free;
+        }
        rc->merge_reloc_tree = 1;
@@ -4076,10 +4211,14 @@ int btrfs_recover_relocation(struct btrfs_root *root)
        unset_reloc_control(rc);
-        trans = btrfs_join_transaction(rc->extent_root, 1);
+        trans = btrfs_join_transaction(rc->extent_root);
-        btrfs_commit_transaction(trans, rc->extent_root);
+        if (IS_ERR(trans))
-out:
+                err = PTR_ERR(trans);
+        else
+                btrfs_commit_transaction(trans, rc->extent_root);
+out_free:
        kfree(rc);
+out:
        while (!list_empty(&reloc_roots)) {
                reloc_root = list_entry(reloc_roots.next,
                                        struct btrfs_root, root_list);
@@ -4097,7 +4236,7 @@ out:
                if (IS_ERR(fs_root))
                        err = PTR_ERR(fs_root);
                else
-                        btrfs_orphan_cleanup(fs_root);
+                        err = btrfs_orphan_cleanup(fs_root);
        }
        return err;
 }
@@ -4124,7 +4263,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
        disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt;
        ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr,
-                                       disk_bytenr + len - 1, &list);
+                                       disk_bytenr + len - 1, &list, 0);
        while (!list_empty(&list)) {
                sums = list_entry(list.next, struct btrfs_ordered_sum, list);
@@ -4143,7 +4282,7 @@ int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len)
                btrfs_add_ordered_sum(inode, ordered, sums);
        }
        btrfs_put_ordered_extent(ordered);
-        return 0;
+        return ret;
 }
 void btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 2d958be761c8..ebe45443de06 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -22,53 +22,6 @@
 #include "print-tree.h"
 /*
- *  search forward for a root, starting with objectid 'search_start'
- *  if a root key is found, the objectid we find is filled into 'found_objectid'
- *  and 0 is returned.  < 0 is returned on error, 1 if there is nothing
- *  left in the tree.
- */
-int btrfs_search_root(struct btrfs_root *root, u64 search_start,
-                      u64 *found_objectid)
-{
-        struct btrfs_path *path;
-        struct btrfs_key search_key;
-        int ret;
-        root = root->fs_info->tree_root;
-        search_key.objectid = search_start;
-        search_key.type = (u8)-1;
-        search_key.offset = (u64)-1;
-        path = btrfs_alloc_path();
-        BUG_ON(!path);
-again:
-        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
-        if (ret < 0)
-                goto out;
-        if (ret == 0) {
-                ret = 1;
-                goto out;
-        }
-        if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
-                ret = btrfs_next_leaf(root, path);
-                if (ret)
-                        goto out;
-        }
-        btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]);
-        if (search_key.type != BTRFS_ROOT_ITEM_KEY) {
-                search_key.offset++;
-                btrfs_release_path(root, path);
-                goto again;
-        }
-        ret = 0;
-        *found_objectid = search_key.objectid;
-out:
-        btrfs_free_path(path);
-        return ret;
-}
-/*
 * lookup the root with the highest offset for a given objectid.  The key we do
 * find is copied into 'key'.  If we find something return 0, otherwise 1, < 0
 * on error.
@@ -88,7 +41,8 @@ int btrfs_find_last_root(struct btrfs_root *root, u64 objectid,
        search_key.offset = (u64)-1;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
        if (ret < 0)
                goto out;
@@ -181,7 +135,6 @@ int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root
 int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid)
 {
        struct btrfs_root *dead_root;
-        struct btrfs_item *item;
        struct btrfs_root_item *ri;
        struct btrfs_key key;
        struct btrfs_key found_key;
@@ -214,7 +167,6 @@ again:
                        nritems = btrfs_header_nritems(leaf);
                        slot = path->slots[0];
                }
-                item = btrfs_item_nr(leaf, slot);
                btrfs_item_key_to_cpu(leaf, &key, slot);
                if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY)
                        goto next;
@@ -231,7 +183,7 @@ again:
                memcpy(&found_key, &key, sizeof(key));
                key.offset++;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                dead_root =
                        btrfs_read_fs_root_no_radix(root->fs_info->tree_root,
                                                    &found_key);
@@ -293,7 +245,7 @@ int btrfs_find_orphan_roots(struct btrfs_root *tree_root)
                }
                btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                btrfs_release_path(tree_root, path);
+                btrfs_release_path(path);
                if (key.objectid != BTRFS_ORPHAN_OBJECTID ||
                    key.type != BTRFS_ORPHAN_ITEM_KEY)
@@ -334,7 +286,8 @@ int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        struct extent_buffer *leaf;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_search_slot(trans, root, key, path, -1, 1);
        if (ret < 0)
                goto out;
@@ -385,18 +338,22 @@ again:
                *sequence = btrfs_root_ref_sequence(leaf, ref);
                ret = btrfs_del_item(trans, tree_root, path);
-                BUG_ON(ret);
+                if (ret) {
+                        err = ret;
+                        goto out;
+                }
        } else
                err = -ENOENT;
        if (key.type == BTRFS_ROOT_BACKREF_KEY) {
-                btrfs_release_path(tree_root, path);
+                btrfs_release_path(path);
                key.objectid = ref_id;
                key.type = BTRFS_ROOT_REF_KEY;
                key.offset = root_id;
                goto again;
        }
+out:
        btrfs_free_path(path);
        return err;
 }
@@ -463,7 +420,7 @@ again:
        btrfs_mark_buffer_dirty(leaf);
        if (key.type == BTRFS_ROOT_BACKREF_KEY) {
-                btrfs_release_path(tree_root, path);
+                btrfs_release_path(path);
                key.objectid = ref_id;
                key.type = BTRFS_ROOT_REF_KEY;
                key.offset = root_id;
@@ -473,3 +430,21 @@ again:
        btrfs_free_path(path);
        return 0;
 }
+/*
+ * Old btrfs forgets to init root_item->flags and root_item->byte_limit
+ * for subvolumes. To work around this problem, we steal a bit from
+ * root_item->inode_item->flags, and use it to indicate if those fields
+ * have been properly initialized.
+ */
+void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
+{
+        u64 inode_flags = le64_to_cpu(root_item->inode.flags);
+        if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
+                inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
+                root_item->inode.flags = cpu_to_le64(inode_flags);
+                root_item->flags = 0;
+                root_item->byte_limit = 0;
+        }
+}
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
new file mode 100644
index 000000000000..a8d03d5efb5d
--- /dev/null
+++ b/fs/btrfs/scrub.c
@@ -0,0 +1,1395 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/blkdev.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "ordered-data.h"
+/*
+ * This is only the first step towards a full-features scrub. It reads all
+ * extent and super block and verifies the checksums. In case a bad checksum
+ * is found or the extent cannot be read, good data will be written back if
+ * any can be found.
+ *
+ * Future enhancements:
+ *  - To enhance the performance, better read-ahead strategies for the
+ *    extent-tree can be employed.
+ *  - In case an unrepairable extent is encountered, track which files are
+ *    affected and report them
+ *  - In case of a read error on files with nodatasum, map the file and read
+ *    the extent to trigger a writeback of the good copy
+ *  - track and record media errors, throw out bad devices
+ *  - add a mode to also read unallocated space
+ *  - make the prefetch cancellable
+ */
+struct scrub_bio;
+struct scrub_page;
+struct scrub_dev;
+static void scrub_bio_end_io(struct bio *bio, int err);
+static void scrub_checksum(struct btrfs_work *work);
+static int scrub_checksum_data(struct scrub_dev *sdev,
+                               struct scrub_page *spag, void *buffer);
+static int scrub_checksum_tree_block(struct scrub_dev *sdev,
+                                     struct scrub_page *spag, u64 logical,
+                                     void *buffer);
+static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer);
+static int scrub_fixup_check(struct scrub_bio *sbio, int ix);
+static void scrub_fixup_end_io(struct bio *bio, int err);
+static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
+                          struct page *page);
+static void scrub_fixup(struct scrub_bio *sbio, int ix);
+#define SCRUB_PAGES_PER_BIO     16      /* 64k per bio */
+#define SCRUB_BIOS_PER_DEV      16      /* 1 MB per device in flight */
+struct scrub_page {
+        u64                     flags;  /* extent flags */
+        u64                     generation;
+        u64                     mirror_num;
+        int                     have_csum;
+        u8                      csum[BTRFS_CSUM_SIZE];
+};
+struct scrub_bio {
+        int                     index;
+        struct scrub_dev        *sdev;
+        struct bio              *bio;
+        int                     err;
+        u64                     logical;
+        u64                     physical;
+        struct scrub_page       spag[SCRUB_PAGES_PER_BIO];
+        u64                     count;
+        int                     next_free;
+        struct btrfs_work       work;
+};
+struct scrub_dev {
+        struct scrub_bio        *bios[SCRUB_BIOS_PER_DEV];
+        struct btrfs_device     *dev;
+        int                     first_free;
+        int                     curr;
+        atomic_t                in_flight;
+        spinlock_t              list_lock;
+        wait_queue_head_t       list_wait;
+        u16                     csum_size;
+        struct list_head        csum_list;
+        atomic_t                cancel_req;
+        int                     readonly;
+        /*
+         * statistics
+         */
+        struct btrfs_scrub_progress stat;
+        spinlock_t              stat_lock;
+};
+static void scrub_free_csums(struct scrub_dev *sdev)
+{
+        while (!list_empty(&sdev->csum_list)) {
+                struct btrfs_ordered_sum *sum;
+                sum = list_first_entry(&sdev->csum_list,
+                                       struct btrfs_ordered_sum, list);
+                list_del(&sum->list);
+                kfree(sum);
+        }
+}
+static void scrub_free_bio(struct bio *bio)
+{
+        int i;
+        struct page *last_page = NULL;
+        if (!bio)
+                return;
+        for (i = 0; i < bio->bi_vcnt; ++i) {
+                if (bio->bi_io_vec[i].bv_page == last_page)
+                        continue;
+                last_page = bio->bi_io_vec[i].bv_page;
+                __free_page(last_page);
+        }
+        bio_put(bio);
+}
+static noinline_for_stack void scrub_free_dev(struct scrub_dev *sdev)
+{
+        int i;
+        if (!sdev)
+                return;
+        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+                struct scrub_bio *sbio = sdev->bios[i];
+                if (!sbio)
+                        break;
+                scrub_free_bio(sbio->bio);
+                kfree(sbio);
+        }
+        scrub_free_csums(sdev);
+        kfree(sdev);
+}
+static noinline_for_stack
+struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
+{
+        struct scrub_dev *sdev;
+        int             i;
+        struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+        sdev = kzalloc(sizeof(*sdev), GFP_NOFS);
+        if (!sdev)
+                goto nomem;
+        sdev->dev = dev;
+        for (i = 0; i < SCRUB_BIOS_PER_DEV; ++i) {
+                struct scrub_bio *sbio;
+                sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
+                if (!sbio)
+                        goto nomem;
+                sdev->bios[i] = sbio;
+                sbio->index = i;
+                sbio->sdev = sdev;
+                sbio->count = 0;
+                sbio->work.func = scrub_checksum;
+                if (i != SCRUB_BIOS_PER_DEV-1)
+                        sdev->bios[i]->next_free = i + 1;
+                 else
+                        sdev->bios[i]->next_free = -1;
+        }
+        sdev->first_free = 0;
+        sdev->curr = -1;
+        atomic_set(&sdev->in_flight, 0);
+        atomic_set(&sdev->cancel_req, 0);
+        sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+        INIT_LIST_HEAD(&sdev->csum_list);
+        spin_lock_init(&sdev->list_lock);
+        spin_lock_init(&sdev->stat_lock);
+        init_waitqueue_head(&sdev->list_wait);
+        return sdev;
+nomem:
+        scrub_free_dev(sdev);
+        return ERR_PTR(-ENOMEM);
+}
+/*
+ * scrub_recheck_error gets called when either verification of the page
+ * failed or the bio failed to read, e.g. with EIO. In the latter case,
+ * recheck_error gets called for every page in the bio, even though only
+ * one may be bad
+ */
+static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
+{
+        if (sbio->err) {
+                if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
+                                   (sbio->physical + ix * PAGE_SIZE) >> 9,
+                                   sbio->bio->bi_io_vec[ix].bv_page) == 0) {
+                        if (scrub_fixup_check(sbio, ix) == 0)
+                                return;
+                }
+        }
+        scrub_fixup(sbio, ix);
+}
+static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
+{
+        int ret = 1;
+        struct page *page;
+        void *buffer;
+        u64 flags = sbio->spag[ix].flags;
+        page = sbio->bio->bi_io_vec[ix].bv_page;
+        buffer = kmap_atomic(page, KM_USER0);
+        if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                ret = scrub_checksum_data(sbio->sdev,
+                                          sbio->spag + ix, buffer);
+        } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                ret = scrub_checksum_tree_block(sbio->sdev,
+                                                sbio->spag + ix,
+                                                sbio->logical + ix * PAGE_SIZE,
+                                                buffer);
+        } else {
+                WARN_ON(1);
+        }
+        kunmap_atomic(buffer, KM_USER0);
+        return ret;
+}
+static void scrub_fixup_end_io(struct bio *bio, int err)
+{
+        complete((struct completion *)bio->bi_private);
+}
+static void scrub_fixup(struct scrub_bio *sbio, int ix)
+{
+        struct scrub_dev *sdev = sbio->sdev;
+        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+        struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+        struct btrfs_multi_bio *multi = NULL;
+        u64 logical = sbio->logical + ix * PAGE_SIZE;
+        u64 length;
+        int i;
+        int ret;
+        DECLARE_COMPLETION_ONSTACK(complete);
+        if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
+            (sbio->spag[ix].have_csum == 0)) {
+                /*
+                 * nodatasum, don't try to fix anything
+                 * FIXME: we can do better, open the inode and trigger a
+                 * writeback
+                 */
+                goto uncorrectable;
+        }
+        length = PAGE_SIZE;
+        ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
+                              &multi, 0);
+        if (ret || !multi || length < PAGE_SIZE) {
+                printk(KERN_ERR
+                       "scrub_fixup: btrfs_map_block failed us for %llu\n",
+                       (unsigned long long)logical);
+                WARN_ON(1);
+                return;
+        }
+        if (multi->num_stripes == 1)
+                /* there aren't any replicas */
+                goto uncorrectable;
+        /*
+         * first find a good copy
+         */
+        for (i = 0; i < multi->num_stripes; ++i) {
+                if (i == sbio->spag[ix].mirror_num)
+                        continue;
+                if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
+                                   multi->stripes[i].physical >> 9,
+                                   sbio->bio->bi_io_vec[ix].bv_page)) {
+                        /* I/O-error, this is not a good copy */
+                        continue;
+                }
+                if (scrub_fixup_check(sbio, ix) == 0)
+                        break;
+        }
+        if (i == multi->num_stripes)
+                goto uncorrectable;
+        if (!sdev->readonly) {
+                /*
+                 * bi_io_vec[ix].bv_page now contains good data, write it back
+                 */
+                if (scrub_fixup_io(WRITE, sdev->dev->bdev,
+                                   (sbio->physical + ix * PAGE_SIZE) >> 9,
+                                   sbio->bio->bi_io_vec[ix].bv_page)) {
+                        /* I/O-error, writeback failed, give up */
+                        goto uncorrectable;
+                }
+        }
+        kfree(multi);
+        spin_lock(&sdev->stat_lock);
+        ++sdev->stat.corrected_errors;
+        spin_unlock(&sdev->stat_lock);
+        if (printk_ratelimit())
+                printk(KERN_ERR "btrfs: fixed up at %llu\n",
+                       (unsigned long long)logical);
+        return;
+uncorrectable:
+        kfree(multi);
+        spin_lock(&sdev->stat_lock);
+        ++sdev->stat.uncorrectable_errors;
+        spin_unlock(&sdev->stat_lock);
+        if (printk_ratelimit())
+                printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
+                         (unsigned long long)logical);
+}
+static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
+                         struct page *page)
+{
+        struct bio *bio = NULL;
+        int ret;
+        DECLARE_COMPLETION_ONSTACK(complete);
+        bio = bio_alloc(GFP_NOFS, 1);
+        bio->bi_bdev = bdev;
+        bio->bi_sector = sector;
+        bio_add_page(bio, page, PAGE_SIZE, 0);
+        bio->bi_end_io = scrub_fixup_end_io;
+        bio->bi_private = &complete;
+        submit_bio(rw, bio);
+        /* this will also unplug the queue */
+        wait_for_completion(&complete);
+        ret = !test_bit(BIO_UPTODATE, &bio->bi_flags);
+        bio_put(bio);
+        return ret;
+}
+static void scrub_bio_end_io(struct bio *bio, int err)
+{
+        struct scrub_bio *sbio = bio->bi_private;
+        struct scrub_dev *sdev = sbio->sdev;
+        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+        sbio->err = err;
+        sbio->bio = bio;
+        btrfs_queue_worker(&fs_info->scrub_workers, &sbio->work);
+}
+static void scrub_checksum(struct btrfs_work *work)
+{
+        struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
+        struct scrub_dev *sdev = sbio->sdev;
+        struct page *page;
+        void *buffer;
+        int i;
+        u64 flags;
+        u64 logical;
+        int ret;
+        if (sbio->err) {
+                for (i = 0; i < sbio->count; ++i)
+                        scrub_recheck_error(sbio, i);
+                sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
+                sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
+                sbio->bio->bi_phys_segments = 0;
+                sbio->bio->bi_idx = 0;
+                for (i = 0; i < sbio->count; i++) {
+                        struct bio_vec *bi;
+                        bi = &sbio->bio->bi_io_vec[i];
+                        bi->bv_offset = 0;
+                        bi->bv_len = PAGE_SIZE;
+                }
+                spin_lock(&sdev->stat_lock);
+                ++sdev->stat.read_errors;
+                spin_unlock(&sdev->stat_lock);
+                goto out;
+        }
+        for (i = 0; i < sbio->count; ++i) {
+                page = sbio->bio->bi_io_vec[i].bv_page;
+                buffer = kmap_atomic(page, KM_USER0);
+                flags = sbio->spag[i].flags;
+                logical = sbio->logical + i * PAGE_SIZE;
+                ret = 0;
+                if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                        ret = scrub_checksum_data(sdev, sbio->spag + i, buffer);
+                } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+                        ret = scrub_checksum_tree_block(sdev, sbio->spag + i,
+                                                        logical, buffer);
+                } else if (flags & BTRFS_EXTENT_FLAG_SUPER) {
+                        BUG_ON(i);
+                        (void)scrub_checksum_super(sbio, buffer);
+                } else {
+                        WARN_ON(1);
+                }
+                kunmap_atomic(buffer, KM_USER0);
+                if (ret)
+                        scrub_recheck_error(sbio, i);
+        }
+out:
+        scrub_free_bio(sbio->bio);
+        sbio->bio = NULL;
+        spin_lock(&sdev->list_lock);
+        sbio->next_free = sdev->first_free;
+        sdev->first_free = sbio->index;
+        spin_unlock(&sdev->list_lock);
+        atomic_dec(&sdev->in_flight);
+        wake_up(&sdev->list_wait);
+}
+static int scrub_checksum_data(struct scrub_dev *sdev,
+                               struct scrub_page *spag, void *buffer)
+{
+        u8 csum[BTRFS_CSUM_SIZE];
+        u32 crc = ~(u32)0;
+        int fail = 0;
+        struct btrfs_root *root = sdev->dev->dev_root;
+        if (!spag->have_csum)
+                return 0;
+        crc = btrfs_csum_data(root, buffer, crc, PAGE_SIZE);
+        btrfs_csum_final(crc, csum);
+        if (memcmp(csum, spag->csum, sdev->csum_size))
+                fail = 1;
+        spin_lock(&sdev->stat_lock);
+        ++sdev->stat.data_extents_scrubbed;
+        sdev->stat.data_bytes_scrubbed += PAGE_SIZE;
+        if (fail)
+                ++sdev->stat.csum_errors;
+        spin_unlock(&sdev->stat_lock);
+        return fail;
+}
+static int scrub_checksum_tree_block(struct scrub_dev *sdev,
+                                     struct scrub_page *spag, u64 logical,
+                                     void *buffer)
+{
+        struct btrfs_header *h;
+        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u8 csum[BTRFS_CSUM_SIZE];
+        u32 crc = ~(u32)0;
+        int fail = 0;
+        int crc_fail = 0;
+        /*
+         * we don't use the getter functions here, as we
+         * a) don't have an extent buffer and
+         * b) the page is already kmapped
+         */
+        h = (struct btrfs_header *)buffer;
+        if (logical != le64_to_cpu(h->bytenr))
+                ++fail;
+        if (spag->generation != le64_to_cpu(h->generation))
+                ++fail;
+        if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+                ++fail;
+        if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
+                   BTRFS_UUID_SIZE))
+                ++fail;
+        crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
+                              PAGE_SIZE - BTRFS_CSUM_SIZE);
+        btrfs_csum_final(crc, csum);
+        if (memcmp(csum, h->csum, sdev->csum_size))
+                ++crc_fail;
+        spin_lock(&sdev->stat_lock);
+        ++sdev->stat.tree_extents_scrubbed;
+        sdev->stat.tree_bytes_scrubbed += PAGE_SIZE;
+        if (crc_fail)
+                ++sdev->stat.csum_errors;
+        if (fail)
+                ++sdev->stat.verify_errors;
+        spin_unlock(&sdev->stat_lock);
+        return fail || crc_fail;
+}
+static int scrub_checksum_super(struct scrub_bio *sbio, void *buffer)
+{
+        struct btrfs_super_block *s;
+        u64 logical;
+        struct scrub_dev *sdev = sbio->sdev;
+        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u8 csum[BTRFS_CSUM_SIZE];
+        u32 crc = ~(u32)0;
+        int fail = 0;
+        s = (struct btrfs_super_block *)buffer;
+        logical = sbio->logical;
+        if (logical != le64_to_cpu(s->bytenr))
+                ++fail;
+        if (sbio->spag[0].generation != le64_to_cpu(s->generation))
+                ++fail;
+        if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
+                ++fail;
+        crc = btrfs_csum_data(root, buffer + BTRFS_CSUM_SIZE, crc,
+                              PAGE_SIZE - BTRFS_CSUM_SIZE);
+        btrfs_csum_final(crc, csum);
+        if (memcmp(csum, s->csum, sbio->sdev->csum_size))
+                ++fail;
+        if (fail) {
+                /*
+                 * if we find an error in a super block, we just report it.
+                 * They will get written with the next transaction commit
+                 * anyway
+                 */
+                spin_lock(&sdev->stat_lock);
+                ++sdev->stat.super_errors;
+                spin_unlock(&sdev->stat_lock);
+        }
+        return fail;
+}
+static int scrub_submit(struct scrub_dev *sdev)
+{
+        struct scrub_bio *sbio;
+        struct bio *bio;
+        int i;
+        if (sdev->curr == -1)
+                return 0;
+        sbio = sdev->bios[sdev->curr];
+        bio = bio_alloc(GFP_NOFS, sbio->count);
+        if (!bio)
+                goto nomem;
+        bio->bi_private = sbio;
+        bio->bi_end_io = scrub_bio_end_io;
+        bio->bi_bdev = sdev->dev->bdev;
+        bio->bi_sector = sbio->physical >> 9;
+        for (i = 0; i < sbio->count; ++i) {
+                struct page *page;
+                int ret;
+                page = alloc_page(GFP_NOFS);
+                if (!page)
+                        goto nomem;
+                ret = bio_add_page(bio, page, PAGE_SIZE, 0);
+                if (!ret) {
+                        __free_page(page);
+                        goto nomem;
+                }
+        }
+        sbio->err = 0;
+        sdev->curr = -1;
+        atomic_inc(&sdev->in_flight);
+        submit_bio(READ, bio);
+        return 0;
+nomem:
+        scrub_free_bio(bio);
+        return -ENOMEM;
+}
+static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
+                      u64 physical, u64 flags, u64 gen, u64 mirror_num,
+                      u8 *csum, int force)
+{
+        struct scrub_bio *sbio;
+again:
+        /*
+         * grab a fresh bio or wait for one to become available
+         */
+        while (sdev->curr == -1) {
+                spin_lock(&sdev->list_lock);
+                sdev->curr = sdev->first_free;
+                if (sdev->curr != -1) {
+                        sdev->first_free = sdev->bios[sdev->curr]->next_free;
+                        sdev->bios[sdev->curr]->next_free = -1;
+                        sdev->bios[sdev->curr]->count = 0;
+                        spin_unlock(&sdev->list_lock);
+                } else {
+                        spin_unlock(&sdev->list_lock);
+                        wait_event(sdev->list_wait, sdev->first_free != -1);
+                }
+        }
+        sbio = sdev->bios[sdev->curr];
+        if (sbio->count == 0) {
+                sbio->physical = physical;
+                sbio->logical = logical;
+        } else if (sbio->physical + sbio->count * PAGE_SIZE != physical ||
+                   sbio->logical + sbio->count * PAGE_SIZE != logical) {
+                int ret;
+                ret = scrub_submit(sdev);
+                if (ret)
+                        return ret;
+                goto again;
+        }
+        sbio->spag[sbio->count].flags = flags;
+        sbio->spag[sbio->count].generation = gen;
+        sbio->spag[sbio->count].have_csum = 0;
+        sbio->spag[sbio->count].mirror_num = mirror_num;
+        if (csum) {
+                sbio->spag[sbio->count].have_csum = 1;
+                memcpy(sbio->spag[sbio->count].csum, csum, sdev->csum_size);
+        }
+        ++sbio->count;
+        if (sbio->count == SCRUB_PAGES_PER_BIO || force) {
+                int ret;
+                ret = scrub_submit(sdev);
+                if (ret)
+                        return ret;
+        }
+        return 0;
+}
+static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
+                           u8 *csum)
+{
+        struct btrfs_ordered_sum *sum = NULL;
+        int ret = 0;
+        unsigned long i;
+        unsigned long num_sectors;
+        u32 sectorsize = sdev->dev->dev_root->sectorsize;
+        while (!list_empty(&sdev->csum_list)) {
+                sum = list_first_entry(&sdev->csum_list,
+                                       struct btrfs_ordered_sum, list);
+                if (sum->bytenr > logical)
+                        return 0;
+                if (sum->bytenr + sum->len > logical)
+                        break;
+                ++sdev->stat.csum_discards;
+                list_del(&sum->list);
+                kfree(sum);
+                sum = NULL;
+        }
+        if (!sum)
+                return 0;
+        num_sectors = sum->len / sectorsize;
+        for (i = 0; i < num_sectors; ++i) {
+                if (sum->sums[i].bytenr == logical) {
+                        memcpy(csum, &sum->sums[i].sum, sdev->csum_size);
+                        ret = 1;
+                        break;
+                }
+        }
+        if (ret && i == num_sectors - 1) {
+                list_del(&sum->list);
+                kfree(sum);
+        }
+        return ret;
+}
+/* scrub extent tries to collect up to 64 kB for each bio */
+static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
+                        u64 physical, u64 flags, u64 gen, u64 mirror_num)
+{
+        int ret;
+        u8 csum[BTRFS_CSUM_SIZE];
+        while (len) {
+                u64 l = min_t(u64, len, PAGE_SIZE);
+                int have_csum = 0;
+                if (flags & BTRFS_EXTENT_FLAG_DATA) {
+                        /* push csums to sbio */
+                        have_csum = scrub_find_csum(sdev, logical, l, csum);
+                        if (have_csum == 0)
+                                ++sdev->stat.no_csum;
+                }
+                ret = scrub_page(sdev, logical, l, physical, flags, gen,
+                                 mirror_num, have_csum ? csum : NULL, 0);
+                if (ret)
+                        return ret;
+                len -= l;
+                logical += l;
+                physical += l;
+        }
+        return 0;
+}
+static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
+        struct map_lookup *map, int num, u64 base, u64 length)
+{
+        struct btrfs_path *path;
+        struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
+        struct btrfs_root *root = fs_info->extent_root;
+        struct btrfs_root *csum_root = fs_info->csum_root;
+        struct btrfs_extent_item *extent;
+        struct blk_plug plug;
+        u64 flags;
+        int ret;
+        int slot;
+        int i;
+        u64 nstripes;
+        int start_stripe;
+        struct extent_buffer *l;
+        struct btrfs_key key;
+        u64 physical;
+        u64 logical;
+        u64 generation;
+        u64 mirror_num;
+        u64 increment = map->stripe_len;
+        u64 offset;
+        nstripes = length;
+        offset = 0;
+        do_div(nstripes, map->stripe_len);
+        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                offset = map->stripe_len * num;
+                increment = map->stripe_len * map->num_stripes;
+                mirror_num = 0;
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                int factor = map->num_stripes / map->sub_stripes;
+                offset = map->stripe_len * (num / map->sub_stripes);
+                increment = map->stripe_len * factor;
+                mirror_num = num % map->sub_stripes;
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+                increment = map->stripe_len;
+                mirror_num = num % map->num_stripes;
+        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
+                increment = map->stripe_len;
+                mirror_num = num % map->num_stripes;
+        } else {
+                increment = map->stripe_len;
+                mirror_num = 0;
+        }
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        /*
+         * find all extents for each stripe and just read them to get
+         * them into the page cache
+         * FIXME: we can do better. build a more intelligent prefetching
+         */
+        logical = base + offset;
+        physical = map->stripes[num].physical;
+        ret = 0;
+        for (i = 0; i < nstripes; ++i) {
+                key.objectid = logical;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = (u64)0;
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out_noplug;
+                /*
+                 * we might miss half an extent here, but that doesn't matter,
+                 * as it's only the prefetch
+                 */
+                while (1) {
+                        l = path->nodes[0];
+                        slot = path->slots[0];
+                        if (slot >= btrfs_header_nritems(l)) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret == 0)
+                                        continue;
+                                if (ret < 0)
+                                        goto out_noplug;
+                                break;
+                        }
+                        btrfs_item_key_to_cpu(l, &key, slot);
+                        if (key.objectid >= logical + map->stripe_len)
+                                break;
+                        path->slots[0]++;
+                }
+                btrfs_release_path(path);
+                logical += increment;
+                physical += map->stripe_len;
+                cond_resched();
+        }
+        /*
+         * collect all data csums for the stripe to avoid seeking during
+         * the scrub. This might currently (crc32) end up to be about 1MB
+         */
+        start_stripe = 0;
+        blk_start_plug(&plug);
+again:
+        logical = base + offset + start_stripe * increment;
+        for (i = start_stripe; i < nstripes; ++i) {
+                ret = btrfs_lookup_csums_range(csum_root, logical,
+                                               logical + map->stripe_len - 1,
+                                               &sdev->csum_list, 1);
+                if (ret)
+                        goto out;
+                logical += increment;
+                cond_resched();
+        }
+        /*
+         * now find all extents for each stripe and scrub them
+         */
+        logical = base + offset + start_stripe * increment;
+        physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+        ret = 0;
+        for (i = start_stripe; i < nstripes; ++i) {
+                /*
+                 * canceled?
+                 */
+                if (atomic_read(&fs_info->scrub_cancel_req) ||
+                    atomic_read(&sdev->cancel_req)) {
+                        ret = -ECANCELED;
+                        goto out;
+                }
+                /*
+                 * check to see if we have to pause
+                 */
+                if (atomic_read(&fs_info->scrub_pause_req)) {
+                        /* push queued extents */
+                        scrub_submit(sdev);
+                        wait_event(sdev->list_wait,
+                                   atomic_read(&sdev->in_flight) == 0);
+                        atomic_inc(&fs_info->scrubs_paused);
+                        wake_up(&fs_info->scrub_pause_wait);
+                        mutex_lock(&fs_info->scrub_lock);
+                        while (atomic_read(&fs_info->scrub_pause_req)) {
+                                mutex_unlock(&fs_info->scrub_lock);
+                                wait_event(fs_info->scrub_pause_wait,
+                                   atomic_read(&fs_info->scrub_pause_req) == 0);
+                                mutex_lock(&fs_info->scrub_lock);
+                        }
+                        atomic_dec(&fs_info->scrubs_paused);
+                        mutex_unlock(&fs_info->scrub_lock);
+                        wake_up(&fs_info->scrub_pause_wait);
+                        scrub_free_csums(sdev);
+                        start_stripe = i;
+                        goto again;
+                }
+                key.objectid = logical;
+                key.type = BTRFS_EXTENT_ITEM_KEY;
+                key.offset = (u64)0;
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        goto out;
+                if (ret > 0) {
+                        ret = btrfs_previous_item(root, path, 0,
+                                                  BTRFS_EXTENT_ITEM_KEY);
+                        if (ret < 0)
+                                goto out;
+                        if (ret > 0) {
+                                /* there's no smaller item, so stick with the
+                                 * larger one */
+                                btrfs_release_path(path);
+                                ret = btrfs_search_slot(NULL, root, &key,
+                                                        path, 0, 0);
+                                if (ret < 0)
+                                        goto out;
+                        }
+                }
+                while (1) {
+                        l = path->nodes[0];
+                        slot = path->slots[0];
+                        if (slot >= btrfs_header_nritems(l)) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret == 0)
+                                        continue;
+                                if (ret < 0)
+                                        goto out;
+                                break;
+                        }
+                        btrfs_item_key_to_cpu(l, &key, slot);
+                        if (key.objectid + key.offset <= logical)
+                                goto next;
+                        if (key.objectid >= logical + map->stripe_len)
+                                break;
+                        if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY)
+                                goto next;
+                        extent = btrfs_item_ptr(l, slot,
+                                                struct btrfs_extent_item);
+                        flags = btrfs_extent_flags(l, extent);
+                        generation = btrfs_extent_generation(l, extent);
+                        if (key.objectid < logical &&
+                            (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
+                                printk(KERN_ERR
+                                       "btrfs scrub: tree block %llu spanning "
+                                       "stripes, ignored. logical=%llu\n",
+                                       (unsigned long long)key.objectid,
+                                       (unsigned long long)logical);
+                                goto next;
+                        }
+                        /*
+                         * trim extent to this stripe
+                         */
+                        if (key.objectid < logical) {
+                                key.offset -= logical - key.objectid;
+                                key.objectid = logical;
+                        }
+                        if (key.objectid + key.offset >
+                            logical + map->stripe_len) {
+                                key.offset = logical + map->stripe_len -
+                                             key.objectid;
+                        }
+                        ret = scrub_extent(sdev, key.objectid, key.offset,
+                                           key.objectid - logical + physical,
+                                           flags, generation, mirror_num);
+                        if (ret)
+                                goto out;
+next:
+                        path->slots[0]++;
+                }
+                btrfs_release_path(path);
+                logical += increment;
+                physical += map->stripe_len;
+                spin_lock(&sdev->stat_lock);
+                sdev->stat.last_physical = physical;
+                spin_unlock(&sdev->stat_lock);
+        }
+        /* push queued extents */
+        scrub_submit(sdev);
+out:
+        blk_finish_plug(&plug);
+out_noplug:
+        btrfs_free_path(path);
+        return ret < 0 ? ret : 0;
+}
+static noinline_for_stack int scrub_chunk(struct scrub_dev *sdev,
+        u64 chunk_tree, u64 chunk_objectid, u64 chunk_offset, u64 length)
+{
+        struct btrfs_mapping_tree *map_tree =
+                &sdev->dev->dev_root->fs_info->mapping_tree;
+        struct map_lookup *map;
+        struct extent_map *em;
+        int i;
+        int ret = -EINVAL;
+        read_lock(&map_tree->map_tree.lock);
+        em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
+        read_unlock(&map_tree->map_tree.lock);
+        if (!em)
+                return -EINVAL;
+        map = (struct map_lookup *)em->bdev;
+        if (em->start != chunk_offset)
+                goto out;
+        if (em->len < length)
+                goto out;
+        for (i = 0; i < map->num_stripes; ++i) {
+                if (map->stripes[i].dev == sdev->dev) {
+                        ret = scrub_stripe(sdev, map, i, chunk_offset, length);
+                        if (ret)
+                                goto out;
+                }
+        }
+out:
+        free_extent_map(em);
+        return ret;
+}
+static noinline_for_stack
+int scrub_enumerate_chunks(struct scrub_dev *sdev, u64 start, u64 end)
+{
+        struct btrfs_dev_extent *dev_extent = NULL;
+        struct btrfs_path *path;
+        struct btrfs_root *root = sdev->dev->dev_root;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        u64 length;
+        u64 chunk_tree;
+        u64 chunk_objectid;
+        u64 chunk_offset;
+        int ret;
+        int slot;
+        struct extent_buffer *l;
+        struct btrfs_key key;
+        struct btrfs_key found_key;
+        struct btrfs_block_group_cache *cache;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        path->search_commit_root = 1;
+        path->skip_locking = 1;
+        key.objectid = sdev->dev->devid;
+        key.offset = 0ull;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        while (1) {
+                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+                if (ret < 0)
+                        break;
+                if (ret > 0) {
+                        if (path->slots[0] >=
+                            btrfs_header_nritems(path->nodes[0])) {
+                                ret = btrfs_next_leaf(root, path);
+                                if (ret)
+                                        break;
+                        }
+                }
+                l = path->nodes[0];
+                slot = path->slots[0];
+                btrfs_item_key_to_cpu(l, &found_key, slot);
+                if (found_key.objectid != sdev->dev->devid)
+                        break;
+                if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
+                        break;
+                if (found_key.offset >= end)
+                        break;
+                if (found_key.offset < key.offset)
+                        break;
+                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+                length = btrfs_dev_extent_length(l, dev_extent);
+                if (found_key.offset + length <= start) {
+                        key.offset = found_key.offset + length;
+                        btrfs_release_path(path);
+                        continue;
+                }
+                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
+                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
+                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
+                /*
+                 * get a reference on the corresponding block group to prevent
+                 * the chunk from going away while we scrub it
+                 */
+                cache = btrfs_lookup_block_group(fs_info, chunk_offset);
+                if (!cache) {
+                        ret = -ENOENT;
+                        break;
+                }
+                ret = scrub_chunk(sdev, chunk_tree, chunk_objectid,
+                                  chunk_offset, length);
+                btrfs_put_block_group(cache);
+                if (ret)
+                        break;
+                key.offset = found_key.offset + length;
+                btrfs_release_path(path);
+        }
+        btrfs_free_path(path);
+        /*
+         * ret can still be 1 from search_slot or next_leaf,
+         * that's not an error
+         */
+        return ret < 0 ? ret : 0;
+}
+static noinline_for_stack int scrub_supers(struct scrub_dev *sdev)
+{
+        int     i;
+        u64     bytenr;
+        u64     gen;
+        int     ret;
+        struct btrfs_device *device = sdev->dev;
+        struct btrfs_root *root = device->dev_root;
+        gen = root->fs_info->last_trans_committed;
+        for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
+                bytenr = btrfs_sb_offset(i);
+                if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes)
+                        break;
+                ret = scrub_page(sdev, bytenr, PAGE_SIZE, bytenr,
+                                 BTRFS_EXTENT_FLAG_SUPER, gen, i, NULL, 1);
+                if (ret)
+                        return ret;
+        }
+        wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+        return 0;
+}
+/*
+ * get a reference count on fs_info->scrub_workers. start worker if necessary
+ */
+static noinline_for_stack int scrub_workers_get(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        mutex_lock(&fs_info->scrub_lock);
+        if (fs_info->scrub_workers_refcnt == 0) {
+                btrfs_init_workers(&fs_info->scrub_workers, "scrub",
+                           fs_info->thread_pool_size, &fs_info->generic_worker);
+                fs_info->scrub_workers.idle_thresh = 4;
+                btrfs_start_workers(&fs_info->scrub_workers, 1);
+        }
+        ++fs_info->scrub_workers_refcnt;
+        mutex_unlock(&fs_info->scrub_lock);
+        return 0;
+}
+static noinline_for_stack void scrub_workers_put(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        mutex_lock(&fs_info->scrub_lock);
+        if (--fs_info->scrub_workers_refcnt == 0)
+                btrfs_stop_workers(&fs_info->scrub_workers);
+        WARN_ON(fs_info->scrub_workers_refcnt < 0);
+        mutex_unlock(&fs_info->scrub_lock);
+}
+int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
+                    struct btrfs_scrub_progress *progress, int readonly)
+{
+        struct scrub_dev *sdev;
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        int ret;
+        struct btrfs_device *dev;
+        if (btrfs_fs_closing(root->fs_info))
+                return -EINVAL;
+        /*
+         * check some assumptions
+         */
+        if (root->sectorsize != PAGE_SIZE ||
+            root->sectorsize != root->leafsize ||
+            root->sectorsize != root->nodesize) {
+                printk(KERN_ERR "btrfs_scrub: size assumptions fail\n");
+                return -EINVAL;
+        }
+        ret = scrub_workers_get(root);
+        if (ret)
+                return ret;
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        dev = btrfs_find_device(root, devid, NULL, NULL);
+        if (!dev || dev->missing) {
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                scrub_workers_put(root);
+                return -ENODEV;
+        }
+        mutex_lock(&fs_info->scrub_lock);
+        if (!dev->in_fs_metadata) {
+                mutex_unlock(&fs_info->scrub_lock);
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                scrub_workers_put(root);
+                return -ENODEV;
+        }
+        if (dev->scrub_device) {
+                mutex_unlock(&fs_info->scrub_lock);
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                scrub_workers_put(root);
+                return -EINPROGRESS;
+        }
+        sdev = scrub_setup_dev(dev);
+        if (IS_ERR(sdev)) {
+                mutex_unlock(&fs_info->scrub_lock);
+                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+                scrub_workers_put(root);
+                return PTR_ERR(sdev);
+        }
+        sdev->readonly = readonly;
+        dev->scrub_device = sdev;
+        atomic_inc(&fs_info->scrubs_running);
+        mutex_unlock(&fs_info->scrub_lock);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        down_read(&fs_info->scrub_super_lock);
+        ret = scrub_supers(sdev);
+        up_read(&fs_info->scrub_super_lock);
+        if (!ret)
+                ret = scrub_enumerate_chunks(sdev, start, end);
+        wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
+        atomic_dec(&fs_info->scrubs_running);
+        wake_up(&fs_info->scrub_pause_wait);
+        if (progress)
+                memcpy(progress, &sdev->stat, sizeof(*progress));
+        mutex_lock(&fs_info->scrub_lock);
+        dev->scrub_device = NULL;
+        mutex_unlock(&fs_info->scrub_lock);
+        scrub_free_dev(sdev);
+        scrub_workers_put(root);
+        return ret;
+}
+int btrfs_scrub_pause(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        mutex_lock(&fs_info->scrub_lock);
+        atomic_inc(&fs_info->scrub_pause_req);
+        while (atomic_read(&fs_info->scrubs_paused) !=
+               atomic_read(&fs_info->scrubs_running)) {
+                mutex_unlock(&fs_info->scrub_lock);
+                wait_event(fs_info->scrub_pause_wait,
+                           atomic_read(&fs_info->scrubs_paused) ==
+                           atomic_read(&fs_info->scrubs_running));
+                mutex_lock(&fs_info->scrub_lock);
+        }
+        mutex_unlock(&fs_info->scrub_lock);
+        return 0;
+}
+int btrfs_scrub_continue(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        atomic_dec(&fs_info->scrub_pause_req);
+        wake_up(&fs_info->scrub_pause_wait);
+        return 0;
+}
+int btrfs_scrub_pause_super(struct btrfs_root *root)
+{
+        down_write(&root->fs_info->scrub_super_lock);
+        return 0;
+}
+int btrfs_scrub_continue_super(struct btrfs_root *root)
+{
+        up_write(&root->fs_info->scrub_super_lock);
+        return 0;
+}
+int btrfs_scrub_cancel(struct btrfs_root *root)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        mutex_lock(&fs_info->scrub_lock);
+        if (!atomic_read(&fs_info->scrubs_running)) {
+                mutex_unlock(&fs_info->scrub_lock);
+                return -ENOTCONN;
+        }
+        atomic_inc(&fs_info->scrub_cancel_req);
+        while (atomic_read(&fs_info->scrubs_running)) {
+                mutex_unlock(&fs_info->scrub_lock);
+                wait_event(fs_info->scrub_pause_wait,
+                           atomic_read(&fs_info->scrubs_running) == 0);
+                mutex_lock(&fs_info->scrub_lock);
+        }
+        atomic_dec(&fs_info->scrub_cancel_req);
+        mutex_unlock(&fs_info->scrub_lock);
+        return 0;
+}
+int btrfs_scrub_cancel_dev(struct btrfs_root *root, struct btrfs_device *dev)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct scrub_dev *sdev;
+        mutex_lock(&fs_info->scrub_lock);
+        sdev = dev->scrub_device;
+        if (!sdev) {
+                mutex_unlock(&fs_info->scrub_lock);
+                return -ENOTCONN;
+        }
+        atomic_inc(&sdev->cancel_req);
+        while (dev->scrub_device) {
+                mutex_unlock(&fs_info->scrub_lock);
+                wait_event(fs_info->scrub_pause_wait,
+                           dev->scrub_device == NULL);
+                mutex_lock(&fs_info->scrub_lock);
+        }
+        mutex_unlock(&fs_info->scrub_lock);
+        return 0;
+}
+int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_device *dev;
+        int ret;
+        /*
+         * we have to hold the device_list_mutex here so the device
+         * does not go away in cancel_dev. FIXME: find a better solution
+         */
+        mutex_lock(&fs_info->fs_devices->device_list_mutex);
+        dev = btrfs_find_device(root, devid, NULL, NULL);
+        if (!dev) {
+                mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+                return -ENODEV;
+        }
+        ret = btrfs_scrub_cancel_dev(root, dev);
+        mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+        return ret;
+}
+int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
+                         struct btrfs_scrub_progress *progress)
+{
+        struct btrfs_device *dev;
+        struct scrub_dev *sdev = NULL;
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        dev = btrfs_find_device(root, devid, NULL, NULL);
+        if (dev)
+                sdev = dev->scrub_device;
+        if (sdev)
+                memcpy(progress, &sdev->stat, sizeof(*progress));
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+        return dev ? (sdev ? 0 : -ENOTCONN) : -ENODEV;
+}
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 1776dbd8dc98..15634d4648d7 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -39,7 +39,9 @@
 #include <linux/miscdevice.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
+#include <linux/cleancache.h>
 #include "compat.h"
+#include "delayed-inode.h"
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
@@ -52,8 +54,95 @@
 #include "export.h"
 #include "compression.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/btrfs.h>
 static const struct super_operations btrfs_super_ops;
+static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
+                                      char nbuf[16])
+{
+        char *errstr = NULL;
+        switch (errno) {
+        case -EIO:
+                errstr = "IO failure";
+                break;
+        case -ENOMEM:
+                errstr = "Out of memory";
+                break;
+        case -EROFS:
+                errstr = "Readonly filesystem";
+                break;
+        default:
+                if (nbuf) {
+                        if (snprintf(nbuf, 16, "error %d", -errno) >= 0)
+                                errstr = nbuf;
+                }
+                break;
+        }
+        return errstr;
+}
+static void __save_error_info(struct btrfs_fs_info *fs_info)
+{
+        /*
+         * today we only save the error info into ram.  Long term we'll
+         * also send it down to the disk
+         */
+        fs_info->fs_state = BTRFS_SUPER_FLAG_ERROR;
+}
+/* NOTE:
+ *      We move write_super stuff at umount in order to avoid deadlock
+ *      for umount hold all lock.
+ */
+static void save_error_info(struct btrfs_fs_info *fs_info)
+{
+        __save_error_info(fs_info);
+}
+/* btrfs handle error by forcing the filesystem readonly */
+static void btrfs_handle_error(struct btrfs_fs_info *fs_info)
+{
+        struct super_block *sb = fs_info->sb;
+        if (sb->s_flags & MS_RDONLY)
+                return;
+        if (fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR) {
+                sb->s_flags |= MS_RDONLY;
+                printk(KERN_INFO "btrfs is forced readonly\n");
+        }
+}
+/*
+ * __btrfs_std_error decodes expected errors from the caller and
+ * invokes the approciate error response.
+ */
+void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
+                     unsigned int line, int errno)
+{
+        struct super_block *sb = fs_info->sb;
+        char nbuf[16];
+        const char *errstr;
+        /*
+         * Special case: if the error is EROFS, and we're already
+         * under MS_RDONLY, then it is safe here.
+         */
+        if (errno == -EROFS && (sb->s_flags & MS_RDONLY))
+                return;
+        errstr = btrfs_decode_error(fs_info, errno, nbuf);
+        printk(KERN_CRIT "BTRFS error (device %s) in %s:%d: %s\n",
+                sb->s_id, function, line, errstr);
+        save_error_info(fs_info);
+        btrfs_handle_error(fs_info);
+}
 static void btrfs_put_super(struct super_block *sb)
 {
        struct btrfs_root *root = btrfs_sb(sb);
@@ -61,14 +150,19 @@ static void btrfs_put_super(struct super_block *sb)
        ret = close_ctree(root);
        sb->s_fs_info = NULL;
+        (void)ret; /* FIXME: need to fix VFS to return error? */
 }
 enum {
        Opt_degraded, Opt_subvol, Opt_subvolid, Opt_device, Opt_nodatasum,
        Opt_nodatacow, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd,
        Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
-        Opt_compress_force, Opt_notreelog, Opt_ratio, Opt_flushoncommit,
+        Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
-        Opt_discard, Opt_err,
+        Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
+        Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
+        Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
+        Opt_inode_cache, Opt_err,
 };
 static match_table_t tokens = {
@@ -83,7 +177,9 @@ static match_table_t tokens = {
        {Opt_alloc_start, "alloc_start=%s"},
        {Opt_thread_pool, "thread_pool=%d"},
        {Opt_compress, "compress"},
+        {Opt_compress_type, "compress=%s"},
        {Opt_compress_force, "compress-force"},
+        {Opt_compress_force_type, "compress-force=%s"},
        {Opt_ssd, "ssd"},
        {Opt_ssd_spread, "ssd_spread"},
        {Opt_nossd, "nossd"},
@@ -92,6 +188,13 @@ static match_table_t tokens = {
        {Opt_flushoncommit, "flushoncommit"},
        {Opt_ratio, "metadata_ratio=%d"},
        {Opt_discard, "discard"},
+        {Opt_space_cache, "space_cache"},
+        {Opt_clear_cache, "clear_cache"},
+        {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
+        {Opt_enospc_debug, "enospc_debug"},
+        {Opt_subvolrootid, "subvolrootid=%d"},
+        {Opt_defrag, "autodefrag"},
+        {Opt_inode_cache, "inode_cache"},
        {Opt_err, NULL},
 };
@@ -106,6 +209,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
        char *p, *num, *orig;
        int intarg;
        int ret = 0;
+        char *compress_type;
+        bool compress_force = false;
        if (!options)
                return 0;
@@ -133,6 +238,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        break;
                case Opt_subvol:
                case Opt_subvolid:
+                case Opt_subvolrootid:
                case Opt_device:
                        /*
                         * These are parsed by btrfs_parse_early_options
@@ -148,14 +254,32 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                        btrfs_set_opt(info->mount_opt, NODATACOW);
                        btrfs_set_opt(info->mount_opt, NODATASUM);
                        break;
-                case Opt_compress:
-                        printk(KERN_INFO "btrfs: use compression\n");
-                        btrfs_set_opt(info->mount_opt, COMPRESS);
-                        break;
                case Opt_compress_force:
-                        printk(KERN_INFO "btrfs: forcing compression\n");
+                case Opt_compress_force_type:
-                        btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+                        compress_force = true;
+                case Opt_compress:
+                case Opt_compress_type:
+                        if (token == Opt_compress ||
+                            token == Opt_compress_force ||
+                            strcmp(args[0].from, "zlib") == 0) {
+                                compress_type = "zlib";
+                                info->compress_type = BTRFS_COMPRESS_ZLIB;
+                        } else if (strcmp(args[0].from, "lzo") == 0) {
+                                compress_type = "lzo";
+                                info->compress_type = BTRFS_COMPRESS_LZO;
+                        } else {
+                                ret = -EINVAL;
+                                goto out;
+                        }
                        btrfs_set_opt(info->mount_opt, COMPRESS);
+                        if (compress_force) {
+                                btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
+                                pr_info("btrfs: force %s compression\n",
+                                        compress_type);
+                        } else
+                                pr_info("btrfs: use %s compression\n",
+                                        compress_type);
                        break;
                case Opt_ssd:
                        printk(KERN_INFO "btrfs: use ssd allocation scheme\n");
@@ -235,6 +359,28 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
                case Opt_discard:
                        btrfs_set_opt(info->mount_opt, DISCARD);
                        break;
+                case Opt_space_cache:
+                        printk(KERN_INFO "btrfs: enabling disk space caching\n");
+                        btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+                        break;
+                case Opt_inode_cache:
+                        printk(KERN_INFO "btrfs: enabling inode map caching\n");
+                        btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
+                        break;
+                case Opt_clear_cache:
+                        printk(KERN_INFO "btrfs: force clearing of disk cache\n");
+                        btrfs_set_opt(info->mount_opt, CLEAR_CACHE);
+                        break;
+                case Opt_user_subvol_rm_allowed:
+                        btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED);
+                        break;
+                case Opt_enospc_debug:
+                        btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG);
+                        break;
+                case Opt_defrag:
+                        printk(KERN_INFO "btrfs: enabling auto defrag");
+                        btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
+                        break;
                case Opt_err:
                        printk(KERN_INFO "btrfs: unrecognized mount option "
                               "'%s'\n", p);
@@ -257,10 +403,10 @@ out:
 */
 static int btrfs_parse_early_options(const char *options, fmode_t flags,
                void *holder, char **subvol_name, u64 *subvol_objectid,
-                struct btrfs_fs_devices **fs_devices)
+                u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
 {
        substring_t args[MAX_OPT_ARGS];
-        char *opts, *p;
+        char *opts, *orig, *p;
        int error = 0;
        int intarg;
@@ -274,6 +420,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
        opts = kstrdup(options, GFP_KERNEL);
        if (!opts)
                return -ENOMEM;
+        orig = opts;
        while ((p = strsep(&opts, ",")) != NULL) {
                int token;
@@ -297,6 +444,18 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
                                        *subvol_objectid = intarg;
                        }
                        break;
+                case Opt_subvolrootid:
+                        intarg = 0;
+                        error = match_int(&args[0], &intarg);
+                        if (!error) {
+                                /* we want the original fs_tree */
+                                if (!intarg)
+                                        *subvol_rootid =
+                                                BTRFS_FS_TREE_OBJECTID;
+                                else
+                                        *subvol_rootid = intarg;
+                        }
+                        break;
                case Opt_device:
                        error = btrfs_scan_one_device(match_strdup(&args[0]),
                                        flags, holder, fs_devices);
@@ -309,7 +468,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
        }
 out_free_opts:
-        kfree(opts);
+        kfree(orig);
 out:
        /*
         * If no subvolume name is specified we use the default one.  Allocate
@@ -360,8 +519,10 @@ static struct dentry *get_default_root(struct super_block *sb,
         */
        dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
        di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
-        if (IS_ERR(di))
+        if (IS_ERR(di)) {
+                btrfs_free_path(path);
                return ERR_CAST(di);
+        }
        if (!di) {
                /*
                 * Ok the default dir item isn't there.  This is weird since
@@ -380,7 +541,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 find_root:
        new_root = btrfs_read_fs_root_no_name(root->fs_info, &location);
        if (IS_ERR(new_root))
-                return ERR_PTR(PTR_ERR(new_root));
+                return ERR_CAST(new_root);
        if (btrfs_root_refs(&new_root->root_item) == 0)
                return ERR_PTR(-ENOENT);
@@ -436,7 +597,6 @@ static int btrfs_fill_super(struct super_block *sb,
 {
        struct inode *inode;
        struct dentry *root_dentry;
-        struct btrfs_super_block *disk_super;
        struct btrfs_root *tree_root;
        struct btrfs_key key;
        int err;
@@ -444,6 +604,7 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_maxbytes = MAX_LFS_FILESIZE;
        sb->s_magic = BTRFS_SUPER_MAGIC;
        sb->s_op = &btrfs_super_ops;
+        sb->s_d_op = &btrfs_dentry_operations;
        sb->s_export_op = &btrfs_export_ops;
        sb->s_xattr = btrfs_xattr_handlers;
        sb->s_time_gran = 1;
@@ -458,7 +619,6 @@ static int btrfs_fill_super(struct super_block *sb,
                return PTR_ERR(tree_root);
        }
        sb->s_fs_info = tree_root;
-        disk_super = &tree_root->fs_info->super_copy;
        key.objectid = BTRFS_FIRST_FREE_OBJECTID;
        key.type = BTRFS_INODE_ITEM_KEY;
@@ -479,6 +639,7 @@ static int btrfs_fill_super(struct super_block *sb,
        sb->s_root = root_dentry;
        save_mount_options(sb, data);
+        cleancache_init_fs(sb);
        return 0;
 fail_close:
@@ -492,6 +653,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        struct btrfs_root *root = btrfs_sb(sb);
        int ret;
+        trace_btrfs_sync_fs(wait);
        if (!wait) {
                filemap_flush(root->fs_info->btree_inode->i_mapping);
                return 0;
@@ -501,6 +664,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
        btrfs_wait_ordered_extents(root, 0, 0);
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans))
+                return PTR_ERR(trans);
        ret = btrfs_commit_transaction(trans, root);
        return ret;
 }
@@ -509,6 +674,7 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 {
        struct btrfs_root *root = btrfs_sb(vfs->mnt_sb);
        struct btrfs_fs_info *info = root->fs_info;
+        char *compress_type;
        if (btrfs_test_opt(root, DEGRADED))
                seq_puts(seq, ",degraded");
@@ -527,8 +693,16 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
        if (info->thread_pool_size !=  min_t(unsigned long,
                                             num_online_cpus() + 2, 8))
                seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
-        if (btrfs_test_opt(root, COMPRESS))
+        if (btrfs_test_opt(root, COMPRESS)) {
-                seq_puts(seq, ",compress");
+                if (info->compress_type == BTRFS_COMPRESS_ZLIB)
+                        compress_type = "zlib";
+                else
+                        compress_type = "lzo";
+                if (btrfs_test_opt(root, FORCE_COMPRESS))
+                        seq_printf(seq, ",compress-force=%s", compress_type);
+                else
+                        seq_printf(seq, ",compress=%s", compress_type);
+        }
        if (btrfs_test_opt(root, NOSSD))
                seq_puts(seq, ",nossd");
        if (btrfs_test_opt(root, SSD_SPREAD))
@@ -543,46 +717,74 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",discard");
        if (!(root->fs_info->sb->s_flags & MS_POSIXACL))
                seq_puts(seq, ",noacl");
+        if (btrfs_test_opt(root, SPACE_CACHE))
+                seq_puts(seq, ",space_cache");
+        if (btrfs_test_opt(root, CLEAR_CACHE))
+                seq_puts(seq, ",clear_cache");
+        if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
+                seq_puts(seq, ",user_subvol_rm_allowed");
+        if (btrfs_test_opt(root, ENOSPC_DEBUG))
+                seq_puts(seq, ",enospc_debug");
+        if (btrfs_test_opt(root, AUTO_DEFRAG))
+                seq_puts(seq, ",autodefrag");
+        if (btrfs_test_opt(root, INODE_MAP_CACHE))
+                seq_puts(seq, ",inode_cache");
        return 0;
 }
 static int btrfs_test_super(struct super_block *s, void *data)
 {
-        struct btrfs_fs_devices *test_fs_devices = data;
+        struct btrfs_root *test_root = data;
        struct btrfs_root *root = btrfs_sb(s);
-        return root->fs_info->fs_devices == test_fs_devices;
+        /*
+         * If this super block is going away, return false as it
+         * can't match as an existing super block.
+         */
+        if (!atomic_read(&s->s_active))
+                return 0;
+        return root->fs_info->fs_devices == test_root->fs_info->fs_devices;
 }
+static int btrfs_set_super(struct super_block *s, void *data)
+{
+        s->s_fs_info = data;
+        return set_anon_super(s, data);
+}
 /*
 * Find a superblock for the given device / mount point.
 *
 * Note:  This is based on get_sb_bdev from fs/super.c with a few additions
 *        for multiple device setup.  Make sure to keep it in sync.
 */
-static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
+static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
-                const char *dev_name, void *data, struct vfsmount *mnt)
+                const char *device_name, void *data)
 {
        struct block_device *bdev = NULL;
        struct super_block *s;
        struct dentry *root;
        struct btrfs_fs_devices *fs_devices = NULL;
+        struct btrfs_root *tree_root = NULL;
+        struct btrfs_fs_info *fs_info = NULL;
        fmode_t mode = FMODE_READ;
        char *subvol_name = NULL;
        u64 subvol_objectid = 0;
+        u64 subvol_rootid = 0;
        int error = 0;
-        int found = 0;
        if (!(flags & MS_RDONLY))
                mode |= FMODE_WRITE;
        error = btrfs_parse_early_options(data, mode, fs_type,
                                          &subvol_name, &subvol_objectid,
-                                          &fs_devices);
+                                          &subvol_rootid, &fs_devices);
        if (error)
-                return error;
+                return ERR_PTR(error);
-        error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices);
+        error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
        if (error)
                goto error_free_subvol_name;
@@ -595,8 +797,24 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                goto error_close_devices;
        }
+        /*
+         * Setup a dummy root and fs_info for test/set super.  This is because
+         * we don't actually fill this stuff out until open_ctree, but we need
+         * it for searching for existing supers, so this lets us do that and
+         * then open_ctree will properly initialize everything later.
+         */
+        fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS);
+        tree_root = kzalloc(sizeof(struct btrfs_root), GFP_NOFS);
+        if (!fs_info || !tree_root) {
+                error = -ENOMEM;
+                goto error_close_devices;
+        }
+        fs_info->tree_root = tree_root;
+        fs_info->fs_devices = fs_devices;
+        tree_root->fs_info = fs_info;
        bdev = fs_devices->latest_bdev;
-        s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices);
+        s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
        if (IS_ERR(s))
                goto error_s;
@@ -607,12 +825,13 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                        goto error_close_devices;
                }
-                found = 1;
                btrfs_close_devices(fs_devices);
+                kfree(fs_info);
+                kfree(tree_root);
        } else {
                char b[BDEVNAME_SIZE];
-                s->s_flags = flags;
+                s->s_flags = flags | MS_NOSEC;
                strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
                error = btrfs_fill_super(s, fs_devices, data,
                                         flags & MS_SILENT ? 1 : 0);
@@ -625,51 +844,58 @@ static int btrfs_get_sb(struct file_system_type *fs_type, int flags,
                s->s_flags |= MS_ACTIVE;
        }
-        root = get_default_root(s, subvol_objectid);
-        if (IS_ERR(root)) {
-                error = PTR_ERR(root);
-                deactivate_locked_super(s);
-                goto error;
-        }
        /* if they gave us a subvolume name bind mount into that */
        if (strcmp(subvol_name, ".")) {
                struct dentry *new_root;
+                root = get_default_root(s, subvol_rootid);
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        deactivate_locked_super(s);
+                        goto error_free_subvol_name;
+                }
                mutex_lock(&root->d_inode->i_mutex);
                new_root = lookup_one_len(subvol_name, root,
                                      strlen(subvol_name));
                mutex_unlock(&root->d_inode->i_mutex);
                if (IS_ERR(new_root)) {
+                        dput(root);
                        deactivate_locked_super(s);
                        error = PTR_ERR(new_root);
-                        dput(root);
+                        goto error_free_subvol_name;
-                        goto error_close_devices;
                }
                if (!new_root->d_inode) {
                        dput(root);
                        dput(new_root);
                        deactivate_locked_super(s);
                        error = -ENXIO;
-                        goto error_close_devices;
+                        goto error_free_subvol_name;
                }
                dput(root);
                root = new_root;
+        } else {
+                root = get_default_root(s, subvol_objectid);
+                if (IS_ERR(root)) {
+                        error = PTR_ERR(root);
+                        deactivate_locked_super(s);
+                        goto error_free_subvol_name;
+                }
        }
-        mnt->mnt_sb = s;
-        mnt->mnt_root = root;
        kfree(subvol_name);
-        return 0;
+        return root;
 error_s:
        error = PTR_ERR(s);
 error_close_devices:
        btrfs_close_devices(fs_devices);
+        kfree(fs_info);
+        kfree(tree_root);
 error_free_subvol_name:
        kfree(subvol_name);
-error:
+        return ERR_PTR(error);
-        return error;
 }
 static int btrfs_remount(struct super_block *sb, int *flags, char *data)
@@ -709,6 +935,153 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
        return 0;
 }
+/* Used to sort the devices by max_avail(descending sort) */
+static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+                                       const void *dev_info2)
+{
+        if (((struct btrfs_device_info *)dev_info1)->max_avail >
+            ((struct btrfs_device_info *)dev_info2)->max_avail)
+                return -1;
+        else if (((struct btrfs_device_info *)dev_info1)->max_avail <
+                 ((struct btrfs_device_info *)dev_info2)->max_avail)
+                return 1;
+        else
+        return 0;
+}
+/*
+ * sort the devices by max_avail, in which max free extent size of each device
+ * is stored.(Descending Sort)
+ */
+static inline void btrfs_descending_sort_devices(
+                                        struct btrfs_device_info *devices,
+                                        size_t nr_devices)
+{
+        sort(devices, nr_devices, sizeof(struct btrfs_device_info),
+             btrfs_cmp_device_free_bytes, NULL);
+}
+/*
+ * The helper to calc the free space on the devices that can be used to store
+ * file data.
+ */
+static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
+{
+        struct btrfs_fs_info *fs_info = root->fs_info;
+        struct btrfs_device_info *devices_info;
+        struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+        struct btrfs_device *device;
+        u64 skip_space;
+        u64 type;
+        u64 avail_space;
+        u64 used_space;
+        u64 min_stripe_size;
+        int min_stripes = 1;
+        int i = 0, nr_devices;
+        int ret;
+        nr_devices = fs_info->fs_devices->rw_devices;
+        BUG_ON(!nr_devices);
+        devices_info = kmalloc(sizeof(*devices_info) * nr_devices,
+                               GFP_NOFS);
+        if (!devices_info)
+                return -ENOMEM;
+        /* calc min stripe number for data space alloction */
+        type = btrfs_get_alloc_profile(root, 1);
+        if (type & BTRFS_BLOCK_GROUP_RAID0)
+                min_stripes = 2;
+        else if (type & BTRFS_BLOCK_GROUP_RAID1)
+                min_stripes = 2;
+        else if (type & BTRFS_BLOCK_GROUP_RAID10)
+                min_stripes = 4;
+        if (type & BTRFS_BLOCK_GROUP_DUP)
+                min_stripe_size = 2 * BTRFS_STRIPE_LEN;
+        else
+                min_stripe_size = BTRFS_STRIPE_LEN;
+        list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
+                if (!device->in_fs_metadata)
+                        continue;
+                avail_space = device->total_bytes - device->bytes_used;
+                /* align with stripe_len */
+                do_div(avail_space, BTRFS_STRIPE_LEN);
+                avail_space *= BTRFS_STRIPE_LEN;
+                /*
+                 * In order to avoid overwritting the superblock on the drive,
+                 * btrfs starts at an offset of at least 1MB when doing chunk
+                 * allocation.
+                 */
+                skip_space = 1024 * 1024;
+                /* user can set the offset in fs_info->alloc_start. */
+                if (fs_info->alloc_start + BTRFS_STRIPE_LEN <=
+                    device->total_bytes)
+                        skip_space = max(fs_info->alloc_start, skip_space);
+                /*
+                 * btrfs can not use the free space in [0, skip_space - 1],
+                 * we must subtract it from the total. In order to implement
+                 * it, we account the used space in this range first.
+                 */
+                ret = btrfs_account_dev_extents_size(device, 0, skip_space - 1,
+                                                     &used_space);
+                if (ret) {
+                        kfree(devices_info);
+                        return ret;
+                }
+                /* calc the free space in [0, skip_space - 1] */
+                skip_space -= used_space;
+                /*
+                 * we can use the free space in [0, skip_space - 1], subtract
+                 * it from the total.
+                 */
+                if (avail_space && avail_space >= skip_space)
+                        avail_space -= skip_space;
+                else
+                        avail_space = 0;
+                if (avail_space < min_stripe_size)
+                        continue;
+                devices_info[i].dev = device;
+                devices_info[i].max_avail = avail_space;
+                i++;
+        }
+        nr_devices = i;
+        btrfs_descending_sort_devices(devices_info, nr_devices);
+        i = nr_devices - 1;
+        avail_space = 0;
+        while (nr_devices >= min_stripes) {
+                if (devices_info[i].max_avail >= min_stripe_size) {
+                        int j;
+                        u64 alloc_size;
+                        avail_space += devices_info[i].max_avail * min_stripes;
+                        alloc_size = devices_info[i].max_avail;
+                        for (j = i + 1 - min_stripes; j <= i; j++)
+                                devices_info[j].max_avail -= alloc_size;
+                }
+                i--;
+                nr_devices--;
+        }
+        kfree(devices_info);
+        *free_bytes = avail_space;
+        return 0;
+}
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
        struct btrfs_root *root = btrfs_sb(dentry->d_sb);
@@ -716,20 +1089,39 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
        struct list_head *head = &root->fs_info->space_info;
        struct btrfs_space_info *found;
        u64 total_used = 0;
+        u64 total_free_data = 0;
        int bits = dentry->d_sb->s_blocksize_bits;
        __be32 *fsid = (__be32 *)root->fs_info->fsid;
+        int ret;
+        /* holding chunk_muext to avoid allocating new chunks */
+        mutex_lock(&root->fs_info->chunk_mutex);
        rcu_read_lock();
-        list_for_each_entry_rcu(found, head, list)
+        list_for_each_entry_rcu(found, head, list) {
+                if (found->flags & BTRFS_BLOCK_GROUP_DATA) {
+                        total_free_data += found->disk_total - found->disk_used;
+                        total_free_data -=
+                                btrfs_account_ro_block_groups_free_space(found);
+                }
                total_used += found->disk_used;
+        }
        rcu_read_unlock();
        buf->f_namelen = BTRFS_NAME_LEN;
        buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits;
        buf->f_bfree = buf->f_blocks - (total_used >> bits);
-        buf->f_bavail = buf->f_bfree;
        buf->f_bsize = dentry->d_sb->s_blocksize;
        buf->f_type = BTRFS_SUPER_MAGIC;
+        buf->f_bavail = total_free_data;
+        ret = btrfs_calc_avail_data_space(root, &total_free_data);
+        if (ret) {
+                mutex_unlock(&root->fs_info->chunk_mutex);
+                return ret;
+        }
+        buf->f_bavail += total_free_data;
+        buf->f_bavail = buf->f_bavail >> bits;
+        mutex_unlock(&root->fs_info->chunk_mutex);
        /* We treat it as constant endianness (it doesn't matter _which_)
           because we want the fsid to come out the same whether mounted
@@ -746,7 +1138,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 static struct file_system_type btrfs_fs_type = {
        .owner          = THIS_MODULE,
        .name           = "btrfs",
-        .get_sb         = btrfs_get_sb,
+        .mount          = btrfs_mount,
        .kill_sb        = kill_anon_super,
        .fs_flags       = FS_REQUIRES_DEV,
 };
@@ -815,6 +1207,7 @@ static const struct file_operations btrfs_ctl_fops = {
        .unlocked_ioctl  = btrfs_control_ioctl,
        .compat_ioctl = btrfs_control_ioctl,
        .owner   = THIS_MODULE,
+        .llseek = noop_llseek,
 };
 static struct miscdevice btrfs_misc = {
@@ -845,10 +1238,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                return err;
-        err = btrfs_init_cachep();
+        err = btrfs_init_compress();
        if (err)
                goto free_sysfs;
+        err = btrfs_init_cachep();
+        if (err)
+                goto free_compress;
        err = extent_io_init();
        if (err)
                goto free_cachep;
@@ -857,10 +1254,14 @@ static int __init init_btrfs_fs(void)
        if (err)
                goto free_extent_io;
-        err = btrfs_interface_init();
+        err = btrfs_delayed_inode_init();
        if (err)
                goto free_extent_map;
+        err = btrfs_interface_init();
+        if (err)
+                goto free_delayed_inode;
        err = register_filesystem(&btrfs_fs_type);
        if (err)
                goto unregister_ioctl;
@@ -870,12 +1271,16 @@ static int __init init_btrfs_fs(void)
 unregister_ioctl:
        btrfs_interface_exit();
+free_delayed_inode:
+        btrfs_delayed_inode_exit();
 free_extent_map:
        extent_map_exit();
 free_extent_io:
        extent_io_exit();
 free_cachep:
        btrfs_destroy_cachep();
+free_compress:
+        btrfs_exit_compress();
 free_sysfs:
        btrfs_exit_sysfs();
        return err;
@@ -884,13 +1289,14 @@ free_sysfs:
 static void __exit exit_btrfs_fs(void)
 {
        btrfs_destroy_cachep();
+        btrfs_delayed_inode_exit();
        extent_map_exit();
        extent_io_exit();
        btrfs_interface_exit();
        unregister_filesystem(&btrfs_fs_type);
        btrfs_exit_sysfs();
        btrfs_cleanup_fs_uuids();
-        btrfs_zlib_exit();
+        btrfs_exit_compress();
 }
 module_init(init_btrfs_fs)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 4ce16ef702a3..daac9ae6d731 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -28,232 +28,9 @@
 #include "disk-io.h"
 #include "transaction.h"
-static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                (unsigned long long)btrfs_root_used(&root->root_item));
-}
-static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                (unsigned long long)btrfs_root_limit(&root->root_item));
-}
-static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                (unsigned long long)btrfs_super_bytes_used(&fs->super_copy));
-}
-static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                (unsigned long long)btrfs_super_total_bytes(&fs->super_copy));
-}
-static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf)
-{
-        return snprintf(buf, PAGE_SIZE, "%llu\n",
-                (unsigned long long)btrfs_super_sectorsize(&fs->super_copy));
-}
-/* this is for root attrs (subvols/snapshots) */
-struct btrfs_root_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct btrfs_root *, char *);
-        ssize_t (*store)(struct btrfs_root *, const char *, size_t);
-};
-#define ROOT_ATTR(name, mode, show, store) \
-static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \
-                                                              show, store)
-ROOT_ATTR(blocks_used,  0444,   root_blocks_used_show,  NULL);
-ROOT_ATTR(block_limit,  0644,   root_block_limit_show,  NULL);
-static struct attribute *btrfs_root_attrs[] = {
-        &btrfs_root_attr_blocks_used.attr,
-        &btrfs_root_attr_block_limit.attr,
-        NULL,
-};
-/* this is for super attrs (actual full fs) */
-struct btrfs_super_attr {
-        struct attribute attr;
-        ssize_t (*show)(struct btrfs_fs_info *, char *);
-        ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t);
-};
-#define SUPER_ATTR(name, mode, show, store) \
-static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \
-                                                                show, store)
-SUPER_ATTR(blocks_used,         0444,   super_blocks_used_show,         NULL);
-SUPER_ATTR(total_blocks,        0444,   super_total_blocks_show,        NULL);
-SUPER_ATTR(blocksize,           0444,   super_blocksize_show,           NULL);
-static struct attribute *btrfs_super_attrs[] = {
-        &btrfs_super_attr_blocks_used.attr,
-        &btrfs_super_attr_total_blocks.attr,
-        &btrfs_super_attr_blocksize.attr,
-        NULL,
-};
-static ssize_t btrfs_super_attr_show(struct kobject *kobj,
-                                    struct attribute *attr, char *buf)
-{
-        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
-                                                super_kobj);
-        struct btrfs_super_attr *a = container_of(attr,
-                                                  struct btrfs_super_attr,
-                                                  attr);
-        return a->show ? a->show(fs, buf) : 0;
-}
-static ssize_t btrfs_super_attr_store(struct kobject *kobj,
-                                     struct attribute *attr,
-                                     const char *buf, size_t len)
-{
-        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
-                                                super_kobj);
-        struct btrfs_super_attr *a = container_of(attr,
-                                                  struct btrfs_super_attr,
-                                                  attr);
-        return a->store ? a->store(fs, buf, len) : 0;
-}
-static ssize_t btrfs_root_attr_show(struct kobject *kobj,
-                                    struct attribute *attr, char *buf)
-{
-        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
-                                                root_kobj);
-        struct btrfs_root_attr *a = container_of(attr,
-                                                 struct btrfs_root_attr,
-                                                 attr);
-        return a->show ? a->show(root, buf) : 0;
-}
-static ssize_t btrfs_root_attr_store(struct kobject *kobj,
-                                     struct attribute *attr,
-                                     const char *buf, size_t len)
-{
-        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
-                                                root_kobj);
-        struct btrfs_root_attr *a = container_of(attr,
-                                                 struct btrfs_root_attr,
-                                                 attr);
-        return a->store ? a->store(root, buf, len) : 0;
-}
-static void btrfs_super_release(struct kobject *kobj)
-{
-        struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info,
-                                                super_kobj);
-        complete(&fs->kobj_unregister);
-}
-static void btrfs_root_release(struct kobject *kobj)
-{
-        struct btrfs_root *root = container_of(kobj, struct btrfs_root,
-                                                root_kobj);
-        complete(&root->kobj_unregister);
-}
-static const struct sysfs_ops btrfs_super_attr_ops = {
-        .show   = btrfs_super_attr_show,
-        .store  = btrfs_super_attr_store,
-};
-static const struct sysfs_ops btrfs_root_attr_ops = {
-        .show   = btrfs_root_attr_show,
-        .store  = btrfs_root_attr_store,
-};
-static struct kobj_type btrfs_root_ktype = {
-        .default_attrs  = btrfs_root_attrs,
-        .sysfs_ops      = &btrfs_root_attr_ops,
-        .release        = btrfs_root_release,
-};
-static struct kobj_type btrfs_super_ktype = {
-        .default_attrs  = btrfs_super_attrs,
-        .sysfs_ops      = &btrfs_super_attr_ops,
-        .release        = btrfs_super_release,
-};
 /* /sys/fs/btrfs/ entry */
 static struct kset *btrfs_kset;
-int btrfs_sysfs_add_super(struct btrfs_fs_info *fs)
-{
-        int error;
-        char *name;
-        char c;
-        int len = strlen(fs->sb->s_id) + 1;
-        int i;
-        name = kmalloc(len, GFP_NOFS);
-        if (!name) {
-                error = -ENOMEM;
-                goto fail;
-        }
-        for (i = 0; i < len; i++) {
-                c = fs->sb->s_id[i];
-                if (c == '/' || c == '\\')
-                        c = '!';
-                name[i] = c;
-        }
-        name[len] = '\0';
-        fs->super_kobj.kset = btrfs_kset;
-        error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype,
-                                     NULL, "%s", name);
-        kfree(name);
-        if (error)
-                goto fail;
-        return 0;
-fail:
-        printk(KERN_ERR "btrfs: sysfs creation for super failed\n");
-        return error;
-}
-int btrfs_sysfs_add_root(struct btrfs_root *root)
-{
-        int error;
-        error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype,
-                                     &root->fs_info->super_kobj,
-                                     "%s", root->name);
-        if (error)
-                goto fail;
-        return 0;
-fail:
-        printk(KERN_ERR "btrfs: sysfs creation for root failed\n");
-        return error;
-}
-void btrfs_sysfs_del_root(struct btrfs_root *root)
-{
-        kobject_put(&root->root_kobj);
-        wait_for_completion(&root->kobj_unregister);
-}
-void btrfs_sysfs_del_super(struct btrfs_fs_info *fs)
-{
-        kobject_put(&fs->super_kobj);
-        wait_for_completion(&fs->kobj_unregister);
-}
 int btrfs_init_sysfs(void)
 {
        btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj);
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 66e4c66cc63b..51dcec86757f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -27,15 +27,15 @@
 #include "transaction.h"
 #include "locking.h"
 #include "tree-log.h"
+#include "inode-map.h"
 #define BTRFS_ROOT_TRANS_TAG 0
 static noinline void put_transaction(struct btrfs_transaction *transaction)
 {
-        WARN_ON(transaction->use_count == 0);
+        WARN_ON(atomic_read(&transaction->use_count) == 0);
-        transaction->use_count--;
+        if (atomic_dec_and_test(&transaction->use_count)) {
-        if (transaction->use_count == 0) {
+                BUG_ON(!list_empty(&transaction->list));
-                list_del_init(&transaction->list);
                memset(transaction, 0, sizeof(*transaction));
                kmem_cache_free(btrfs_transaction_cachep, transaction);
        }
@@ -50,46 +50,72 @@ static noinline void switch_commit_root(struct btrfs_root *root)
 /*
 * either allocate a new transaction or hop into the existing one
 */
-static noinline int join_transaction(struct btrfs_root *root)
+static noinline int join_transaction(struct btrfs_root *root, int nofail)
 {
        struct btrfs_transaction *cur_trans;
+        spin_lock(&root->fs_info->trans_lock);
+        if (root->fs_info->trans_no_join) {
+                if (!nofail) {
+                        spin_unlock(&root->fs_info->trans_lock);
+                        return -EBUSY;
+                }
+        }
        cur_trans = root->fs_info->running_transaction;
-        if (!cur_trans) {
+        if (cur_trans) {
-                cur_trans = kmem_cache_alloc(btrfs_transaction_cachep,
+                atomic_inc(&cur_trans->use_count);
-                                             GFP_NOFS);
+                atomic_inc(&cur_trans->num_writers);
-                BUG_ON(!cur_trans);
-                root->fs_info->generation++;
-                cur_trans->num_writers = 1;
-                cur_trans->num_joined = 0;
-                cur_trans->transid = root->fs_info->generation;
-                init_waitqueue_head(&cur_trans->writer_wait);
-                init_waitqueue_head(&cur_trans->commit_wait);
-                cur_trans->in_commit = 0;
-                cur_trans->blocked = 0;
-                cur_trans->use_count = 1;
-                cur_trans->commit_done = 0;
-                cur_trans->start_time = get_seconds();
-                cur_trans->delayed_refs.root = RB_ROOT;
-                cur_trans->delayed_refs.num_entries = 0;
-                cur_trans->delayed_refs.num_heads_ready = 0;
-                cur_trans->delayed_refs.num_heads = 0;
-                cur_trans->delayed_refs.flushing = 0;
-                cur_trans->delayed_refs.run_delayed_start = 0;
-                spin_lock_init(&cur_trans->delayed_refs.lock);
-                INIT_LIST_HEAD(&cur_trans->pending_snapshots);
-                list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-                extent_io_tree_init(&cur_trans->dirty_pages,
-                                     root->fs_info->btree_inode->i_mapping,
-                                     GFP_NOFS);
-                spin_lock(&root->fs_info->new_trans_lock);
-                root->fs_info->running_transaction = cur_trans;
-                spin_unlock(&root->fs_info->new_trans_lock);
-        } else {
-                cur_trans->num_writers++;
                cur_trans->num_joined++;
+                spin_unlock(&root->fs_info->trans_lock);
+                return 0;
        }
+        spin_unlock(&root->fs_info->trans_lock);
+        cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
+        if (!cur_trans)
+                return -ENOMEM;
+        spin_lock(&root->fs_info->trans_lock);
+        if (root->fs_info->running_transaction) {
+                kmem_cache_free(btrfs_transaction_cachep, cur_trans);
+                cur_trans = root->fs_info->running_transaction;
+                atomic_inc(&cur_trans->use_count);
+                atomic_inc(&cur_trans->num_writers);
+                cur_trans->num_joined++;
+                spin_unlock(&root->fs_info->trans_lock);
+                return 0;
+        }
+        atomic_set(&cur_trans->num_writers, 1);
+        cur_trans->num_joined = 0;
+        init_waitqueue_head(&cur_trans->writer_wait);
+        init_waitqueue_head(&cur_trans->commit_wait);
+        cur_trans->in_commit = 0;
+        cur_trans->blocked = 0;
+        /*
+         * One for this trans handle, one so it will live on until we
+         * commit the transaction.
+         */
+        atomic_set(&cur_trans->use_count, 2);
+        cur_trans->commit_done = 0;
+        cur_trans->start_time = get_seconds();
+        cur_trans->delayed_refs.root = RB_ROOT;
+        cur_trans->delayed_refs.num_entries = 0;
+        cur_trans->delayed_refs.num_heads_ready = 0;
+        cur_trans->delayed_refs.num_heads = 0;
+        cur_trans->delayed_refs.flushing = 0;
+        cur_trans->delayed_refs.run_delayed_start = 0;
+        spin_lock_init(&cur_trans->commit_lock);
+        spin_lock_init(&cur_trans->delayed_refs.lock);
+        INIT_LIST_HEAD(&cur_trans->pending_snapshots);
+        list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+        extent_io_tree_init(&cur_trans->dirty_pages,
+                             root->fs_info->btree_inode->i_mapping);
+        root->fs_info->generation++;
+        cur_trans->transid = root->fs_info->generation;
+        root->fs_info->running_transaction = cur_trans;
+        spin_unlock(&root->fs_info->trans_lock);
        return 0;
 }
@@ -100,36 +126,82 @@ static noinline int join_transaction(struct btrfs_root *root)
 * to make sure the old root from before we joined the transaction is deleted
 * when the transaction commits
 */
-static noinline int record_root_in_trans(struct btrfs_trans_handle *trans,
+static int record_root_in_trans(struct btrfs_trans_handle *trans,
-                                         struct btrfs_root *root)
+                               struct btrfs_root *root)
 {
        if (root->ref_cows && root->last_trans < trans->transid) {
                WARN_ON(root == root->fs_info->extent_root);
                WARN_ON(root->commit_root != root->node);
+                /*
+                 * see below for in_trans_setup usage rules
+                 * we have the reloc mutex held now, so there
+                 * is only one writer in this function
+                 */
+                root->in_trans_setup = 1;
+                /* make sure readers find in_trans_setup before
+                 * they find our root->last_trans update
+                 */
+                smp_wmb();
+                spin_lock(&root->fs_info->fs_roots_radix_lock);
+                if (root->last_trans == trans->transid) {
+                        spin_unlock(&root->fs_info->fs_roots_radix_lock);
+                        return 0;
+                }
                radix_tree_tag_set(&root->fs_info->fs_roots_radix,
                           (unsigned long)root->root_key.objectid,
                           BTRFS_ROOT_TRANS_TAG);
+                spin_unlock(&root->fs_info->fs_roots_radix_lock);
                root->last_trans = trans->transid;
+                /* this is pretty tricky.  We don't want to
+                 * take the relocation lock in btrfs_record_root_in_trans
+                 * unless we're really doing the first setup for this root in
+                 * this transaction.
+                 *
+                 * Normally we'd use root->last_trans as a flag to decide
+                 * if we want to take the expensive mutex.
+                 *
+                 * But, we have to set root->last_trans before we
+                 * init the relocation root, otherwise, we trip over warnings
+                 * in ctree.c.  The solution used here is to flag ourselves
+                 * with root->in_trans_setup.  When this is 1, we're still
+                 * fixing up the reloc trees and everyone must wait.
+                 *
+                 * When this is zero, they can trust root->last_trans and fly
+                 * through btrfs_record_root_in_trans without having to take the
+                 * lock.  smp_wmb() makes sure that all the writes above are
+                 * done before we pop in the zero below
+                 */
                btrfs_init_reloc_root(trans, root);
+                smp_wmb();
+                root->in_trans_setup = 0;
        }
        return 0;
 }
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root)
 {
        if (!root->ref_cows)
                return 0;
-        mutex_lock(&root->fs_info->trans_mutex);
+        /*
-        if (root->last_trans == trans->transid) {
+         * see record_root_in_trans for comments about in_trans_setup usage
-                mutex_unlock(&root->fs_info->trans_mutex);
+         * and barriers
+         */
+        smp_rmb();
+        if (root->last_trans == trans->transid &&
+            !root->in_trans_setup)
                return 0;
-        }
+        mutex_lock(&root->fs_info->reloc_mutex);
        record_root_in_trans(trans, root);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        mutex_unlock(&root->fs_info->reloc_mutex);
        return 0;
 }
@@ -141,21 +213,23 @@ static void wait_current_trans(struct btrfs_root *root)
 {
        struct btrfs_transaction *cur_trans;
+        spin_lock(&root->fs_info->trans_lock);
        cur_trans = root->fs_info->running_transaction;
        if (cur_trans && cur_trans->blocked) {
                DEFINE_WAIT(wait);
-                cur_trans->use_count++;
+                atomic_inc(&cur_trans->use_count);
+                spin_unlock(&root->fs_info->trans_lock);
                while (1) {
                        prepare_to_wait(&root->fs_info->transaction_wait, &wait,
                                        TASK_UNINTERRUPTIBLE);
                        if (!cur_trans->blocked)
                                break;
-                        mutex_unlock(&root->fs_info->trans_mutex);
                        schedule();
-                        mutex_lock(&root->fs_info->trans_mutex);
                }
                finish_wait(&root->fs_info->transaction_wait, &wait);
                put_transaction(cur_trans);
+        } else {
+                spin_unlock(&root->fs_info->trans_lock);
        }
 }
@@ -163,14 +237,21 @@ enum btrfs_trans_type {
        TRANS_START,
        TRANS_JOIN,
        TRANS_USERSPACE,
+        TRANS_JOIN_NOLOCK,
 };
 static int may_wait_transaction(struct btrfs_root *root, int type)
 {
-        if (!root->fs_info->log_root_recovering &&
+        if (root->fs_info->log_root_recovering)
-            ((type == TRANS_START && !root->fs_info->open_ioctl_trans) ||
+                return 0;
-             type == TRANS_USERSPACE))
+        if (type == TRANS_USERSPACE)
                return 1;
+        if (type == TRANS_START &&
+            !atomic_read(&root->fs_info->open_ioctl_trans))
+                return 1;
        return 0;
 }
@@ -181,29 +262,47 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
        struct btrfs_transaction *cur_trans;
        int retries = 0;
        int ret;
+        if (root->fs_info->fs_state & BTRFS_SUPER_FLAG_ERROR)
+                return ERR_PTR(-EROFS);
+        if (current->journal_info) {
+                WARN_ON(type != TRANS_JOIN && type != TRANS_JOIN_NOLOCK);
+                h = current->journal_info;
+                h->use_count++;
+                h->orig_rsv = h->block_rsv;
+                h->block_rsv = NULL;
+                goto got_it;
+        }
 again:
        h = kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS);
        if (!h)
                return ERR_PTR(-ENOMEM);
-        mutex_lock(&root->fs_info->trans_mutex);
        if (may_wait_transaction(root, type))
                wait_current_trans(root);
-        ret = join_transaction(root);
+        do {
-        BUG_ON(ret);
+                ret = join_transaction(root, type == TRANS_JOIN_NOLOCK);
+                if (ret == -EBUSY)
+                        wait_current_trans(root);
+        } while (ret == -EBUSY);
+        if (ret < 0) {
+                kmem_cache_free(btrfs_trans_handle_cachep, h);
+                return ERR_PTR(ret);
+        }
        cur_trans = root->fs_info->running_transaction;
-        cur_trans->use_count++;
-        mutex_unlock(&root->fs_info->trans_mutex);
        h->transid = cur_trans->transid;
        h->transaction = cur_trans;
        h->blocks_used = 0;
-        h->block_group = 0;
        h->bytes_reserved = 0;
        h->delayed_ref_updates = 0;
+        h->use_count = 1;
        h->block_rsv = NULL;
+        h->orig_rsv = NULL;
        smp_mb();
        if (cur_trans->blocked && may_wait_transaction(root, type)) {
@@ -212,21 +311,27 @@ again:
        }
        if (num_items > 0) {
-                ret = btrfs_trans_reserve_metadata(h, root, num_items,
+                ret = btrfs_trans_reserve_metadata(h, root, num_items);
-                                                   &retries);
+                if (ret == -EAGAIN && !retries) {
-                if (ret == -EAGAIN) {
+                        retries++;
                        btrfs_commit_transaction(h, root);
                        goto again;
+                } else if (ret == -EAGAIN) {
+                        /*
+                         * We have already retried and got EAGAIN, so really we
+                         * don't have space, so set ret to -ENOSPC.
+                         */
+                        ret = -ENOSPC;
                }
                if (ret < 0) {
                        btrfs_end_transaction(h, root);
                        return ERR_PTR(ret);
                }
        }
-        mutex_lock(&root->fs_info->trans_mutex);
+got_it:
-        record_root_in_trans(h, root);
+        btrfs_record_root_in_trans(h, root);
-        mutex_unlock(&root->fs_info->trans_mutex);
        if (!current->journal_info && type != TRANS_USERSPACE)
                current->journal_info = h;
@@ -238,16 +343,19 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
 {
        return start_transaction(root, num_items, TRANS_START);
 }
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
-                                                   int num_blocks)
 {
        return start_transaction(root, 0, TRANS_JOIN);
 }
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
-                                                         int num_blocks)
+{
+        return start_transaction(root, 0, TRANS_JOIN_NOLOCK);
+}
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
 {
-        return start_transaction(r, 0, TRANS_USERSPACE);
+        return start_transaction(root, 0, TRANS_USERSPACE);
 }
 /* wait for a transaction commit to be fully complete */
@@ -255,70 +363,72 @@ static noinline int wait_for_commit(struct btrfs_root *root,
                                    struct btrfs_transaction *commit)
 {
        DEFINE_WAIT(wait);
-        mutex_lock(&root->fs_info->trans_mutex);
        while (!commit->commit_done) {
                prepare_to_wait(&commit->commit_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
                if (commit->commit_done)
                        break;
-                mutex_unlock(&root->fs_info->trans_mutex);
                schedule();
-                mutex_lock(&root->fs_info->trans_mutex);
        }
-        mutex_unlock(&root->fs_info->trans_mutex);
        finish_wait(&commit->commit_wait, &wait);
        return 0;
 }
-#if 0
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid)
-/*
- * rate limit against the drop_snapshot code.  This helps to slow down new
- * operations if the drop_snapshot code isn't able to keep up.
- */
-static void throttle_on_drops(struct btrfs_root *root)
 {
-        struct btrfs_fs_info *info = root->fs_info;
+        struct btrfs_transaction *cur_trans = NULL, *t;
-        int harder_count = 0;
+        int ret;
-harder:
+        ret = 0;
-        if (atomic_read(&info->throttles)) {
+        if (transid) {
-                DEFINE_WAIT(wait);
+                if (transid <= root->fs_info->last_trans_committed)
-                int thr;
+                        goto out;
-                thr = atomic_read(&info->throttle_gen);
+                /* find specified transaction */
-                do {
+                spin_lock(&root->fs_info->trans_lock);
-                        prepare_to_wait(&info->transaction_throttle,
+                list_for_each_entry(t, &root->fs_info->trans_list, list) {
-                                        &wait, TASK_UNINTERRUPTIBLE);
+                        if (t->transid == transid) {
-                        if (!atomic_read(&info->throttles)) {
+                                cur_trans = t;
-                                finish_wait(&info->transaction_throttle, &wait);
+                                atomic_inc(&cur_trans->use_count);
                                break;
                        }
-                        schedule();
+                        if (t->transid > transid)
-                        finish_wait(&info->transaction_throttle, &wait);
+                                break;
-                } while (thr == atomic_read(&info->throttle_gen));
+                }
-                harder_count++;
+                spin_unlock(&root->fs_info->trans_lock);
+                ret = -EINVAL;
-                if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 &&
+                if (!cur_trans)
-                    harder_count < 2)
+                        goto out;  /* bad transid */
-                        goto harder;
+        } else {
+                /* find newest transaction that is committing | committed */
+                spin_lock(&root->fs_info->trans_lock);
+                list_for_each_entry_reverse(t, &root->fs_info->trans_list,
+                                            list) {
+                        if (t->in_commit) {
+                                if (t->commit_done)
+                                        break;
+                                cur_trans = t;
+                                atomic_inc(&cur_trans->use_count);
+                                break;
+                        }
+                }
+                spin_unlock(&root->fs_info->trans_lock);
+                if (!cur_trans)
+                        goto out;  /* nothing committing|committed */
+        }
-                if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 &&
+        wait_for_commit(root, cur_trans);
-                    harder_count < 10)
-                        goto harder;
-                if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 &&
+        put_transaction(cur_trans);
-                    harder_count < 20)
+        ret = 0;
-                        goto harder;
+out:
-        }
+        return ret;
 }
-#endif
 void btrfs_throttle(struct btrfs_root *root)
 {
-        mutex_lock(&root->fs_info->trans_mutex);
+        if (!atomic_read(&root->fs_info->open_ioctl_trans))
-        if (!root->fs_info->open_ioctl_trans)
                wait_current_trans(root);
-        mutex_unlock(&root->fs_info->trans_mutex);
 }
 static int should_end_transaction(struct btrfs_trans_handle *trans,
@@ -336,6 +446,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
        struct btrfs_transaction *cur_trans = trans->transaction;
        int updates;
+        smp_mb();
        if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
                return 1;
@@ -348,12 +459,17 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 }
 static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
-                          struct btrfs_root *root, int throttle)
+                          struct btrfs_root *root, int throttle, int lock)
 {
        struct btrfs_transaction *cur_trans = trans->transaction;
        struct btrfs_fs_info *info = root->fs_info;
        int count = 0;
+        if (--trans->use_count) {
+                trans->block_rsv = trans->orig_rsv;
+                return 0;
+        }
        while (count < 4) {
                unsigned long cur = trans->delayed_ref_updates;
                trans->delayed_ref_updates = 0;
@@ -376,26 +492,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
        btrfs_trans_release_metadata(trans, root);
-        if (!root->fs_info->open_ioctl_trans &&
+        if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
-            should_end_transaction(trans, root))
+            should_end_transaction(trans, root)) {
                trans->transaction->blocked = 1;
+                smp_wmb();
+        }
-        if (cur_trans->blocked && !cur_trans->in_commit) {
+        if (lock && cur_trans->blocked && !cur_trans->in_commit) {
                if (throttle)
                        return btrfs_commit_transaction(trans, root);
                else
                        wake_up_process(info->transaction_kthread);
        }
-        mutex_lock(&info->trans_mutex);
        WARN_ON(cur_trans != info->running_transaction);
-        WARN_ON(cur_trans->num_writers < 1);
+        WARN_ON(atomic_read(&cur_trans->num_writers) < 1);
-        cur_trans->num_writers--;
+        atomic_dec(&cur_trans->num_writers);
+        smp_mb();
        if (waitqueue_active(&cur_trans->writer_wait))
                wake_up(&cur_trans->writer_wait);
        put_transaction(cur_trans);
-        mutex_unlock(&info->trans_mutex);
        if (current->journal_info == trans)
                current->journal_info = NULL;
@@ -411,13 +528,40 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root)
 {
-        return __btrfs_end_transaction(trans, root, 0);
+        int ret;
+        ret = __btrfs_end_transaction(trans, root, 0, 1);
+        if (ret)
+                return ret;
+        return 0;
 }
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root)
 {
-        return __btrfs_end_transaction(trans, root, 1);
+        int ret;
+        ret = __btrfs_end_transaction(trans, root, 1, 1);
+        if (ret)
+                return ret;
+        return 0;
+}
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root)
+{
+        int ret;
+        ret = __btrfs_end_transaction(trans, root, 0, 0);
+        if (ret)
+                return ret;
+        return 0;
+}
+int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root)
+{
+        return __btrfs_end_transaction(trans, root, 1, 1);
 }
 /*
@@ -643,9 +787,9 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans,
 */
 int btrfs_add_dead_root(struct btrfs_root *root)
 {
-        mutex_lock(&root->fs_info->trans_mutex);
+        spin_lock(&root->fs_info->trans_lock);
        list_add(&root->root_list, &root->fs_info->dead_roots);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        spin_unlock(&root->fs_info->trans_lock);
        return 0;
 }
@@ -661,6 +805,7 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
        int ret;
        int err = 0;
+        spin_lock(&fs_info->fs_roots_radix_lock);
        while (1) {
                ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
                                                 (void **)gang, 0,
@@ -673,13 +818,20 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        radix_tree_tag_clear(&fs_info->fs_roots_radix,
                                        (unsigned long)root->root_key.objectid,
                                        BTRFS_ROOT_TRANS_TAG);
+                        spin_unlock(&fs_info->fs_roots_radix_lock);
                        btrfs_free_log(trans, root);
                        btrfs_update_reloc_root(trans, root);
                        btrfs_orphan_commit_root(trans, root);
+                        btrfs_save_ino_cache(root, trans);
                        if (root->commit_root != root->node) {
+                                mutex_lock(&root->fs_commit_mutex);
                                switch_commit_root(root);
+                                btrfs_unpin_free_ino(root);
+                                mutex_unlock(&root->fs_commit_mutex);
                                btrfs_set_root_node(&root->root_item,
                                                    root->node);
                        }
@@ -687,10 +839,12 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans,
                        err = btrfs_update_root(trans, fs_info->tree_root,
                                                &root->root_key,
                                                &root->root_item);
+                        spin_lock(&fs_info->fs_roots_radix_lock);
                        if (err)
                                break;
                }
        }
+        spin_unlock(&fs_info->fs_roots_radix_lock);
        return err;
 }
@@ -720,104 +874,13 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
                btrfs_btree_balance_dirty(info->tree_root, nr);
                cond_resched();
-                if (root->fs_info->closing || ret != -EAGAIN)
+                if (btrfs_fs_closing(root->fs_info) || ret != -EAGAIN)
                        break;
        }
        root->defrag_running = 0;
        return ret;
 }
-#if 0
-/*
- * when dropping snapshots, we generate a ton of delayed refs, and it makes
- * sense not to join the transaction while it is trying to flush the current
- * queue of delayed refs out.
- *
- * This is used by the drop snapshot code only
- */
-static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
-{
-        DEFINE_WAIT(wait);
-        mutex_lock(&info->trans_mutex);
-        while (info->running_transaction &&
-               info->running_transaction->delayed_refs.flushing) {
-                prepare_to_wait(&info->transaction_wait, &wait,
-                                TASK_UNINTERRUPTIBLE);
-                mutex_unlock(&info->trans_mutex);
-                schedule();
-                mutex_lock(&info->trans_mutex);
-                finish_wait(&info->transaction_wait, &wait);
-        }
-        mutex_unlock(&info->trans_mutex);
-        return 0;
-}
-/*
- * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
- * all of them
- */
-int btrfs_drop_dead_root(struct btrfs_root *root)
-{
-        struct btrfs_trans_handle *trans;
-        struct btrfs_root *tree_root = root->fs_info->tree_root;
-        unsigned long nr;
-        int ret;
-        while (1) {
-                /*
-                 * we don't want to jump in and create a bunch of
-                 * delayed refs if the transaction is starting to close
-                 */
-                wait_transaction_pre_flush(tree_root->fs_info);
-                trans = btrfs_start_transaction(tree_root, 1);
-                /*
-                 * we've joined a transaction, make sure it isn't
-                 * closing right now
-                 */
-                if (trans->transaction->delayed_refs.flushing) {
-                        btrfs_end_transaction(trans, tree_root);
-                        continue;
-                }
-                ret = btrfs_drop_snapshot(trans, root);
-                if (ret != -EAGAIN)
-                        break;
-                ret = btrfs_update_root(trans, tree_root,
-                                        &root->root_key,
-                                        &root->root_item);
-                if (ret)
-                        break;
-                nr = trans->blocks_used;
-                ret = btrfs_end_transaction(trans, tree_root);
-                BUG_ON(ret);
-                btrfs_btree_balance_dirty(tree_root, nr);
-                cond_resched();
-        }
-        BUG_ON(ret);
-        ret = btrfs_del_root(trans, tree_root, &root->root_key);
-        BUG_ON(ret);
-        nr = trans->blocks_used;
-        ret = btrfs_end_transaction(trans, tree_root);
-        BUG_ON(ret);
-        free_extent_buffer(root->node);
-        free_extent_buffer(root->commit_root);
-        kfree(root);
-        btrfs_btree_balance_dirty(tree_root, nr);
-        return ret;
-}
-#endif
 /*
 * new snapshots need to be created at a very specific time in the
 * transaction commit.  This does the actual creation
@@ -832,14 +895,15 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        struct btrfs_root *root = pending->root;
        struct btrfs_root *parent_root;
        struct inode *parent_inode;
+        struct dentry *parent;
        struct dentry *dentry;
        struct extent_buffer *tmp;
        struct extent_buffer *old;
        int ret;
-        int retries = 0;
        u64 to_reserve = 0;
        u64 index = 0;
        u64 objectid;
+        u64 root_flags;
        new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
        if (!new_root_item) {
@@ -847,7 +911,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
                goto fail;
        }
-        ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid);
+        ret = btrfs_find_free_objectid(tree_root, &objectid);
        if (ret) {
                pending->error = ret;
                goto fail;
@@ -858,7 +922,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        if (to_reserve > 0) {
                ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
-                                          to_reserve, &retries);
+                                          to_reserve);
                if (ret) {
                        pending->error = ret;
                        goto fail;
@@ -872,7 +936,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        trans->block_rsv = &pending->block_rsv;
        dentry = pending->dentry;
-        parent_inode = dentry->d_parent->d_inode;
+        parent = dget_parent(dentry);
+        parent_inode = parent->d_inode;
        parent_root = BTRFS_I(parent_inode)->root;
        record_root_in_trans(trans, parent_root);
@@ -883,7 +948,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        ret = btrfs_insert_dir_item(trans, parent_root,
                                dentry->d_name.name, dentry->d_name.len,
-                                parent_inode->i_ino, &key,
+                                parent_inode, &key,
                                BTRFS_FT_DIR, index);
        BUG_ON(ret);
@@ -892,9 +957,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
        ret = btrfs_update_inode(trans, parent_root, parent_inode);
        BUG_ON(ret);
+        /*
+         * pull in the delayed directory update
+         * and the delayed inode item
+         * otherwise we corrupt the FS during
+         * snapshot
+         */
+        ret = btrfs_run_delayed_items(trans, root);
+        BUG_ON(ret);
        record_root_in_trans(trans, root);
        btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
        memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
+        btrfs_check_and_init_root_item(new_root_item);
+        root_flags = btrfs_root_flags(new_root_item);
+        if (pending->readonly)
+                root_flags |= BTRFS_ROOT_SUBVOL_RDONLY;
+        else
+                root_flags &= ~BTRFS_ROOT_SUBVOL_RDONLY;
+        btrfs_set_root_flags(new_root_item, root_flags);
        old = btrfs_lock_root_node(root);
        btrfs_cow_block(trans, root, old, NULL, 0, &old);
@@ -917,9 +999,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
         */
        ret = btrfs_add_root_ref(trans, tree_root, objectid,
                                 parent_root->root_key.objectid,
-                                 parent_inode->i_ino, index,
+                                 btrfs_ino(parent_inode), index,
                                 dentry->d_name.name, dentry->d_name.len);
        BUG_ON(ret);
+        dput(parent);
        key.offset = (u64)-1;
        pending->snap = btrfs_read_fs_root_no_name(root->fs_info, &key);
@@ -966,33 +1049,152 @@ static void update_super_roots(struct btrfs_root *root)
        super->root = root_item->bytenr;
        super->generation = root_item->generation;
        super->root_level = root_item->level;
+        if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+                super->cache_generation = root_item->generation;
 }
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
 {
        int ret = 0;
-        spin_lock(&info->new_trans_lock);
+        spin_lock(&info->trans_lock);
        if (info->running_transaction)
                ret = info->running_transaction->in_commit;
-        spin_unlock(&info->new_trans_lock);
+        spin_unlock(&info->trans_lock);
        return ret;
 }
 int btrfs_transaction_blocked(struct btrfs_fs_info *info)
 {
        int ret = 0;
-        spin_lock(&info->new_trans_lock);
+        spin_lock(&info->trans_lock);
        if (info->running_transaction)
                ret = info->running_transaction->blocked;
-        spin_unlock(&info->new_trans_lock);
+        spin_unlock(&info->trans_lock);
        return ret;
 }
+/*
+ * wait for the current transaction commit to start and block subsequent
+ * transaction joins
+ */
+static void wait_current_trans_commit_start(struct btrfs_root *root,
+                                            struct btrfs_transaction *trans)
+{
+        DEFINE_WAIT(wait);
+        if (trans->in_commit)
+                return;
+        while (1) {
+                prepare_to_wait(&root->fs_info->transaction_blocked_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                if (trans->in_commit) {
+                        finish_wait(&root->fs_info->transaction_blocked_wait,
+                                    &wait);
+                        break;
+                }
+                schedule();
+                finish_wait(&root->fs_info->transaction_blocked_wait, &wait);
+        }
+}
+/*
+ * wait for the current transaction to start and then become unblocked.
+ * caller holds ref.
+ */
+static void wait_current_trans_commit_start_and_unblock(struct btrfs_root *root,
+                                         struct btrfs_transaction *trans)
+{
+        DEFINE_WAIT(wait);
+        if (trans->commit_done || (trans->in_commit && !trans->blocked))
+                return;
+        while (1) {
+                prepare_to_wait(&root->fs_info->transaction_wait, &wait,
+                                TASK_UNINTERRUPTIBLE);
+                if (trans->commit_done ||
+                    (trans->in_commit && !trans->blocked)) {
+                        finish_wait(&root->fs_info->transaction_wait,
+                                    &wait);
+                        break;
+                }
+                schedule();
+                finish_wait(&root->fs_info->transaction_wait,
+                            &wait);
+        }
+}
+/*
+ * commit transactions asynchronously. once btrfs_commit_transaction_async
+ * returns, any subsequent transaction will not be allowed to join.
+ */
+struct btrfs_async_commit {
+        struct btrfs_trans_handle *newtrans;
+        struct btrfs_root *root;
+        struct delayed_work work;
+};
+static void do_async_commit(struct work_struct *work)
+{
+        struct btrfs_async_commit *ac =
+                container_of(work, struct btrfs_async_commit, work.work);
+        btrfs_commit_transaction(ac->newtrans, ac->root);
+        kfree(ac);
+}
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   int wait_for_unblock)
+{
+        struct btrfs_async_commit *ac;
+        struct btrfs_transaction *cur_trans;
+        ac = kmalloc(sizeof(*ac), GFP_NOFS);
+        if (!ac)
+                return -ENOMEM;
+        INIT_DELAYED_WORK(&ac->work, do_async_commit);
+        ac->root = root;
+        ac->newtrans = btrfs_join_transaction(root);
+        if (IS_ERR(ac->newtrans)) {
+                int err = PTR_ERR(ac->newtrans);
+                kfree(ac);
+                return err;
+        }
+        /* take transaction reference */
+        cur_trans = trans->transaction;
+        atomic_inc(&cur_trans->use_count);
+        btrfs_end_transaction(trans, root);
+        schedule_delayed_work(&ac->work, 0);
+        /* wait for transaction to start and unblock */
+        if (wait_for_unblock)
+                wait_current_trans_commit_start_and_unblock(root, cur_trans);
+        else
+                wait_current_trans_commit_start(root, cur_trans);
+        if (current->journal_info == trans)
+                current->journal_info = NULL;
+        put_transaction(cur_trans);
+        return 0;
+}
+/*
+ * btrfs_transaction state sequence:
+ *    in_commit = 0, blocked = 0  (initial)
+ *    in_commit = 1, blocked = 1
+ *    blocked = 0
+ *    commit_done = 1
+ */
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root)
 {
        unsigned long joined = 0;
-        unsigned long timeout = 1;
        struct btrfs_transaction *cur_trans;
        struct btrfs_transaction *prev_trans = NULL;
        DEFINE_WAIT(wait);
@@ -1021,36 +1223,41 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        ret = btrfs_run_delayed_refs(trans, root, 0);
        BUG_ON(ret);
-        mutex_lock(&root->fs_info->trans_mutex);
+        spin_lock(&cur_trans->commit_lock);
        if (cur_trans->in_commit) {
-                cur_trans->use_count++;
+                spin_unlock(&cur_trans->commit_lock);
-                mutex_unlock(&root->fs_info->trans_mutex);
+                atomic_inc(&cur_trans->use_count);
                btrfs_end_transaction(trans, root);
                ret = wait_for_commit(root, cur_trans);
                BUG_ON(ret);
-                mutex_lock(&root->fs_info->trans_mutex);
                put_transaction(cur_trans);
-                mutex_unlock(&root->fs_info->trans_mutex);
                return 0;
        }
        trans->transaction->in_commit = 1;
        trans->transaction->blocked = 1;
+        spin_unlock(&cur_trans->commit_lock);
+        wake_up(&root->fs_info->transaction_blocked_wait);
+        spin_lock(&root->fs_info->trans_lock);
        if (cur_trans->list.prev != &root->fs_info->trans_list) {
                prev_trans = list_entry(cur_trans->list.prev,
                                        struct btrfs_transaction, list);
                if (!prev_trans->commit_done) {
-                        prev_trans->use_count++;
+                        atomic_inc(&prev_trans->use_count);
-                        mutex_unlock(&root->fs_info->trans_mutex);
+                        spin_unlock(&root->fs_info->trans_lock);
                        wait_for_commit(root, prev_trans);
-                        mutex_lock(&root->fs_info->trans_mutex);
                        put_transaction(prev_trans);
+                } else {
+                        spin_unlock(&root->fs_info->trans_lock);
                }
+        } else {
+                spin_unlock(&root->fs_info->trans_lock);
        }
        if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
@@ -1058,17 +1265,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        do {
                int snap_pending = 0;
                joined = cur_trans->num_joined;
                if (!list_empty(&trans->transaction->pending_snapshots))
                        snap_pending = 1;
                WARN_ON(cur_trans != trans->transaction);
-                if (cur_trans->num_writers > 1)
-                        timeout = MAX_SCHEDULE_TIMEOUT;
-                else if (should_grow)
-                        timeout = 1;
-                mutex_unlock(&root->fs_info->trans_mutex);
                if (flush_on_commit || snap_pending) {
                        btrfs_start_delalloc_inodes(root, 1);
@@ -1076,6 +1278,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                        BUG_ON(ret);
                }
+                ret = btrfs_run_delayed_items(trans, root);
+                BUG_ON(ret);
                /*
                 * rename don't use btrfs_join_transaction, so, once we
                 * set the transaction to blocked above, we aren't going
@@ -1088,23 +1293,51 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                prepare_to_wait(&cur_trans->writer_wait, &wait,
                                TASK_UNINTERRUPTIBLE);
-                smp_mb();
+                if (atomic_read(&cur_trans->num_writers) > 1)
-                if (cur_trans->num_writers > 1 || should_grow)
+                        schedule_timeout(MAX_SCHEDULE_TIMEOUT);
-                        schedule_timeout(timeout);
+                else if (should_grow)
+                        schedule_timeout(1);
-                mutex_lock(&root->fs_info->trans_mutex);
                finish_wait(&cur_trans->writer_wait, &wait);
-        } while (cur_trans->num_writers > 1 ||
+        } while (atomic_read(&cur_trans->num_writers) > 1 ||
                 (should_grow && cur_trans->num_joined != joined));
+        /*
+         * Ok now we need to make sure to block out any other joins while we
+         * commit the transaction.  We could have started a join before setting
+         * no_join so make sure to wait for num_writers to == 1 again.
+         */
+        spin_lock(&root->fs_info->trans_lock);
+        root->fs_info->trans_no_join = 1;
+        spin_unlock(&root->fs_info->trans_lock);
+        wait_event(cur_trans->writer_wait,
+                   atomic_read(&cur_trans->num_writers) == 1);
+        /*
+         * the reloc mutex makes sure that we stop
+         * the balancing code from coming in and moving
+         * extents around in the middle of the commit
+         */
+        mutex_lock(&root->fs_info->reloc_mutex);
+        ret = btrfs_run_delayed_items(trans, root);
+        BUG_ON(ret);
        ret = create_pending_snapshots(trans, root->fs_info);
        BUG_ON(ret);
        ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
        BUG_ON(ret);
+        /*
+         * make sure none of the code above managed to slip in a
+         * delayed item
+         */
+        btrfs_assert_delayed_root_empty(root);
        WARN_ON(cur_trans != trans->transaction);
+        btrfs_scrub_pause(root);
        /* btrfs_commit_tree_roots is responsible for getting the
         * various roots consistent with each other.  Every pointer
         * in the tree of tree roots has to point to the most up to date
@@ -1134,9 +1367,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_prepare_extent_commit(trans, root);
        cur_trans = root->fs_info->running_transaction;
-        spin_lock(&root->fs_info->new_trans_lock);
-        root->fs_info->running_transaction = NULL;
-        spin_unlock(&root->fs_info->new_trans_lock);
        btrfs_set_root_node(&root->fs_info->tree_root->root_item,
                            root->fs_info->tree_root->node);
@@ -1157,10 +1387,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
               sizeof(root->fs_info->super_copy));
        trans->transaction->blocked = 0;
+        spin_lock(&root->fs_info->trans_lock);
+        root->fs_info->running_transaction = NULL;
+        root->fs_info->trans_no_join = 0;
+        spin_unlock(&root->fs_info->trans_lock);
+        mutex_unlock(&root->fs_info->reloc_mutex);
        wake_up(&root->fs_info->transaction_wait);
-        mutex_unlock(&root->fs_info->trans_mutex);
        ret = btrfs_write_and_wait_transaction(trans, root);
        BUG_ON(ret);
        write_ctree_super(trans, root, 0);
@@ -1173,18 +1407,22 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
        btrfs_finish_extent_commit(trans, root);
-        mutex_lock(&root->fs_info->trans_mutex);
        cur_trans->commit_done = 1;
        root->fs_info->last_trans_committed = cur_trans->transid;
        wake_up(&cur_trans->commit_wait);
+        spin_lock(&root->fs_info->trans_lock);
+        list_del_init(&cur_trans->list);
+        spin_unlock(&root->fs_info->trans_lock);
        put_transaction(cur_trans);
        put_transaction(cur_trans);
-        mutex_unlock(&root->fs_info->trans_mutex);
+        trace_btrfs_transaction_commit(root);
+        btrfs_scrub_continue(root);
        if (current->journal_info == trans)
                current->journal_info = NULL;
@@ -1205,14 +1443,16 @@ int btrfs_clean_old_snapshots(struct btrfs_root *root)
        LIST_HEAD(list);
        struct btrfs_fs_info *fs_info = root->fs_info;
-        mutex_lock(&fs_info->trans_mutex);
+        spin_lock(&fs_info->trans_lock);
        list_splice_init(&fs_info->dead_roots, &list);
-        mutex_unlock(&fs_info->trans_mutex);
+        spin_unlock(&fs_info->trans_lock);
        while (!list_empty(&list)) {
                root = list_entry(list.next, struct btrfs_root, root_list);
                list_del(&root->root_list);
+                btrfs_kill_all_delayed_nodes(root);
                if (btrfs_header_backref_rev(root->node) <
                    BTRFS_MIXED_BACKREF_REV)
                        btrfs_drop_snapshot(root, NULL, 0);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index e104986d0bfd..02564e6230ac 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -27,11 +27,13 @@ struct btrfs_transaction {
         * total writers in this transaction, it must be zero before the
         * transaction can end
         */
-        unsigned long num_writers;
+        atomic_t num_writers;
+        atomic_t use_count;
        unsigned long num_joined;
+        spinlock_t commit_lock;
        int in_commit;
-        int use_count;
        int commit_done;
        int blocked;
        struct list_head list;
@@ -45,13 +47,14 @@ struct btrfs_transaction {
 struct btrfs_trans_handle {
        u64 transid;
-        u64 block_group;
        u64 bytes_reserved;
+        unsigned long use_count;
        unsigned long blocks_reserved;
        unsigned long blocks_used;
        unsigned long delayed_ref_updates;
        struct btrfs_transaction *transaction;
        struct btrfs_block_rsv *block_rsv;
+        struct btrfs_block_rsv *orig_rsv;
 };
 struct btrfs_pending_snapshot {
@@ -62,22 +65,10 @@ struct btrfs_pending_snapshot {
        struct btrfs_block_rsv block_rsv;
        /* extra metadata reseration for relocation */
        int error;
+        bool readonly;
        struct list_head list;
 };
-static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans,
-                                               struct inode *inode)
-{
-        trans->block_group = BTRFS_I(inode)->block_group;
-}
-static inline void btrfs_update_inode_block_group(
-                                          struct btrfs_trans_handle *trans,
-                                          struct inode *inode)
-{
-        BTRFS_I(inode)->block_group = trans->block_group;
-}
 static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
                                              struct inode *inode)
 {
@@ -87,25 +78,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
 int btrfs_end_transaction(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root);
+int btrfs_end_transaction_nolock(struct btrfs_trans_handle *trans,
+                                 struct btrfs_root *root);
 struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
                                                   int num_items);
-struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root,
+struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
-                                                  int num_blocks);
+struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
-struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r,
+struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root);
-                                                         int num_blocks);
+int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid);
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
                                     struct btrfs_root *root);
-int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
-                            struct btrfs_root *root);
 int btrfs_add_dead_root(struct btrfs_root *root);
-int btrfs_drop_dead_root(struct btrfs_root *root);
 int btrfs_defrag_root(struct btrfs_root *root, int cacheonly);
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
                             struct btrfs_root *root);
+int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
+                                   struct btrfs_root *root,
+                                   int wait_for_unblock);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
                                   struct btrfs_root *root);
+int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
+                                struct btrfs_root *root);
 int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
                                 struct btrfs_root *root);
 void btrfs_throttle(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f7ac8e013ed7..3b580ee8ab1d 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -36,7 +36,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
        int ret = 0;
        int wret;
        int level;
-        int orig_level;
        int is_extent = 0;
        int next_key_ret = 0;
        u64 last_ret = 0;
@@ -64,7 +63,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        level = btrfs_header_level(root->node);
-        orig_level = level;
        if (level == 0)
                goto out;
@@ -99,7 +97,7 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
                ret = 0;
                goto out;
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
        if (wret < 0) {
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index fb102a9aee9c..4ce8a9f41d1e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -333,11 +333,17 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
                        goto insert;
                if (item_size == 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        return 0;
                }
                dst_copy = kmalloc(item_size, GFP_NOFS);
                src_copy = kmalloc(item_size, GFP_NOFS);
+                if (!dst_copy || !src_copy) {
+                        btrfs_release_path(path);
+                        kfree(dst_copy);
+                        kfree(src_copy);
+                        return -ENOMEM;
+                }
                read_extent_buffer(eb, src_copy, src_ptr, item_size);
@@ -355,13 +361,13 @@ static noinline int overwrite_item(struct btrfs_trans_handle *trans,
                 * sync
                 */
                if (ret == 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        return 0;
                }
        }
 insert:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        /* try to insert the key into the destination tree */
        ret = btrfs_insert_empty_item(trans, root, path,
                                      key, item_size);
@@ -376,7 +382,6 @@ insert:
                } else if (found_size < item_size) {
                        ret = btrfs_extend_item(trans, root, path,
                                                item_size - found_size);
-                        BUG_ON(ret);
                }
        } else if (ret) {
                return ret;
@@ -432,7 +437,7 @@ insert:
        }
 no_copy:
        btrfs_mark_buffer_dirty(path->nodes[0]);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return 0;
 }
@@ -513,7 +518,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
         * file.  This must be done before the btrfs_drop_extents run
         * so we don't try to drop this extent.
         */
-        ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
+        ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
                                       start, 0);
        if (ret == 0 &&
@@ -538,11 +543,11 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                 * we don't have to do anything
                 */
                if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        goto out;
                }
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        saved_nbytes = inode_get_bytes(inode);
        /* drop any overlapping extents */
@@ -584,6 +589,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                ins.objectid, ins.offset,
                                                0, root->root_key.objectid,
                                                key->objectid, offset);
+                                BUG_ON(ret);
                        } else {
                                /*
                                 * insert the extent pointer in the extent
@@ -594,7 +600,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                                key->objectid, offset, &ins);
                                BUG_ON(ret);
                        }
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        if (btrfs_file_extent_compression(eb, item)) {
                                csum_start = ins.objectid;
@@ -608,7 +614,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                        ret = btrfs_lookup_csums_range(root->log_root,
                                                csum_start, csum_end - 1,
-                                                &ordered_sums);
+                                                &ordered_sums, 0);
                        BUG_ON(ret);
                        while (!list_empty(&ordered_sums)) {
                                struct btrfs_ordered_sum *sums;
@@ -623,7 +629,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
                                kfree(sums);
                        }
                } else {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                }
        } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
                /* inline extents are easy, we just overwrite them */
@@ -665,11 +671,17 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
        btrfs_dir_item_key_to_cpu(leaf, di, &location);
        name_len = btrfs_dir_name_len(leaf, di);
        name = kmalloc(name_len, GFP_NOFS);
+        if (!name)
+                return -ENOMEM;
        read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        inode = read_one_inode(root, location.objectid);
-        BUG_ON(!inode);
+        if (!inode) {
+                kfree(name);
+                return -EIO;
+        }
        ret = link_to_fixup_dir(trans, root, path, location.objectid);
        BUG_ON(ret);
@@ -704,7 +716,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
                        goto out;
        } else
                goto out;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
        if (di && !IS_ERR(di)) {
@@ -715,7 +727,7 @@ static noinline int inode_in_dir(struct btrfs_root *root,
                goto out;
        match = 1;
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return match;
 }
@@ -744,6 +756,9 @@ static noinline int backref_in_log(struct btrfs_root *log,
        int match = 0;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
        if (ret != 0)
                goto out;
@@ -786,18 +801,13 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
 {
        struct inode *dir;
        int ret;
-        struct btrfs_key location;
        struct btrfs_inode_ref *ref;
-        struct btrfs_dir_item *di;
        struct inode *inode;
        char *name;
        int namelen;
        unsigned long ref_ptr;
        unsigned long ref_end;
+        int search_done = 0;
-        location.objectid = key->objectid;
-        location.type = BTRFS_INODE_ITEM_KEY;
-        location.offset = 0;
        /*
         * it is possible that we didn't log all the parent directories
@@ -810,7 +820,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
                return -ENOENT;
        inode = read_one_inode(root, key->objectid);
-        BUG_ON(!inode);
+        if (!inode) {
+                iput(dir);
+                return -EIO;
+        }
        ref_ptr = btrfs_item_ptr_offset(eb, slot);
        ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
@@ -825,7 +838,7 @@ again:
        read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
        /* if we already have a perfect match, we're done */
-        if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
+        if (inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode),
                         btrfs_inode_ref_index(eb, ref),
                         name, namelen)) {
                goto out;
@@ -838,7 +851,10 @@ again:
         * existing back reference, and we don't want to create
         * dangling pointers in the directory.
         */
-conflict_again:
+        if (search_done)
+                goto insert;
        ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
        if (ret == 0) {
                char *victim_name;
@@ -874,42 +890,26 @@ conflict_again:
                        if (!backref_in_log(log, key, victim_name,
                                            victim_name_len)) {
                                btrfs_inc_nlink(inode);
-                                btrfs_release_path(root, path);
+                                btrfs_release_path(path);
                                ret = btrfs_unlink_inode(trans, root, dir,
                                                         inode, victim_name,
                                                         victim_name_len);
-                                kfree(victim_name);
-                                btrfs_release_path(root, path);
-                                goto conflict_again;
                        }
                        kfree(victim_name);
                        ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
                }
                BUG_ON(ret);
-        }
-        btrfs_release_path(root, path);
-        /* look for a conflicting sequence number */
+                /*
-        di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
+                 * NOTE: we have searched root tree and checked the
-                                         btrfs_inode_ref_index(eb, ref),
+                 * coresponding ref, it does not need to check again.
-                                         name, namelen, 0);
+                 */
-        if (di && !IS_ERR(di)) {
+                search_done = 1;
-                ret = drop_one_dir_item(trans, root, path, dir, di);
-                BUG_ON(ret);
-        }
-        btrfs_release_path(root, path);
-        /* look for a conflicting name */
-        di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
-                                   name, namelen, 0);
-        if (di && !IS_ERR(di)) {
-                ret = drop_one_dir_item(trans, root, path, dir, di);
-                BUG_ON(ret);
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
+insert:
        /* insert our name */
        ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
                             btrfs_inode_ref_index(eb, ref));
@@ -928,7 +928,7 @@ out:
        BUG_ON(ret);
 out_nowrite:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        iput(dir);
        iput(inode);
        return 0;
@@ -966,12 +966,15 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        unsigned long ptr;
        unsigned long ptr_end;
        int name_len;
+        u64 ino = btrfs_ino(inode);
-        key.objectid = inode->i_ino;
+        key.objectid = ino;
        key.type = BTRFS_INODE_REF_KEY;
        key.offset = (u64)-1;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        while (1) {
                ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -984,7 +987,7 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                }
                btrfs_item_key_to_cpu(path->nodes[0], &key,
                                      path->slots[0]);
-                if (key.objectid != inode->i_ino ||
+                if (key.objectid != ino ||
                    key.type != BTRFS_INODE_REF_KEY)
                        break;
                ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
@@ -1003,9 +1006,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
                if (key.offset == 0)
                        break;
                key.offset--;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (nlink != inode->i_nlink) {
                inode->i_nlink = nlink;
                btrfs_update_inode(trans, root, inode);
@@ -1015,10 +1018,10 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
        if (inode->i_nlink == 0) {
                if (S_ISDIR(inode->i_mode)) {
                        ret = replay_dir_deletes(trans, root, NULL, path,
-                                                 inode->i_ino, 1);
+                                                 ino, 1);
                        BUG_ON(ret);
                }
-                ret = insert_orphan_item(trans, root, inode->i_ino);
+                ret = insert_orphan_item(trans, root, ino);
                BUG_ON(ret);
        }
        btrfs_free_path(path);
@@ -1054,11 +1057,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
                        break;
                ret = btrfs_del_item(trans, root, path);
-                BUG_ON(ret);
+                if (ret)
+                        goto out;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                inode = read_one_inode(root, key.offset);
-                BUG_ON(!inode);
+                if (!inode)
+                        return -EIO;
                ret = fixup_inode_link_count(trans, root, inode);
                BUG_ON(ret);
@@ -1072,8 +1077,10 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
                 */
                key.offset = (u64)-1;
        }
-        btrfs_release_path(root, path);
+        ret = 0;
-        return 0;
+out:
+        btrfs_release_path(path);
+        return ret;
 }
@@ -1092,7 +1099,8 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
        struct inode *inode;
        inode = read_one_inode(root, objectid);
-        BUG_ON(!inode);
+        if (!inode)
+                return -EIO;
        key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
        btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
@@ -1100,7 +1108,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
        ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (ret == 0) {
                btrfs_inc_nlink(inode);
                btrfs_update_inode(trans, root, inode);
@@ -1179,10 +1187,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        int ret;
        dir = read_one_inode(root, key->objectid);
-        BUG_ON(!dir);
+        if (!dir)
+                return -EIO;
        name_len = btrfs_dir_name_len(eb, di);
        name = kmalloc(name_len, GFP_NOFS);
+        if (!name)
+                return -ENOMEM;
        log_type = btrfs_dir_type(eb, di);
        read_extent_buffer(eb, name, (unsigned long)(di + 1),
                   name_len);
@@ -1193,7 +1205,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
                exists = 1;
        else
                exists = 0;
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        if (key->type == BTRFS_DIR_ITEM_KEY) {
                dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1206,7 +1218,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        } else {
                BUG();
        }
-        if (!dst_di || IS_ERR(dst_di)) {
+        if (IS_ERR_OR_NULL(dst_di)) {
                /* we need a sequence number to insert, so we only
                 * do inserts for the BTRFS_DIR_INDEX_KEY types
                 */
@@ -1237,13 +1249,13 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
        if (key->type == BTRFS_DIR_INDEX_KEY)
                goto insert;
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        kfree(name);
        iput(dir);
        return 0;
 insert:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        ret = insert_one_name(trans, root, path, key->objectid, key->offset,
                              name, name_len, log_type, &log_key);
@@ -1274,6 +1286,8 @@ static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
        ptr_end = ptr + item_size;
        while (ptr < ptr_end) {
                di = (struct btrfs_dir_item *)ptr;
+                if (verify_dir_item(root, eb, di))
+                        return -EIO;
                name_len = btrfs_dir_name_len(eb, di);
                ret = replay_one_name(trans, root, path, eb, di, key);
                BUG_ON(ret);
@@ -1362,7 +1376,7 @@ next:
        *end_ret = found_end;
        ret = 0;
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        return ret;
 }
@@ -1400,6 +1414,11 @@ again:
        ptr_end = ptr + item_size;
        while (ptr < ptr_end) {
                di = (struct btrfs_dir_item *)ptr;
+                if (verify_dir_item(root, eb, di)) {
+                        ret = -EIO;
+                        goto out;
+                }
                name_len = btrfs_dir_name_len(eb, di);
                name = kmalloc(name_len, GFP_NOFS);
                if (!name) {
@@ -1420,12 +1439,15 @@ again:
                                                     dir_key->offset,
                                                     name, name_len, 0);
                }
-                if (!log_di || IS_ERR(log_di)) {
+                if (IS_ERR_OR_NULL(log_di)) {
                        btrfs_dir_item_key_to_cpu(eb, di, &location);
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
-                        btrfs_release_path(log, log_path);
+                        btrfs_release_path(log_path);
                        inode = read_one_inode(root, location.objectid);
-                        BUG_ON(!inode);
+                        if (!inode) {
+                                kfree(name);
+                                return -EIO;
+                        }
                        ret = link_to_fixup_dir(trans, root,
                                                path, location.objectid);
@@ -1447,7 +1469,7 @@ again:
                        ret = 0;
                        goto out;
                }
-                btrfs_release_path(log, log_path);
+                btrfs_release_path(log_path);
                kfree(name);
                ptr = (unsigned long)(di + 1);
@@ -1455,8 +1477,8 @@ again:
        }
        ret = 0;
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
-        btrfs_release_path(log, log_path);
+        btrfs_release_path(log_path);
        return ret;
 }
@@ -1544,7 +1566,7 @@ again:
                                break;
                        dir_key.offset = found_key.offset + 1;
                }
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                if (range_end == (u64)-1)
                        break;
                range_start = range_end + 1;
@@ -1555,11 +1577,11 @@ next_type:
        if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
                key_type = BTRFS_DIR_LOG_INDEX_KEY;
                dir_key.type = BTRFS_DIR_INDEX_KEY;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                goto again;
        }
 out:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        btrfs_free_path(log_path);
        iput(dir);
        return ret;
@@ -1583,7 +1605,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
        struct btrfs_path *path;
        struct btrfs_root *root = wc->replay_dest;
        struct btrfs_key key;
-        u32 item_size;
        int level;
        int i;
        int ret;
@@ -1601,7 +1622,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
        nritems = btrfs_header_nritems(eb);
        for (i = 0; i < nritems; i++) {
                btrfs_item_key_to_cpu(eb, &key, i);
-                item_size = btrfs_item_size_nr(eb, i);
                /* inode keys are done during the first stage */
                if (key.type == BTRFS_INODE_ITEM_KEY &&
@@ -1668,7 +1688,6 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                                   struct walk_control *wc)
 {
        u64 root_owner;
-        u64 root_gen;
        u64 bytenr;
        u64 ptr_gen;
        struct extent_buffer *next;
@@ -1698,9 +1717,10 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
                parent = path->nodes[*level];
                root_owner = btrfs_header_owner(parent);
-                root_gen = btrfs_header_generation(parent);
                next = btrfs_find_create_tree_block(root, bytenr, blocksize);
+                if (!next)
+                        return -ENOMEM;
                if (*level == 1) {
                        wc->process_func(root, next, wc, ptr_gen);
@@ -1749,7 +1769,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                 struct walk_control *wc)
 {
        u64 root_owner;
-        u64 root_gen;
        int i;
        int slot;
        int ret;
@@ -1757,8 +1776,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
        for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
                slot = path->slots[i];
                if (slot + 1 < btrfs_header_nritems(path->nodes[i])) {
-                        struct extent_buffer *node;
-                        node = path->nodes[i];
                        path->slots[i]++;
                        *level = i;
                        WARN_ON(*level == 0);
@@ -1771,7 +1788,6 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
                                parent = path->nodes[*level + 1];
                        root_owner = btrfs_header_owner(parent);
-                        root_gen = btrfs_header_generation(parent);
                        wc->process_func(root, path->nodes[*level], wc,
                                 btrfs_header_generation(path->nodes[*level]));
                        if (wc->free) {
@@ -1815,7 +1831,8 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
        int orig_level;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
        level = btrfs_header_level(log->node);
        orig_level = level;
@@ -2045,6 +2062,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
                wait_log_commit(trans, log_root_tree,
                                log_root_tree->log_transid);
                mutex_unlock(&log_root_tree->log_mutex);
+                ret = 0;
                goto out;
        }
        atomic_set(&log_root_tree->log_commit[index2], 1);
@@ -2091,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
         * the running transaction open, so a full commit can't hop
         * in and cause problems either.
         */
+        btrfs_scrub_pause_super(root);
        write_ctree_super(trans, root->fs_info->tree_root, 1);
+        btrfs_scrub_continue_super(root);
        ret = 0;
        mutex_lock(&root->log_mutex);
@@ -2109,7 +2129,7 @@ out:
        smp_mb();
        if (waitqueue_active(&root->log_commit_wait[index1]))
                wake_up(&root->log_commit_wait[index1]);
-        return 0;
+        return ret;
 }
 static void free_log_tree(struct btrfs_trans_handle *trans,
@@ -2195,6 +2215,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        int ret;
        int err = 0;
        int bytes_del = 0;
+        u64 dir_ino = btrfs_ino(dir);
        if (BTRFS_I(dir)->logged_trans < trans->transid)
                return 0;
@@ -2207,7 +2228,12 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        log = root->log_root;
        path = btrfs_alloc_path();
-        di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
+        if (!path) {
+                err = -ENOMEM;
+                goto out_unlock;
+        }
+        di = btrfs_lookup_dir_item(trans, log, path, dir_ino,
                                   name, name_len, -1);
        if (IS_ERR(di)) {
                err = PTR_ERR(di);
@@ -2218,8 +2244,8 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                bytes_del += name_len;
                BUG_ON(ret);
        }
-        btrfs_release_path(log, path);
+        btrfs_release_path(path);
-        di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
+        di = btrfs_lookup_dir_index_item(trans, log, path, dir_ino,
                                         index, name, name_len, -1);
        if (IS_ERR(di)) {
                err = PTR_ERR(di);
@@ -2237,10 +2263,10 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
        if (bytes_del) {
                struct btrfs_key key;
-                key.objectid = dir->i_ino;
+                key.objectid = dir_ino;
                key.offset = 0;
                key.type = BTRFS_INODE_ITEM_KEY;
-                btrfs_release_path(log, path);
+                btrfs_release_path(path);
                ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
                if (ret < 0) {
@@ -2262,10 +2288,11 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
                        btrfs_mark_buffer_dirty(path->nodes[0]);
                } else
                        ret = 0;
-                btrfs_release_path(log, path);
+                btrfs_release_path(path);
        }
 fail:
        btrfs_free_path(path);
+out_unlock:
        mutex_unlock(&BTRFS_I(dir)->log_mutex);
        if (ret == -ENOSPC) {
                root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -2273,7 +2300,7 @@ fail:
        }
        btrfs_end_log_trans(root);
-        return 0;
+        return err;
 }
 /* see comments for btrfs_del_dir_entries_in_log */
@@ -2295,7 +2322,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
        log = root->log_root;
        mutex_lock(&BTRFS_I(inode)->log_mutex);
-        ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
+        ret = btrfs_del_inode_ref(trans, log, name, name_len, btrfs_ino(inode),
                                  dirid, &index);
        mutex_unlock(&BTRFS_I(inode)->log_mutex);
        if (ret == -ENOSPC) {
@@ -2336,7 +2363,7 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
                              struct btrfs_dir_log_item);
        btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
        btrfs_mark_buffer_dirty(path->nodes[0]);
-        btrfs_release_path(log, path);
+        btrfs_release_path(path);
        return 0;
 }
@@ -2361,13 +2388,14 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        int nritems;
        u64 first_offset = min_offset;
        u64 last_offset = (u64)-1;
+        u64 ino = btrfs_ino(inode);
        log = root->log_root;
-        max_key.objectid = inode->i_ino;
+        max_key.objectid = ino;
        max_key.offset = (u64)-1;
        max_key.type = key_type;
-        min_key.objectid = inode->i_ino;
+        min_key.objectid = ino;
        min_key.type = key_type;
        min_key.offset = min_offset;
@@ -2380,18 +2408,17 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
         * we didn't find anything from this transaction, see if there
         * is anything at all
         */
-        if (ret != 0 || min_key.objectid != inode->i_ino ||
+        if (ret != 0 || min_key.objectid != ino || min_key.type != key_type) {
-            min_key.type != key_type) {
+                min_key.objectid = ino;
-                min_key.objectid = inode->i_ino;
                min_key.type = key_type;
                min_key.offset = (u64)-1;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
                if (ret < 0) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        return ret;
                }
-                ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+                ret = btrfs_previous_item(root, path, ino, key_type);
                /* if ret == 0 there are items for this type,
                 * create a range to tell us the last key of this type.
@@ -2409,7 +2436,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
        }
        /* go backward to find any previous key */
-        ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
+        ret = btrfs_previous_item(root, path, ino, key_type);
        if (ret == 0) {
                struct btrfs_key tmp;
                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
@@ -2424,7 +2451,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        }
                }
        }
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
        /* find the first key from this transaction again */
        ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
@@ -2444,8 +2471,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                for (i = path->slots[0]; i < nritems; i++) {
                        btrfs_item_key_to_cpu(src, &min_key, i);
-                        if (min_key.objectid != inode->i_ino ||
+                        if (min_key.objectid != ino || min_key.type != key_type)
-                            min_key.type != key_type)
                                goto done;
                        ret = overwrite_item(trans, log, dst_path, src, i,
                                             &min_key);
@@ -2466,7 +2492,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                        goto done;
                }
                btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
-                if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
+                if (tmp.objectid != ino || tmp.type != key_type) {
                        last_offset = (u64)-1;
                        goto done;
                }
@@ -2482,8 +2508,8 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans,
                }
        }
 done:
-        btrfs_release_path(root, path);
+        btrfs_release_path(path);
-        btrfs_release_path(log, dst_path);
+        btrfs_release_path(dst_path);
        if (err == 0) {
                *last_offset_ret = last_offset;
@@ -2492,8 +2518,7 @@ done:
                 * is valid
                 */
                ret = insert_dir_log_key(trans, log, path, key_type,
-                                         inode->i_ino, first_offset,
+                                         ino, first_offset, last_offset);
-                                         last_offset);
                if (ret)
                        err = ret;
        }
@@ -2579,10 +2604,11 @@ static int drop_objectid_items(struct btrfs_trans_handle *trans,
                        break;
                ret = btrfs_del_item(trans, log, path);
-                BUG_ON(ret);
+                if (ret)
-                btrfs_release_path(log, path);
+                        break;
+                btrfs_release_path(path);
        }
-        btrfs_release_path(log, path);
+        btrfs_release_path(path);
        return ret;
 }
@@ -2607,6 +2633,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
        ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
                           nr * sizeof(u32), GFP_NOFS);
+        if (!ins_data)
+                return -ENOMEM;
        ins_sizes = (u32 *)ins_data;
        ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
@@ -2654,6 +2683,9 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                        extent = btrfs_item_ptr(src, start_slot + i,
                                                struct btrfs_file_extent_item);
+                        if (btrfs_file_extent_generation(src, extent) < trans->transid)
+                                continue;
                        found_type = btrfs_file_extent_type(src, extent);
                        if (found_type == BTRFS_FILE_EXTENT_REG ||
                            found_type == BTRFS_FILE_EXTENT_PREALLOC) {
@@ -2678,14 +2710,14 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
                                ret = btrfs_lookup_csums_range(
                                                log->fs_info->csum_root,
                                                ds + cs, ds + cs + cl - 1,
-                                                &ordered_sums);
+                                                &ordered_sums, 0);
                                BUG_ON(ret);
                        }
                }
        }
        btrfs_mark_buffer_dirty(dst_path->nodes[0]);
-        btrfs_release_path(log, dst_path);
+        btrfs_release_path(dst_path);
        kfree(ins_data);
        /*
@@ -2729,23 +2761,29 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
        struct btrfs_key max_key;
        struct btrfs_root *log = root->log_root;
        struct extent_buffer *src = NULL;
-        u32 size;
        int err = 0;
        int ret;
        int nritems;
        int ins_start_slot = 0;
        int ins_nr;
+        u64 ino = btrfs_ino(inode);
        log = root->log_root;
        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
        dst_path = btrfs_alloc_path();
+        if (!dst_path) {
+                btrfs_free_path(path);
+                return -ENOMEM;
+        }
-        min_key.objectid = inode->i_ino;
+        min_key.objectid = ino;
        min_key.type = BTRFS_INODE_ITEM_KEY;
        min_key.offset = 0;
-        max_key.objectid = inode->i_ino;
+        max_key.objectid = ino;
        /* today the code can only do partial logging of directories */
        if (!S_ISDIR(inode->i_mode))
@@ -2757,6 +2795,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                max_key.type = (u8)-1;
        max_key.offset = (u64)-1;
+        ret = btrfs_commit_inode_delayed_items(trans, inode);
+        if (ret) {
+                btrfs_free_path(path);
+                btrfs_free_path(dst_path);
+                return ret;
+        }
        mutex_lock(&BTRFS_I(inode)->log_mutex);
        /*
@@ -2768,8 +2813,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                if (inode_only == LOG_INODE_EXISTS)
                        max_key_type = BTRFS_XATTR_ITEM_KEY;
-                ret = drop_objectid_items(trans, log, path,
+                ret = drop_objectid_items(trans, log, path, ino, max_key_type);
-                                          inode->i_ino, max_key_type);
        } else {
                ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
        }
@@ -2787,13 +2831,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
                        break;
 again:
                /* note, ins_nr might be > 0 here, cleanup outside the loop */
-                if (min_key.objectid != inode->i_ino)
+                if (min_key.objectid != ino)
                        break;
                if (min_key.type > max_key.type)
                        break;
                src = path->nodes[0];
-                size = btrfs_item_size_nr(src, path->slots[0]);
                if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
                        ins_nr++;
                        goto next_slot;
@@ -2830,7 +2873,7 @@ next_slot:
                        }
                        ins_nr = 0;
                }
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                if (min_key.offset < (u64)-1)
                        min_key.offset++;
@@ -2853,8 +2896,8 @@ next_slot:
        }
        WARN_ON(ins_nr);
        if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
-                btrfs_release_path(log, dst_path);
+                btrfs_release_path(dst_path);
                ret = log_directory_changes(trans, root, inode, path, dst_path);
                if (ret) {
                        err = ret;
@@ -2884,6 +2927,7 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
 {
        int ret = 0;
        struct btrfs_root *root;
+        struct dentry *old_parent = NULL;
        /*
         * for regular files, if its inode is already on disk, we don't
@@ -2925,10 +2969,13 @@ static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
                if (IS_ROOT(parent))
                        break;
-                parent = parent->d_parent;
+                parent = dget_parent(parent);
+                dput(old_parent);
+                old_parent = parent;
                inode = parent->d_inode;
        }
+        dput(old_parent);
 out:
        return ret;
 }
@@ -2960,6 +3007,7 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 {
        int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
        struct super_block *sb;
+        struct dentry *old_parent = NULL;
        int ret = 0;
        u64 last_committed = root->fs_info->last_trans_committed;
@@ -3031,10 +3079,13 @@ int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
                if (IS_ROOT(parent))
                        break;
-                parent = parent->d_parent;
+                parent = dget_parent(parent);
+                dput(old_parent);
+                old_parent = parent;
        }
        ret = 0;
 end_trans:
+        dput(old_parent);
        if (ret < 0) {
                BUG_ON(ret != -ENOSPC);
                root->fs_info->last_trans_log_full_commit = trans->transid;
@@ -3054,8 +3105,13 @@ end_no_trans:
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
                          struct btrfs_root *root, struct dentry *dentry)
 {
-        return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+        struct dentry *parent = dget_parent(dentry);
-                                      dentry->d_parent, 0);
+        int ret;
+        ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, 0);
+        dput(parent);
+        return ret;
 }
 /*
@@ -3077,16 +3133,20 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
                .stage = 0,
        };
-        fs_info->log_root_recovering = 1;
        path = btrfs_alloc_path();
-        BUG_ON(!path);
+        if (!path)
+                return -ENOMEM;
+        fs_info->log_root_recovering = 1;
        trans = btrfs_start_transaction(fs_info->tree_root, 0);
+        BUG_ON(IS_ERR(trans));
        wc.trans = trans;
        wc.pin = 1;
-        walk_log_tree(trans, log_root_tree, &wc);
+        ret = walk_log_tree(trans, log_root_tree, &wc);
+        BUG_ON(ret);
 again:
        key.objectid = BTRFS_TREE_LOG_OBJECTID;
@@ -3104,21 +3164,20 @@ again:
                }
                btrfs_item_key_to_cpu(path->nodes[0], &found_key,
                                      path->slots[0]);
-                btrfs_release_path(log_root_tree, path);
+                btrfs_release_path(path);
                if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
                        break;
                log = btrfs_read_fs_root_no_radix(log_root_tree,
                                                  &found_key);
-                BUG_ON(!log);
+                BUG_ON(IS_ERR(log));
                tmp_key.objectid = found_key.offset;
                tmp_key.type = BTRFS_ROOT_ITEM_KEY;
                tmp_key.offset = (u64)-1;
                wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
-                BUG_ON(!wc.replay_dest);
+                BUG_ON(IS_ERR_OR_NULL(wc.replay_dest));
                wc.replay_dest->log_root = log;
                btrfs_record_root_in_trans(trans, wc.replay_dest);
@@ -3140,7 +3199,7 @@ again:
                if (found_key.offset == 0)
                        break;
        }
-        btrfs_release_path(log_root_tree, path);
+        btrfs_release_path(path);
        /* step one is to pin it all, step two is to replay just inodes */
        if (wc.pin) {
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 3dfae84c8cc8..2270ac58d746 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -38,7 +38,6 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
                               struct btrfs_root *root,
                               const char *name, int name_len,
                               struct inode *inode, u64 dirid);
-int btrfs_join_running_log_trans(struct btrfs_root *root);
 int btrfs_end_log_trans(struct btrfs_root *root);
 int btrfs_pin_log_trans(struct btrfs_root *root);
 int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh
deleted file mode 100644
index 1ca1952fd917..000000000000
--- a/fs/btrfs/version.sh
+++ /dev/null
@@ -1,43 +0,0 @@
-#!/bin/bash
-#
-# determine-version -- report a useful version for releases
-#
-# Copyright 2008, Aron Griffis <agriffis@n01se.net>
-# Copyright 2008, Oracle
-# Released under the GNU GPLv2
- 
-v="v0.16"
-which git &> /dev/null
-if [ $? == 0 ]; then
-    git branch >& /dev/null
-    if [ $? == 0 ]; then
-            if head=`git rev-parse --verify HEAD 2>/dev/null`; then
-                if tag=`git describe --tags 2>/dev/null`; then
-                    v="$tag"
-                fi
-                # Are there uncommitted changes?
-                git update-index --refresh --unmerged > /dev/null
-                if git diff-index --name-only HEAD | \
-                    grep -v "^scripts/package" \
-                    | read dummy; then
-                    v="$v"-dirty
-                fi
-            fi
-    fi
-fi
- 
-echo "#ifndef __BUILD_VERSION" > .build-version.h
-echo "#define __BUILD_VERSION" >> .build-version.h
-echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h
-echo "#endif" >> .build-version.h
-diff -q version.h .build-version.h >& /dev/null
-if [ $? == 0 ]; then
-    rm .build-version.h
-    exit 0
-fi
-mv .build-version.h version.h
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index dd318ff280b2..19450bc53632 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -22,6 +22,7 @@
 #include <linux/blkdev.h>
 #include <linux/random.h>
 #include <linux/iocontext.h>
+#include <linux/capability.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -32,38 +33,14 @@
 #include "volumes.h"
 #include "async-thread.h"
-struct map_lookup {
-        u64 type;
-        int io_align;
-        int io_width;
-        int stripe_len;
-        int sector_size;
-        int num_stripes;
-        int sub_stripes;
-        struct btrfs_bio_stripe stripes[];
-};
 static int init_first_rw_device(struct btrfs_trans_handle *trans,
                                struct btrfs_root *root,
                                struct btrfs_device *device);
 static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
-#define map_lookup_size(n) (sizeof(struct map_lookup) + \
-                            (sizeof(struct btrfs_bio_stripe) * (n)))
 static DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
-void btrfs_lock_volumes(void)
-{
-        mutex_lock(&uuid_mutex);
-}
-void btrfs_unlock_volumes(void)
-{
-        mutex_unlock(&uuid_mutex);
-}
 static void lock_chunks(struct btrfs_root *root)
 {
        mutex_lock(&root->fs_info->chunk_mutex);
@@ -161,22 +138,25 @@ static noinline int run_scheduled_bios(struct btrfs_device *device)
        struct bio *cur;
        int again = 0;
        unsigned long num_run;
-        unsigned long num_sync_run;
        unsigned long batch_run = 0;
        unsigned long limit;
        unsigned long last_waited = 0;
        int force_reg = 0;
+        struct blk_plug plug;
+        /*
+         * this function runs all the bios we've collected for
+         * a particular device.  We don't want to wander off to
+         * another device without first sending all of these down.
+         * So, setup a plug here and finish it off before we return
+         */
+        blk_start_plug(&plug);
        bdi = blk_get_backing_dev_info(device->bdev);
        fs_info = device->dev_root->fs_info;
        limit = btrfs_async_submit_limit(fs_info);
        limit = limit * 2 / 3;
-        /* we want to make sure that every time we switch from the sync
-         * list to the normal list, we unplug
-         */
-        num_sync_run = 0;
 loop:
        spin_lock(&device->io_lock);
@@ -222,15 +202,6 @@ loop_lock:
        spin_unlock(&device->io_lock);
-        /*
-         * if we're doing the regular priority list, make sure we unplug
-         * for any high prio bios we've sent down
-         */
-        if (pending_bios == &device->pending_bios && num_sync_run > 0) {
-                num_sync_run = 0;
-                blk_run_backing_dev(bdi, NULL);
-        }
        while (pending) {
                rmb();
@@ -258,19 +229,11 @@ loop_lock:
                BUG_ON(atomic_read(&cur->bi_cnt) == 0);
-                if (cur->bi_rw & REQ_SYNC)
-                        num_sync_run++;
                submit_bio(cur->bi_rw, cur);
                num_run++;
                batch_run++;
-                if (need_resched()) {
+                if (need_resched())
-                        if (num_sync_run) {
-                                blk_run_backing_dev(bdi, NULL);
-                                num_sync_run = 0;
-                        }
                        cond_resched();
-                }
                /*
                 * we made progress, there is more work to do and the bdi
@@ -303,13 +266,8 @@ loop_lock:
                                 * against it before looping
                                 */
                                last_waited = ioc->last_waited;
-                                if (need_resched()) {
+                                if (need_resched())
-                                        if (num_sync_run) {
-                                                blk_run_backing_dev(bdi, NULL);
-                                                num_sync_run = 0;
-                                        }
                                        cond_resched();
-                                }
                                continue;
                        }
                        spin_lock(&device->io_lock);
@@ -322,22 +280,6 @@ loop_lock:
                }
        }
-        if (num_sync_run) {
-                num_sync_run = 0;
-                blk_run_backing_dev(bdi, NULL);
-        }
-        /*
-         * IO has already been through a long path to get here.  Checksumming,
-         * async helper threads, perhaps compression.  We've done a pretty
-         * good job of collecting a batch of IO and should just unplug
-         * the device right away.
-         *
-         * This will help anyone who is waiting on the IO, they might have
-         * already unplugged, but managed to do so before the bio they
-         * cared about found its way down here.
-         */
-        blk_run_backing_dev(bdi, NULL);
        cond_resched();
        if (again)
                goto loop;
@@ -348,6 +290,7 @@ loop_lock:
        spin_unlock(&device->io_lock);
 done:
+        blk_finish_plug(&plug);
        return 0;
 }
@@ -398,7 +341,6 @@ static noinline int device_list_add(const char *path,
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, disk_super->dev_item.uuid,
                       BTRFS_UUID_SIZE);
-                device->barriers = 1;
                spin_lock_init(&device->io_lock);
                device->name = kstrdup(path, GFP_NOFS);
                if (!device->name) {
@@ -408,17 +350,21 @@ static noinline int device_list_add(const char *path,
                INIT_LIST_HEAD(&device->dev_alloc_list);
                mutex_lock(&fs_devices->device_list_mutex);
-                list_add(&device->dev_list, &fs_devices->devices);
+                list_add_rcu(&device->dev_list, &fs_devices->devices);
                mutex_unlock(&fs_devices->device_list_mutex);
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
-        } else if (strcmp(device->name, path)) {
+        } else if (!device->name || strcmp(device->name, path)) {
                name = kstrdup(path, GFP_NOFS);
                if (!name)
                        return -ENOMEM;
                kfree(device->name);
                device->name = name;
+                if (device->missing) {
+                        fs_devices->missing_devices--;
+                        device->missing = 0;
+                }
        }
        if (found_transid > fs_devices->latest_trans) {
@@ -447,7 +393,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
        fs_devices->latest_trans = orig->latest_trans;
        memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
-        mutex_lock(&orig->device_list_mutex);
+        /* We have held the volume lock, it is safe to get the devices. */
        list_for_each_entry(orig_dev, &orig->devices, dev_list) {
                device = kzalloc(sizeof(*device), GFP_NOFS);
                if (!device)
@@ -462,7 +408,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                device->devid = orig_dev->devid;
                device->work.func = pending_bios_fn;
                memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
-                device->barriers = 1;
                spin_lock_init(&device->io_lock);
                INIT_LIST_HEAD(&device->dev_list);
                INIT_LIST_HEAD(&device->dev_alloc_list);
@@ -471,10 +416,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
                device->fs_devices = fs_devices;
                fs_devices->num_devices++;
        }
-        mutex_unlock(&orig->device_list_mutex);
        return fs_devices;
 error:
-        mutex_unlock(&orig->device_list_mutex);
        free_fs_devices(fs_devices);
        return ERR_PTR(-ENOMEM);
 }
@@ -485,13 +428,13 @@ int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices)
        mutex_lock(&uuid_mutex);
 again:
-        mutex_lock(&fs_devices->device_list_mutex);
+        /* This is the initialized path, it is safe to release the devices. */
        list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
                if (device->in_fs_metadata)
                        continue;
                if (device->bdev) {
-                        close_bdev_exclusive(device->bdev, device->mode);
+                        blkdev_put(device->bdev, device->mode);
                        device->bdev = NULL;
                        fs_devices->open_devices--;
                }
@@ -505,7 +448,6 @@ again:
                kfree(device->name);
                kfree(device);
        }
-        mutex_unlock(&fs_devices->device_list_mutex);
        if (fs_devices->seed) {
                fs_devices = fs_devices->seed;
@@ -516,6 +458,29 @@ again:
        return 0;
 }
+static void __free_device(struct work_struct *work)
+{
+        struct btrfs_device *device;
+        device = container_of(work, struct btrfs_device, rcu_work);
+        if (device->bdev)
+                blkdev_put(device->bdev, device->mode);
+        kfree(device->name);
+        kfree(device);
+}
+static void free_device(struct rcu_head *head)
+{
+        struct btrfs_device *device;
+        device = container_of(head, struct btrfs_device, rcu);
+        INIT_WORK(&device->rcu_work, __free_device);
+        schedule_work(&device->rcu_work);
+}
 static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 {
        struct btrfs_device *device;
@@ -523,20 +488,32 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
        if (--fs_devices->opened > 0)
                return 0;
+        mutex_lock(&fs_devices->device_list_mutex);
        list_for_each_entry(device, &fs_devices->devices, dev_list) {
-                if (device->bdev) {
+                struct btrfs_device *new_device;
-                        close_bdev_exclusive(device->bdev, device->mode);
+                if (device->bdev)
                        fs_devices->open_devices--;
-                }
                if (device->writeable) {
                        list_del_init(&device->dev_alloc_list);
                        fs_devices->rw_devices--;
                }
-                device->bdev = NULL;
+                new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
-                device->writeable = 0;
+                BUG_ON(!new_device);
-                device->in_fs_metadata = 0;
+                memcpy(new_device, device, sizeof(*new_device));
+                new_device->name = kstrdup(device->name, GFP_NOFS);
+                BUG_ON(device->name && !new_device->name);
+                new_device->bdev = NULL;
+                new_device->writeable = 0;
+                new_device->in_fs_metadata = 0;
+                list_replace_rcu(&device->dev_list, &new_device->dev_list);
+                call_rcu(&device->rcu, free_device);
        }
+        mutex_unlock(&fs_devices->device_list_mutex);
        WARN_ON(fs_devices->open_devices);
        WARN_ON(fs_devices->rw_devices);
        fs_devices->opened = 0;
@@ -582,13 +559,15 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
        int seeding = 1;
        int ret = 0;
+        flags |= FMODE_EXCL;
        list_for_each_entry(device, head, dev_list) {
                if (device->bdev)
                        continue;
                if (!device->name)
                        continue;
-                bdev = open_bdev_exclusive(device->name, flags, holder);
+                bdev = blkdev_get_by_path(device->name, flags, holder);
                if (IS_ERR(bdev)) {
                        printk(KERN_INFO "open %s failed\n", device->name);
                        goto error;
@@ -596,8 +575,10 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                set_blocksize(bdev, 4096);
                bh = btrfs_read_dev_super(bdev);
-                if (!bh)
+                if (!bh) {
+                        ret = -EINVAL;
                        goto error_close;
+                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
                devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -635,12 +616,13 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                        list_add(&device->dev_alloc_list,
                                 &fs_devices->alloc_list);
                }
+                brelse(bh);
                continue;
 error_brelse:
                brelse(bh);
 error_close:
-                close_bdev_exclusive(bdev, FMODE_READ);
+                blkdev_put(bdev, flags);
 error:
                continue;
        }
@@ -686,7 +668,8 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        mutex_lock(&uuid_mutex);
-        bdev = open_bdev_exclusive(path, flags, holder);
+        flags |= FMODE_EXCL;
+        bdev = blkdev_get_by_path(path, flags, holder);
        if (IS_ERR(bdev)) {
                ret = PTR_ERR(bdev);
@@ -698,7 +681,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
                goto error_close;
        bh = btrfs_read_dev_super(bdev);
        if (!bh) {
-                ret = -EIO;
+                ret = -EINVAL;
                goto error_close;
        }
        disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -706,77 +689,178 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
        transid = btrfs_super_generation(disk_super);
        if (disk_super->label[0])
                printk(KERN_INFO "device label %s ", disk_super->label);
-        else {
+        else
-                /* FIXME, make a readl uuid parser */
+                printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
-                printk(KERN_INFO "device fsid %llx-%llx ",
-                       *(unsigned long long *)disk_super->fsid,
-                       *(unsigned long long *)(disk_super->fsid + 8));
-        }
        printk(KERN_CONT "devid %llu transid %llu %s\n",
               (unsigned long long)devid, (unsigned long long)transid, path);
        ret = device_list_add(path, disk_super, devid, fs_devices_ret);
        brelse(bh);
 error_close:
-        close_bdev_exclusive(bdev, flags);
+        blkdev_put(bdev, flags);
 error:
        mutex_unlock(&uuid_mutex);
        return ret;
 }
+/* helper to account the used device space in the range */
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                   u64 end, u64 *length)
+{
+        struct btrfs_key key;
+        struct btrfs_root *root = device->dev_root;
+        struct btrfs_dev_extent *dev_extent;
+        struct btrfs_path *path;
+        u64 extent_end;
+        int ret;
+        int slot;
+        struct extent_buffer *l;
+        *length = 0;
+        if (start >= device->total_bytes)
+                return 0;
+        path = btrfs_alloc_path();
+        if (!path)
+                return -ENOMEM;
+        path->reada = 2;
+        key.objectid = device->devid;
+        key.offset = start;
+        key.type = BTRFS_DEV_EXTENT_KEY;
+        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+        if (ret < 0)
+                goto out;
+        if (ret > 0) {
+                ret = btrfs_previous_item(root, path, key.objectid, key.type);
+                if (ret < 0)
+                        goto out;
+        }
+        while (1) {
+                l = path->nodes[0];
+                slot = path->slots[0];
+                if (slot >= btrfs_header_nritems(l)) {
+                        ret = btrfs_next_leaf(root, path);
+                        if (ret == 0)
+                                continue;
+                        if (ret < 0)
+                                goto out;
+                        break;
+                }
+                btrfs_item_key_to_cpu(l, &key, slot);
+                if (key.objectid < device->devid)
+                        goto next;
+                if (key.objectid > device->devid)
+                        break;
+                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
+                        goto next;
+                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+                extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                  dev_extent);
+                if (key.offset <= start && extent_end > end) {
+                        *length = end - start + 1;
+                        break;
+                } else if (key.offset <= start && extent_end > start)
+                        *length += extent_end - start;
+                else if (key.offset > start && extent_end <= end)
+                        *length += extent_end - key.offset;
+                else if (key.offset > start && key.offset <= end) {
+                        *length += end - key.offset + 1;
+                        break;
+                } else if (key.offset > end)
+                        break;
+next:
+                path->slots[0]++;
+        }
+        ret = 0;
+out:
+        btrfs_free_path(path);
+        return ret;
+}
 /*
+ * find_free_dev_extent - find free space in the specified device
+ * @trans:      transaction handler
+ * @device:     the device which we search the free space in
+ * @num_bytes:  the size of the free space that we need
+ * @start:      store the start of the free space.
+ * @len:        the size of the free space. that we find, or the size of the max
+ *              free space if we don't find suitable free space
+ *
 * this uses a pretty simple search, the expectation is that it is
 * called very infrequently and that a given device has a small number
 * of extents
+ *
+ * @start is used to store the start of the free space if we find. But if we
+ * don't find suitable free space, it will be used to store the start position
+ * of the max free space.
+ *
+ * @len is used to store the size of the free space that we find.
+ * But if we don't find suitable free space, it is used to store the size of
+ * the max free space.
 */
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_device *device, u64 num_bytes,
-                         u64 *start, u64 *max_avail)
+                         u64 *start, u64 *len)
 {
        struct btrfs_key key;
        struct btrfs_root *root = device->dev_root;
-        struct btrfs_dev_extent *dev_extent = NULL;
+        struct btrfs_dev_extent *dev_extent;
        struct btrfs_path *path;
-        u64 hole_size = 0;
+        u64 hole_size;
-        u64 last_byte = 0;
+        u64 max_hole_start;
-        u64 search_start = 0;
+        u64 max_hole_size;
+        u64 extent_end;
+        u64 search_start;
        u64 search_end = device->total_bytes;
        int ret;
-        int slot = 0;
+        int slot;
-        int start_found;
        struct extent_buffer *l;
-        path = btrfs_alloc_path();
-        if (!path)
-                return -ENOMEM;
-        path->reada = 2;
-        start_found = 0;
        /* FIXME use last free of some kind */
        /* we don't want to overwrite the superblock on the drive,
         * so we make sure to start at an offset of at least 1MB
         */
-        search_start = max((u64)1024 * 1024, search_start);
+        search_start = max(root->fs_info->alloc_start, 1024ull * 1024);
+        max_hole_start = search_start;
+        max_hole_size = 0;
-        if (root->fs_info->alloc_start + num_bytes <= device->total_bytes)
+        if (search_start >= search_end) {
-                search_start = max(root->fs_info->alloc_start, search_start);
+                ret = -ENOSPC;
+                goto error;
+        }
+        path = btrfs_alloc_path();
+        if (!path) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        path->reada = 2;
        key.objectid = device->devid;
        key.offset = search_start;
        key.type = BTRFS_DEV_EXTENT_KEY;
        ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
        if (ret < 0)
-                goto error;
+                goto out;
        if (ret > 0) {
                ret = btrfs_previous_item(root, path, key.objectid, key.type);
                if (ret < 0)
-                        goto error;
+                        goto out;
-                if (ret > 0)
-                        start_found = 1;
        }
-        l = path->nodes[0];
-        btrfs_item_key_to_cpu(l, &key, path->slots[0]);
        while (1) {
                l = path->nodes[0];
                slot = path->slots[0];
@@ -785,24 +869,9 @@ int find_free_dev_extent(struct btrfs_trans_handle *trans,
                        if (ret == 0)
                                continue;
                        if (ret < 0)
-                                goto error;
+                                goto out;
-no_more_items:
-                        if (!start_found) {
+                        break;
-                                if (search_start >= search_end) {
-                                        ret = -ENOSPC;
-                                        goto error;
-                                }
-                                *start = search_start;
-                                start_found = 1;
-                                goto check_pending;
-                        }
-                        *start = last_byte > search_start ?
-                                last_byte : search_start;
-                        if (search_end <= *start) {
-                                ret = -ENOSPC;
-                                goto error;
-                        }
-                        goto check_pending;
                }
                btrfs_item_key_to_cpu(l, &key, slot);
@@ -810,48 +879,62 @@ no_more_items:
                        goto next;
                if (key.objectid > device->devid)
-                        goto no_more_items;
+                        break;
-                if (key.offset >= search_start && key.offset > last_byte &&
+                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-                    start_found) {
+                        goto next;
-                        if (last_byte < search_start)
-                                last_byte = search_start;
-                        hole_size = key.offset - last_byte;
-                        if (hole_size > *max_avail)
+                if (key.offset > search_start) {
-                                *max_avail = hole_size;
+                        hole_size = key.offset - search_start;
+                        if (hole_size > max_hole_size) {
+                                max_hole_start = search_start;
+                                max_hole_size = hole_size;
+                        }
-                        if (key.offset > last_byte &&
+                        /*
-                            hole_size >= num_bytes) {
+                         * If this free space is greater than which we need,
-                                *start = last_byte;
+                         * it must be the max free space that we have found
-                                goto check_pending;
+                         * until now, so max_hole_start must point to the start
+                         * of this free space and the length of this free space
+                         * is stored in max_hole_size. Thus, we return
+                         * max_hole_start and max_hole_size and go back to the
+                         * caller.
+                         */
+                        if (hole_size >= num_bytes) {
+                                ret = 0;
+                                goto out;
                        }
                }
-                if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY)
-                        goto next;
-                start_found = 1;
                dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
-                last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+                extent_end = key.offset + btrfs_dev_extent_length(l,
+                                                                  dev_extent);
+                if (extent_end > search_start)
+                        search_start = extent_end;
 next:
                path->slots[0]++;
                cond_resched();
        }
-check_pending:
-        /* we have to make sure we didn't find an extent that has already
-         * been allocated by the map tree or the original allocation
-         */
-        BUG_ON(*start < search_start);
-        if (*start + num_bytes > search_end) {
+        hole_size = search_end- search_start;
-                ret = -ENOSPC;
+        if (hole_size > max_hole_size) {
-                goto error;
+                max_hole_start = search_start;
+                max_hole_size = hole_size;
        }
-        /* check for pending inserts here */
-        ret = 0;
-error:
+        /* See above. */
+        if (hole_size < num_bytes)
+                ret = -ENOSPC;
+        else
+                ret = 0;
+out:
        btrfs_free_path(path);
+error:
+        *start = max_hole_start;
+        if (len)
+                *len = max_hole_size;
        return ret;
 }
@@ -879,14 +962,14 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
        if (ret > 0) {
                ret = btrfs_previous_item(root, path, key.objectid,
                                          BTRFS_DEV_EXTENT_KEY);
-                BUG_ON(ret);
+                if (ret)
+                        goto out;
                leaf = path->nodes[0];
                btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
                extent = btrfs_item_ptr(leaf, path->slots[0],
                                        struct btrfs_dev_extent);
                BUG_ON(found_key.offset > start || found_key.offset +
                       btrfs_dev_extent_length(leaf, extent) < start);
-                ret = 0;
        } else if (ret == 0) {
                leaf = path->nodes[0];
                extent = btrfs_item_ptr(leaf, path->slots[0],
@@ -897,8 +980,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
        if (device->bytes_used > 0)
                device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
        ret = btrfs_del_item(trans, root, path);
-        BUG_ON(ret);
+out:
        btrfs_free_path(path);
        return ret;
 }
@@ -1098,6 +1181,10 @@ static int btrfs_rm_dev_item(struct btrfs_root *root,
                return -ENOMEM;
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                btrfs_free_path(path);
+                return PTR_ERR(trans);
+        }
        key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
        key.type = BTRFS_DEV_ITEM_KEY;
        key.offset = device->devid;
@@ -1129,11 +1216,13 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        struct block_device *bdev;
        struct buffer_head *bh = NULL;
        struct btrfs_super_block *disk_super;
+        struct btrfs_fs_devices *cur_devices;
        u64 all_avail;
        u64 devid;
        u64 num_devices;
        u8 *dev_uuid;
        int ret = 0;
+        bool clear_super = false;
        mutex_lock(&uuid_mutex);
        mutex_lock(&root->fs_info->volume_mutex);
@@ -1164,14 +1253,16 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                device = NULL;
                devices = &root->fs_info->fs_devices->devices;
-                mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+                /*
+                 * It is safe to read the devices since the volume_mutex
+                 * is held.
+                 */
                list_for_each_entry(tmp, devices, dev_list) {
                        if (tmp->in_fs_metadata && !tmp->bdev) {
                                device = tmp;
                                break;
                        }
                }
-                mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
                bdev = NULL;
                bh = NULL;
                disk_super = NULL;
@@ -1181,8 +1272,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                        goto out;
                }
        } else {
-                bdev = open_bdev_exclusive(device_path, FMODE_READ,
+                bdev = blkdev_get_by_path(device_path, FMODE_READ | FMODE_EXCL,
-                                      root->fs_info->bdev_holder);
+                                          root->fs_info->bdev_holder);
                if (IS_ERR(bdev)) {
                        ret = PTR_ERR(bdev);
                        goto out;
@@ -1191,7 +1282,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                set_blocksize(bdev, 4096);
                bh = btrfs_read_dev_super(bdev);
                if (!bh) {
-                        ret = -EIO;
+                        ret = -EINVAL;
                        goto error_close;
                }
                disk_super = (struct btrfs_super_block *)bh->b_data;
@@ -1213,31 +1304,39 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        }
        if (device->writeable) {
+                lock_chunks(root);
                list_del_init(&device->dev_alloc_list);
+                unlock_chunks(root);
                root->fs_info->fs_devices->rw_devices--;
+                clear_super = true;
        }
        ret = btrfs_shrink_device(device, 0);
        if (ret)
-                goto error_brelse;
+                goto error_undo;
        ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device);
        if (ret)
-                goto error_brelse;
+                goto error_undo;
        device->in_fs_metadata = 0;
+        btrfs_scrub_cancel_dev(root, device);
        /*
         * the device list mutex makes sure that we don't change
         * the device list while someone else is writing out all
         * the device supers.
         */
+        cur_devices = device->fs_devices;
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-        list_del_init(&device->dev_list);
+        list_del_rcu(&device->dev_list);
-        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        device->fs_devices->num_devices--;
+        if (device->missing)
+                root->fs_info->fs_devices->missing_devices--;
        next_device = list_entry(root->fs_info->fs_devices->devices.next,
                                 struct btrfs_device, dev_list);
        if (device->bdev == root->fs_info->sb->s_bdev)
@@ -1245,34 +1344,36 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
        if (device->bdev == root->fs_info->fs_devices->latest_bdev)
                root->fs_info->fs_devices->latest_bdev = next_device->bdev;
-        if (device->bdev) {
+        if (device->bdev)
-                close_bdev_exclusive(device->bdev, device->mode);
-                device->bdev = NULL;
                device->fs_devices->open_devices--;
-        }
+        call_rcu(&device->rcu, free_device);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
        btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
-        if (device->fs_devices->open_devices == 0) {
+        if (cur_devices->open_devices == 0) {
                struct btrfs_fs_devices *fs_devices;
                fs_devices = root->fs_info->fs_devices;
                while (fs_devices) {
-                        if (fs_devices->seed == device->fs_devices)
+                        if (fs_devices->seed == cur_devices)
                                break;
                        fs_devices = fs_devices->seed;
                }
-                fs_devices->seed = device->fs_devices->seed;
+                fs_devices->seed = cur_devices->seed;
-                device->fs_devices->seed = NULL;
+                cur_devices->seed = NULL;
-                __btrfs_close_devices(device->fs_devices);
+                lock_chunks(root);
-                free_fs_devices(device->fs_devices);
+                __btrfs_close_devices(cur_devices);
+                unlock_chunks(root);
+                free_fs_devices(cur_devices);
        }
        /*
         * at this point, the device is zero sized.  We want to
         * remove it from the devices list and zero out the old super
         */
-        if (device->writeable) {
+        if (clear_super) {
                /* make sure this device isn't detected as part of
                 * the FS anymore
                 */
@@ -1281,19 +1382,26 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
                sync_dirty_buffer(bh);
        }
-        kfree(device->name);
-        kfree(device);
        ret = 0;
 error_brelse:
        brelse(bh);
 error_close:
        if (bdev)
-                close_bdev_exclusive(bdev, FMODE_READ);
+                blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
 out:
        mutex_unlock(&root->fs_info->volume_mutex);
        mutex_unlock(&uuid_mutex);
        return ret;
+error_undo:
+        if (device->writeable) {
+                lock_chunks(root);
+                list_add(&device->dev_alloc_list,
+                         &root->fs_info->fs_devices->alloc_list);
+                unlock_chunks(root);
+                root->fs_info->fs_devices->rw_devices++;
+        }
+        goto error_brelse;
 }
 /*
@@ -1330,7 +1438,12 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
        INIT_LIST_HEAD(&seed_devices->devices);
        INIT_LIST_HEAD(&seed_devices->alloc_list);
        mutex_init(&seed_devices->device_list_mutex);
-        list_splice_init(&fs_devices->devices, &seed_devices->devices);
+        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+        list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
+                              synchronize_rcu);
+        mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
        list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
        list_for_each_entry(device, &seed_devices->devices, dev_list) {
                device->fs_devices = seed_devices;
@@ -1391,7 +1504,7 @@ next_slot:
                                goto error;
                        leaf = path->nodes[0];
                        btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        continue;
                }
@@ -1441,7 +1554,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding)
                return -EINVAL;
-        bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder);
+        bdev = blkdev_get_by_path(device_path, FMODE_EXCL,
+                                  root->fs_info->bdev_holder);
        if (IS_ERR(bdev))
                return PTR_ERR(bdev);
@@ -1482,14 +1596,21 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        ret = find_next_devid(root, &device->devid);
        if (ret) {
+                kfree(device->name);
                kfree(device);
                goto error;
        }
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                kfree(device->name);
+                kfree(device);
+                ret = PTR_ERR(trans);
+                goto error;
+        }
        lock_chunks(root);
-        device->barriers = 1;
        device->writeable = 1;
        device->work.func = pending_bios_fn;
        generate_random_uuid(device->uuid);
@@ -1503,7 +1624,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
        device->dev_root = root->fs_info->dev_root;
        device->bdev = bdev;
        device->in_fs_metadata = 1;
-        device->mode = 0;
+        device->mode = FMODE_EXCL;
        set_blocksize(device->bdev, 4096);
        if (seeding_dev) {
@@ -1519,7 +1640,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
         * half setup
         */
        mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
-        list_add(&device->dev_list, &root->fs_info->fs_devices->devices);
+        list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices);
        list_add(&device->dev_alloc_list,
                 &root->fs_info->fs_devices->alloc_list);
        root->fs_info->fs_devices->num_devices++;
@@ -1568,7 +1689,7 @@ out:
        mutex_unlock(&root->fs_info->volume_mutex);
        return ret;
 error:
-        close_bdev_exclusive(bdev, 0);
+        blkdev_put(bdev, FMODE_EXCL);
        if (seeding_dev) {
                mutex_unlock(&uuid_mutex);
                up_write(&sb->s_umount);
@@ -1677,10 +1798,9 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
        BUG_ON(ret);
        ret = btrfs_del_item(trans, root, path);
-        BUG_ON(ret);
        btrfs_free_path(path);
-        return 0;
+        return ret;
 }
 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
@@ -1755,7 +1875,7 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
                return ret;
        trans = btrfs_start_transaction(root, 0);
-        BUG_ON(!trans);
+        BUG_ON(IS_ERR(trans));
        lock_chunks(root);
@@ -1786,6 +1906,8 @@ static int btrfs_relocate_chunk(struct btrfs_root *root,
        BUG_ON(ret);
+        trace_btrfs_chunk_free(root, map, chunk_offset, em->len);
        if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
                ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset);
                BUG_ON(ret);
@@ -1853,7 +1975,7 @@ again:
                chunk = btrfs_item_ptr(leaf, path->slots[0],
                                       struct btrfs_chunk);
                chunk_type = btrfs_chunk_type(leaf, chunk);
-                btrfs_release_path(chunk_root, path);
+                btrfs_release_path(path);
                if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
                        ret = btrfs_relocate_chunk(chunk_root, chunk_tree,
@@ -1901,7 +2023,6 @@ int btrfs_balance(struct btrfs_root *dev_root)
        u64 size_to_free;
        struct btrfs_path *path;
        struct btrfs_key key;
-        struct btrfs_chunk *chunk;
        struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root;
        struct btrfs_trans_handle *trans;
        struct btrfs_key found_key;
@@ -1909,6 +2030,9 @@ int btrfs_balance(struct btrfs_root *dev_root)
        if (dev_root->fs_info->sb->s_flags & MS_RDONLY)
                return -EROFS;
+        if (!capable(CAP_SYS_ADMIN))
+                return -EPERM;
        mutex_lock(&dev_root->fs_info->volume_mutex);
        dev_root = dev_root->fs_info->dev_root;
@@ -1927,7 +2051,7 @@ int btrfs_balance(struct btrfs_root *dev_root)
                BUG_ON(ret);
                trans = btrfs_start_transaction(dev_root, 0);
-                BUG_ON(!trans);
+                BUG_ON(IS_ERR(trans));
                ret = btrfs_grow_device(trans, device, old_size);
                BUG_ON(ret);
@@ -1965,19 +2089,17 @@ int btrfs_balance(struct btrfs_root *dev_root)
                if (found_key.objectid != key.objectid)
                        break;
-                chunk = btrfs_item_ptr(path->nodes[0],
-                                       path->slots[0],
-                                       struct btrfs_chunk);
                /* chunk zero is special */
                if (found_key.offset == 0)
                        break;
-                btrfs_release_path(chunk_root, path);
+                btrfs_release_path(path);
                ret = btrfs_relocate_chunk(chunk_root,
                                           chunk_root->root_key.objectid,
                                           found_key.objectid,
                                           found_key.offset);
-                BUG_ON(ret && ret != -ENOSPC);
+                if (ret && ret != -ENOSPC)
+                        goto error;
                key.offset = found_key.offset - 1;
        }
        ret = 0;
@@ -2044,7 +2166,7 @@ again:
                        goto done;
                if (ret) {
                        ret = 0;
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        break;
                }
@@ -2053,7 +2175,7 @@ again:
                btrfs_item_key_to_cpu(l, &key, path->slots[0]);
                if (key.objectid != device->devid) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        break;
                }
@@ -2061,14 +2183,14 @@ again:
                length = btrfs_dev_extent_length(l, dev_extent);
                if (key.offset + length <= new_size) {
-                        btrfs_release_path(root, path);
+                        btrfs_release_path(path);
                        break;
                }
                chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
                chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
                chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid,
                                           chunk_offset);
@@ -2096,6 +2218,11 @@ again:
        /* Shrinking succeeded, else we would be at "done". */
        trans = btrfs_start_transaction(root, 0);
+        if (IS_ERR(trans)) {
+                ret = PTR_ERR(trans);
+                goto done;
+        }
        lock_chunks(root);
        device->disk_total_bytes = new_size;
@@ -2139,211 +2266,243 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
        return 0;
 }
-static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size,
+/*
-                                        int num_stripes, int sub_stripes)
+ * sort the devices in descending order by max_avail, total_avail
+ */
+static int btrfs_cmp_device_info(const void *a, const void *b)
 {
-        if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP))
+        const struct btrfs_device_info *di_a = a;
-                return calc_size;
+        const struct btrfs_device_info *di_b = b;
-        else if (type & BTRFS_BLOCK_GROUP_RAID10)
-                return calc_size * (num_stripes / sub_stripes);
+        if (di_a->max_avail > di_b->max_avail)
-        else
+                return -1;
-                return calc_size * num_stripes;
+        if (di_a->max_avail < di_b->max_avail)
+                return 1;
+        if (di_a->total_avail > di_b->total_avail)
+                return -1;
+        if (di_a->total_avail < di_b->total_avail)
+                return 1;
+        return 0;
 }
 static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
                               struct btrfs_root *extent_root,
                               struct map_lookup **map_ret,
-                               u64 *num_bytes, u64 *stripe_size,
+                               u64 *num_bytes_out, u64 *stripe_size_out,
                               u64 start, u64 type)
 {
        struct btrfs_fs_info *info = extent_root->fs_info;
-        struct btrfs_device *device = NULL;
        struct btrfs_fs_devices *fs_devices = info->fs_devices;
        struct list_head *cur;
        struct map_lookup *map = NULL;
        struct extent_map_tree *em_tree;
        struct extent_map *em;
-        struct list_head private_devs;
+        struct btrfs_device_info *devices_info = NULL;
-        int min_stripe_size = 1 * 1024 * 1024;
+        u64 total_avail;
-        u64 calc_size = 1024 * 1024 * 1024;
+        int num_stripes;        /* total number of stripes to allocate */
-        u64 max_chunk_size = calc_size;
+        int sub_stripes;        /* sub_stripes info for map */
-        u64 min_free;
+        int dev_stripes;        /* stripes per dev */
-        u64 avail;
+        int devs_max;           /* max devs to use */
-        u64 max_avail = 0;
+        int devs_min;           /* min devs needed */
-        u64 dev_offset;
+        int devs_increment;     /* ndevs has to be a multiple of this */
-        int num_stripes = 1;
+        int ncopies;            /* how many copies to data has */
-        int min_stripes = 1;
-        int sub_stripes = 0;
-        int looped = 0;
        int ret;
-        int index;
+        u64 max_stripe_size;
-        int stripe_len = 64 * 1024;
+        u64 max_chunk_size;
+        u64 stripe_size;
+        u64 num_bytes;
+        int ndevs;
+        int i;
+        int j;
        if ((type & BTRFS_BLOCK_GROUP_RAID1) &&
            (type & BTRFS_BLOCK_GROUP_DUP)) {
                WARN_ON(1);
                type &= ~BTRFS_BLOCK_GROUP_DUP;
        }
        if (list_empty(&fs_devices->alloc_list))
                return -ENOSPC;
-        if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
+        sub_stripes = 1;
-                num_stripes = fs_devices->rw_devices;
+        dev_stripes = 1;
-                min_stripes = 2;
+        devs_increment = 1;
-        }
+        ncopies = 1;
+        devs_max = 0;   /* 0 == as many as possible */
+        devs_min = 1;
+        /*
+         * define the properties of each RAID type.
+         * FIXME: move this to a global table and use it in all RAID
+         * calculation code
+         */
        if (type & (BTRFS_BLOCK_GROUP_DUP)) {
-                num_stripes = 2;
+                dev_stripes = 2;
-                min_stripes = 2;
+                ncopies = 2;
-        }
+                devs_max = 1;
-        if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
+        } else if (type & (BTRFS_BLOCK_GROUP_RAID0)) {
-                if (fs_devices->rw_devices < 2)
+                devs_min = 2;
-                        return -ENOSPC;
+        } else if (type & (BTRFS_BLOCK_GROUP_RAID1)) {
-                num_stripes = 2;
+                devs_increment = 2;
-                min_stripes = 2;
+                ncopies = 2;
-        }
+                devs_max = 2;
-        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+                devs_min = 2;
-                num_stripes = fs_devices->rw_devices;
+        } else if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
-                if (num_stripes < 4)
-                        return -ENOSPC;
-                num_stripes &= ~(u32)1;
                sub_stripes = 2;
-                min_stripes = 4;
+                devs_increment = 2;
+                ncopies = 2;
+                devs_min = 4;
+        } else {
+                devs_max = 1;
        }
        if (type & BTRFS_BLOCK_GROUP_DATA) {
-                max_chunk_size = 10 * calc_size;
+                max_stripe_size = 1024 * 1024 * 1024;
-                min_stripe_size = 64 * 1024 * 1024;
+                max_chunk_size = 10 * max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
-                max_chunk_size = 256 * 1024 * 1024;
+                max_stripe_size = 256 * 1024 * 1024;
-                min_stripe_size = 32 * 1024 * 1024;
+                max_chunk_size = max_stripe_size;
        } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-                calc_size = 8 * 1024 * 1024;
+                max_stripe_size = 8 * 1024 * 1024;
-                max_chunk_size = calc_size * 2;
+                max_chunk_size = 2 * max_stripe_size;
-                min_stripe_size = 1 * 1024 * 1024;
+        } else {
+                printk(KERN_ERR "btrfs: invalid chunk type 0x%llx requested\n",
+                       type);
+                BUG_ON(1);
        }
        /* we don't want a chunk larger than 10% of writeable space */
        max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
                             max_chunk_size);
-again:
+        devices_info = kzalloc(sizeof(*devices_info) * fs_devices->rw_devices,
-        max_avail = 0;
+                               GFP_NOFS);
-        if (!map || map->num_stripes != num_stripes) {
+        if (!devices_info)
-                kfree(map);
+                return -ENOMEM;
-                map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
-                if (!map)
-                        return -ENOMEM;
-                map->num_stripes = num_stripes;
-        }
-        if (calc_size * num_stripes > max_chunk_size) {
-                calc_size = max_chunk_size;
-                do_div(calc_size, num_stripes);
-                do_div(calc_size, stripe_len);
-                calc_size *= stripe_len;
-        }
-        /* we don't want tiny stripes */
+        cur = fs_devices->alloc_list.next;
-        if (!looped)
-                calc_size = max_t(u64, min_stripe_size, calc_size);
        /*
-         * we're about to do_div by the stripe_len so lets make sure
+         * in the first pass through the devices list, we gather information
-         * we end up with something bigger than a stripe
+         * about the available holes on each device.
         */
-        calc_size = max_t(u64, calc_size, stripe_len * 4);
+        ndevs = 0;
+        while (cur != &fs_devices->alloc_list) {
+                struct btrfs_device *device;
+                u64 max_avail;
+                u64 dev_offset;
-        do_div(calc_size, stripe_len);
+                device = list_entry(cur, struct btrfs_device, dev_alloc_list);
-        calc_size *= stripe_len;
-        cur = fs_devices->alloc_list.next;
+                cur = cur->next;
-        index = 0;
-        if (type & BTRFS_BLOCK_GROUP_DUP)
+                if (!device->writeable) {
-                min_free = calc_size * 2;
+                        printk(KERN_ERR
-        else
+                               "btrfs: read-only device in alloc_list\n");
-                min_free = calc_size;
+                        WARN_ON(1);
+                        continue;
+                }
-        /*
+                if (!device->in_fs_metadata)
-         * we add 1MB because we never use the first 1MB of the device, unless
+                        continue;
-         * we've looped, then we are likely allocating the maximum amount of
-         * space left already
-         */
-        if (!looped)
-                min_free += 1024 * 1024;
-        INIT_LIST_HEAD(&private_devs);
-        while (index < num_stripes) {
-                device = list_entry(cur, struct btrfs_device, dev_alloc_list);
-                BUG_ON(!device->writeable);
                if (device->total_bytes > device->bytes_used)
-                        avail = device->total_bytes - device->bytes_used;
+                        total_avail = device->total_bytes - device->bytes_used;
                else
-                        avail = 0;
+                        total_avail = 0;
-                cur = cur->next;
+                /* avail is off by max(alloc_start, 1MB), but that is the same
+                 * for all devices, so it doesn't hurt the sorting later on
+                 */
-                if (device->in_fs_metadata && avail >= min_free) {
+                ret = find_free_dev_extent(trans, device,
-                        ret = find_free_dev_extent(trans, device,
+                                           max_stripe_size * dev_stripes,
-                                                   min_free, &dev_offset,
+                                           &dev_offset, &max_avail);
-                                                   &max_avail);
+                if (ret && ret != -ENOSPC)
-                        if (ret == 0) {
+                        goto error;
-                                list_move_tail(&device->dev_alloc_list,
-                                               &private_devs);
+                if (ret == 0)
-                                map->stripes[index].dev = device;
+                        max_avail = max_stripe_size * dev_stripes;
-                                map->stripes[index].physical = dev_offset;
-                                index++;
+                if (max_avail < BTRFS_STRIPE_LEN * dev_stripes)
-                                if (type & BTRFS_BLOCK_GROUP_DUP) {
+                        continue;
-                                        map->stripes[index].dev = device;
-                                        map->stripes[index].physical =
+                devices_info[ndevs].dev_offset = dev_offset;
-                                                dev_offset + calc_size;
+                devices_info[ndevs].max_avail = max_avail;
-                                        index++;
+                devices_info[ndevs].total_avail = total_avail;
-                                }
+                devices_info[ndevs].dev = device;
-                        }
+                ++ndevs;
-                } else if (device->in_fs_metadata && avail > max_avail)
-                        max_avail = avail;
-                if (cur == &fs_devices->alloc_list)
-                        break;
        }
-        list_splice(&private_devs, &fs_devices->alloc_list);
-        if (index < num_stripes) {
+        /*
-                if (index >= min_stripes) {
+         * now sort the devices by hole size / available space
-                        num_stripes = index;
+         */
-                        if (type & (BTRFS_BLOCK_GROUP_RAID10)) {
+        sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
-                                num_stripes /= sub_stripes;
+             btrfs_cmp_device_info, NULL);
-                                num_stripes *= sub_stripes;
-                        }
+        /* round down to number of usable stripes */
-                        looped = 1;
+        ndevs -= ndevs % devs_increment;
-                        goto again;
-                }
+        if (ndevs < devs_increment * sub_stripes || ndevs < devs_min) {
-                if (!looped && max_avail > 0) {
+                ret = -ENOSPC;
-                        looped = 1;
+                goto error;
-                        calc_size = max_avail;
+        }
-                        goto again;
+        if (devs_max && ndevs > devs_max)
+                ndevs = devs_max;
+        /*
+         * the primary goal is to maximize the number of stripes, so use as many
+         * devices as possible, even if the stripes are not maximum sized.
+         */
+        stripe_size = devices_info[ndevs-1].max_avail;
+        num_stripes = ndevs * dev_stripes;
+        if (stripe_size * num_stripes > max_chunk_size * ncopies) {
+                stripe_size = max_chunk_size * ncopies;
+                do_div(stripe_size, num_stripes);
+        }
+        do_div(stripe_size, dev_stripes);
+        do_div(stripe_size, BTRFS_STRIPE_LEN);
+        stripe_size *= BTRFS_STRIPE_LEN;
+        map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
+        if (!map) {
+                ret = -ENOMEM;
+                goto error;
+        }
+        map->num_stripes = num_stripes;
+        for (i = 0; i < ndevs; ++i) {
+                for (j = 0; j < dev_stripes; ++j) {
+                        int s = i * dev_stripes + j;
+                        map->stripes[s].dev = devices_info[i].dev;
+                        map->stripes[s].physical = devices_info[i].dev_offset +
+                                                   j * stripe_size;
                }
-                kfree(map);
-                return -ENOSPC;
        }
        map->sector_size = extent_root->sectorsize;
-        map->stripe_len = stripe_len;
+        map->stripe_len = BTRFS_STRIPE_LEN;
-        map->io_align = stripe_len;
+        map->io_align = BTRFS_STRIPE_LEN;
-        map->io_width = stripe_len;
+        map->io_width = BTRFS_STRIPE_LEN;
        map->type = type;
-        map->num_stripes = num_stripes;
        map->sub_stripes = sub_stripes;
        *map_ret = map;
-        *stripe_size = calc_size;
+        num_bytes = stripe_size * (num_stripes / ncopies);
-        *num_bytes = chunk_bytes_by_type(type, calc_size,
-                                         num_stripes, sub_stripes);
+        *stripe_size_out = stripe_size;
+        *num_bytes_out = num_bytes;
-        em = alloc_extent_map(GFP_NOFS);
+        trace_btrfs_chunk_alloc(info->chunk_root, map, start, num_bytes);
+        em = alloc_extent_map();
        if (!em) {
-                kfree(map);
+                ret = -ENOMEM;
-                return -ENOMEM;
+                goto error;
        }
        em->bdev = (struct block_device *)map;
        em->start = start;
-        em->len = *num_bytes;
+        em->len = num_bytes;
        em->block_start = 0;
        em->block_len = em->len;
@@ -2356,23 +2515,30 @@ again:
        ret = btrfs_make_block_group(trans, extent_root, 0, type,
                                     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                                     start, *num_bytes);
+                                     start, num_bytes);
        BUG_ON(ret);
-        index = 0;
+        for (i = 0; i < map->num_stripes; ++i) {
-        while (index < map->num_stripes) {
+                struct btrfs_device *device;
-                device = map->stripes[index].dev;
+                u64 dev_offset;
-                dev_offset = map->stripes[index].physical;
+                device = map->stripes[i].dev;
+                dev_offset = map->stripes[i].physical;
                ret = btrfs_alloc_dev_extent(trans, device,
                                info->chunk_root->root_key.objectid,
                                BTRFS_FIRST_CHUNK_TREE_OBJECTID,
-                                start, dev_offset, calc_size);
+                                start, dev_offset, stripe_size);
                BUG_ON(ret);
-                index++;
        }
+        kfree(devices_info);
        return 0;
+error:
+        kfree(map);
+        kfree(devices_info);
+        return ret;
 }
 static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
@@ -2438,6 +2604,7 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
                                             item_size);
                BUG_ON(ret);
        }
        kfree(chunk);
        return 0;
 }
@@ -2569,7 +2736,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
 void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
 {
-        extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+        extent_map_tree_init(&tree->map_tree);
 }
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
@@ -2635,14 +2802,17 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                             u64 logical, u64 *length,
                             struct btrfs_multi_bio **multi_ret,
-                             int mirror_num, struct page *unplug_page)
+                             int mirror_num)
 {
        struct extent_map *em;
        struct map_lookup *map;
        struct extent_map_tree *em_tree = &map_tree->map_tree;
        u64 offset;
        u64 stripe_offset;
+        u64 stripe_end_offset;
        u64 stripe_nr;
+        u64 stripe_nr_orig;
+        u64 stripe_nr_end;
        int stripes_allocated = 8;
        int stripes_required = 1;
        int stripe_index;
@@ -2651,7 +2821,7 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
        int max_errors = 0;
        struct btrfs_multi_bio *multi = NULL;
-        if (multi_ret && !(rw & REQ_WRITE))
+        if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
                stripes_allocated = 1;
 again:
        if (multi_ret) {
@@ -2667,11 +2837,6 @@ again:
        em = lookup_extent_mapping(em_tree, logical, *length);
        read_unlock(&em_tree->lock);
-        if (!em && unplug_page) {
-                kfree(multi);
-                return 0;
-        }
        if (!em) {
                printk(KERN_CRIT "unable to find logical %llu len %llu\n",
                       (unsigned long long)logical,
@@ -2697,7 +2862,15 @@ again:
                        max_errors = 1;
                }
        }
-        if (multi_ret && (rw & REQ_WRITE) &&
+        if (rw & REQ_DISCARD) {
+                if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                                 BTRFS_BLOCK_GROUP_RAID1 |
+                                 BTRFS_BLOCK_GROUP_DUP |
+                                 BTRFS_BLOCK_GROUP_RAID10)) {
+                        stripes_required = map->num_stripes;
+                }
+        }
+        if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
            stripes_allocated < stripes_required) {
                stripes_allocated = map->num_stripes;
                free_extent_map(em);
@@ -2717,23 +2890,37 @@ again:
        /* stripe_offset is the offset of this block in its stripe*/
        stripe_offset = offset - stripe_offset;
-        if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 |
+        if (rw & REQ_DISCARD)
-                         BTRFS_BLOCK_GROUP_RAID10 |
+                *length = min_t(u64, em->len - offset, *length);
-                         BTRFS_BLOCK_GROUP_DUP)) {
+        else if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
+                              BTRFS_BLOCK_GROUP_RAID1 |
+                              BTRFS_BLOCK_GROUP_RAID10 |
+                              BTRFS_BLOCK_GROUP_DUP)) {
                /* we limit the length of each bio to what fits in a stripe */
                *length = min_t(u64, em->len - offset,
-                              map->stripe_len - stripe_offset);
+                                map->stripe_len - stripe_offset);
        } else {
                *length = em->len - offset;
        }
-        if (!multi_ret && !unplug_page)
+        if (!multi_ret)
                goto out;
        num_stripes = 1;
        stripe_index = 0;
-        if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+        stripe_nr_orig = stripe_nr;
-                if (unplug_page || (rw & REQ_WRITE))
+        stripe_nr_end = (offset + *length + map->stripe_len - 1) &
+                        (~(map->stripe_len - 1));
+        do_div(stripe_nr_end, map->stripe_len);
+        stripe_end_offset = stripe_nr_end * map->stripe_len -
+                            (offset + *length);
+        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                if (rw & REQ_DISCARD)
+                        num_stripes = min_t(u64, map->num_stripes,
+                                            stripe_nr_end - stripe_nr_orig);
+                stripe_index = do_div(stripe_nr, map->num_stripes);
+        } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
+                if (rw & (REQ_WRITE | REQ_DISCARD))
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
@@ -2744,7 +2931,7 @@ again:
                }
        } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-                if (rw & REQ_WRITE)
+                if (rw & (REQ_WRITE | REQ_DISCARD))
                        num_stripes = map->num_stripes;
                else if (mirror_num)
                        stripe_index = mirror_num - 1;
@@ -2755,8 +2942,12 @@ again:
                stripe_index = do_div(stripe_nr, factor);
                stripe_index *= map->sub_stripes;
-                if (unplug_page || (rw & REQ_WRITE))
+                if (rw & REQ_WRITE)
                        num_stripes = map->sub_stripes;
+                else if (rw & REQ_DISCARD)
+                        num_stripes = min_t(u64, map->sub_stripes *
+                                            (stripe_nr_end - stripe_nr_orig),
+                                            map->num_stripes);
                else if (mirror_num)
                        stripe_index += mirror_num - 1;
                else {
@@ -2774,24 +2965,101 @@ again:
        }
        BUG_ON(stripe_index >= map->num_stripes);
-        for (i = 0; i < num_stripes; i++) {
+        if (rw & REQ_DISCARD) {
-                if (unplug_page) {
+                for (i = 0; i < num_stripes; i++) {
-                        struct btrfs_device *device;
-                        struct backing_dev_info *bdi;
-                        device = map->stripes[stripe_index].dev;
-                        if (device->bdev) {
-                                bdi = blk_get_backing_dev_info(device->bdev);
-                                if (bdi->unplug_io_fn)
-                                        bdi->unplug_io_fn(bdi, unplug_page);
-                        }
-                } else {
                        multi->stripes[i].physical =
                                map->stripes[stripe_index].physical +
                                stripe_offset + stripe_nr * map->stripe_len;
                        multi->stripes[i].dev = map->stripes[stripe_index].dev;
+                        if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
+                                u64 stripes;
+                                u32 last_stripe = 0;
+                                int j;
+                                div_u64_rem(stripe_nr_end - 1,
+                                            map->num_stripes,
+                                            &last_stripe);
+                                for (j = 0; j < map->num_stripes; j++) {
+                                        u32 test;
+                                        div_u64_rem(stripe_nr_end - 1 - j,
+                                                    map->num_stripes, &test);
+                                        if (test == stripe_index)
+                                                break;
+                                }
+                                stripes = stripe_nr_end - 1 - j;
+                                do_div(stripes, map->num_stripes);
+                                multi->stripes[i].length = map->stripe_len *
+                                        (stripes - stripe_nr + 1);
+                                if (i == 0) {
+                                        multi->stripes[i].length -=
+                                                stripe_offset;
+                                        stripe_offset = 0;
+                                }
+                                if (stripe_index == last_stripe)
+                                        multi->stripes[i].length -=
+                                                stripe_end_offset;
+                        } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
+                                u64 stripes;
+                                int j;
+                                int factor = map->num_stripes /
+                                             map->sub_stripes;
+                                u32 last_stripe = 0;
+                                div_u64_rem(stripe_nr_end - 1,
+                                            factor, &last_stripe);
+                                last_stripe *= map->sub_stripes;
+                                for (j = 0; j < factor; j++) {
+                                        u32 test;
+                                        div_u64_rem(stripe_nr_end - 1 - j,
+                                                    factor, &test);
+                                        if (test ==
+                                            stripe_index / map->sub_stripes)
+                                                break;
+                                }
+                                stripes = stripe_nr_end - 1 - j;
+                                do_div(stripes, factor);
+                                multi->stripes[i].length = map->stripe_len *
+                                        (stripes - stripe_nr + 1);
+                                if (i < map->sub_stripes) {
+                                        multi->stripes[i].length -=
+                                                stripe_offset;
+                                        if (i == map->sub_stripes - 1)
+                                                stripe_offset = 0;
+                                }
+                                if (stripe_index >= last_stripe &&
+                                    stripe_index <= (last_stripe +
+                                                     map->sub_stripes - 1)) {
+                                        multi->stripes[i].length -=
+                                                stripe_end_offset;
+                                }
+                        } else
+                                multi->stripes[i].length = *length;
+                        stripe_index++;
+                        if (stripe_index == map->num_stripes) {
+                                /* This could only happen for RAID0/10 */
+                                stripe_index = 0;
+                                stripe_nr++;
+                        }
+                }
+        } else {
+                for (i = 0; i < num_stripes; i++) {
+                        multi->stripes[i].physical =
+                                map->stripes[stripe_index].physical +
+                                stripe_offset +
+                                stripe_nr * map->stripe_len;
+                        multi->stripes[i].dev =
+                                map->stripes[stripe_index].dev;
+                        stripe_index++;
                }
-                stripe_index++;
        }
        if (multi_ret) {
                *multi_ret = multi;
@@ -2808,7 +3076,7 @@ int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
                      struct btrfs_multi_bio **multi_ret, int mirror_num)
 {
        return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
-                                 mirror_num, NULL);
+                                 mirror_num);
 }
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
@@ -2876,14 +3144,6 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
        return 0;
 }
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
-                      u64 logical, struct page *page)
-{
-        u64 length = PAGE_CACHE_SIZE;
-        return __btrfs_map_block(map_tree, READ, logical, &length,
-                                 NULL, 0, page);
-}
 static void end_bio_multi_stripe(struct bio *bio, int err)
 {
        struct btrfs_multi_bio *multi = bio->bi_private;
@@ -3034,8 +3294,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                }
                bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
                dev = multi->stripes[dev_nr].dev;
-                BUG_ON(rw == WRITE && !dev->writeable);
+                if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
-                if (dev && dev->bdev) {
                        bio->bi_bdev = dev->bdev;
                        if (async_submit)
                                schedule_bio(root, dev, rw, bio);
@@ -3084,12 +3343,13 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
                return NULL;
        list_add(&device->dev_list,
                 &fs_devices->devices);
-        device->barriers = 1;
        device->dev_root = root->fs_info->dev_root;
        device->devid = devid;
        device->work.func = pending_bios_fn;
        device->fs_devices = fs_devices;
+        device->missing = 1;
        fs_devices->num_devices++;
+        fs_devices->missing_devices++;
        spin_lock_init(&device->io_lock);
        INIT_LIST_HEAD(&device->dev_alloc_list);
        memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
@@ -3126,7 +3386,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
                free_extent_map(em);
        }
-        em = alloc_extent_map(GFP_NOFS);
+        em = alloc_extent_map();
        if (!em)
                return -ENOMEM;
        num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
@@ -3287,6 +3547,15 @@ static int read_one_dev(struct btrfs_root *root,
                        device = add_missing_dev(root, devid, dev_uuid);
                        if (!device)
                                return -ENOMEM;
+                } else if (!device->missing) {
+                        /*
+                         * this happens when a device that was properly setup
+                         * in the device info lists suddenly goes bad.
+                         * device->bdev is NULL, and so we have to set
+                         * device->missing to one here
+                         */
+                        root->fs_info->fs_devices->missing_devices++;
+                        device->missing = 1;
                }
        }
@@ -3306,15 +3575,6 @@ static int read_one_dev(struct btrfs_root *root,
        return ret;
 }
-int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf)
-{
-        struct btrfs_dev_item *dev_item;
-        dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block,
-                                                     dev_item);
-        return read_one_dev(root, buf, dev_item);
-}
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
        struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
@@ -3431,7 +3691,7 @@ again:
        }
        if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
                key.objectid = 0;
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                goto again;
        }
        ret = 0;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 31b0fabdd2ea..7c12d61ae7ae 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -20,8 +20,11 @@
 #define __BTRFS_VOLUMES_
 #include <linux/bio.h>
+#include <linux/sort.h>
 #include "async-thread.h"
+#define BTRFS_STRIPE_LEN        (64 * 1024)
 struct buffer_head;
 struct btrfs_pending_bios {
        struct bio *head;
@@ -42,15 +45,15 @@ struct btrfs_device {
        int running_pending;
        u64 generation;
-        int barriers;
        int writeable;
        int in_fs_metadata;
+        int missing;
        spinlock_t io_lock;
        struct block_device *bdev;
-        /* the mode sent to open_bdev_exclusive */
+        /* the mode sent to blkdev_get */
        fmode_t mode;
        char *name;
@@ -82,7 +85,12 @@ struct btrfs_device {
        /* physical drive uuid (or lvm uuid) */
        u8 uuid[BTRFS_UUID_SIZE];
+        /* per-device scrub information */
+        struct scrub_dev *scrub_device;
        struct btrfs_work work;
+        struct rcu_head rcu;
+        struct work_struct rcu_work;
 };
 struct btrfs_fs_devices {
@@ -94,6 +102,7 @@ struct btrfs_fs_devices {
        u64 num_devices;
        u64 open_devices;
        u64 rw_devices;
+        u64 missing_devices;
        u64 total_rw_bytes;
        struct block_device *latest_bdev;
@@ -122,6 +131,7 @@ struct btrfs_fs_devices {
 struct btrfs_bio_stripe {
        struct btrfs_device *dev;
        u64 physical;
+        u64 length; /* only used for discard mappings */
 };
 struct btrfs_multi_bio {
@@ -135,6 +145,30 @@ struct btrfs_multi_bio {
        struct btrfs_bio_stripe stripes[];
 };
+struct btrfs_device_info {
+        struct btrfs_device *dev;
+        u64 dev_offset;
+        u64 max_avail;
+        u64 total_avail;
+};
+struct map_lookup {
+        u64 type;
+        int io_align;
+        int io_width;
+        int stripe_len;
+        int sector_size;
+        int num_stripes;
+        int sub_stripes;
+        struct btrfs_bio_stripe stripes[];
+};
+#define map_lookup_size(n) (sizeof(struct map_lookup) + \
+                            (sizeof(struct btrfs_bio_stripe) * (n)))
+int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
+                                   u64 end, u64 *length);
 #define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
                            (sizeof(struct btrfs_bio_stripe) * (n)))
@@ -156,7 +190,6 @@ void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
 void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
 int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
                  int mirror_num, int async_submit);
-int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf);
 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
                       fmode_t flags, void *holder);
 int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
@@ -169,8 +202,6 @@ int btrfs_add_device(struct btrfs_trans_handle *trans,
 int btrfs_rm_device(struct btrfs_root *root, char *device_path);
 int btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len);
-int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree,
-                      u64 logical, struct page *page);
 int btrfs_grow_device(struct btrfs_trans_handle *trans,
                      struct btrfs_device *device, u64 new_size);
 struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
@@ -178,8 +209,6 @@ struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid,
 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size);
 int btrfs_init_new_device(struct btrfs_root *root, char *path);
 int btrfs_balance(struct btrfs_root *dev_root);
-void btrfs_unlock_volumes(void);
-void btrfs_lock_volumes(void);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
                         struct btrfs_device *device, u64 num_bytes,
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 88ecbb215878..5366fe452ab0 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -44,7 +44,7 @@ ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
                return -ENOMEM;
        /* lookup the xattr by name */
-        di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name,
+        di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name,
                                strlen(name), 0);
        if (!di) {
                ret = -ENODATA;
@@ -103,7 +103,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
                return -ENOMEM;
        /* first lets see if we already have this xattr */
-        di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name,
+        di = btrfs_lookup_xattr(trans, root, path, btrfs_ino(inode), name,
                                strlen(name), -1);
        if (IS_ERR(di)) {
                ret = PTR_ERR(di);
@@ -120,13 +120,13 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
                ret = btrfs_delete_one_dir_name(trans, root, path, di);
                BUG_ON(ret);
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                /* if we don't have a value then we are removing the xattr */
                if (!value)
                        goto out;
        } else {
-                btrfs_release_path(root, path);
+                btrfs_release_path(path);
                if (flags & XATTR_REPLACE) {
                        /* we couldn't find the attr to replace */
@@ -136,7 +136,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
        }
        /* ok we have to create a completely new xattr */
-        ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino,
+        ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
                                      name, name_len, value, size);
        BUG_ON(ret);
 out:
@@ -158,8 +158,6 @@ int __btrfs_setxattr(struct btrfs_trans_handle *trans,
        if (IS_ERR(trans))
                return PTR_ERR(trans);
-        btrfs_set_trans_block_group(trans, inode);
        ret = do_setxattr(trans, inode, name, value, size, flags);
        if (ret)
                goto out;
@@ -178,21 +176,19 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
        struct inode *inode = dentry->d_inode;
        struct btrfs_root *root = BTRFS_I(inode)->root;
        struct btrfs_path *path;
-        struct btrfs_item *item;
        struct extent_buffer *leaf;
        struct btrfs_dir_item *di;
-        int ret = 0, slot, advance;
+        int ret = 0, slot;
        size_t total_size = 0, size_left = size;
        unsigned long name_ptr;
        size_t name_len;
-        u32 nritems;
        /*
         * ok we want all objects associated with this id.
         * NOTE: we set key.offset = 0; because we want to start with the
         * first xattr that we find and walk forward
         */
-        key.objectid = inode->i_ino;
+        key.objectid = btrfs_ino(inode);
        btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY);
        key.offset = 0;
@@ -205,36 +201,25 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
        ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
        if (ret < 0)
                goto err;
-        advance = 0;
        while (1) {
                leaf = path->nodes[0];
-                nritems = btrfs_header_nritems(leaf);
                slot = path->slots[0];
                /* this is where we start walking through the path */
-                if (advance || slot >= nritems) {
+                if (slot >= btrfs_header_nritems(leaf)) {
                        /*
                         * if we've reached the last slot in this leaf we need
                         * to go to the next leaf and reset everything
                         */
-                        if (slot >= nritems-1) {
+                        ret = btrfs_next_leaf(root, path);
-                                ret = btrfs_next_leaf(root, path);
+                        if (ret < 0)
-                                if (ret)
+                                goto err;
-                                        break;
+                        else if (ret > 0)
-                                leaf = path->nodes[0];
+                                break;
-                                nritems = btrfs_header_nritems(leaf);
+                        continue;
-                                slot = path->slots[0];
-                        } else {
-                                /*
-                                 * just walking through the slots on this leaf
-                                 */
-                                slot++;
-                                path->slots[0]++;
-                        }
                }
-                advance = 1;
-                item = btrfs_item_nr(leaf, slot);
                btrfs_item_key_to_cpu(leaf, &found_key, slot);
                /* check to make sure this item is what we want */
@@ -244,13 +229,15 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                        break;
                di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
+                if (verify_dir_item(root, leaf, di))
+                        continue;
                name_len = btrfs_dir_name_len(leaf, di);
                total_size += name_len + 1;
                /* we are just looking for how big our buffer needs to be */
                if (!size)
-                        continue;
+                        goto next;
                if (!buffer || (name_len + 1) > size_left) {
                        ret = -ERANGE;
@@ -263,6 +250,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
                size_left -= name_len + 1;
                buffer += name_len + 1;
+next:
+                path->slots[0]++;
        }
        ret = total_size;
@@ -318,6 +307,15 @@ ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
 int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
                   size_t size, int flags)
 {
+        struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+        /*
+         * The permission on security.* and system.* is not checked
+         * in permission().
+         */
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        /*
         * If this is a request for a synthetic attribute in the system.*
         * namespace use the generic infrastructure to resolve a handler
@@ -338,6 +336,15 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 int btrfs_removexattr(struct dentry *dentry, const char *name)
 {
+        struct btrfs_root *root = BTRFS_I(dentry->d_inode)->root;
+        /*
+         * The permission on security.* and system.* is not checked
+         * in permission().
+         */
+        if (btrfs_root_readonly(root))
+                return -EROFS;
        /*
         * If this is a request for a synthetic attribute in the system.*
         * namespace use the generic infrastructure to resolve a handler
@@ -354,7 +361,8 @@ int btrfs_removexattr(struct dentry *dentry, const char *name)
 }
 int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-                              struct inode *inode, struct inode *dir)
+                              struct inode *inode, struct inode *dir,
+                              const struct qstr *qstr)
 {
        int err;
        size_t len;
@@ -362,7 +370,8 @@ int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
        char *suffix;
        char *name;
-        err = security_inode_init_security(inode, dir, &suffix, &value, &len);
+        err = security_inode_init_security(inode, dir, qstr, &suffix, &value,
+                                           &len);
        if (err) {
                if (err == -EOPNOTSUPP)
                        return 0;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 7a43fd640bbb..b3cc8039134b 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -37,6 +37,7 @@ extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
 extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans,
-                                     struct inode *inode, struct inode *dir);
+                                     struct inode *inode, struct inode *dir,
+                                     const struct qstr *qstr);
 #endif /* __XATTR__ */
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index 3e2b90eaa239..faccd47c6c46 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -32,15 +32,6 @@
 #include <linux/bio.h>
 #include "compression.h"
-/* Plan: call deflate() with avail_in == *sourcelen,
-        avail_out = *dstlen - 12 and flush == Z_FINISH.
-        If it doesn't manage to finish, call it again with
-        avail_in == 0 and avail_out set to the remaining 12
-        bytes for it to clean up.
-   Q: Is 12 bytes sufficient?
-*/
-#define STREAM_END_SPACE 12
 struct workspace {
        z_stream inf_strm;
        z_stream def_strm;
@@ -48,169 +39,63 @@ struct workspace {
        struct list_head list;
 };
-static LIST_HEAD(idle_workspace);
+static void zlib_free_workspace(struct list_head *ws)
-static DEFINE_SPINLOCK(workspace_lock);
-static unsigned long num_workspace;
-static atomic_t alloc_workspace = ATOMIC_INIT(0);
-static DECLARE_WAIT_QUEUE_HEAD(workspace_wait);
-/*
- * this finds an available zlib workspace or allocates a new one
- * NULL or an ERR_PTR is returned if things go bad.
- */
-static struct workspace *find_zlib_workspace(void)
 {
-        struct workspace *workspace;
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
-        int ret;
-        int cpus = num_online_cpus();
-again:
-        spin_lock(&workspace_lock);
-        if (!list_empty(&idle_workspace)) {
-                workspace = list_entry(idle_workspace.next, struct workspace,
-                                       list);
-                list_del(&workspace->list);
-                num_workspace--;
-                spin_unlock(&workspace_lock);
-                return workspace;
-        }
-        spin_unlock(&workspace_lock);
-        if (atomic_read(&alloc_workspace) > cpus) {
-                DEFINE_WAIT(wait);
-                prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE);
-                if (atomic_read(&alloc_workspace) > cpus)
-                        schedule();
-                finish_wait(&workspace_wait, &wait);
-                goto again;
-        }
-        atomic_inc(&alloc_workspace);
-        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
-        if (!workspace) {
-                ret = -ENOMEM;
-                goto fail;
-        }
-        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize());
-        if (!workspace->def_strm.workspace) {
-                ret = -ENOMEM;
-                goto fail;
-        }
-        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
-        if (!workspace->inf_strm.workspace) {
-                ret = -ENOMEM;
-                goto fail_inflate;
-        }
-        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-        if (!workspace->buf) {
-                ret = -ENOMEM;
-                goto fail_kmalloc;
-        }
-        return workspace;
-fail_kmalloc:
-        vfree(workspace->inf_strm.workspace);
-fail_inflate:
-        vfree(workspace->def_strm.workspace);
-fail:
-        kfree(workspace);
-        atomic_dec(&alloc_workspace);
-        wake_up(&workspace_wait);
-        return ERR_PTR(ret);
-}
-/*
- * put a workspace struct back on the list or free it if we have enough
- * idle ones sitting around
- */
-static int free_workspace(struct workspace *workspace)
-{
-        spin_lock(&workspace_lock);
-        if (num_workspace < num_online_cpus()) {
-                list_add_tail(&workspace->list, &idle_workspace);
-                num_workspace++;
-                spin_unlock(&workspace_lock);
-                if (waitqueue_active(&workspace_wait))
-                        wake_up(&workspace_wait);
-                return 0;
-        }
-        spin_unlock(&workspace_lock);
        vfree(workspace->def_strm.workspace);
        vfree(workspace->inf_strm.workspace);
        kfree(workspace->buf);
        kfree(workspace);
-        atomic_dec(&alloc_workspace);
-        if (waitqueue_active(&workspace_wait))
-                wake_up(&workspace_wait);
-        return 0;
 }
-/*
+static struct list_head *zlib_alloc_workspace(void)
- * cleanup function for module exit
- */
-static void free_workspaces(void)
 {
        struct workspace *workspace;
-        while (!list_empty(&idle_workspace)) {
-                workspace = list_entry(idle_workspace.next, struct workspace,
+        workspace = kzalloc(sizeof(*workspace), GFP_NOFS);
-                                       list);
+        if (!workspace)
-                list_del(&workspace->list);
+                return ERR_PTR(-ENOMEM);
-                vfree(workspace->def_strm.workspace);
-                vfree(workspace->inf_strm.workspace);
+        workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize(
-                kfree(workspace->buf);
+                                                MAX_WBITS, MAX_MEM_LEVEL));
-                kfree(workspace);
+        workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize());
-                atomic_dec(&alloc_workspace);
+        workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-        }
+        if (!workspace->def_strm.workspace ||
+            !workspace->inf_strm.workspace || !workspace->buf)
+                goto fail;
+        INIT_LIST_HEAD(&workspace->list);
+        return &workspace->list;
+fail:
+        zlib_free_workspace(&workspace->list);
+        return ERR_PTR(-ENOMEM);
 }
-/*
+static int zlib_compress_pages(struct list_head *ws,
- * given an address space and start/len, compress the bytes.
+                               struct address_space *mapping,
- *
+                               u64 start, unsigned long len,
- * pages are allocated to hold the compressed result and stored
+                               struct page **pages,
- * in 'pages'
+                               unsigned long nr_dest_pages,
- *
+                               unsigned long *out_pages,
- * out_pages is used to return the number of pages allocated.  There
+                               unsigned long *total_in,
- * may be pages allocated even if we return an error
+                               unsigned long *total_out,
- *
+                               unsigned long max_out)
- * total_in is used to return the number of bytes actually read.  It
- * may be smaller then len if we had to exit early because we
- * ran out of room in the pages array or because we cross the
- * max_out threshold.
- *
- * total_out is used to return the total number of compressed bytes
- *
- * max_out tells us the max number of bytes that we're allowed to
- * stuff into pages
- */
-int btrfs_zlib_compress_pages(struct address_space *mapping,
-                              u64 start, unsigned long len,
-                              struct page **pages,
-                              unsigned long nr_dest_pages,
-                              unsigned long *out_pages,
-                              unsigned long *total_in,
-                              unsigned long *total_out,
-                              unsigned long max_out)
 {
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
        int ret;
-        struct workspace *workspace;
        char *data_in;
        char *cpage_out;
        int nr_pages = 0;
        struct page *in_page = NULL;
        struct page *out_page = NULL;
-        int out_written = 0;
-        int in_read = 0;
        unsigned long bytes_left;
        *out_pages = 0;
        *total_out = 0;
        *total_in = 0;
-        workspace = find_zlib_workspace();
-        if (IS_ERR(workspace))
-                return -1;
        if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) {
                printk(KERN_WARNING "deflateInit failed\n");
                ret = -1;
@@ -224,6 +109,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        data_in = kmap(in_page);
        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+        if (out_page == NULL) {
+                ret = -1;
+                goto out;
+        }
        cpage_out = kmap(out_page);
        pages[0] = out_page;
        nr_pages = 1;
@@ -233,9 +122,6 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
        workspace->def_strm.avail_out = PAGE_CACHE_SIZE;
        workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE);
-        out_written = 0;
-        in_read = 0;
        while (workspace->def_strm.total_in < len) {
                ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH);
                if (ret != Z_OK) {
@@ -265,6 +151,10 @@ int btrfs_zlib_compress_pages(struct address_space *mapping,
                                goto out;
                        }
                        out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+                        if (out_page == NULL) {
+                                ret = -1;
+                                goto out;
+                        }
                        cpage_out = kmap(out_page);
                        pages[nr_pages] = out_page;
                        nr_pages++;
@@ -319,55 +209,26 @@ out:
                kunmap(in_page);
                page_cache_release(in_page);
        }
-        free_workspace(workspace);
        return ret;
 }
-/*
+static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in,
- * pages_in is an array of pages with compressed data.
+                                  u64 disk_start,
- *
+                                  struct bio_vec *bvec,
- * disk_start is the starting logical offset of this array in the file
+                                  int vcnt,
- *
+                                  size_t srclen)
- * bvec is a bio_vec of pages from the file that we want to decompress into
- *
- * vcnt is the count of pages in the biovec
- *
- * srclen is the number of bytes in pages_in
- *
- * The basic idea is that we have a bio that was created by readpages.
- * The pages in the bio are for the uncompressed data, and they may not
- * be contiguous.  They all correspond to the range of bytes covered by
- * the compressed extent.
- */
-int btrfs_zlib_decompress_biovec(struct page **pages_in,
-                              u64 disk_start,
-                              struct bio_vec *bvec,
-                              int vcnt,
-                              size_t srclen)
 {
-        int ret = 0;
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
+        int ret = 0, ret2;
        int wbits = MAX_WBITS;
-        struct workspace *workspace;
        char *data_in;
        size_t total_out = 0;
-        unsigned long page_bytes_left;
        unsigned long page_in_index = 0;
        unsigned long page_out_index = 0;
-        struct page *page_out;
        unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) /
                                        PAGE_CACHE_SIZE;
        unsigned long buf_start;
-        unsigned long buf_offset;
-        unsigned long bytes;
-        unsigned long working_bytes;
        unsigned long pg_offset;
-        unsigned long start_byte;
-        unsigned long current_buf_start;
-        char *kaddr;
-        workspace = find_zlib_workspace();
-        if (IS_ERR(workspace))
-                return -ENOMEM;
        data_in = kmap(pages_in[page_in_index]);
        workspace->inf_strm.next_in = data_in;
@@ -377,8 +238,6 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        workspace->inf_strm.total_out = 0;
        workspace->inf_strm.next_out = workspace->buf;
        workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
-        page_out = bvec[page_out_index].bv_page;
-        page_bytes_left = PAGE_CACHE_SIZE;
        pg_offset = 0;
        /* If it's deflate, and it's got no preset dictionary, then
@@ -394,107 +253,29 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in,
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
                printk(KERN_WARNING "inflateInit failed\n");
-                ret = -1;
+                return -1;
-                goto out;
        }
        while (workspace->inf_strm.total_in < srclen) {
                ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH);
                if (ret != Z_OK && ret != Z_STREAM_END)
                        break;
-                /*
-                 * buf start is the byte offset we're of the start of
-                 * our workspace buffer
-                 */
-                buf_start = total_out;
-                /* total_out is the last byte of the workspace buffer */
+                buf_start = total_out;
                total_out = workspace->inf_strm.total_out;
-                working_bytes = total_out - buf_start;
+                /* we didn't make progress in this inflate call, we're done */
+                if (buf_start == total_out)
-                /*
-                 * start byte is the first byte of the page we're currently
-                 * copying into relative to the start of the compressed data.
-                 */
-                start_byte = page_offset(page_out) - disk_start;
-                if (working_bytes == 0) {
-                        /* we didn't make progress in this inflate
-                         * call, we're done
-                         */
-                        if (ret != Z_STREAM_END)
-                                ret = -1;
                        break;
-                }
-                /* we haven't yet hit data corresponding to this page */
+                ret2 = btrfs_decompress_buf2page(workspace->buf, buf_start,
-                if (total_out <= start_byte)
+                                                 total_out, disk_start,
-                        goto next;
+                                                 bvec, vcnt,
+                                                 &page_out_index, &pg_offset);
-                /*
+                if (ret2 == 0) {
-                 * the start of the data we care about is offset into
+                        ret = 0;
-                 * the middle of our working buffer
+                        goto done;
-                 */
-                if (total_out > start_byte && buf_start < start_byte) {
-                        buf_offset = start_byte - buf_start;
-                        working_bytes -= buf_offset;
-                } else {
-                        buf_offset = 0;
-                }
-                current_buf_start = buf_start;
-                /* copy bytes from the working buffer into the pages */
-                while (working_bytes > 0) {
-                        bytes = min(PAGE_CACHE_SIZE - pg_offset,
-                                    PAGE_CACHE_SIZE - buf_offset);
-                        bytes = min(bytes, working_bytes);
-                        kaddr = kmap_atomic(page_out, KM_USER0);
-                        memcpy(kaddr + pg_offset, workspace->buf + buf_offset,
-                               bytes);
-                        kunmap_atomic(kaddr, KM_USER0);
-                        flush_dcache_page(page_out);
-                        pg_offset += bytes;
-                        page_bytes_left -= bytes;
-                        buf_offset += bytes;
-                        working_bytes -= bytes;
-                        current_buf_start += bytes;
-                        /* check if we need to pick another page */
-                        if (page_bytes_left == 0) {
-                                page_out_index++;
-                                if (page_out_index >= vcnt) {
-                                        ret = 0;
-                                        goto done;
-                                }
-                                page_out = bvec[page_out_index].bv_page;
-                                pg_offset = 0;
-                                page_bytes_left = PAGE_CACHE_SIZE;
-                                start_byte = page_offset(page_out) - disk_start;
-                                /*
-                                 * make sure our new page is covered by this
-                                 * working buffer
-                                 */
-                                if (total_out <= start_byte)
-                                        goto next;
-                                /* the next page in the biovec might not
-                                 * be adjacent to the last page, but it
-                                 * might still be found inside this working
-                                 * buffer.  bump our offset pointer
-                                 */
-                                if (total_out > start_byte &&
-                                    current_buf_start < start_byte) {
-                                        buf_offset = start_byte - buf_start;
-                                        working_bytes = total_out - start_byte;
-                                        current_buf_start = buf_start +
-                                                buf_offset;
-                                }
-                        }
                }
-next:
                workspace->inf_strm.next_out = workspace->buf;
                workspace->inf_strm.avail_out = PAGE_CACHE_SIZE;
@@ -521,35 +302,21 @@ done:
        zlib_inflateEnd(&workspace->inf_strm);
        if (data_in)
                kunmap(pages_in[page_in_index]);
-out:
-        free_workspace(workspace);
        return ret;
 }
-/*
+static int zlib_decompress(struct list_head *ws, unsigned char *data_in,
- * a less complex decompression routine.  Our compressed data fits in a
+                           struct page *dest_page,
- * single page, and we want to read a single page out of it.
+                           unsigned long start_byte,
- * start_byte tells us the offset into the compressed data we're interested in
+                           size_t srclen, size_t destlen)
- */
-int btrfs_zlib_decompress(unsigned char *data_in,
-                          struct page *dest_page,
-                          unsigned long start_byte,
-                          size_t srclen, size_t destlen)
 {
+        struct workspace *workspace = list_entry(ws, struct workspace, list);
        int ret = 0;
        int wbits = MAX_WBITS;
-        struct workspace *workspace;
        unsigned long bytes_left = destlen;
        unsigned long total_out = 0;
        char *kaddr;
-        if (destlen > PAGE_CACHE_SIZE)
-                return -ENOMEM;
-        workspace = find_zlib_workspace();
-        if (IS_ERR(workspace))
-                return -ENOMEM;
        workspace->inf_strm.next_in = data_in;
        workspace->inf_strm.avail_in = srclen;
        workspace->inf_strm.total_in = 0;
@@ -570,8 +337,7 @@ int btrfs_zlib_decompress(unsigned char *data_in,
        if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) {
                printk(KERN_WARNING "inflateInit failed\n");
-                ret = -1;
+                return -1;
-                goto out;
        }
        while (bytes_left > 0) {
@@ -621,12 +387,13 @@ next:
                ret = 0;
        zlib_inflateEnd(&workspace->inf_strm);
-out:
-        free_workspace(workspace);
        return ret;
 }
-void btrfs_zlib_exit(void)
+struct btrfs_compress_op btrfs_zlib_compress = {
-{
+        .alloc_workspace        = zlib_alloc_workspace,
-    free_workspaces();
+        .free_workspace         = zlib_free_workspace,
-}
+        .compress_pages         = zlib_compress_pages,
+        .decompress_biovec      = zlib_decompress_biovec,
+        .decompress             = zlib_decompress,
+};